Beispiel #1
0
 def addClassifierModel(self, model, classifierModelPath, classifierParameters, threshold=None):
     classifierModel = model.get(self.tag+"classifier-model", True)
     shutil.copy2(classifierModelPath, classifierModel)
     model.addStr(self.tag+"classifier-parameter", Parameters.toString(Parameters.get(classifierParameters)))
     if threshold != None:
         model.addStr(self.tag+"threshold", str(threshold))
     return classifierModel
Beispiel #2
0
 def addClassifierModel(self, model, classifierModelPath, classifierParameters, threshold=None):
     classifierModel = model.get(self.tag+"classifier-model", True)
     shutil.copy2(classifierModelPath, classifierModel)
     model.addStr(self.tag+"classifier-parameter", Parameters.toString(Parameters.get(classifierParameters)))
     if threshold != None:
         model.addStr(self.tag+"threshold", str(threshold))
     return classifierModel
Beispiel #3
0
def getConnection(connection): #, account=None, workDirBase=None, remoteSettingsPath=None):
    if connection == None: # return a "dummy" local connection
        return getConnection("connection=Unix:jobLimit=1")
    elif type(connection) in types.StringTypes and hasattr(Settings, connection): # connection is a Settings key
        print >> sys.stderr, "Using connection", connection
        return getConnection(getattr(Settings, connection))
        #return getConnection(*getattr(Settings, connection))
    else: # connection is a parameter string or dictionary
        defaultParams = dict.fromkeys(["connection", "account", "workdir", "settings", "memory", "cores", "modules", "wallTime", "jobLimit", "preamble", "debug"])
        defaultParams["debug"] = False
        connection = Parameters.get(connection, valueListKey="connection", valueTypes={"debug":[bool]}, defaults=defaultParams)
        if connection["connection"] == None:
            connection["connection"] = "Unix"
        if connection["account"] == None:
            assert connection["workdir"] == None
            #assert remoteSettingsPath == None
            print >> sys.stderr, "New local connection", Parameters.toString(connection)
        else: 
            print >> sys.stderr, "New remote connection:", Parameters.toString(connection)
        # Make the connection
        exec "ConnectionClass = " + connection["connection"] + "Connection"
        connectionArgs = {}
        for key in connection:
            if key != "connection" and connection[key] != None:
                connectionArgs[key] = connection[key]
        return ConnectionClass(**connectionArgs)
Beispiel #4
0
def getSteps(step, omitSteps, mainSteps):
    # Determine substep to start from, for the main step from which processing starts
    step = Parameters.get(step, mainSteps)
    fromMainStep = None
    fromSubStep = {} # The substep to start from, for the main step to start from
    for mainStep in step.keys():
        fromSubStep[mainStep] = step[mainStep] # the sub step to start from
        if step[mainStep] != None:
            assert fromMainStep == None # processing can start from one place only
            fromMainStep = mainStep
            if step[mainStep] == True:
                fromSubStep[mainStep] = None
            else:
                assert type(step[mainStep]) in types.StringTypes # no list allowed, processing can start from one place only
    # Determine steps to omit
    omitSubSteps = {} # Skip these substeps. If the value is True, skip the entire main step.
    omitMainSteps = []
    omitSteps = Parameters.get(omitSteps, mainSteps)
    for mainStep in omitSteps.keys():
        omitSubSteps[mainStep] = omitSteps[mainStep]
        if omitSteps[mainStep] == True:
            omitMainSteps.append(mainStep)
            omitSubSteps[mainStep] = None
    # Initialize main step selector
    if fromMainStep != None:
        if fromSubStep[fromMainStep] != None:
            print >> sys.stderr, "Starting process from step", fromMainStep + ", substep", fromSubStep[fromMainStep]
        else:
            print >> sys.stderr, "Starting process from step", fromMainStep
    selector = StepSelector(mainSteps, fromStep=fromMainStep, omitSteps=omitMainSteps)
    return selector, fromSubStep, omitSubSteps
Beispiel #5
0
def getSteps(step, omitSteps, mainSteps):
    # Determine substep to start from, for the main step from which processing starts
    step = Parameters.get(step, mainSteps)
    fromMainStep = None
    fromSubStep = {} # The substep to start from, for the main step to start from
    for mainStep in step.keys():
        fromSubStep[mainStep] = step[mainStep] # the sub step to start from
        if step[mainStep] != None:
            assert fromMainStep == None # processing can start from one place only
            fromMainStep = mainStep
            if step[mainStep] == True:
                fromSubStep[mainStep] = None
            else:
                assert type(step[mainStep]) in types.StringTypes # no list allowed, processing can start from one place only
    # Determine steps to omit
    omitSubSteps = {} # Skip these substeps. If the value is True, skip the entire main step.
    omitMainSteps = []
    omitSteps = Parameters.get(omitSteps, mainSteps)
    for mainStep in omitSteps.keys():
        omitSubSteps[mainStep] = omitSteps[mainStep]
        if omitSteps[mainStep] == True:
            omitMainSteps.append(mainStep)
            omitSubSteps[mainStep] = None
    # Initialize main step selector
    if fromMainStep != None:
        if fromSubStep[fromMainStep] != None:
            print >> sys.stderr, "Starting process from step", fromMainStep + ", substep", fromSubStep[fromMainStep]
        else:
            print >> sys.stderr, "Starting process from step", fromMainStep
    selector = StepSelector(mainSteps, fromStep=fromMainStep, omitSteps=omitMainSteps)
    return selector, fromSubStep, omitSubSteps
Beispiel #6
0
 def saveModel(self, teesModel, tag=""):
     if hasattr(self, "model") and self.model != None:
         teesModelPath = teesModel.get(tag+"classifier-model", True)
         shutil.copy2(self.model, teesModelPath)
     if hasattr(self, "parameters") and self.parameters != None:
         teesModel.addStr(tag+"classifier-parameter", Parameters.toString(Parameters.get(self.parameters)))
     if hasattr(self, "threshold") and self.threshold != None:
         teesModel.addStr(tag+"threshold", str(self.threshold))
Beispiel #7
0
 def optimize(self, examples, outDir, parameters, classifyExamples, classIds, step="BOTH", evaluator=None, determineThreshold=False, timeout=None, downloadAllModels=False):
     assert step in ["BOTH", "SUBMIT", "RESULTS"], step
     outDir = os.path.abspath(outDir)
     # Initialize training (or reconnect to existing jobs)
     combinations = Parameters.getCombinations(Parameters.get(parameters, valueListKey="c")) #Core.OptimizeParameters.getParameterCombinations(parameters)
     trained = []
     for combination in combinations:
         trained.append( self.train(examples, outDir, combination, classifyExamples, replaceRemoteExamples=(len(trained) == 0), dummy=(step == "RESULTS")) )
     if step == "SUBMIT": # Return already
         classifier = copy.copy(self)
         classifier.setState("OPTIMIZE")
         return classifier
     
     # Wait for the training to finish
     finalJobStatus = self.connection.waitForJobs([x.getJob() for x in trained])
     # Evaluate the results
     print >> sys.stderr, "Evaluating results"
     #Stream.setIndent(" ")
     bestResult = None
     if evaluator == None:
         evaluator = self.defaultEvaluator
     for i in range(len(combinations)):
         id = trained[i].parameterIdStr
         #Stream.setIndent(" ")
         # Get predictions
         predictions = None
         if trained[i].getStatus() == "FINISHED":
             predictions = trained[i].downloadPredictions()
         else:
             print >> sys.stderr, "No results for combination" + id
             continue
         if downloadAllModels:
             trained[i].downloadModel()
         # Compare to other results
         print >> sys.stderr, "*** Evaluating results for combination" + id + " ***"
         threshold = None
         if determineThreshold:
             print >> sys.stderr, "Thresholding, original micro =",
             evaluation = evaluator.evaluate(classifyExamples, predictions, classIds, os.path.join(outDir, "evaluation-before-threshold" + id + ".csv"), verbose=False)
             print >> sys.stderr, evaluation.microF.toStringConcise()
             threshold, bestF = evaluator.threshold(classifyExamples, predictions)
             print >> sys.stderr, "threshold =", threshold, "at binary fscore", str(bestF)[0:6]
         evaluation = evaluator.evaluate(classifyExamples, ExampleUtils.loadPredictions(predictions, threshold=threshold), classIds, os.path.join(outDir, "evaluation" + id + ".csv"))
         if bestResult == None or evaluation.compare(bestResult[0]) > 0: #: averageResult.fScore > bestResult[1].fScore:
             bestResult = [evaluation, trained[i], combinations[i], threshold]
         if not self.connection.isLocal():
             os.remove(predictions) # remove predictions to save space
     #Stream.setIndent()
     if bestResult == None:
         raise Exception("No results for any parameter combination")
     print >> sys.stderr, "*** Evaluation complete", finalJobStatus, "***"
     print >> sys.stderr, "Selected parameters", bestResult[2]
     classifier = copy.copy(bestResult[1])
     classifier.threshold = bestResult[3]
     classifier.downloadModel()
     return classifier
Beispiel #8
0
 def saveModel(self, teesModel, tag=""):
     if hasattr(self, "model") and self.model != None:
         teesModelPath = teesModel.get(tag + "classifier-model", True)
         shutil.copy2(self.model, teesModelPath)
     if hasattr(self, "parameters") and self.parameters != None:
         teesModel.addStr(
             tag + "classifier-parameter",
             Parameters.toString(Parameters.get(self.parameters)))
     if hasattr(self, "threshold") and self.threshold != None:
         teesModel.addStr(tag + "threshold", str(self.threshold))
Beispiel #9
0
    def doGrid(self):
        print >> sys.stderr, "--------- Booster parameter search ---------"
        # Build trigger examples
        self.triggerDetector.buildExamples(self.model, [self.optData], [self.workDir+"grid-trigger-examples.gz"])

        if self.fullGrid:
            # Parameters to optimize
            ALL_PARAMS={
                "trigger":[int(i) for i in Parameters.get(self.triggerClassifierParameters, valueListKey="c")["c"]], 
                "booster":[float(i) for i in self.recallAdjustParameters.split(",")], 
                "edge":[int(i) for i in Parameters.get(self.edgeClassifierParameters, valueListKey="c")["c"]] }
        else:
            ALL_PARAMS={"trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameter"), valueListKey="c")["c"],
                        "booster":[float(i) for i in self.recallAdjustParameters.split(",")],
                        "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameter"), valueListKey="c")["c"]}
        
        paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"])
        prevParams = None
        EDGE_MODEL_STEM = os.path.join(self.edgeDetector.workDir, os.path.normpath(self.model.path)+"-edge-models/model-c_")
        TRIGGER_MODEL_STEM = os.path.join(self.triggerDetector.workDir, os.path.normpath(self.model.path)+"-trigger-models/model-c_")
        bestResults = None
        for i in range(len(paramCombinations)):
            params = paramCombinations[i]
            print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            print >> sys.stderr, "Processing params", str(i+1) + "/" + str(len(paramCombinations)), params
            print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            # Triggers and Boost
            if prevParams == None or prevParams["trigger"] != params["trigger"] or prevParams["booster"] != params["booster"]:
                print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str(params["trigger"]), "booster:" + str(params["booster"])
                xml = self.triggerDetector.classifyToXML(self.optData, self.model, self.workDir+"grid-trigger-examples.gz", self.workDir+"grid-", classifierModel=TRIGGER_MODEL_STEM+str(params["trigger"]), recallAdjust=params["booster"])
            prevParams = params
            # Build edge examples
            self.edgeDetector.buildExamples(self.model, [xml], [self.workDir+"grid-edge-examples.gz"], [self.optData])
            # Classify with pre-defined model
            edgeClassifierModel=EDGE_MODEL_STEM+str(params["edge"])
            xml = self.edgeDetector.classifyToXML(xml, self.model, self.workDir+"grid-edge-examples.gz", self.workDir+"grid-", classifierModel=edgeClassifierModel)
            bestResults = self.evaluateGrid(xml, params, bestResults)
        print >> sys.stderr, "Booster search complete"
        print >> sys.stderr, "Tested", len(paramCombinations), "combinations"
        print >> sys.stderr, "Best parameters:", bestResults[0]
        print >> sys.stderr, "Best result:", bestResults[2] # f-score
        # Save grid model
        self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.model)
        self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.combinedModel, False)
        if self.fullGrid: # define best models
            self.triggerDetector.addClassifierModel(self.model, TRIGGER_MODEL_STEM+str(bestResults[0]["trigger"]), bestResults[0]["trigger"])
            self.edgeDetector.addClassifierModel(self.model, EDGE_MODEL_STEM+str(bestResults[0]["edge"]), bestResults[0]["edge"])
        # Remove work files
        for stepTag in [self.workDir+"grid-trigger", self.workDir+"grid-edge", self.workDir+"grid-unmerging"]:
            for fileStem in ["-classifications", "-classifications.log", "examples.gz", "pred.xml.gz"]:
                if os.path.exists(stepTag+fileStem):
                    os.remove(stepTag+fileStem)
Beispiel #10
0
 def trainUnmergingDetector(self):
     xml = None
     if not self.unmerging:
         print >> sys.stderr, "No unmerging"
     if self.checkStep("SELF-TRAIN-EXAMPLES-FOR-UNMERGING", self.unmerging) and self.unmerging:
         # Self-classified train data for unmerging
         if self.doUnmergingSelfTraining:
             # This allows limiting to a subcorpus
             triggerStyle = copy.copy(Parameters.get(self.triggerExampleStyle))
             edgeStyle = copy.copy(Parameters.get(self.edgeExampleStyle))
             unmergingStyle = Parameters.get(self.unmergingExampleStyle)
             if "sentenceLimit" in unmergingStyle and unmergingStyle["sentenceLimit"]:
                 triggerStyle["sentenceLimit"] = unmergingStyle["sentenceLimit"]
                 edgeStyle["sentenceLimit"] = unmergingStyle["sentenceLimit"]
             # Build the examples
             xml = self.triggerDetector.classifyToXML(self.trainData, self.model, None, self.workDir+"unmerging-extra-", exampleStyle=triggerStyle)#, recallAdjust=0.5)
             xml = self.edgeDetector.classifyToXML(xml, self.model, None, self.workDir+"unmerging-extra-", exampleStyle=edgeStyle)#, recallAdjust=0.5)
             assert xml != None
             EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml, self.trainData, self.parse)
         else:
             print >> sys.stderr, "No self-training for unmerging"
     if self.checkStep("UNMERGING-EXAMPLES", self.unmerging) and self.unmerging:
         # Unmerging example generation
         GOLD_TEST_FILE = self.optData.replace("-nodup", "")
         GOLD_TRAIN_FILE = self.trainData.replace("-nodup", "")
         if self.doUnmergingSelfTraining:
             if xml == None: 
                 xml = self.workDir+"unmerging-extra-edge-pred.xml.gz"
             self.unmergingDetector.buildExamples(self.model, [self.optData.replace("-nodup", ""), [self.trainData.replace("-nodup", ""), xml]], 
                                                  [self.workDir+"unmerging-opt-examples.gz", self.workDir+"unmerging-train-examples.gz"], 
                                                  [GOLD_TEST_FILE, [GOLD_TRAIN_FILE, GOLD_TRAIN_FILE]], 
                                                  exampleStyle=self.unmergingExampleStyle, saveIdsToModel=True)
             xml = None
         else:
             self.unmergingDetector.buildExamples(self.model, [self.optData.replace("-nodup", ""), self.trainData.replace("-nodup", "")], 
                                                  [self.workDir+"unmerging-opt-examples.gz", self.workDir+"unmerging-train-examples.gz"], 
                                                  [GOLD_TEST_FILE, GOLD_TRAIN_FILE], 
                                                  exampleStyle=self.unmergingExampleStyle, saveIdsToModel=True)
             xml = None
         #UnmergingExampleBuilder.run("/home/jari/biotext/EventExtension/TrainSelfClassify/test-predicted-edges.xml", GOLD_TRAIN_FILE, UNMERGING_TRAIN_EXAMPLE_FILE, PARSE, TOK, UNMERGING_FEATURE_PARAMS, UNMERGING_IDS, append=True)
     if self.checkStep("BEGIN-UNMERGING-MODEL", self.unmerging) and self.unmerging:
         self.unmergingDetector.beginModel(None, self.model, self.workDir+"unmerging-train-examples.gz", self.workDir+"unmerging-opt-examples.gz")
     if self.checkStep("END-UNMERGING-MODEL", self.unmerging) and self.unmerging:
         self.unmergingDetector.endModel(None, self.model, self.workDir+"unmerging-opt-examples.gz")
         print >> sys.stderr, "Adding unmerging classifier model to test-set event model"
         if self.combinedModel != None:
             self.combinedModel.addStr("unmerging-example-style", self.model.getStr("unmerging-example-style"))
             self.combinedModel.insert(self.model.get("unmerging-ids.classes"), "unmerging-ids.classes")
             self.combinedModel.insert(self.model.get("unmerging-ids.features"), "unmerging-ids.features")
             self.unmergingDetector.addClassifierModel(self.combinedModel, self.model.get("unmerging-classifier-model", True), 
                                                       self.model.getStr("unmerging-classifier-parameter"))
             self.combinedModel.save()
Beispiel #11
0
 def getClassifier(self, parameters):
     parameters = Parameters.get(parameters, ["TEES.threshold", "TEES.classifier", "c"], valueListKey="c")
     if parameters["TEES.classifier"] == None:
         return self.Classifier
     else:
         exec "from Classifiers." + parameters["TEES.classifier"] + " import " + parameters["TEES.classifier"] + " as " + parameters["TEES.classifier"]
         return eval(parameters["TEES.classifier"])
Beispiel #12
0
 def train(self, trainData=None, optData=None, model=None, combinedModel=None, exampleStyle=None, 
     classifierParameters=None, parse=None, tokenization=None, task=None, fromStep=None, toStep=None, 
     workDir=None):
     
     exampleStyle = Parameters.cat(exampleStyle, "keep_neg:no_features")
     EdgeDetector.train(self, trainData, optData, model, combinedModel, exampleStyle, classifierParameters, parse, tokenization, fromStep, toStep)
     self.classify(trainData, model, "classification-train/train", goldData=trainData, workDir="classification-train")
Beispiel #13
0
 def train(self, examples, outDir, parameters, classifyExamples=None, dummy=False):
     outDir = os.path.abspath(outDir)
     
     examples = self.getExampleFile(examples, dummy=dummy)
     classifyExamples = self.getExampleFile(classifyExamples, dummy=dummy)
     
     # Return a new classifier instance for following the training process and using the model
     classifier = copy.copy(self)
     classifier.parameters = parameters
     classifier._filesToRelease = [examples, classifyExamples]
     
     if not os.path.exists(outDir):
         os.makedirs(outDir)
     
     trainFeatures, trainClasses = datasets.load_svmlight_file(examples)
     if classifyExamples != None:
         develFeatures, develClasses = datasets.load_svmlight_file(classifyExamples, trainFeatures.shape[1])
     binarizer = preprocessing.LabelBinarizer()
     binarizer.fit(trainClasses)
     trainClasses = binarizer.transform(trainClasses)
     if classifyExamples != None:
         develClasses = binarizer.transform(develClasses)
     
     print >> sys.stderr, "Training Keras model with parameters:", parameters
     parameters = Parameters.get(parameters, {"TEES.classifier":"KerasClassifier", "layers":5, "lr":0.001, "epochs":1, "batch_size":64, "patience":10})
     np.random.seed(10)
     classifier.kerasModel = classifier._defineModel(outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses)
     classifier._fitModel(outDir, parameters, trainFeatures, trainClasses, develFeatures, develClasses)
Beispiel #14
0
    def classifyToXML(self, data, model, exampleFileName=None, tag="", classifierModel=None, goldData=None, parse=None, recallAdjust=None, compressExamples=True, exampleStyle=None):
        model = self.openModel(model, "r")
        if parse == None:
            parse = self.getStr(self.tag+"parse", model)
        if exampleFileName == None:
            exampleFileName = tag+self.tag+"examples"
            if compressExamples:
                exampleFileName += ".gz"
        self.buildExamples(model, [data], [exampleFileName], [goldData], parse=parse, exampleStyle=exampleStyle)
        if classifierModel == None:
            classifierModel = model.get(self.tag+"classifier-model")
        else:
            assert os.path.exists(classifierModel), classifierModel
        classifier = self.Classifier()
        classifier.classify(exampleFileName, tag+self.tag+"classifications", classifierModel, finishBeforeReturn=True)
        threshold = model.getStr(self.tag+"threshold", defaultIfNotExist=None, asType=float)
        predictions = ExampleUtils.loadPredictions(tag+self.tag+"classifications", recallAdjust, threshold=threshold)
        evaluator = self.evaluator.evaluate(exampleFileName, predictions, model.get(self.tag+"ids.classes"))
        #outputFileName = tag+"-"+self.tag+"pred.xml.gz"
        #exampleStyle = self.exampleBuilder.getParameters(model.getStr(self.tag+"example-style"))
        if exampleStyle == None:
            exampleStyle = Parameters.get(model.getStr(self.tag+"example-style")) # no checking, but these should already have passed the ExampleBuilder
        return self.exampleWriter.write(exampleFileName, predictions, data, tag+self.tag+"pred.xml.gz", model.get(self.tag+"ids.classes"), parse, exampleStyle=exampleStyle)
#        if evaluator.getData().getTP() + evaluator.getData().getFP() > 0:
#            return self.exampleWriter.write(exampleFileName, predictions, data, outputFileName, model.get(self.tag+"ids.classes"), parse)
#        else:
#            # TODO: e.g. interactions must be removed if task does unmerging
#            print >> sys.stderr, "No positive", self.tag + "predictions, XML file", outputFileName, "unchanged from input"
#            if type(data) in types.StringTypes: # assume its a file
#                shutil.copy(data, outputFileName)
#            else: # assume its an elementtree
#                ETUtils.write(data, outputFileName)
#            #print >> sys.stderr, "No positive predictions, XML file", tag+self.tag+"pred.xml", "not written"
#            return data #None
Beispiel #15
0
 def getBioNLPSharedTaskParams(self, parameters=None, model=None):
     if parameters == None:
         if model != None:
             model = self.openModel(model, "r")
             parameters = model.getStr("BioNLPSTParams", defaultIfNotExist=None)
         else:
             parameters = {}
     return Parameters.get(parameters, ["convert", "evaluate", "scores", "a2Tag"])
Beispiel #16
0
 def getClassifier(self, parameters):
     #parameters = Parameters.get(parameters, ["TEES.threshold", "TEES.classifier", "c"], valueListKey="c")
     parameters = Parameters.get(parameters, ["TEES.threshold", "TEES.classifier"], allowNew=True, valueListKey="c")
     if parameters["TEES.classifier"] == None:
         return self.Classifier
     else:
         exec "from Classifiers." + parameters["TEES.classifier"] + " import " + parameters["TEES.classifier"] + " as " + parameters["TEES.classifier"]
         return eval(parameters["TEES.classifier"])
Beispiel #17
0
 def getBioNLPSharedTaskParams(self, parameters=None, model=None):
     if parameters == None:
         if model != None:
             model = self.openModel(model, "r")
             parameters = model.getStr("BioNLPSTParams", defaultIfNotExist=None)
         else:
             parameters = {}
     return Parameters.get(parameters, ["convert", "evaluate", "scores", "a2Tag"])
Beispiel #18
0
def learnSettings(inputFiles, detector, classifierParameters):
    if detector == None:
        print >> sys.stderr, "*** Analyzing input files to determine training settings ***"
        structureAnalyzer = StructureAnalyzer()
        if not os.path.exists("training/structure.txt"):
            datasets = sorted(
                filter(None, [inputFiles["train"], inputFiles["devel"]]))
            print >> sys.stderr, "input files:", datasets
            structureAnalyzer.analyze(datasets)
            print >> sys.stderr, structureAnalyzer.toString()
            structureAnalyzer.save(None, "training/structure.txt")
        else:
            print >> sys.stderr, "Using existing analysis from training/structure.txt"
            structureAnalyzer.load(None, "training/structure.txt")

    # Choose detector
    if detector == None:
        if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets:
            detector = "Detectors.EventDetector"
        elif "ENTITY" in structureAnalyzer.targets:
            detector = "Detectors.EntityDetector"
        elif "INTERACTION" in structureAnalyzer.targets:
            detector = "Detectors.EdgeDetector"
        else:
            assert False, structureAnalyzer.targets
    print >> sys.stderr, "Using detector '" + str(detector) + "'"

    # Set default parameters
    if detector == "Detectors.EventDetector":
        classifierParameters["unmerging"] = Parameters.cat(
            "c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000",
            classifierParameters["unmerging"],
            "Classifier parameters for unmerging")
        classifierParameters["modifiers"] = Parameters.cat(
            "c=5000,10000,20000,50000,100000",
            classifierParameters["modifiers"],
            "Classifier parameters for modifiers")
        classifierParameters["edge"] = Parameters.cat(
            "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000",
            classifierParameters["edge"], "Classifier parameters for edges")
        classifierParameters["trigger"] = Parameters.cat(
            "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000",
            classifierParameters["trigger"],
            "Classifier parameters for triggers")
        classifierParameters["recall"] = Parameters.cat(
            "0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2",
            classifierParameters["recall"], "Recall adjustment parameters")
    elif detector == "Detectors.EntityDetector":
        classifierParameters["examples"] = Parameters.cat(
            "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000",
            classifierParameters["examples"],
            "Classifier parameters for entities")
    elif detector == "Detectors.EdgeDetector":
        classifierParameters["examples"] = Parameters.cat(
            "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000",
            classifierParameters["examples"],
            "Classifier parameters for edges")

    return detector
Beispiel #19
0
 def getParameters(self, parameters=None, model=None, defaultValue=None, modelParameterStringName=None):
     if modelParameterStringName == None:
         modelParameterStringName = self.modelParameterStringName
     if parameters == None and model != None:
         model = self.openModel(model, "r")
         parameters = model.getStr(modelParameterStringName, defaultIfNotExist=None)
     defaultStepNames = [x[0] for x in self.getDefaultSteps()]
     valueLimits={"omitSteps":defaultStepNames + [None], "intermediateFiles":defaultStepNames + [True, None]}
     defaults = self.getDefaultParameters(defaultValue=defaultValue)
     return Parameters.get(parameters, defaults, valueLimits=valueLimits)
Beispiel #20
0
 def train(self, examples, outDir, parameters, classifyExamples=None, finishBeforeReturn=False, replaceRemoteExamples=True, dummy=False):
     outDir = os.path.abspath(outDir)
     
     examples = self.getExampleFile(examples, replaceRemote=replaceRemoteExamples, dummy=dummy)
     classifyExamples = self.getExampleFile(classifyExamples, replaceRemote=replaceRemoteExamples, dummy=dummy)
     parameters = Parameters.get(parameters, valueListKey="c")
     trainDir = self.connection.getSetting(self.trainDirSetting)
     
     # Return a new classifier instance for following the training process and using the model
     classifier = copy.copy(self)
     classifier.setState("TRAIN")
     classifier.parameters = parameters
     classifier._filesToRelease = [examples, classifyExamples]
     # Train
     if not os.path.exists(outDir):
         os.makedirs(outDir)
     trainCommand = os.path.join(trainDir, self.trainCommand)
     paramKeys = sorted(parameters.keys())
     idStr = ""
     paramString = ""
     for key in paramKeys:
         if key.startswith("TEES."):
             continue
         if len(paramString) > 0 and not paramString.endswith(" "):
             paramString += " "
         if parameters[key] != None:
             paramString += self.parameterFormat.replace("%k", key).replace("%v", str(parameters[key])).strip()
             idStr += "-" + str(key) + "_" + str(parameters[key])
         else:
             paramString += self.parameterFormat.replace("%k", key).replace("%v", "").strip()
             idStr += "-" + str(key)
     classifier.parameterIdStr = idStr
     classifier.model = self.connection.getRemotePath(outDir + "/model" + idStr, True)
     modelPath = self.connection.getRemotePath(outDir + "/model" + idStr, False)
     trainCommand = trainCommand.replace("%p", paramString).replace("%e", examples).replace("%m", modelPath).strip()
     self.connection.addCommand(trainCommand)
     # Classify with the trained model (optional)
     if classifyExamples != None:
         classifier.predictions = self.connection.getRemotePath(outDir + "/predictions" + idStr, True)
         predictionsPath = self.connection.getRemotePath(outDir + "/predictions" + idStr, False)
         classifyDir = self.connection.getSetting(self.classifyDirSetting)
         classifyCommand = os.path.join(classifyDir, self.classifyCommand).replace("%e", classifyExamples).replace("%m", modelPath).replace("%c", predictionsPath).strip()
         self.connection.addCommand(classifyCommand)
     # Run the process
     jobName = self.trainCommand.split()[0] + idStr
     logPath = outDir + "/" + jobName
     if dummy: # return a classifier that connects to an existing job
         self.connection.clearCommands()
         classifier._job = self.connection.getJob(jobDir=outDir, jobName=jobName)
     else: # submit the job
         classifier._job = self.connection.submit(jobDir=outDir, jobName=jobName, stdout=logPath+".stdout")
         if finishBeforeReturn:
             self.connection.waitForJob(classifier._job)
             self.getStatus()
     return classifier
Beispiel #21
0
 def initModel(self, model, saveParams=[]):
     if model == None:
         return model
     elif type(model) in types.StringTypes:
         model = self.openModel(model, "w")
     else:
         assert model.mode in ["a", "w"]
     for param in saveParams:
         model.addStr(param[1], Parameters.toString(getattr(self, param[0])))
     model.save()
     return model
Beispiel #22
0
 def initModel(self, model, saveParams=[]):
     if model == None:
         return model
     elif type(model) in types.StringTypes:
         model = self.openModel(model, "w")
     else:
         assert model.mode in ["a", "w"]
     for param in saveParams:
         model.addStr(param[1], Parameters.toString(getattr(self, param[0])))
     model.save()
     return model
Beispiel #23
0
 def train(self, examples, outDir, parameters, classifyExamples=None, finishBeforeReturn=False, replaceRemoteExamples=True, dummy=False):
     outDir = os.path.abspath(outDir)
     
     examples = self.getExampleFile(examples, replaceRemote=replaceRemoteExamples, dummy=dummy)
     classifyExamples = self.getExampleFile(classifyExamples, replaceRemote=replaceRemoteExamples, dummy=dummy)
     #parameters = Parameters.get(parameters, valueListKey="c")
     trainDir = ""
     if self.trainDirSetting:
         trainDir = os.path.normpath(self.connection.getSetting(self.trainDirSetting)) + os.path.sep
     
     # Return a new classifier instance for following the training process and using the model
     classifier = copy.copy(self)
     classifier.setState("TRAIN")
     classifier.parameters = parameters
     classifier._filesToRelease = [examples, classifyExamples]
     # Train
     if not os.path.exists(outDir):
         os.makedirs(outDir)
     #trainCommand = os.path.join(trainDir, self.trainCommand)
     trainCommand = self.trainCommand.replace("%d", trainDir)
     parameters = Parameters.get(parameters, self.parameterDefaults["train"], self.parameterAllowNew["train"], 
                                 self.parameterValueListKey["train"], self.parameterValueLimits["train"], 
                                 self.parameterValueTypes["train"])
     paramString, idStr = self._getParameterString(parameters)
     classifier.parameterIdStr = idStr
     classifier.model = self.connection.getRemotePath(outDir + "/model" + idStr, True)
     modelPath = self.connection.getRemotePath(outDir + "/model" + idStr, False)
     trainCommand = trainCommand.replace("%p", paramString).replace("%e", examples).replace("%m", modelPath).strip()
     self.connection.addCommand(trainCommand)
     # Classify with the trained model (optional)
     if classifyExamples != None:
         classifier.predictions = self.connection.getRemotePath(outDir + "/predictions" + idStr, True)
         predictionsPath = self.connection.getRemotePath(outDir + "/predictions" + idStr, False)
         classifyDir = ""
         if self.classifyDirSetting:
             classifyDir = os.path.normpath(self.connection.getSetting(self.classifyDirSetting)) + os.path.sep
         classifyCommand = self.classifyCommand.replace("%d", classifyDir).replace("%e", classifyExamples).replace("%m", modelPath).replace("%c", predictionsPath).strip()
         self.connection.addCommand(classifyCommand)
     # Run the process
     jobName = self.trainCommand.split()[0].replace("%d", "") + idStr
     logPath = outDir + "/" + jobName
     if dummy: # return a classifier that connects to an existing job
         self.connection.clearCommands()
         classifier._job = self.connection.getJob(jobDir=outDir, jobName=jobName)
     else: # submit the job
         classifier._job = self.connection.submit(jobDir=outDir, jobName=jobName, stdout=logPath+".stdout")
         if finishBeforeReturn:
             self.connection.waitForJob(classifier._job)
             self.getStatus()
     return classifier
Beispiel #24
0
def getConnection(
        connection
):  #, account=None, workDirBase=None, remoteSettingsPath=None):
    if connection == None:  # return a "dummy" local connection
        return getConnection("connection=Unix:jobLimit=1")
    elif type(connection) in types.StringTypes and hasattr(
            Settings, connection):  # connection is a Settings key
        print >> sys.stderr, "Using connection", connection
        return getConnection(getattr(Settings, connection))
        #return getConnection(*getattr(Settings, connection))
    else:  # connection is a parameter string or dictionary
        defaultParams = dict.fromkeys([
            "connection", "account", "workdir", "settings", "memory", "cores",
            "modules", "wallTime", "jobLimit", "preamble", "debug"
        ])
        defaultParams["debug"] = False
        connection = Parameters.get(connection,
                                    valueListKey="connection",
                                    valueTypes={"debug": [bool]},
                                    defaults=defaultParams)
        if connection["connection"] == None:
            connection["connection"] = "Unix"
        if connection["account"] == None:
            assert connection["workdir"] == None
            #assert remoteSettingsPath == None
            print >> sys.stderr, "New local connection", Parameters.toString(
                connection)
        else:
            print >> sys.stderr, "New remote connection:", Parameters.toString(
                connection)
        # Make the connection
        exec "ConnectionClass = " + connection["connection"] + "Connection"
        connectionArgs = {}
        for key in connection:
            if key != "connection" and connection[key] != None:
                connectionArgs[key] = connection[key]
        return ConnectionClass(**connectionArgs)
 def train(self, examples, outDir, parameters, classifyExamples=None, finishBeforeReturn=False, replaceRemoteExamples=True, dummy=False):
     outDir = os.path.abspath(outDir)
     
     examples = self.getExampleFile(examples, replaceRemote=replaceRemoteExamples, dummy=dummy)
     classifyExamples = self.getExampleFile(classifyExamples, replaceRemote=replaceRemoteExamples, dummy=dummy)
     parameters = Parameters.get(parameters, valueListKey="c")
     svmMulticlassDir = self.connection.getSetting("SVM_MULTICLASS_DIR")
     
     # Return a new classifier instance for following the training process and using the model
     classifier = copy.copy(self)
     classifier.setState("TRAIN")
     classifier.parameters = parameters
     # Train
     if not os.path.exists(outDir):
         os.makedirs(outDir)
     trainCommand = svmMulticlassDir + "/svm_multiclass_learn "
     paramKeys = sorted(parameters.keys())
     idStr = ""
     for key in paramKeys:
         trainCommand += "-" + str(key) + " "
         idStr += "-" + str(key)
         if parameters[key] != None:
             trainCommand += str(parameters[key]) + " "
             idStr += "_" + str(parameters[key])
     classifier.parameterIdStr = idStr
     classifier.model = self.connection.getRemotePath(outDir + "/model" + idStr, True)
     modelPath = self.connection.getRemotePath(outDir + "/model" + idStr, False)
     trainCommand += examples + " " + modelPath
     self.connection.addCommand(trainCommand)
     # Classify with the trained model (optional)
     if classifyExamples != None:
         classifier.predictions = self.connection.getRemotePath(outDir + "/predictions" + idStr, True)
         predictionsPath = self.connection.getRemotePath(outDir + "/predictions" + idStr, False)
         classifyCommand = svmMulticlassDir + "/svm_multiclass_classify " + classifyExamples + " " + modelPath + " " + predictionsPath
         self.connection.addCommand(classifyCommand)
     # Run the process
     jobName = "svm_multiclass_learn" + idStr
     logPath = outDir + "/" + jobName
     if dummy: # return a classifier that connects to an existing job
         self.connection.clearCommands()
         classifier._job = self.connection.getJob(jobDir=outDir, jobName=jobName)
     else: # submit the job
         classifier._job = self.connection.submit(jobDir=outDir, jobName=jobName, stdout=logPath+".stdout")
         if finishBeforeReturn:
             self.connection.waitForJob(classifier._job)
     return classifier
Beispiel #26
0
    def train(self,
              examples,
              outDir,
              parameters,
              classifyExamples=None,
              dummy=False):
        outDir = os.path.abspath(outDir)

        examples = self.getExampleFile(examples, dummy=dummy)
        classifyExamples = self.getExampleFile(classifyExamples, dummy=dummy)

        # Return a new classifier instance for following the training process and using the model
        classifier = copy.copy(self)
        classifier.parameters = parameters
        classifier._filesToRelease = [examples, classifyExamples]

        if not os.path.exists(outDir):
            os.makedirs(outDir)

        trainFeatures, trainClasses = datasets.load_svmlight_file(examples)
        if classifyExamples != None:
            develFeatures, develClasses = datasets.load_svmlight_file(
                classifyExamples, trainFeatures.shape[1])
        binarizer = preprocessing.LabelBinarizer()
        binarizer.fit(trainClasses)
        trainClasses = binarizer.transform(trainClasses)
        if classifyExamples != None:
            develClasses = binarizer.transform(develClasses)

        print >> sys.stderr, "Training Keras model with parameters:", parameters
        parameters = Parameters.get(
            parameters, {
                "TEES.classifier": "KerasClassifier",
                "layers": 5,
                "lr": 0.001,
                "epochs": 1,
                "batch_size": 64,
                "patience": 10
            })
        np.random.seed(10)
        classifier.kerasModel = classifier._defineModel(
            outDir, parameters, trainFeatures, trainClasses, develFeatures,
            develClasses)
        classifier._fitModel(outDir, parameters, trainFeatures, trainClasses,
                             develFeatures, develClasses)
Beispiel #27
0
 def getBioNLPSharedTaskParams(self, parameters=None, model=None):
     if parameters == None:
         if model != None:
             model = self.openModel(model, "r")
             parameters = model.getStr("BioNLPSTParams",
                                       defaultIfNotExist=None)
         else:
             parameters = {}
     elif parameters == "skip" or "skip" in parameters:
         parameters = {"convert": False}
     return Parameters.get(
         parameters, {
             "convert": None,
             "evaluate": None,
             "scores": None,
             "a2Tag": None,
             "evalSubTasks": "123"
         })
Beispiel #28
0
 def getParameters(self,
                   parameters=None,
                   model=None,
                   defaultValue=None,
                   modelParameterStringName=None):
     if modelParameterStringName == None:
         modelParameterStringName = self.modelParameterStringName
     if parameters == None and model != None:
         model = self.openModel(model, "r")
         parameters = model.getStr(modelParameterStringName,
                                   defaultIfNotExist=None)
     defaultStepNames = [x[0] for x in self.getDefaultSteps()]
     valueLimits = {
         "omitSteps": defaultStepNames + [None],
         "intermediateFiles": defaultStepNames + [True, None]
     }
     defaults = self.getDefaultParameters(defaultValue=defaultValue)
     return Parameters.get(parameters, defaults, valueLimits=valueLimits)
Beispiel #29
0
 def beginModel(self, step, model, trainExampleFiles, testExampleFile, importIdsFromModel=None):
     """
     Begin the training process leading to a new model.
     """
     if self.checkStep(step, False):
         if model != None:
             if self.state != None and step != None:
                 print >> sys.stderr, self.__class__.__name__ + ":" + self.state + ":" + step
             # Create combined model
             model = self.openModel(model, "w")
             assert model.mode in ["a", "w"], (model.path, model.mode)
             # Information can be imported from an existing model. In this case, model is trained
             # with the parameter already defined in the import source. This is used when training
             # the combined model.
             if importIdsFromModel != None:
                 model.importFrom(self.openModel(importIdsFromModel, "r"), [self.tag+"ids.classes", self.tag+"ids.features", "structure.txt"],
                                  [self.tag+"classifier-parameter", self.tag+"example-style", self.tag+"parse", self.tag+"task"])
                 # Train the model with the parameters defined in the import source
                 model.addStr(self.tag+"classifier-parameters-train", model.getStr(self.tag+"classifier-parameter"))
             if self.bioNLPSTParams != None and len(self.bioNLPSTParams) > 0:
                 model.addStr("BioNLPSTParams", Parameters.toString(self.bioNLPSTParams))
             # Catenate example files
             if type(trainExampleFiles) in types.StringTypes:
                 combinedTrainExamples = trainExampleFiles
             elif len(trainExampleFiles) == 1: 
                 combinedTrainExamples = trainExampleFiles[0]
             else:
                 combinedTrainExamples = self.workDir + os.path.normpath(model.path)+"-"+self.tag+"combined-examples.gz"
                 combinedTrainExamplesFile = gzip.open(combinedTrainExamples, 'wb')
                 for trainExampleFile in trainExampleFiles:
                     print >> sys.stderr, "Catenating", trainExampleFile, "to", combinedTrainExamples
                     shutil.copyfileobj(gzip.open(trainExampleFile, 'rb'), combinedTrainExamplesFile)
                 combinedTrainExamplesFile.close()
             # Upload training model
             # The parameter grid is stored in the model as "*classifier-parameters-train" so that endModel can 
             # use it, and also as annotation for the trained model. The final selected parameter will
             # be stored as "*classifier-parameter" 
             classifierWorkDir = self.workDir + os.path.normpath(model.path) + "-" + self.tag + "models"
             classifier = self.getClassifier(model.getStr(self.tag+"classifier-parameters-train"))(self.connection)
             classifier.optimize(combinedTrainExamples, classifierWorkDir, model.getStr(self.tag+"classifier-parameters-train"), testExampleFile, model.get(self.tag+"ids.classes"), step="SUBMIT", evaluator=self.evaluator)
             model.save()
Beispiel #30
0
 def process(self, input, output, parameters=None, model=None, fromStep=None, toStep=None, omitSteps=None):
     self.initVariables(source=input, xml=input, outDir=os.path.dirname(output))
     if os.path.basename(output) != "":
         self.intermediateFileTag = os.path.basename(output)
     else:
         self.intermediateFileTag = ""
     self.enterState(self.STATE_TOOLCHAIN, [x[0] for x in self.steps], fromStep, toStep, omitSteps)
     parameters = self.getParameters(parameters, model, defaultValue=NOTHING)
     self.applyParameters(parameters)
     # Run the tools
     print >> sys.stderr, "Tool chain parameters:", Parameters.toString(parameters, skipKeysWithValues=[NOTHING], skipDefaults=self.getDefaultParameters())
     if os.path.exists(output) and not os.path.isdir(output):
         print >> sys.stderr, "Removing existing preprocessor output file", output
         os.remove(output)
     savedIntermediate = None # Output from a previous step if "fromStep" is used
     for step in self.steps:
         if self.checkStep(step[0]):
             if savedIntermediate != None: # A previous run of the program saved an intermediate file
                 print >> sys.stderr, "Reading input from saved intermediate file", savedIntermediate
                 self.xml = ETUtils.ETFromObj(savedIntermediate)
                 savedIntermediate = None
             stepArgs = copy.copy(step[2]) # make a copy of the arguments to which i/o can be added
             stepArgs[step[4]["input"]] = self.xml # the input
             if self.getIntermediateFilePath(step) != None: # this step should save an intermediate file
                 stepArgs[step[4]["output"]] = self.getIntermediateFilePath(step)
             print >> sys.stderr, "Running step", step[0], "with arguments", stepArgs
             step[1](**stepArgs) # call the tool
         elif self.getStepStatus(step[0]) == "BEFORE": # this step was run earlier
             savedIntermediate = self.getIntermediateFilePath(step)
     # End state and return
     xml = self.xml # state-specific member variable self.xml will be removed when exiting state
     self.exitState()
     if self.state == None: # if the whole toolchain has finished, return the final product
         if not os.path.isdir(output): # if output is a directory, it was given only for storing intermediate files ...
             ETUtils.write(xml, output) # ... otherwise, save the final output
         return xml
     else:
         return None
Beispiel #31
0
    def train(self,
              trainData=None,
              optData=None,
              model=None,
              combinedModel=None,
              exampleStyle=None,
              classifierParameters=None,
              parse=None,
              tokenization=None,
              task=None,
              fromStep=None,
              toStep=None,
              workDir=None):

        exampleStyle = Parameters.cat(exampleStyle, "keep_neg:no_features")
        EdgeDetector.train(self, trainData, optData, model, combinedModel,
                           exampleStyle, classifierParameters, parse,
                           tokenization, fromStep, toStep)
        self.classify(trainData,
                      model,
                      "classification-train/train",
                      goldData=trainData,
                      workDir="classification-train")
Beispiel #32
0
def learnSettings(inputFiles, detector, classifierParameters):
    if detector == None:
        print >> sys.stderr, "*** Analyzing input files to determine training settings ***"
        structureAnalyzer = StructureAnalyzer()
        if not os.path.exists("training/structure.txt"): 
            datasets = sorted(filter(None, [inputFiles["train"], inputFiles["devel"]]))
            print >> sys.stderr, "input files:", datasets
            structureAnalyzer.analyze(datasets)
            print >> sys.stderr, structureAnalyzer.toString()
            structureAnalyzer.save(None, "training/structure.txt")
        else:
            print >> sys.stderr, "Using existing analysis from training/structure.txt"
            structureAnalyzer.load(None, "training/structure.txt")
    
    # Choose detector
    if detector == None:
        if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets:
            detector = "Detectors.EventDetector"
        elif "ENTITY" in structureAnalyzer.targets:
            detector = "Detectors.EntityDetector"
        elif "INTERACTION" in structureAnalyzer.targets:
            detector = "Detectors.EdgeDetector"
        else:
            assert False, structureAnalyzer.targets
    print >> sys.stderr, "Using detector '" + str(detector) + "'"
    
    # Set default parameters
    if detector == "Detectors.EventDetector":
        classifierParameters["unmerging"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", classifierParameters["unmerging"], "Classifier parameters for unmerging")        
        classifierParameters["modifiers"] = Parameters.cat("c=5000,10000,20000,50000,100000", classifierParameters["modifiers"], "Classifier parameters for modifiers")
        classifierParameters["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["edge"], "Classifier parameters for edges")
        classifierParameters["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["trigger"], "Classifier parameters for triggers")
        classifierParameters["recall"] = Parameters.cat("0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2", classifierParameters["recall"], "Recall adjustment parameters")
    elif detector == "Detectors.EntityDetector":
        classifierParameters["examples"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["examples"], "Classifier parameters for entities")
    elif detector == "Detectors.EdgeDetector":
        classifierParameters["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges")

    return detector
Beispiel #33
0
 def beginModel(self,
                step,
                model,
                trainExampleFiles,
                testExampleFile,
                importIdsFromModel=None):
     """
     Begin the training process leading to a new model.
     """
     if self.checkStep(step, False):
         if model != None:
             if self.state != None and step != None:
                 print >> sys.stderr, self.__class__.__name__ + ":" + self.state + ":" + step
             # Create combined model
             model = self.openModel(model, "w")
             assert model.mode in ["a", "w"], (model.path, model.mode)
             # Information can be imported from an existing model. In this case, model is trained
             # with the parameter already defined in the import source. This is used when training
             # the combined model.
             if importIdsFromModel != None:
                 model.importFrom(self.openModel(importIdsFromModel, "r"), [
                     self.tag + "ids.classes", self.tag + "ids.features",
                     "structure.txt"
                 ], [
                     self.tag + "classifier-parameter", self.tag +
                     "example-style", self.tag + "parse", self.tag + "task"
                 ])
                 # Train the model with the parameters defined in the import source
                 model.addStr(
                     self.tag + "classifier-parameters-train",
                     model.getStr(self.tag + "classifier-parameter"))
             if self.bioNLPSTParams != None and len(
                     self.bioNLPSTParams) > 0:
                 model.addStr("BioNLPSTParams",
                              Parameters.toString(self.bioNLPSTParams))
             # Catenate example files
             if type(trainExampleFiles) in types.StringTypes:
                 combinedTrainExamples = trainExampleFiles
             elif len(trainExampleFiles) == 1:
                 combinedTrainExamples = trainExampleFiles[0]
             else:
                 combinedTrainExamples = self.workDir + os.path.normpath(
                     model.path) + "-" + self.tag + "combined-examples.gz"
                 combinedTrainExamplesFile = gzip.open(
                     combinedTrainExamples, 'wb')
                 for trainExampleFile in trainExampleFiles:
                     print >> sys.stderr, "Catenating", trainExampleFile, "to", combinedTrainExamples
                     shutil.copyfileobj(gzip.open(trainExampleFile, 'rb'),
                                        combinedTrainExamplesFile)
                 combinedTrainExamplesFile.close()
             # Upload training model
             # The parameter grid is stored in the model as "*classifier-parameters-train" so that endModel can
             # use it, and also as annotation for the trained model. The final selected parameter will
             # be stored as "*classifier-parameter"
             classifierWorkDir = self.workDir + os.path.normpath(
                 model.path) + "-" + self.tag + "models"
             classifier = self.getClassifier(
                 model.getStr(self.tag + "classifier-parameters-train"))(
                     self.connection)
             classifier.optimize(
                 combinedTrainExamples,
                 classifierWorkDir,
                 model.getStr(self.tag + "classifier-parameters-train"),
                 testExampleFile,
                 model.get(self.tag + "ids.classes"),
                 step="SUBMIT",
                 evaluator=self.evaluator)
             model.save()
Beispiel #34
0
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None,
          processUnmerging=None, processModifiers=None, 
          bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, 
          classifierParams=None,  doFullGrid=False, deleteOutput=False, copyFrom=None, 
          log="log.txt", step=None, omitSteps=None, debug=False, connection=None, subset=None, 
          folds=None, corpusDir=None, corpusPreprocessing=None, evaluator=None):
    """
    Train a new model for event or relation detection.
    
    @param output: A directory where output files will appear.
    @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks.
    @param detector: a Detector object, or a string defining one to be imported
    @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test"
    @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test"
    @param parse: The parse element name in the training interaction XML
    @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default.
    @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default.
    @param bioNLPSTParams: Parameters controlling BioNLP ST format output.
    @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying.
    @param exampleStyles: A parameter set for controlling example builders.
    @param classifierParams: A parameter set for controlling classifiers.
    @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search.
    @param deleteOutput: Remove an existing output directory
    @param copyFrom: Copy an existing output directory for use as a template
    @param log: An optional alternative name for the log file. None is for no logging.
    @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST"
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param connection: A parameter set defining a local or remote connection for training the classifier
    @param subset: A parameter set for making subsets of input files
    """
    # Insert default arguments where needed
    inputFiles = setDictDefaults(inputFiles, {"train":None, "devel":None, "test":None})
    models = setDictDefaults(models, {"devel":"model-devel", "test":"model-test"})
    exampleStyles = setDictDefaults(exampleStyles, {"examples":None, "trigger":None, "edge":None, "unmerging":None, "modifiers":None})
    classifierParams = setDictDefaults(classifierParams, {"examples":None, "trigger":None, "recall":None, "edge":None, "unmerging":None, "modifiers":None})
    subset = setDictDefaults(Parameters.get(subset), {"train":None, "devel":None, "test":None, "seed":0, "all":None})
    folds = setDictDefaults(folds, {"train":None, "devel":None, "test":None})
    processUnmerging = getDefinedBool(processUnmerging)
    processModifiers = getDefinedBool(processModifiers)
    # Initialize working directory
    workdir(output, deleteOutput, copyFrom, log)
    # Get task specific parameters
    useKerasDetector = False
    if detector != None and "keras" in detector.lower():
        print >> sys.stderr, "Using a Keras Detector"
        useKerasDetector = True
        if detector.lower() == "keras":
            detector = None
    detector, bioNLPSTParams, preprocessorParams, folds = getTaskSettings(task, detector, 
        bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams, folds, corpusDir=corpusDir, useKerasDetector=useKerasDetector)
    # Learn training settings from input files
    detector = learnSettings(inputFiles, detector, classifierParams, task, exampleStyles, useKerasDetector=useKerasDetector)   
    # Get corpus subsets   
    getFolds(inputFiles, folds)
    getSubsets(inputFiles, subset)
    if task != None: 
        task = task.replace("-FULL", "")
    if "." in task:
        _, subTask = getSubTask(task)
        if subTask != 3:
            processModifiers = False
    # Preprocess the corpus if required
    if corpusPreprocessing != None:
        preprocessor = Preprocessor(steps=corpusPreprocessing)
        assert preprocessor.steps[0].name == "MERGE_SETS"
        assert preprocessor.steps[-1].name == "DIVIDE_SETS"
        preprocessedCorpusDir = os.path.join(output, "corpus")
        #outputFiles = {x:os.path.join(preprocessedCorpusDir, os.path.basename(inputFiles[x])) for x in inputFiles}
        preprocessor.process(inputFiles, os.path.join(preprocessedCorpusDir, task))
        #inputFiles = outputFiles
        for setName in inputFiles.keys():
            if inputFiles[setName] != None:
                inputFiles[setName] = os.path.join(preprocessedCorpusDir, task + "-" + setName + ".xml")
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"])
    
    # Initialize the detector
    detector, detectorName = getDetector(detector, evaluator=evaluator)
    evaluator, evaluatorName = importClass(evaluator, "evaluator")
    detector = detector() # initialize object
    if evaluator != None:
        print >> sys.stderr, "Using evaluator", evaluator.__name__
        detector.evaluator = evaluator
    detector.debug = debug
    detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams)
    #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format
    #detector.stWriteScores = True # write confidence scores into additional st-format files
    connection = getConnection(connection)
    detector.setConnection(connection)
    connection.debug = debug
    if deleteOutput:
        connection.clearWorkDir()
    
    # Train
    if selector.check("TRAIN"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------------ Train Detector ------------------"
        print >> sys.stderr, "----------------------------------------------------"
        if not isinstance(detector, EventDetector):
            detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
                           exampleStyles["examples"], classifierParams["examples"], parse, None, task,
                           fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"])
        else:
            detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
                           exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"],
                           classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"],
                           classifierParams["recall"], processUnmerging, processModifiers, 
                           doFullGrid, task, parse, None,
                           fromStep=detectorSteps["TRAIN"], workDir="training", testData=inputFiles["test"])
        # Save the detector type
        for model in [models["devel"], models["test"]]:
            if model != None and os.path.exists(model):
                model = Model(model, "a")
                model.addStr("detector", detectorName)
                if evaluatorName != None:
                    model.addStr("detector", evaluatorName)
                if preprocessorParams != None:
                    preprocessor = Preprocessor()
                    model.addStr("preprocessorParams", Parameters.toString(preprocessor.getParameters(preprocessorParams)))
                model.save()
                model.close()
    if selector.check("DEVEL"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Check devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
        detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel")
    if selector.check("EMPTY"):
        # By passing an emptied devel set through the prediction system, we can check that we get the same predictions
        # as in the DEVEL step, ensuring the model does not use leaked information.
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Empty devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
        removalScope = "non-given"
        if "names" in str(exampleStyles["examples"]) or "names" in str(exampleStyles["trigger"]):
            removalScope = "all"
        elif "Edge" in detector.__class__.__name__:
            removalScope = "interactions"
        detector.classify(getEmptyCorpus(inputFiles["devel"], scope=removalScope), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty")
        print >> sys.stderr, "*** Evaluate empty devel classification ***"
        if os.path.exists("classification-empty/devel-empty-pred.xml.gz"):
            EvaluateInteractionXML.run(detector.evaluator, "classification-empty/devel-empty-pred.xml.gz", inputFiles["devel"], parse)
        else:
            print >> sys.stderr, "No output file for evaluation"
    if selector.check("TEST"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------- Test set classification --------------"
        print >> sys.stderr, "----------------------------------------------------"
        if inputFiles["test"] == None or not os.path.exists(inputFiles["test"]):
            print >> sys.stderr, "Skipping, test file", inputFiles["test"], "does not exist"
        else:
            #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
            detector.classify(inputFiles["test"], models["test"] if models["test"] != None else models["devel"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test")
            if detector.bioNLPSTParams["convert"]:
                extension = ".zip" if (detector.bioNLPSTParams["convert"] == "zip") else ".tar.gz" 
                Utils.STFormat.Compare.compare("classification-test/test-events" + extension, "classification-devel/devel-events" + extension, "a2")
    # Stop logging
    if log != None:
        Stream.closeLog(log)
Beispiel #35
0
    def classifyToXML(self,
                      data,
                      model,
                      exampleFileName=None,
                      tag="",
                      classifierModel=None,
                      goldData=None,
                      parse=None,
                      recallAdjust=None,
                      compressExamples=True,
                      exampleStyle=None,
                      useExistingExamples=False):
        model = self.openModel(model, "r")
        if parse == None:
            parse = self.getStr(self.tag + "parse", model)
        if useExistingExamples:
            assert exampleFileName != None
            assert os.path.exists(exampleFileName)
        if exampleFileName == None:
            exampleFileName = tag + self.tag + "examples"
            if compressExamples:
                exampleFileName += ".gz"
        if not useExistingExamples:
            self.buildExamples(model, [data], [exampleFileName], [goldData],
                               parse=parse,
                               exampleStyle=exampleStyle)
        if classifierModel == None:
            classifierModel = model.get(self.tag + "classifier-model",
                                        defaultIfNotExist=None)
        #else:
        #    assert os.path.exists(classifierModel), classifierModel
        classifier = self.getClassifier(
            model.getStr(self.tag + "classifier-parameter",
                         defaultIfNotExist=None))()
        classifier.classify(exampleFileName,
                            tag + self.tag + "classifications",
                            classifierModel,
                            finishBeforeReturn=True)
        threshold = model.getStr(self.tag + "threshold",
                                 defaultIfNotExist=None,
                                 asType=float)
        predictions = ExampleUtils.loadPredictions(tag + self.tag +
                                                   "classifications",
                                                   recallAdjust,
                                                   threshold=threshold)
        evaluator = self.evaluator.evaluate(
            exampleFileName, predictions, model.get(self.tag + "ids.classes"))
        #outputFileName = tag+"-"+self.tag+"pred.xml.gz"
        #exampleStyle = self.exampleBuilder.getParameters(model.getStr(self.tag+"example-style"))
        if exampleStyle == None:
            exampleStyle = Parameters.get(
                model.getStr(self.tag + "example-style")
            )  # no checking, but these should already have passed the ExampleBuilder
        self.structureAnalyzer.load(model)
        return self.exampleWriter.write(
            exampleFileName,
            predictions,
            data,
            tag + self.tag + "pred.xml.gz",
            model.get(self.tag + "ids.classes"),
            parse,
            exampleStyle=exampleStyle,
            structureAnalyzer=self.structureAnalyzer)


#        if evaluator.getData().getTP() + evaluator.getData().getFP() > 0:
#            return self.exampleWriter.write(exampleFileName, predictions, data, outputFileName, model.get(self.tag+"ids.classes"), parse)
#        else:
#            # TODO: e.g. interactions must be removed if task does unmerging
#            print >> sys.stderr, "No positive", self.tag + "predictions, XML file", outputFileName, "unchanged from input"
#            if type(data) in types.StringTypes: # assume its a file
#                shutil.copy(data, outputFileName)
#            else: # assume its an elementtree
#                ETUtils.write(data, outputFileName)
#            #print >> sys.stderr, "No positive predictions, XML file", tag+self.tag+"pred.xml", "not written"
#            return data #None
Beispiel #36
0
    optparser.add_option("-o", "--output", default=None, dest="output", help="Output directory or file")
    optparser.add_option("-r", "--remote", default=None, dest="remote", help="Remote connection")
    #optparser.add_option("-c", "--classifier", default="SVMMultiClassClassifier", dest="classifier", help="Classifier Class")
    optparser.add_option("-p", "--parameters", default=None, dest="parameters", help="Parameters for the classifier")
    #optparser.add_option("-d", "--ids", default=None, dest="ids", help="")
    #optparser.add_option("--filterIds", default=None, dest="filterIds", help="")
    optparser.add_option("--install", default=None, dest="install", help="Install directory (or DEFAULT)")
    optparser.add_option("--installFromSource", default=False, action="store_true", dest="installFromSource", help="")
    (options, args) = optparser.parse_args()

    assert options.action in ["TRAIN", "CLASSIFY", "OPTIMIZE"]
    classifier = ScikitClassifier(Connection.getConnection(options.remote))
    if options.action == "TRAIN":
        import time
        trained = classifier.train(options.examples, options.output, options.parameters, options.classifyExamples)
        status = trained.getStatus()
        while status not in ["FINISHED", "FAILED"]:
            print >> sys.stderr, "Training classifier, status =", status
            time.sleep(10)
            status = trained.getStatus()
        print >> sys.stderr, "Training finished, status =", status
        if trained.getStatus() == "FINISHED":
            trained.downloadPredictions()
            trained.downloadModel()
    elif options.action == "CLASSIFY":
        classified = classifier.classify(options.examples, options.output, options.model, True)
        if classified.getStatus() == "FINISHED":
            classified.downloadPredictions()
    else: # OPTIMIZE
        options.parameters = Parameters.get(options.parameters)
        optimized = classifier.optimize(options.examples, options.output, options.parameters, options.classifyExamples, options.classIds, step=options.optimizeStep)
Beispiel #37
0
    optparser.add_option("-o", "--output", default=None, dest="output", help="Output directory or file")
    optparser.add_option("-r", "--remote", default=None, dest="remote", help="Remote connection")
    #optparser.add_option("-c", "--classifier", default="SVMMultiClassClassifier", dest="classifier", help="Classifier Class")
    optparser.add_option("-p", "--parameters", default=None, dest="parameters", help="Parameters for the classifier")
    #optparser.add_option("-d", "--ids", default=None, dest="ids", help="")
    #optparser.add_option("--filterIds", default=None, dest="filterIds", help="")
    optparser.add_option("--install", default=None, dest="install", help="Install directory (or DEFAULT)")
    optparser.add_option("--installFromSource", default=False, action="store_true", dest="installFromSource", help="")
    (options, args) = optparser.parse_args()

    assert options.action in ["TRAIN", "CLASSIFY", "OPTIMIZE"]
    classifier = ScikitClassifier(Connection.getConnection(options.remote))
    if options.action == "TRAIN":
        import time
        trained = classifier.train(options.examples, options.output, options.parameters, options.classifyExamples)
        status = trained.getStatus()
        while status not in ["FINISHED", "FAILED"]:
            print >> sys.stderr, "Training classifier, status =", status
            time.sleep(10)
            status = trained.getStatus()
        print >> sys.stderr, "Training finished, status =", status
        if trained.getStatus() == "FINISHED":
            trained.downloadPredictions()
            trained.downloadModel()
    elif options.action == "CLASSIFY":
        classified = classifier.classify(options.examples, options.output, options.model, True)
        if classified.getStatus() == "FINISHED":
            classified.downloadPredictions()
    else: # OPTIMIZE
        options.parameters = Parameters.get(options.parameters)
        optimized = classifier.optimize(options.examples, options.output, options.parameters, options.classifyExamples, options.classIds, step=options.optimizeStep)
Beispiel #38
0
    def doGrid(self):
        print >> sys.stderr, "--------- Booster parameter search ---------"
        # Build trigger examples
        self.triggerDetector.buildExamples(
            self.model, [self.optData],
            [self.workDir + "grid-trigger-examples.gz"])

        if self.fullGrid:
            # Parameters to optimize
            ALL_PARAMS = {
                "trigger": [
                    int(i)
                    for i in Parameters.get(self.triggerClassifierParameters,
                                            valueListKey="c")["c"]
                ],
                "booster":
                [float(i) for i in self.recallAdjustParameters.split(",")],
                "edge": [
                    int(i)
                    for i in Parameters.get(self.edgeClassifierParameters,
                                            valueListKey="c")["c"]
                ]
            }
        else:
            ALL_PARAMS = {
                "trigger":
                Parameters.get(self.model.getStr(self.triggerDetector.tag +
                                                 "classifier-parameter"),
                               valueListKey="c")["c"],
                "booster":
                [float(i) for i in self.recallAdjustParameters.split(",")],
                "edge":
                Parameters.get(self.model.getStr(self.edgeDetector.tag +
                                                 "classifier-parameter"),
                               valueListKey="c")["c"]
            }

        paramCombinations = Parameters.getCombinations(
            ALL_PARAMS, ["trigger", "booster", "edge"])
        prevParams = None
        EDGE_MODEL_STEM = os.path.join(
            self.edgeDetector.workDir,
            os.path.normpath(self.model.path) + "-edge-models/model-c_")
        TRIGGER_MODEL_STEM = os.path.join(
            self.triggerDetector.workDir,
            os.path.normpath(self.model.path) + "-trigger-models/model-c_")
        bestResults = None
        for i in range(len(paramCombinations)):
            params = paramCombinations[i]
            print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            print >> sys.stderr, "Processing params", str(i + 1) + "/" + str(
                len(paramCombinations)), params
            print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            # Triggers and Boost
            if prevParams == None or prevParams["trigger"] != params[
                    "trigger"] or prevParams["booster"] != params["booster"]:
                print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str(
                    params["trigger"]), "booster:" + str(params["booster"])
                xml = self.triggerDetector.classifyToXML(
                    self.optData,
                    self.model,
                    self.workDir + "grid-trigger-examples.gz",
                    self.workDir + "grid-",
                    classifierModel=TRIGGER_MODEL_STEM +
                    str(params["trigger"]),
                    recallAdjust=params["booster"])
            prevParams = params
            # Build edge examples
            self.edgeDetector.buildExamples(
                self.model, [xml], [self.workDir + "grid-edge-examples.gz"],
                [self.optData])
            # Classify with pre-defined model
            edgeClassifierModel = EDGE_MODEL_STEM + str(params["edge"])
            xml = self.edgeDetector.classifyToXML(
                xml,
                self.model,
                self.workDir + "grid-edge-examples.gz",
                self.workDir + "grid-",
                classifierModel=edgeClassifierModel)
            bestResults = self.evaluateGrid(xml, params, bestResults)
        print >> sys.stderr, "Booster search complete"
        print >> sys.stderr, "Tested", len(paramCombinations), "combinations"
        print >> sys.stderr, "Best parameters:", bestResults[0]
        print >> sys.stderr, "Best result:", bestResults[2]  # f-score
        # Save grid model
        self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]),
                     self.model)
        self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]),
                     self.combinedModel, False)
        if self.fullGrid:  # define best models
            self.triggerDetector.addClassifierModel(
                self.model,
                TRIGGER_MODEL_STEM + str(bestResults[0]["trigger"]),
                bestResults[0]["trigger"])
            self.edgeDetector.addClassifierModel(
                self.model, EDGE_MODEL_STEM + str(bestResults[0]["edge"]),
                bestResults[0]["edge"])
        # Remove work files
        for stepTag in [
                self.workDir + "grid-trigger", self.workDir + "grid-edge",
                self.workDir + "grid-unmerging"
        ]:
            for fileStem in [
                    "-classifications", "-classifications.log", "examples.gz",
                    "pred.xml.gz"
            ]:
                if os.path.exists(stepTag + fileStem):
                    os.remove(stepTag + fileStem)
    def optimize(self,
                 examples,
                 outDir,
                 parameters,
                 classifyExamples,
                 classIds,
                 step="BOTH",
                 evaluator=None,
                 determineThreshold=False,
                 timeout=None,
                 downloadAllModels=False):
        assert step in ["BOTH", "SUBMIT", "RESULTS"], step
        outDir = os.path.abspath(outDir)
        # Initialize training (or reconnect to existing jobs)
        combinations = Parameters.getCombinations(
            Parameters.get(parameters, valueListKey="c")
        )  #Core.OptimizeParameters.getParameterCombinations(parameters)
        trained = []
        for combination in combinations:
            trained.append(
                self.train(examples,
                           outDir,
                           combination,
                           classifyExamples,
                           replaceRemoteExamples=(len(trained) == 0),
                           dummy=(step == "RESULTS")))
        if step == "SUBMIT":  # Return already
            classifier = copy.copy(self)
            classifier.setState("OPTIMIZE")
            return classifier

        # Wait for the training to finish
        finalJobStatus = self.connection.waitForJobs(
            [x.getJob() for x in trained])
        # Evaluate the results
        print >> sys.stderr, "Evaluating results"
        #Stream.setIndent(" ")
        bestResult = None
        if evaluator == None:
            evaluator = self.defaultEvaluator
        for i in range(len(combinations)):
            id = trained[i].parameterIdStr
            #Stream.setIndent(" ")
            # Get predictions
            predictions = None
            if trained[i].getStatus() == "FINISHED":
                predictions = trained[i].downloadPredictions()
            else:
                print >> sys.stderr, "No results for combination" + id
                continue
            if downloadAllModels:
                trained[i].downloadModel()
            # Compare to other results
            print >> sys.stderr, "*** Evaluating results for combination" + id + " ***"
            threshold = None
            if determineThreshold:
                print >> sys.stderr, "Thresholding, original micro =",
                evaluation = evaluator.evaluate(
                    classifyExamples,
                    predictions,
                    classIds,
                    os.path.join(outDir,
                                 "evaluation-before-threshold" + id + ".csv"),
                    verbose=False)
                print >> sys.stderr, evaluation.microF.toStringConcise()
                threshold, bestF = evaluator.threshold(classifyExamples,
                                                       predictions)
                print >> sys.stderr, "threshold =", threshold, "at binary fscore", str(
                    bestF)[0:6]
            evaluation = evaluator.evaluate(
                classifyExamples,
                ExampleUtils.loadPredictions(predictions, threshold=threshold),
                classIds, os.path.join(outDir, "evaluation" + id + ".csv"))
            if bestResult == None or evaluation.compare(
                    bestResult[0]
            ) > 0:  #: averageResult.fScore > bestResult[1].fScore:
                bestResult = [
                    evaluation, trained[i], combinations[i], threshold
                ]
            if not self.connection.isLocal():
                os.remove(predictions)  # remove predictions to save space
        #Stream.setIndent()
        if bestResult == None:
            raise Exception("No results for any parameter combination")
        print >> sys.stderr, "*** Evaluation complete", finalJobStatus, "***"
        print >> sys.stderr, "Selected parameters", bestResult[2]
        classifier = copy.copy(bestResult[1])
        classifier.threshold = bestResult[3]
        classifier.downloadModel()
        return classifier
Beispiel #40
0
def parameters():
    """This function deals with parameters passed to the script"""

    # Defines globals to be used above
    global mode, effectObject, target_packet_type, save_active, NFQUEUE_Active

    # Defaults
    mode = print_packet
    target_packet_type = 'ALL'
    save_active = False

    # Setup
    NFQUEUE_Active = True

    # Arguments
    parser = argparse.ArgumentParser(
        prog="Packet.py",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        allow_abbrev=False)

    parser.add_argument_group('Arguments', description=Parameter.Usage())

    # Mode parameters
    effect = parser.add_mutually_exclusive_group(required=True, )

    effect.add_argument('--print',
                        Parameter.cmd_print,
                        action='store_true',
                        dest="output",
                        help=argparse.SUPPRESS)

    effect.add_argument('--ignore',
                        '-i',
                        action='store_true',
                        dest='ignore',
                        help=argparse.SUPPRESS)

    effect.add_argument('--latency',
                        Parameter.cmd_latency,
                        action='store',
                        help=argparse.SUPPRESS,
                        type=int)

    effect.add_argument('--packet-loss',
                        Parameter.cmd_packetloss,
                        action='store',
                        help=argparse.SUPPRESS,
                        type=int)

    effect.add_argument('--surge',
                        Parameter.cmd_throttle,
                        action='store',
                        help=argparse.SUPPRESS,
                        type=int)

    effect.add_argument('--display-bandwidth',
                        Parameter.cmd_bandwidth,
                        action='store_true',
                        help=argparse.SUPPRESS)

    effect.add_argument('--rate-limit',
                        Parameter.cmd_ratelimit,
                        action='store',
                        dest='rate_limit',
                        help=argparse.SUPPRESS,
                        type=int)

    # Extra parameters
    parser.add_argument('--target-packet',
                        Parameter.cmd_target_packet,
                        action='store',
                        dest='target',
                        help=argparse.SUPPRESS)

    parser.add_argument('--save',
                        Parameter.cmd_save,
                        nargs=1,
                        dest='save',
                        help=argparse.SUPPRESS)

    args = parser.parse_args()

    # Modes
    if args.output:

        effectObject = Print.Print()
        mode = print_packet

    elif args.ignore:
        mode = ignore_packet

    elif args.latency:
        effectObject = Latency.Latency(latency_value=args.latency)
        mode = packet_latency

    elif args.packet_loss:
        effectObject = PacketLoss.PacketLoss(percentage=args.packet_loss)
        mode = packet_loss

    elif args.surge:
        effectObject = Surge.Surge(period=args.surge)
        effectObject.start_purge_monitor()
        mode = surge

    elif args.display_bandwidth:
        effectObject = DisplayBandwidth.DisplayBandwidth()
        mode = track_bandwidth

    elif args.rate_limit:
        # Sets the bandwidth object with the specified bandwidth limit
        effectObject = LimitBandwidth.LimitBandwidth(bandwidth=args.rate_limit)
        mode = limit_bandwidth

    if args.save:
        print('[!] File saving on - Files will be saved under: \'{}.pcap\''.
              format(args.save[0]))

        save_active = True
        setup_packet_save(args.save[0])

    if args.target:
        target_packet_type = args.target

    # When all parameters are handled
    if NFQUEUE_Active:
        run_packet_manipulation()
Beispiel #41
0
def learnSettings(inputFiles, detector, classifierParameters, task, exampleStyles, useKerasDetector=False):
    if detector == None:
        print >> sys.stderr, "*** Analyzing input files to determine training settings ***"
        structureAnalyzer = StructureAnalyzer()
        if not os.path.exists("training/structure.txt"): 
            datasets = sorted(filter(None, [inputFiles["train"], inputFiles["devel"]]))
            print >> sys.stderr, "input files:", datasets
            structureAnalyzer.analyze(datasets)
            print >> sys.stderr, structureAnalyzer.toString()
            structureAnalyzer.save(None, "training/structure.txt")
        else:
            print >> sys.stderr, "Using existing analysis from training/structure.txt"
            structureAnalyzer.load(None, "training/structure.txt")
    
    # Choose detector
    if detector == None:
        if "ENTITY" in structureAnalyzer.targets and "INTERACTION" in structureAnalyzer.targets:
            detector = "Detectors.EventDetector"
        elif "ENTITY" in structureAnalyzer.targets:
            detector = "Detectors.EntityDetector"
        elif "INTERACTION" in structureAnalyzer.targets:
            detector = "Detectors.EdgeDetector"
        else:
            assert False, structureAnalyzer.targets

    if useKerasDetector and not "Keras" in detector:
        detector = detector.replace("Detectors.", "Detectors.Keras")
    print >> sys.stderr, "Using detector '" + str(detector) + "'"
    
    # Set default parameters
    cp = classifierParameters
    if detector == "Detectors.EventDetector":
        # Add common classifier parameters
        if cp["examples"] != None:
            cp["unmerging"] = Parameters.cat(cp["examples"], cp["unmerging"])
            cp["modifiers"] = Parameters.cat(cp["examples"], cp["modifiers"])
            cp["edge"] = Parameters.cat(cp["examples"], cp["edge"])
            cp["trigger"] = Parameters.cat(cp["examples"], cp["trigger"])
        cp["unmerging"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["unmerging"], "Classifier parameters for unmerging")        
        cp["modifiers"] = Parameters.cat("c=5000,10000,20000,50000,100000", cp["modifiers"], "Classifier parameters for modifiers")
        cp["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["edge"], "Classifier parameters for edges")
        cp["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["trigger"], "Classifier parameters for triggers")
        cp["recall"] = Parameters.cat("0.5,0.6,0.65,0.7,0.85,1.0,1.1,1.2", cp["recall"], "Recall adjustment parameters")
    elif detector == "Detectors.EntityDetector":
        cp["examples"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", cp["examples"], "Classifier parameters for entities")
    elif detector == "Detectors.EdgeDetector":
        cp["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", cp["examples"], "Classifier parameters for edges")
    elif detector == "Detectors.UnmergingDetector":
        cp["examples"] = Parameters.cat("c=1,10,100,500,1000,1500,2500,5000,10000,20000,50000,80000,100000", cp["examples"], "Classifier parameters for unmerging")
    
    #######################################################################
    # Keras example styles
    #######################################################################
    if useKerasDetector:
        task, subTask = getSubTask(task)
        msg = "Keras example style"
        #overrideStyles = {x:(Parameters.get(exampleStyles[x]) if (exampleStyles[x] != None and "override" in exampleStyles[x]) else {"override":True}) for x in exampleStyles}
        overrideStyles = {"all":{}}
        for key in exampleStyles:
            overrideStyles[key] = {}
            params = Parameters.get(exampleStyles[key])
            if "override" in params:
                exampleStyles[key] = None
                overrideStyles[key] = params
                overrideStyles[key].pop("override")
            elif "override_all" in params:
                exampleStyles[key] = None
                overrideStyles["all"] = params
                overrideStyles["all"].pop("override_all")
            #exampleStyles[key] = exampleStyles[key] if (exampleStyles[key] != None and not "override" in exampleStyles[key]) else None
        print >> sys.stderr, "Override styles:", overrideStyles
        if "EventDetector" in detector:
            if task == "EPI11":
                exampleStyles["trigger"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20:epi_merge_negated", exampleStyles["trigger"])
            else:
                exampleStyles["trigger"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20", exampleStyles["trigger"])
            if task in ["GE09", "GE11", "GE13"] and subTask == 1:
                exampleStyles["edge"] = Parameters.cat("keras:genia_task1:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["edge"])
            else:
                exampleStyles["edge"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["edge"])
            exampleStyles["unmerging"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["unmerging"])
            exampleStyles["modifiers"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:el=41:mods=20", exampleStyles["modifiers"])
        elif "EntityDetector" in detector:
            if task == "DDI13T91":
                exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20:names:build_for_nameless", exampleStyles["examples"])
            else:
                exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=4:el=41:mods=20", exampleStyles["examples"])
        elif "EdgeDetector" in detector:
            if "DDI" in task:
                exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=0:do=0.2:dense=800:ol=50:mods=20", exampleStyles["examples"])
            elif task == "CP17":
                exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=512:path=0:do=0.2:ol=50:skip_labels=CPR\:0,CPR\:1,CPR\:2,CPR\:7,CPR\:8,CPR\:10:mods=20", exampleStyles["examples"])
            else:
                exampleStyles["examples"] = Parameters.cat("keras:epochs=500:patience=10:nf=256:path=4:ol=15:mods=20", exampleStyles["examples"])
        print >> sys.stderr, "Keras initial example styles:", exampleStyles
        for key in exampleStyles:
            if exampleStyles[key] != None:
                exampleStyles[key] = Parameters.get(exampleStyles[key])
                exampleStyles[key].update(overrideStyles[key])
                exampleStyles[key].update(overrideStyles["all"])
                exampleStyles[key] = Parameters.toString(exampleStyles[key])
            print >> sys.stderr, "Keras final example style for " + key + ": ", exampleStyles[key]
        
    return detector
Beispiel #42
0
 def addClassifierModel(self, model, classifierModelPath, classifierParameters):
     classifierModel = model.get(self.tag+"classifier-model", True)
     shutil.copy2(classifierModelPath, classifierModel)
     model.addStr(self.tag+"classifier-parameter", Parameters.toString(Parameters.get(classifierParameters)))
     return classifierModel
Beispiel #43
0
def getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams,
                    inputFiles, exampleStyles, classifierParameters):
    if task != None:
        print >> sys.stderr, "*** Defining training settings for task", task, "***"
        fullTaskId = task
        subTask = 2
        if "." in task:
            task, subTask = task.split(".")
            subTask = int(subTask)
        dataPath = Settings.CORPUS_DIR
        for dataset in ["devel", "train", "test"]:
            if inputFiles[dataset] == None and inputFiles[dataset] != "None":
                inputFiles[dataset] = os.path.join(
                    dataPath,
                    task.replace("-FULL", "") + "-" + dataset + ".xml")
            if task == "ID11" and dataset == "train":
                inputFiles[dataset] = Catenate.catenate(
                    [
                        os.path.join(dataPath, "ID11-train.xml"),
                        os.path.join(dataPath, "GE11-devel.xml"),
                        os.path.join(dataPath, "GE11-train.xml")
                    ],
                    "training/ID11-train-and-GE11-devel-and-train.xml.gz",
                    fast=True)
            if inputFiles[dataset] == "None":
                inputFiles[dataset] = None
            if inputFiles[dataset] != None and not os.path.exists(
                    inputFiles[dataset]):
                inputFiles[dataset] = None
                print >> sys.stderr, "Input file", inputFiles[
                    dataset], "for set '" + dataset + "' does not exist, skipping."
        assert inputFiles["train"] != None  # at least training set must exist
        # Example generation parameters
        if task == "CO11":
            detector = "Detectors.CODetector"
        elif task in ["BI11-FULL", "DDI11-FULL"]:
            detector = "Detectors.EventDetector"

        # BioNLP Shared Task and preprocessing parameters
        if task == "BI11-FULL":
            bioNLPSTParams = Parameters.cat(
                bioNLPSTParams, "convert:scores",
                "BioNLP Shared Task / " + fullTaskId, ["default"]
            )  # the shared task evaluator is not designed for predicted entities
        elif task == "REL11":
            bioNLPSTParams = Parameters.cat(
                bioNLPSTParams, "convert:evaluate:scores:a2Tag=rel",
                "BioNLP Shared Task / " + fullTaskId, ["default"])
        elif task not in ["DDI11", "DDI11-FULL", "DDI13"]:
            bioNLPSTParams = Parameters.cat(
                bioNLPSTParams, "convert:evaluate:scores",
                "BioNLP Shared Task / " + fullTaskId, ["default"])

        # Preprocessing parameters
        if task in ["BI11", "BI11-FULL", "BB11", "DDI11", "DDI11-FULL"]:
            Parameters.cat("intermediateFiles:omitSteps=NER,DIVIDE-SETS",
                           preprocessorParams, "Preprocessor /" + fullTaskId,
                           ["default"])
        else:  # parse only sentences where BANNER found an entity
            Parameters.cat(
                "intermediateFiles:omitSteps=DIVIDE-SETS:PARSE.requireEntities",
                preprocessorParams, "Preprocessor /" + fullTaskId, ["default"])

        # Example style parameters for single-stage tasks
        if task == "REN11":
            exampleStyles["examples"] = Parameters.cat(
                "undirected:bacteria_renaming:maskTypeAsProtein=Gene",
                exampleStyles["examples"],
                "Single-stage example style / " + fullTaskId)
        elif task == "DDI11":
            exampleStyles["examples"] = Parameters.cat(
                "drugbank_features:ddi_mtmx:filter_shortest_path=conj_and",
                exampleStyles["examples"],
                "Single-stage example style / " + fullTaskId)
        elif task == "DDI13":
            exampleStyles["examples"] = Parameters.cat(
                "keep_neg:drugbank_features:filter_shortest_path=conj_and",
                exampleStyles["examples"],
                "Single-stage example style / " + fullTaskId)
        elif task == "BI11":
            exampleStyles["edge"] = Parameters.cat(
                "bi_features", exampleStyles["edge"],
                "Edge example style / " + fullTaskId)
        # Edge style
        if task in ["GE09", "GE11", "GE13"] and subTask == 1:
            exampleStyles["edge"] = Parameters.cat(
                "genia_features:genia_task1", exampleStyles["edge"])
        elif task in ["GE09", "GE11", "GE13"]:
            exampleStyles["edge"] = Parameters.cat("genia_features",
                                                   exampleStyles["edge"])
        elif task == "REL11":
            exampleStyles["edge"] = Parameters.cat(
                "rel_features", exampleStyles["edge"],
                "Edge example style / " + fullTaskId)
        elif task == "DDI11-FULL":
            exampleStyles["edge"] = Parameters.cat(
                "drugbank_features:filter_shortest_path=conj_and",
                exampleStyles["edge"], "Edge example style / " + fullTaskId)
        elif task == "CO11":
            exampleStyles["edge"] = Parameters.cat(
                "co_features", exampleStyles["edge"],
                "Edge example style / " + fullTaskId)
        elif task == "BI11-FULL":
            exampleStyles["edge"] = Parameters.cat(
                "bi_features", exampleStyles["edge"],
                "Edge example style / " + fullTaskId)
        # Trigger style
        if task in ["GE09", "GE11", "GE13"] and subTask == 1:
            exampleStyles["trigger"] = Parameters.cat(
                "genia_task1", exampleStyles["trigger"],
                "Trigger example style / " + fullTaskId)
        elif task in ["EPI11", "PC13"]:
            exampleStyles["trigger"] = Parameters.cat(
                "epi_merge_negated", exampleStyles["trigger"],
                "Trigger example style / " + fullTaskId)
        elif task == "BB11":  # "bb_features:build_for_nameless:wordnet"
            exampleStyles["trigger"] = Parameters.cat(
                "bb_features:build_for_nameless", exampleStyles["trigger"],
                "Trigger example style / " + fullTaskId)
        elif task == "BB13T3":  # "bb_features:build_for_nameless:wordnet"
            exampleStyles["trigger"] = Parameters.cat(
                "bb_features:build_for_nameless", exampleStyles["trigger"],
                "Trigger example style / " + fullTaskId)
        elif task == "REL11":
            exampleStyles["trigger"] = Parameters.cat(
                "rel_features", exampleStyles["trigger"],
                "Trigger example style / " + fullTaskId)
        elif task in ["BI11-FULL", "DDI11-FULL"]:
            exampleStyles["trigger"] = "build_for_nameless:names"
        # Classifier parameters
        if task == "DDI11":
            classifierParameters["examples"] = Parameters.cat(
                "c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold",
                classifierParameters["examples"],
                "Classifier parameters for single-stage examples" + fullTaskId)
        #elif task == "DDI13":
        #    classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId)
        elif task == "CO11":
            classifierParameters["edge"] = Parameters.cat(
                "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000",
                classifierParameters["examples"],
                "Classifier parameters for edges / " + fullTaskId)
            classifierParameters["trigger"] = Parameters.cat(
                "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000",
                classifierParameters["examples"],
                "Classifier parameters for triggers / " + fullTaskId)
            classifierParameters["recall"] = Parameters.cat(
                "0.8,0.9,0.95,1.0", classifierParameters["recall"],
                "Recall adjust / " + fullTaskId)

    return detector, bioNLPSTParams, preprocessorParams
Beispiel #44
0
def getTaskSettings(task,
                    detector,
                    bioNLPSTParams,
                    preprocessorParams,
                    inputFiles,
                    exampleStyles,
                    classifierParameters,
                    folds,
                    corpusDir=None):
    if task != None:
        print >> sys.stderr, "*** Defining training settings for task", task, "***"
        fullTaskId = task
        subTask = 2
        if "." in task:
            task, subTask = task.split(".")
            subTask = int(subTask)
        if corpusDir == None:
            corpusDir = Settings.CORPUS_DIR
        for dataset in ["devel", "train", "test"]:
            if inputFiles[dataset] == None and inputFiles[dataset] != "None":
                if task.startswith("DDI13"):
                    if dataset in ["devel", "train"]:
                        inputFiles[dataset] = os.path.join(
                            corpusDir, "DDI13-train.xml")
                    elif dataset == "test":
                        if task.endswith("T91"):
                            inputFiles[dataset] = os.path.join(
                                corpusDir, "DDI13-test-task9.1.xml")
                        elif task.endswith("T92") or task.endswith("FULL"):
                            inputFiles[dataset] = os.path.join(
                                corpusDir, "DDI13-test-task9.2.xml")
                elif task == "ID11" and dataset == "train":
                    inputFiles[dataset] = Catenate.catenate(
                        [
                            os.path.join(corpusDir, "ID11-train.xml"),
                            os.path.join(corpusDir, "GE11-devel.xml"),
                            os.path.join(corpusDir, "GE11-train.xml")
                        ],
                        "training/ID11-train-and-GE11-devel-and-train.xml.gz",
                        fast=True)
                else:
                    inputFiles[dataset] = os.path.join(
                        corpusDir,
                        task.replace("-FULL", "") + "-" + dataset + ".xml")

            if inputFiles[dataset] == "None":
                inputFiles[dataset] = None
            if inputFiles[dataset] != None and not os.path.exists(
                    inputFiles[dataset]):
                fullPath = os.path.join(Settings.CORPUS_DIR,
                                        inputFiles[dataset])
                if os.path.exists(fullPath):
                    inputFiles[dataset] = fullPath
                else:
                    inputFiles[dataset] = None
                    print >> sys.stderr, "Input file", inputFiles[
                        dataset], "for set '" + dataset + "' does not exist, skipping."
        assert inputFiles["train"] != None  # at least training set must exist
        # Example generation parameters
        if task == "CO11":
            detector = "Detectors.CODetector"
        elif task in [
                "BI11-FULL", "DDI11-FULL", "DDI13-FULL", "BB_EVENT_16-FULL"
        ]:
            detector = "Detectors.EventDetector"
        elif task.startswith("DDI13"):
            if task.endswith("T91"):
                detector = "Detectors.EntityDetector"
            elif task.endswith("T92"):
                detector = "Detectors.EdgeDetector"

        #######################################################################
        # BioNLP Shared Task and preprocessing parameters
        #######################################################################
        if task == "BI11-FULL":
            bioNLPSTParams = Parameters.cat(
                bioNLPSTParams, "convert:scores",
                "BioNLP Shared Task / " + fullTaskId, ["default"]
            )  # the shared task evaluator is not designed for predicted entities
        elif task == "REL11":
            bioNLPSTParams = Parameters.cat(
                bioNLPSTParams, "convert:evaluate:scores:a2Tag=rel",
                "BioNLP Shared Task / " + fullTaskId, ["default"])
        elif task in ("BB_EVENT_16", "BB_EVENT_16-FULL", "BB_EVENT_NER_16",
                      "SDB16"):
            bioNLPSTParams = Parameters.cat(
                bioNLPSTParams, "convert=zip",
                "BioNLP Shared Task / " + fullTaskId, ["default"])
        elif task not in [
                "DDI11", "DDI11-FULL", "DDI13T91", "DDI13T92", "DDI13-FULL"
        ]:
            bioNLPSTParams = Parameters.cat(
                bioNLPSTParams, "convert:evaluate:scores",
                "BioNLP Shared Task / " + fullTaskId, ["default"])

        #######################################################################
        # Preprocessing parameters
        #######################################################################
        if task in [
                "BI11", "BI11-FULL", "BB11", "DDI11", "DDI11-FULL", "DDI13T91",
                "DDI13T92", "DDI13-FULL"
        ]:
            Parameters.cat("intermediateFiles:omitSteps=NER,DIVIDE-SETS",
                           preprocessorParams, "Preprocessor /" + fullTaskId,
                           ["default"])
        else:  # parse only sentences where BANNER found an entity
            Parameters.cat(
                "intermediateFiles:omitSteps=DIVIDE-SETS:PARSE.requireEntities",
                preprocessorParams, "Preprocessor /" + fullTaskId, ["default"])

        #######################################################################
        # Example style parameters
        #######################################################################
        # Example style parameters for single-stage tasks #####################
        msg = "Single-stage example style / " + fullTaskId
        if task == "REN11":
            exampleStyles["examples"] = Parameters.cat(
                "undirected:bacteria_renaming:maskTypeAsProtein=Gene",
                exampleStyles["examples"], msg)
        elif task == "DDI11":
            exampleStyles["examples"] = Parameters.cat(
                "drugbank_features:ddi_mtmx:filter_shortest_path=conj_and",
                exampleStyles["examples"], msg)
        elif task.startswith("DDI13"):
            if task.endswith("T91"):
                exampleStyles["examples"] = Parameters.cat(
                    "names:build_for_nameless:ddi13_features:drugbank_features",
                    exampleStyles["examples"], msg)
            elif task.endswith("T92"):
                exampleStyles["examples"] = Parameters.cat(
                    "keep_neg:drugbank_features:filter_shortest_path=conj_and",
                    exampleStyles["examples"], msg)
        elif task == "BI11":
            exampleStyles["examples"] = Parameters.cat(
                "bi_features", exampleStyles["examples"], msg)
        elif task == "BB_EVENT_16":
            exampleStyles["examples"] = Parameters.cat(
                "keep_neg", exampleStyles["examples"], msg
            )  #exampleStyles["examples"] = Parameters.cat("linear_features:keep_neg", exampleStyles["examples"], msg)
        elif task == "SDB16":
            exampleStyles["examples"] = Parameters.cat(
                "sdb_merge:sdb_features", exampleStyles["examples"], msg)
        # Edge style ##########################################################
        msg = "Edge example style / " + fullTaskId
        if task in ["GE09", "GE11", "GE13"] and subTask == 1:
            exampleStyles["edge"] = Parameters.cat(
                "genia_features:genia_task1", exampleStyles["edge"], msg)
        elif task in ["GE09", "GE11", "GE13"]:
            exampleStyles["edge"] = Parameters.cat("genia_features",
                                                   exampleStyles["edge"], msg)
        elif task == "REL11":
            exampleStyles["edge"] = Parameters.cat("rel_features",
                                                   exampleStyles["edge"], msg)
        elif task == "DDI11-FULL":
            exampleStyles["edge"] = Parameters.cat(
                "drugbank_features:filter_shortest_path=conj_and",
                exampleStyles["edge"], msg)
        elif task == "DDI13-FULL":
            exampleStyles["edge"] = Parameters.cat(
                "keep_neg:drugbank_features:filter_shortest_path=conj_and",
                exampleStyles["edge"], msg)
        elif task == "CO11":
            exampleStyles["edge"] = Parameters.cat("co_features",
                                                   exampleStyles["edge"], msg)
        elif task == "BI11-FULL":
            exampleStyles["edge"] = Parameters.cat("bi_features",
                                                   exampleStyles["edge"], msg)
        # Trigger style #######################################################
        msg = "Trigger example style / " + fullTaskId
        if task in ["GE09", "GE11", "GE13"] and subTask == 1:
            exampleStyles["trigger"] = Parameters.cat("genia_task1",
                                                      exampleStyles["trigger"],
                                                      msg)
        elif task in ["EPI11", "PC13"]:
            exampleStyles["trigger"] = Parameters.cat("epi_merge_negated",
                                                      exampleStyles["trigger"],
                                                      msg)
        elif task == "BB11":  # "bb_features:build_for_nameless:wordnet"
            exampleStyles["trigger"] = Parameters.cat("bb_features",
                                                      exampleStyles["trigger"],
                                                      msg)
        elif task == "BB13T3":  # "bb_features:build_for_nameless:wordnet"
            exampleStyles["trigger"] = Parameters.cat("bb_features",
                                                      exampleStyles["trigger"],
                                                      msg)
        elif task == "REL11":
            exampleStyles["trigger"] = Parameters.cat("rel_features",
                                                      exampleStyles["trigger"],
                                                      msg)
        elif task in ["BI11-FULL", "DDI11-FULL"]:
            exampleStyles["trigger"] = "names:build_for_nameless"
        elif task == "DDI13-FULL":
            exampleStyles[
                "trigger"] = "names:build_for_nameless:ddi13_features:drugbank_features"
        elif task == "BB_EVENT_16-FULL":
            exampleStyles["trigger"] = Parameters.cat(
                "bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens:only_types=Bacteria,Habitat,Geographical",
                exampleStyles["trigger"], msg)
        elif task in "BB_EVENT_NER_16":
            exampleStyles["trigger"] = Parameters.cat(
                "bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens",
                exampleStyles["trigger"], msg)

        #######################################################################
        # Classifier parameters
        #######################################################################
        if task == "DDI11":
            classifierParameters["examples"] = Parameters.cat(
                "c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold",
                classifierParameters["examples"],
                "Classifier parameters for single-stage examples" + fullTaskId)
        #elif task == "DDI13":
        #    classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId)
        elif task == "CO11":
            classifierParameters["edge"] = Parameters.cat(
                "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000",
                classifierParameters["edge"],
                "Classifier parameters for edges / " + fullTaskId)
            classifierParameters["trigger"] = Parameters.cat(
                "c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000",
                classifierParameters["trigger"],
                "Classifier parameters for triggers / " + fullTaskId)
            classifierParameters["recall"] = Parameters.cat(
                "0.8,0.9,0.95,1.0", classifierParameters["recall"],
                "Recall adjust / " + fullTaskId)
        elif task == "BB_EVENT_16":
            classifierParameters["examples"] = Parameters.cat(
                "c=10,20,30,40,50,60,70,80,100,110,115,120,125,130,140,150,200,500,1000,2000,3000,4000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000",
                classifierParameters["examples"],
                "Classifier parameters for edges / " + fullTaskId)
        elif task in ("BB_EVENT_16-FULL", "BB_EVENT_NER_16"):
            classifierParameters["edge"] = Parameters.cat(
                "c=10,20,50,80,100,110,115,120,125,130,140,150,200,500,1000",
                classifierParameters["edge"],
                "Classifier parameters for edges / " + fullTaskId)
        elif task == "SDB16":
            classifierParameters["examples"] = Parameters.cat(
                "c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000,80000,100000,150000",
                classifierParameters["examples"],
                "Classifier parameters for single-stage examples / " +
                fullTaskId)
        # Training fold parameters ############################################
        if task.startswith("DDI13"):
            folds["devel"] = ["train1", "train2", "train3", "train4"]
            folds["train"] = ["train5", "train6", "train7", "train8", "train9"]

    return detector, bioNLPSTParams, preprocessorParams, folds
Beispiel #45
0
    def doGrid(self):
        print >> sys.stderr, "--------- Parameter grid search ---------"
        # Build trigger examples
        self.triggerDetector.buildExamples(
            self.model, [self.optData],
            [self.workDir + "grid-trigger-examples"])

        if self.fullGrid:
            stepParams = {
                "trigger":
                Parameters.get(self.model.getStr(self.triggerDetector.tag +
                                                 "classifier-parameters-train",
                                                 defaultIfNotExist=""),
                               valueListKey="c"),
                "booster":
                [float(i) for i in self.recallAdjustParameters.split(",")],
                "edge":
                Parameters.get(self.model.getStr(self.edgeDetector.tag +
                                                 "classifier-parameters-train",
                                                 defaultIfNotExist=""),
                               valueListKey="c")
            }
        else:
            stepParams = {
                "trigger":
                Parameters.get(self.model.getStr(self.triggerDetector.tag +
                                                 "classifier-parameter",
                                                 defaultIfNotExist=""),
                               valueListKey="c"),
                "booster":
                [float(i) for i in self.recallAdjustParameters.split(",")],
                "edge":
                Parameters.get(self.model.getStr(self.edgeDetector.tag +
                                                 "classifier-parameter",
                                                 defaultIfNotExist=""),
                               valueListKey="c")
            }

        for step in ["trigger", "edge"]:
            stepParams[step] = Parameters.getCombinations(stepParams[step])
            for i in range(len(stepParams[step])):
                stepParams[step][i] = Parameters.toString(stepParams[step][i])
        print >> sys.stderr, "Parameters", [
            stepParams[x] for x in ["trigger", "booster", "edge"]
        ]
        paramCombinations = combine(
            *[stepParams[x] for x in ["trigger", "booster", "edge"]])
        print >> sys.stderr, "Combinations", paramCombinations
        for i in range(len(paramCombinations)):
            paramCombinations[i] = {
                "trigger": paramCombinations[i][0],
                "booster": paramCombinations[i][1],
                "edge": paramCombinations[i][2]
            }

        #paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"])
        prevParams = None
        EDGE_MODEL_STEM = os.path.join(
            self.edgeDetector.workDir,
            os.path.normpath(self.model.path) + "-edge-models/model")
        TRIGGER_MODEL_STEM = os.path.join(
            self.triggerDetector.workDir,
            os.path.normpath(self.model.path) + "-trigger-models/model")
        self.structureAnalyzer.load(self.model)
        bestResults = None
        for i in range(len(paramCombinations)):
            params = paramCombinations[i]
            print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            print >> sys.stderr, "Processing params", str(i + 1) + "/" + str(
                len(paramCombinations)), params
            print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            # Triggers and Boost (the trigger predictions are recalculated only when the relevant parameters change)
            if (prevParams == None) or (
                    prevParams["trigger"] != params["trigger"]) or (
                        prevParams["booster"] != params["booster"]):
                print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str(
                    params["trigger"]), "booster:" + str(params["booster"])
                xml = self.triggerDetector.classifyToXML(
                    self.optData,
                    self.model,
                    self.workDir + "grid-trigger-examples",
                    self.workDir + "grid-",
                    classifierModel=TRIGGER_MODEL_STEM +
                    Parameters.toId(params["trigger"]),
                    recallAdjust=params["booster"],
                    useExistingExamples=True)
            prevParams = params
            ## Build edge examples
            #self.edgeDetector.buildExamples(self.model, [xml], [self.workDir+"grid-edge-examples"], [self.optData])
            # Classify with pre-defined model
            edgeClassifierModel = EDGE_MODEL_STEM + Parameters.toId(
                params["edge"])
            xml = self.edgeDetector.classifyToXML(
                xml,
                self.model,
                self.workDir + "grid-edge-examples",
                self.workDir + "grid-",
                classifierModel=edgeClassifierModel,
                goldData=self.optData)
            bestResults = self.evaluateGrid(xml, params, bestResults)
        # Remove remaining intermediate grid files
        for tag1 in ["edge", "trigger", "unmerging"]:
            for tag2 in ["examples", "pred.xml.gz"]:
                if os.path.exists(self.workDir + "grid-" + tag1 + "-" + tag2):
                    os.remove(self.workDir + "grid-" + tag1 + "-" + tag2)
        print >> sys.stderr, "Parameter grid search complete"
        print >> sys.stderr, "Tested", len(paramCombinations), "combinations"
        print >> sys.stderr, "Best parameters:", bestResults[0]
        print >> sys.stderr, "Best result:", bestResults[2]  # f-score
        # Save grid model
        self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]),
                     self.model)
        self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]),
                     self.combinedModel, False)
        if self.fullGrid:  # define best models
            self.triggerDetector.addClassifierModel(
                self.model,
                TRIGGER_MODEL_STEM + str(bestResults[0]["trigger"]),
                bestResults[0]["trigger"])
            self.edgeDetector.addClassifierModel(
                self.model, EDGE_MODEL_STEM + str(bestResults[0]["edge"]),
                bestResults[0]["edge"])
        # Remove work files
        for stepTag in [
                self.workDir + "grid-trigger", self.workDir + "grid-edge",
                self.workDir + "grid-unmerging"
        ]:
            for fileStem in [
                    "-classifications", "-classifications.log", "examples.gz",
                    "pred.xml.gz"
            ]:
                if os.path.exists(stepTag + fileStem):
                    os.remove(stepTag + fileStem)
Beispiel #46
0
 def process(self,
             input,
             output,
             parameters=None,
             model=None,
             fromStep=None,
             toStep=None,
             omitSteps=None):
     self.initVariables(source=input,
                        xml=input,
                        outDir=os.path.dirname(output))
     if os.path.basename(output) != "":
         self.intermediateFileTag = os.path.basename(output)
     else:
         self.intermediateFileTag = ""
     self.enterState(self.STATE_TOOLCHAIN, [x[0] for x in self.steps],
                     fromStep, toStep, omitSteps)
     parameters = self.getParameters(parameters,
                                     model,
                                     defaultValue=NOTHING)
     self.applyParameters(parameters)
     # Run the tools
     print >> sys.stderr, "Tool chain parameters:", Parameters.toString(
         parameters,
         skipKeysWithValues=[NOTHING],
         skipDefaults=self.getDefaultParameters())
     if os.path.exists(output) and not os.path.isdir(output):
         print >> sys.stderr, "Removing existing preprocessor output file", output
         os.remove(output)
     savedIntermediate = None  # Output from a previous step if "fromStep" is used
     for step in self.steps:
         if self.checkStep(step[0]):
             if savedIntermediate != None:  # A previous run of the program saved an intermediate file
                 print >> sys.stderr, "Reading input from saved intermediate file", savedIntermediate
                 self.xml = ETUtils.ETFromObj(savedIntermediate)
                 savedIntermediate = None
             stepArgs = copy.copy(
                 step[2]
             )  # make a copy of the arguments to which i/o can be added
             stepArgs[step[4]["input"]] = self.xml  # the input
             if self.getIntermediateFilePath(
                     step
             ) != None:  # this step should save an intermediate file
                 stepArgs[step[4]["output"]] = self.getIntermediateFilePath(
                     step)
             print >> sys.stderr, "Running step", step[
                 0], "with arguments", stepArgs
             step[1](**stepArgs)  # call the tool
         elif self.getStepStatus(
                 step[0]) == "BEFORE":  # this step was run earlier
             savedIntermediate = self.getIntermediateFilePath(step)
     # End state and return
     xml = self.xml  # state-specific member variable self.xml will be removed when exiting state
     self.exitState()
     if self.state == None:  # if the whole toolchain has finished, return the final product
         if not os.path.isdir(
                 output
         ):  # if output is a directory, it was given only for storing intermediate files ...
             ETUtils.write(xml,
                           output)  # ... otherwise, save the final output
         return xml
     else:
         return None
Beispiel #47
0
    def doGrid(self):
        print >> sys.stderr, "--------- Parameter grid search ---------"
        # Build trigger examples
        self.triggerDetector.buildExamples(self.model, [self.optData], [self.workDir+"grid-trigger-examples.gz"])

        if self.fullGrid:
            stepParams = {
                "trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameters-train", defaultIfNotExist=""), valueListKey="c"),
                "booster":[float(i) for i in self.recallAdjustParameters.split(",")],
                "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameters-train", defaultIfNotExist=""), valueListKey="c")}
        else:
            stepParams = {
                "trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameter", defaultIfNotExist=""), valueListKey="c"),
                "booster":[float(i) for i in self.recallAdjustParameters.split(",")],
                "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameter", defaultIfNotExist=""), valueListKey="c")}
        
        for step in ["trigger", "edge"]:
            stepParams[step] = Parameters.getCombinations(stepParams[step])
            for i in range(len(stepParams[step])):
                stepParams[step][i] = Parameters.toString(stepParams[step][i])
        print >> sys.stderr, [stepParams[x] for x in ["trigger", "booster", "edge"]]
        paramCombinations = combine(*[stepParams[x] for x in ["trigger", "booster", "edge"]])
        print >> sys.stderr, paramCombinations
        for i in range(len(paramCombinations)):
            paramCombinations[i] = {"trigger":paramCombinations[i][0], "booster":paramCombinations[i][1], "edge":paramCombinations[i][2]}
        
        #paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"])
        prevParams = None
        EDGE_MODEL_STEM = os.path.join(self.edgeDetector.workDir, os.path.normpath(self.model.path)+"-edge-models/model")
        TRIGGER_MODEL_STEM = os.path.join(self.triggerDetector.workDir, os.path.normpath(self.model.path)+"-trigger-models/model")
        self.structureAnalyzer.load(self.model)
        bestResults = None
        for i in range(len(paramCombinations)):
            params = paramCombinations[i]
            print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            print >> sys.stderr, "Processing params", str(i+1) + "/" + str(len(paramCombinations)), params
            print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            # Triggers and Boost
            if prevParams == None or prevParams["trigger"] != params["trigger"] or prevParams["trigger"] != params["trigger"]:
                print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str(params["trigger"]), "booster:" + str(params["booster"])
                xml = self.triggerDetector.classifyToXML(self.optData, self.model, self.workDir+"grid-trigger-examples", self.workDir+"grid-", classifierModel=TRIGGER_MODEL_STEM + Parameters.toId(params["trigger"]), recallAdjust=params["booster"])
            prevParams = params
            ## Build edge examples
            #self.edgeDetector.buildExamples(self.model, [xml], [self.workDir+"grid-edge-examples"], [self.optData])
            # Classify with pre-defined model
            edgeClassifierModel = EDGE_MODEL_STEM + Parameters.toId(params["edge"])
            xml = self.edgeDetector.classifyToXML(xml, self.model, self.workDir+"grid-edge-examples", self.workDir+"grid-", classifierModel=edgeClassifierModel, goldData=self.optData)
            bestResults = self.evaluateGrid(xml, params, bestResults)
        # Remove remaining intermediate grid files
        for tag1 in ["edge", "trigger", "unmerging"]:
            for tag2 in ["examples", "pred.xml.gz"]:
                if os.path.exists(self.workDir+"grid-"+tag1+"-"+tag2):
                    os.remove(self.workDir+"grid-"+tag1+"-"+tag2)
        print >> sys.stderr, "Parameter grid search complete"
        print >> sys.stderr, "Tested", len(paramCombinations), "combinations"
        print >> sys.stderr, "Best parameters:", bestResults[0]
        print >> sys.stderr, "Best result:", bestResults[2] # f-score
        # Save grid model
        self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.model)
        self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.combinedModel, False)
        if self.fullGrid: # define best models
            self.triggerDetector.addClassifierModel(self.model, TRIGGER_MODEL_STEM+str(bestResults[0]["trigger"]), bestResults[0]["trigger"])
            self.edgeDetector.addClassifierModel(self.model, EDGE_MODEL_STEM+str(bestResults[0]["edge"]), bestResults[0]["edge"])
        # Remove work files
        for stepTag in [self.workDir+"grid-trigger", self.workDir+"grid-edge", self.workDir+"grid-unmerging"]:
            for fileStem in ["-classifications", "-classifications.log", "examples.gz", "pred.xml.gz"]:
                if os.path.exists(stepTag+fileStem):
                    os.remove(stepTag+fileStem)
    def train(self,
              examples,
              outDir,
              parameters,
              classifyExamples=None,
              finishBeforeReturn=False,
              replaceRemoteExamples=True,
              dummy=False):
        outDir = os.path.abspath(outDir)

        examples = self.getExampleFile(examples,
                                       replaceRemote=replaceRemoteExamples,
                                       dummy=dummy)
        classifyExamples = self.getExampleFile(
            classifyExamples, replaceRemote=replaceRemoteExamples, dummy=dummy)
        #parameters = Parameters.get(parameters, valueListKey="c")
        trainDir = os.path.normpath(
            self.connection.getSetting(self.trainDirSetting)) + os.path.sep

        # Return a new classifier instance for following the training process and using the model
        classifier = copy.copy(self)
        classifier.setState("TRAIN")
        classifier.parameters = parameters
        classifier._filesToRelease = [examples, classifyExamples]
        # Train
        if not os.path.exists(outDir):
            os.makedirs(outDir)
        #trainCommand = os.path.join(trainDir, self.trainCommand)
        trainCommand = self.trainCommand.replace("%d", trainDir)
        parameters = Parameters.get(parameters,
                                    self.parameterDefaults["train"],
                                    self.parameterAllowNew["train"],
                                    self.parameterValueListKey["train"],
                                    self.parameterValueLimits["train"],
                                    self.parameterValueTypes["train"])
        paramString, idStr = self._getParameterString(parameters)
        classifier.parameterIdStr = idStr
        classifier.model = self.connection.getRemotePath(
            outDir + "/model" + idStr, True)
        modelPath = self.connection.getRemotePath(outDir + "/model" + idStr,
                                                  False)
        trainCommand = trainCommand.replace("%p", paramString).replace(
            "%e", examples).replace("%m", modelPath).strip()
        self.connection.addCommand(trainCommand)
        # Classify with the trained model (optional)
        if classifyExamples != None:
            classifier.predictions = self.connection.getRemotePath(
                outDir + "/predictions" + idStr, True)
            predictionsPath = self.connection.getRemotePath(
                outDir + "/predictions" + idStr, False)
            classifyDir = os.path.normpath(
                self.connection.getSetting(
                    self.classifyDirSetting)) + os.path.sep
            classifyCommand = self.classifyCommand.replace(
                "%d", classifyDir).replace("%e", classifyExamples).replace(
                    "%m", modelPath).replace("%c", predictionsPath).strip()
            self.connection.addCommand(classifyCommand)
        # Run the process
        jobName = self.trainCommand.split()[0].replace("%d", "") + idStr
        logPath = outDir + "/" + jobName
        if dummy:  # return a classifier that connects to an existing job
            self.connection.clearCommands()
            classifier._job = self.connection.getJob(jobDir=outDir,
                                                     jobName=jobName)
        else:  # submit the job
            classifier._job = self.connection.submit(jobDir=outDir,
                                                     jobName=jobName,
                                                     stdout=logPath +
                                                     ".stdout")
            if finishBeforeReturn:
                self.connection.waitForJob(classifier._job)
                self.getStatus()
        return classifier
Beispiel #49
0
def getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, 
                    inputFiles, exampleStyles, classifierParameters):
    if task != None:
        print >> sys.stderr, "*** Defining training settings for task", task, "***"
        fullTaskId = task
        subTask = 2
        if "." in task:
            task, subTask = task.split(".")
            subTask = int(subTask)
        dataPath = Settings.CORPUS_DIR
        for dataset in ["devel", "train", "test"]:
            if inputFiles[dataset] == None and inputFiles[dataset] != "None":
                inputFiles[dataset] = os.path.join(dataPath, task.replace("-FULL", "") + "-"+dataset+".xml")
            if task == "ID11" and dataset == "train":
                inputFiles[dataset] = Catenate.catenate([os.path.join(dataPath, "ID11-train.xml"), os.path.join(dataPath, "GE11-devel.xml"),
                                                         os.path.join(dataPath, "GE11-train.xml")], "training/ID11-train-and-GE11-devel-and-train.xml.gz", fast=True)
            if inputFiles[dataset] == "None":
                inputFiles[dataset] = None
            if inputFiles[dataset] != None and not os.path.exists(inputFiles[dataset]):
                inputFiles[dataset] = None
                print >> sys.stderr, "Input file", inputFiles[dataset], "for set '" + dataset + "' does not exist, skipping."
        assert inputFiles["train"] != None # at least training set must exist
        # Example generation parameters
        if task == "CO11":
            detector = "Detectors.CODetector"
        elif task in ["BI11-FULL", "DDI11-FULL"]:
            detector = "Detectors.EventDetector"
        
        # BioNLP Shared Task and preprocessing parameters
        if task == "BI11-FULL":
            bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) # the shared task evaluator is not designed for predicted entities
        elif task == "REL11":
            bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores:a2Tag=rel", "BioNLP Shared Task / " + fullTaskId, ["default"])
        elif task not in ["DDI11", "DDI11-FULL", "DDI13"]:
            bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores", "BioNLP Shared Task / " + fullTaskId, ["default"])
        
        # Preprocessing parameters
        if task in ["BI11", "BI11-FULL", "BB11", "DDI11", "DDI11-FULL"]:
            Parameters.cat("intermediateFiles:omitSteps=NER,DIVIDE-SETS", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"])
        else: # parse only sentences where BANNER found an entity
            Parameters.cat("intermediateFiles:omitSteps=DIVIDE-SETS:PARSE.requireEntities", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"])
        
        # Example style parameters for single-stage tasks
        if task == "REN11":
            exampleStyles["examples"] = Parameters.cat("undirected:bacteria_renaming:maskTypeAsProtein=Gene", exampleStyles["examples"], "Single-stage example style / " + fullTaskId)
        elif task == "DDI11":
            exampleStyles["examples"] = Parameters.cat("drugbank_features:ddi_mtmx:filter_shortest_path=conj_and", exampleStyles["examples"], "Single-stage example style / " + fullTaskId)
        elif task == "DDI13":
            exampleStyles["examples"] = Parameters.cat("keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["examples"], "Single-stage example style / " + fullTaskId)
        elif task == "BI11":
            exampleStyles["edge"] = Parameters.cat("bi_features", exampleStyles["edge"], "Edge example style / " + fullTaskId)
        # Edge style
        if task in ["GE09", "GE11", "GE13"] and subTask == 1:
            exampleStyles["edge"] = Parameters.cat("genia_features:genia_task1", exampleStyles["edge"])
        elif task in ["GE09", "GE11", "GE13"]:
            exampleStyles["edge"] = Parameters.cat("genia_features", exampleStyles["edge"])
        elif task == "REL11":
            exampleStyles["edge"] = Parameters.cat("rel_features", exampleStyles["edge"], "Edge example style / " + fullTaskId)
        elif task == "DDI11-FULL":
            exampleStyles["edge"] = Parameters.cat("drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], "Edge example style / " + fullTaskId)
        elif task == "CO11":
            exampleStyles["edge"] = Parameters.cat("co_features", exampleStyles["edge"], "Edge example style / " + fullTaskId)
        elif task == "BI11-FULL":
            exampleStyles["edge"] = Parameters.cat("bi_features", exampleStyles["edge"], "Edge example style / " + fullTaskId)
        # Trigger style
        if task in ["GE09", "GE11", "GE13"] and subTask == 1:
            exampleStyles["trigger"] = Parameters.cat("genia_task1", exampleStyles["trigger"], "Trigger example style / " + fullTaskId)
        elif task in ["EPI11", "PC13"]:
            exampleStyles["trigger"] = Parameters.cat("epi_merge_negated", exampleStyles["trigger"], "Trigger example style / " + fullTaskId)
        elif task == "BB11": # "bb_features:build_for_nameless:wordnet"
            exampleStyles["trigger"] = Parameters.cat("bb_features:build_for_nameless", exampleStyles["trigger"], "Trigger example style / " + fullTaskId)
        elif task == "BB13T3": # "bb_features:build_for_nameless:wordnet"
            exampleStyles["trigger"] = Parameters.cat("bb_features:build_for_nameless", exampleStyles["trigger"], "Trigger example style / " + fullTaskId)
        elif task == "REL11":
            exampleStyles["trigger"] = Parameters.cat("rel_features", exampleStyles["trigger"], "Trigger example style / " + fullTaskId)
        elif task in ["BI11-FULL", "DDI11-FULL"]:
            exampleStyles["trigger"] = "build_for_nameless:names"        
        # Classifier parameters
        if task == "DDI11":
            classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId)
        #elif task == "DDI13":
        #    classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId)
        elif task == "CO11":
            classifierParameters["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges / " + fullTaskId)
            classifierParameters["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["examples"], "Classifier parameters for triggers / " + fullTaskId)
            classifierParameters["recall"] = Parameters.cat("0.8,0.9,0.95,1.0", classifierParameters["recall"], "Recall adjust / " + fullTaskId)
    
    return detector, bioNLPSTParams, preprocessorParams
Beispiel #50
0
def train(output, task=None, detector=None, inputFiles=None, models=None, parse=None,
          processUnmerging=None, processModifiers=None, isSingleStage=False, 
          bioNLPSTParams=None, preprocessorParams=None, exampleStyles=None, 
          classifierParams=None,  doFullGrid=False, deleteOutput=False, copyFrom=None, 
          log="log.txt", step=None, omitSteps=None, debug=False, connection=None):
    """
    Train a new model for event or relation detection.
    
    @param output: A directory where output files will appear.
    @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks.
    @param detector: a Detector object, or a string defining one to be imported
    @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test"
    @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test"
    @param parse: The parse element name in the training interaction XML
    @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default.
    @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default.
    @param isSingleStage: False for EventDetector, True for a single stage detector.
    @param bioNLPSTParams: Parameters controlling BioNLP ST format output.
    @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying.
    @param exampleStyles: A parameter set for controlling example builders.
    @param classifierParams: A parameter set for controlling classifiers.
    @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search.
    @param deleteOutput: Remove an existing output directory
    @param copyFrom: Copy an existing output directory for use as a template
    @param log: An optional alternative name for the log file. None is for no logging.
    @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST"
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param connection: A parameter set defining a local or remote connection for training the classifier
    """
    # Insert default arguments where needed
    inputFiles = Parameters.get(inputFiles, {"train":None, "devel":None, "test":None})
    models = Parameters.get(models, {"devel":None, "test":None})
    exampleStyles = Parameters.get(exampleStyles, {"examples":None, "trigger":None, "edge":None, "unmerging":None, "modifiers":None})
    classifierParams = Parameters.get(classifierParams, {"examples":None, "trigger":None, "recall":None, "edge":None, "unmerging":None, "modifiers":None})
    processUnmerging = getDefinedBool(processUnmerging)
    processModifiers = getDefinedBool(processModifiers)
    # Initialize working directory
    workdir(output, deleteOutput, copyFrom, log)
    # Get task specific parameters
    detector, processUnmerging, processModifiers, isSingleStage, bioNLPSTParams, preprocessorParams, exampleStyles, classifierParams, removeNamesFromEmpty = getTaskSettings(task, 
        detector, processUnmerging, processModifiers, isSingleStage, bioNLPSTParams, preprocessorParams, inputFiles, exampleStyles, classifierParams)   
    if task != None: task = task.replace("-MINI", "").replace("-FULL", "")
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"])
    
    # Initialize the detector
    detector, detectorName = getDetector(detector)
    detector = detector() # initialize object
    detector.debug = debug
    detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(bioNLPSTParams)
    #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format
    #detector.stWriteScores = True # write confidence scores into additional st-format files
    connection = getConnection(connection)
    detector.setConnection(connection)
    connection.debug = debug
    if deleteOutput:
        connection.clearWorkDir()
    
    # Train
    if selector.check("TRAIN"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------------ Train Detector ------------------"
        print >> sys.stderr, "----------------------------------------------------"
        if isSingleStage:
            detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
                           exampleStyles["examples"], classifierParams["examples"], parse, None, task,
                           fromStep=detectorSteps["TRAIN"], workDir="training")
        else:
            detector.train(inputFiles["train"], inputFiles["devel"], models["devel"], models["test"],
                           exampleStyles["trigger"], exampleStyles["edge"], exampleStyles["unmerging"], exampleStyles["modifiers"],
                           classifierParams["trigger"], classifierParams["edge"], classifierParams["unmerging"], classifierParams["modifiers"],
                           classifierParams["recall"], processUnmerging, processModifiers, 
                           doFullGrid, task, parse, None,
                           fromStep=detectorSteps["TRAIN"], workDir="training")
        # Save the detector type
        for model in [models["devel"], models["test"]]:
            if os.path.exists(model):
                model = Model(model, "a")
                model.addStr("detector", detectorName)
                if preprocessorParams != None:
                    preprocessor = Preprocessor()
                    model.addStr("preprocessorParams", Parameters.toString(preprocessor.getParameters(preprocessorParams)))
                model.save()
                model.close()
    if selector.check("DEVEL"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Check devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        detector.classify(inputFiles["devel"], models["devel"], "classification-devel/devel", goldData=inputFiles["devel"], fromStep=detectorSteps["DEVEL"], workDir="classification-devel")
    if selector.check("EMPTY"):
        # By passing an emptied devel set through the prediction system, we can check that we get the same predictions
        # as in the DEVEL step, ensuring the model does not use leaked information.
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Empty devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        detector.classify(getEmptyCorpus(inputFiles["devel"], removeNames=removeNamesFromEmpty), models["devel"], "classification-empty/devel-empty", fromStep=detectorSteps["EMPTY"], workDir="classification-empty")
    if selector.check("TEST"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------- Test set classification --------------"
        print >> sys.stderr, "----------------------------------------------------"
        if inputFiles["test"] == None or not os.path.exists(inputFiles["test"]):
            print >> sys.stderr, "Skipping, test file", inputFiles["test"], "does not exist"
        else:
            detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
            detector.classify(inputFiles["test"], models["test"], "classification-test/test", fromStep=detectorSteps["TEST"], workDir="classification-test")
            if detector.bioNLPSTParams["convert"]:
                Utils.STFormat.Compare.compare("classification-test/test-events.tar.gz", "classification-devel/devel-events.tar.gz", "a2")
Beispiel #51
0
 def trainUnmergingDetector(self):
     xml = None
     if not self.unmerging:
         print >> sys.stderr, "No unmerging"
     if self.checkStep("SELF-TRAIN-EXAMPLES-FOR-UNMERGING",
                       self.unmerging) and self.unmerging:
         # Self-classified train data for unmerging
         if self.doUnmergingSelfTraining:
             # This allows limiting to a subcorpus
             triggerStyle = copy.copy(
                 Parameters.get(self.triggerExampleStyle))
             edgeStyle = copy.copy(Parameters.get(self.edgeExampleStyle))
             unmergingStyle = Parameters.get(self.unmergingExampleStyle)
             if "sentenceLimit" in unmergingStyle and unmergingStyle[
                     "sentenceLimit"]:
                 triggerStyle["sentenceLimit"] = unmergingStyle[
                     "sentenceLimit"]
                 edgeStyle["sentenceLimit"] = unmergingStyle[
                     "sentenceLimit"]
             # Build the examples
             xml = self.triggerDetector.classifyToXML(
                 self.trainData,
                 self.model,
                 None,
                 self.workDir + "unmerging-extra-",
                 exampleStyle=triggerStyle)  #, recallAdjust=0.5)
             xml = self.edgeDetector.classifyToXML(
                 xml,
                 self.model,
                 None,
                 self.workDir + "unmerging-extra-",
                 exampleStyle=edgeStyle)  #, recallAdjust=0.5)
             assert xml != None
             EvaluateInteractionXML.run(self.edgeDetector.evaluator, xml,
                                        self.trainData, self.parse)
         else:
             print >> sys.stderr, "No self-training for unmerging"
     if self.checkStep("UNMERGING-EXAMPLES",
                       self.unmerging) and self.unmerging:
         # Unmerging example generation
         GOLD_TEST_FILE = self.optData.replace("-nodup", "")
         GOLD_TRAIN_FILE = self.trainData.replace("-nodup", "")
         if self.doUnmergingSelfTraining:
             if xml == None:
                 xml = self.workDir + "unmerging-extra-edge-pred.xml.gz"
             self.unmergingDetector.buildExamples(
                 self.model, [
                     self.optData.replace("-nodup", ""),
                     [self.trainData.replace("-nodup", ""), xml]
                 ], [
                     self.workDir + "unmerging-opt-examples.gz",
                     self.workDir + "unmerging-train-examples.gz"
                 ], [GOLD_TEST_FILE, [GOLD_TRAIN_FILE, GOLD_TRAIN_FILE]],
                 exampleStyle=self.unmergingExampleStyle,
                 saveIdsToModel=True)
             xml = None
         else:
             self.unmergingDetector.buildExamples(
                 self.model, [
                     self.optData.replace("-nodup", ""),
                     self.trainData.replace("-nodup", "")
                 ], [
                     self.workDir + "unmerging-opt-examples.gz",
                     self.workDir + "unmerging-train-examples.gz"
                 ], [GOLD_TEST_FILE, GOLD_TRAIN_FILE],
                 exampleStyle=self.unmergingExampleStyle,
                 saveIdsToModel=True)
             xml = None
         #UnmergingExampleBuilder.run("/home/jari/biotext/EventExtension/TrainSelfClassify/test-predicted-edges.xml", GOLD_TRAIN_FILE, UNMERGING_TRAIN_EXAMPLE_FILE, PARSE, TOK, UNMERGING_FEATURE_PARAMS, UNMERGING_IDS, append=True)
     if self.checkStep("BEGIN-UNMERGING-MODEL",
                       self.unmerging) and self.unmerging:
         self.unmergingDetector.beginModel(
             None, self.model, self.workDir + "unmerging-train-examples.gz",
             self.workDir + "unmerging-opt-examples.gz")
     if self.checkStep("END-UNMERGING-MODEL",
                       self.unmerging) and self.unmerging:
         self.unmergingDetector.endModel(
             None, self.model, self.workDir + "unmerging-opt-examples.gz")
         print >> sys.stderr, "Adding unmerging classifier model to test-set event model"
         if self.combinedModel != None:
             self.combinedModel.addStr(
                 "unmerging-example-style",
                 self.model.getStr("unmerging-example-style"))
             self.combinedModel.insert(
                 self.model.get("unmerging-ids.classes"),
                 "unmerging-ids.classes")
             self.combinedModel.insert(
                 self.model.get("unmerging-ids.features"),
                 "unmerging-ids.features")
             self.unmergingDetector.addClassifierModel(
                 self.combinedModel,
                 self.model.get("unmerging-classifier-model", True),
                 self.model.getStr("unmerging-classifier-parameter"))
             self.combinedModel.save()
Beispiel #52
0
    def train(self,
              examples,
              outDir,
              parameters,
              classifyExamples=None,
              finishBeforeReturn=False,
              replaceRemoteExamples=True,
              dummy=False):
        outDir = os.path.abspath(outDir)

        examples = self.getExampleFile(examples,
                                       replaceRemote=replaceRemoteExamples,
                                       dummy=dummy)
        classifyExamples = self.getExampleFile(
            classifyExamples, replaceRemote=replaceRemoteExamples, dummy=dummy)
        parameters = Parameters.get(parameters, valueListKey="c")
        svmMulticlassDir = self.connection.getSetting("SVM_MULTICLASS_DIR")

        # Return a new classifier instance for following the training process and using the model
        classifier = copy.copy(self)
        classifier.setState("TRAIN")
        classifier.parameters = parameters
        # Train
        if not os.path.exists(outDir):
            os.makedirs(outDir)
        trainCommand = svmMulticlassDir + "/svm_multiclass_learn "
        paramKeys = sorted(parameters.keys())
        idStr = ""
        for key in paramKeys:
            trainCommand += "-" + str(key) + " "
            idStr += "-" + str(key)
            if parameters[key] != None:
                trainCommand += str(parameters[key]) + " "
                idStr += "_" + str(parameters[key])
        classifier.parameterIdStr = idStr
        classifier.model = self.connection.getRemotePath(
            outDir + "/model" + idStr, True)
        modelPath = self.connection.getRemotePath(outDir + "/model" + idStr,
                                                  False)
        trainCommand += examples + " " + modelPath
        self.connection.addCommand(trainCommand)
        # Classify with the trained model (optional)
        if classifyExamples != None:
            classifier.predictions = self.connection.getRemotePath(
                outDir + "/predictions" + idStr, True)
            predictionsPath = self.connection.getRemotePath(
                outDir + "/predictions" + idStr, False)
            classifyCommand = svmMulticlassDir + "/svm_multiclass_classify " + classifyExamples + " " + modelPath + " " + predictionsPath
            self.connection.addCommand(classifyCommand)
        # Run the process
        jobName = "svm_multiclass_learn" + idStr
        logPath = outDir + "/" + jobName
        if dummy:  # return a classifier that connects to an existing job
            self.connection.clearCommands()
            classifier._job = self.connection.getJob(jobDir=outDir,
                                                     jobName=jobName)
        else:  # submit the job
            classifier._job = self.connection.submit(jobDir=outDir,
                                                     jobName=jobName,
                                                     stdout=logPath +
                                                     ".stdout")
            if finishBeforeReturn:
                self.connection.waitForJob(classifier._job)
        return classifier
Beispiel #53
0
def getTaskSettings(task, detector, bioNLPSTParams, preprocessorParams, 
                    inputFiles, exampleStyles, classifierParameters, folds, corpusDir=None, useKerasDetector=False):
    if task != None:
        print >> sys.stderr, "*** Defining training settings for task", task, "***"
        fullTaskId = task
        task, subTask = getSubTask(task)
        if corpusDir == None:
            corpusDir = Settings.CORPUS_DIR
        print >> sys.stderr, "Loading corpus", task, "from", corpusDir
        for dataset in ["devel", "train", "test"]:
            if inputFiles[dataset] == None:
                if task.startswith("DDI13") and task != "DDI13":
                    if dataset in ["devel", "train"]:
                        inputFiles[dataset] = os.path.join(corpusDir, "DDI13-train.xml")
                    elif dataset == "test":
                        if task.endswith("T91"):
                            inputFiles[dataset] = os.path.join(corpusDir, "DDI13-test-task9.1.xml")
                        elif task.endswith("T92") or task.endswith("FULL"):
                            inputFiles[dataset] = os.path.join(corpusDir, "DDI13-test-task9.2.xml")
                elif task == "ID11" and dataset == "train":
                    inputFiles[dataset] = Catenate.catenate([os.path.join(corpusDir, "ID11-train.xml"), os.path.join(corpusDir, "GE11-devel.xml"),
                                                             os.path.join(corpusDir, "GE11-train.xml")], "training/ID11-train-and-GE11-devel-and-train.xml.gz", fast=True)
                else:
                    inputFiles[dataset] = os.path.join(corpusDir, task.replace("-FULL", "") + "-"+dataset+".xml")
                
            if inputFiles[dataset] == "skip":
                inputFiles[dataset] = None
            if inputFiles[dataset] != None and not os.path.exists(inputFiles[dataset]):
                fullPath = os.path.join(Settings.CORPUS_DIR, inputFiles[dataset])
                if os.path.exists(fullPath):
                    inputFiles[dataset] = fullPath
                else:
                    inputFiles[dataset] = None
                    print >> sys.stderr, "Input file", inputFiles[dataset], "for set '" + dataset + "' does not exist, skipping."
        assert inputFiles["train"] != None # at least training set must exist
        # Example generation parameters
        if detector == None:
            if task == "CO11":
                detector = "Detectors.CODetector"
            elif task in ["BI11-FULL", "DDI11-FULL", "DDI13-FULL", "BB_EVENT_16-FULL"]:
                detector = "Detectors.EventDetector"
            elif task.startswith("DDI13"):
                if task.endswith("T91"):
                    detector = "Detectors.EntityDetector"
                elif task.endswith("T92") or task == "DDI13":
                    detector = "Detectors.EdgeDetector"
        
        #######################################################################
        # BioNLP Shared Task and preprocessing parameters
        #######################################################################
        if task == "BI11-FULL":
            bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:scores", "BioNLP Shared Task / " + fullTaskId, ["default"]) # the shared task evaluator is not designed for predicted entities
        elif task == "REL11":
            bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores:a2Tag=rel", "BioNLP Shared Task / " + fullTaskId, ["default"])
        elif task in ("BB_EVENT_16", "BB_EVENT_16-FULL", "BB_EVENT_NER_16", "SDB16"):
            bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert=zip", "BioNLP Shared Task / " + fullTaskId, ["default"])
        elif task not in ["DDI11", "DDI11-FULL", "DDI13T91", "DDI13T92", "DDI13-FULL", "DDI13", "CP17", "SEMEVAL10T8"]:
            bioNLPSTParams = Parameters.cat(bioNLPSTParams, "convert:evaluate:scores", "BioNLP Shared Task / " + fullTaskId, ["default"])
        else:
            bioNLPSTParams = "skip"
        
        #######################################################################
        # Preprocessing parameters
        #######################################################################
        if task in ["BI11", "BI11-FULL", "BB11", "DDI11", "DDI11-FULL", "DDI13T91", "DDI13T92", "DDI13-FULL", "DDI13"]:
            Parameters.cat("intermediateFiles:omitSteps=NER,DIVIDE-SETS", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"])
        else: # parse only sentences where BANNER found an entity
            Parameters.cat("intermediateFiles:omitSteps=DIVIDE-SETS:PARSE.requireEntities", preprocessorParams, "Preprocessor /" + fullTaskId, ["default"])
        
        #######################################################################
        # Example style parameters
        #######################################################################
        if not useKerasDetector:
            # Example style parameters for single-stage tasks #####################
            msg = "Single-stage example style / " + fullTaskId
            if task == "REN11":
                exampleStyles["examples"] = Parameters.cat("undirected:bacteria_renaming:maskTypeAsProtein=Gene", exampleStyles["examples"], msg)
            elif task == "DDI11":
                exampleStyles["examples"] = Parameters.cat("drugbank_features:ddi_mtmx:filter_shortest_path=conj_and", exampleStyles["examples"], msg)
            elif task.startswith("DDI13"):
                if task.endswith("T91"):
                    exampleStyles["examples"] = Parameters.cat("names:build_for_nameless:ddi13_features:drugbank_features", exampleStyles["examples"], msg)
                elif task.endswith("T92") or task == "DDI13":
                    exampleStyles["examples"] = Parameters.cat("keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["examples"], msg)
            elif task == "BI11":
                exampleStyles["examples"] = Parameters.cat("bi_features", exampleStyles["examples"], msg)
            elif task == "BB_EVENT_16":
                exampleStyles["examples"] = Parameters.cat("keep_neg", exampleStyles["examples"], msg) #exampleStyles["examples"] = Parameters.cat("linear_features:keep_neg", exampleStyles["examples"], msg)
            elif task == "SDB16":
                exampleStyles["examples"] = Parameters.cat("sdb_merge:sdb_features", exampleStyles["examples"], msg)
            # Edge style ##########################################################
            msg = "Edge example style / " + fullTaskId
            if task in ["GE09", "GE11", "GE13"] and subTask == 1:
                exampleStyles["edge"] = Parameters.cat("genia_features:genia_task1", exampleStyles["edge"], msg)
            elif task in ["GE09", "GE11", "GE13"]:
                exampleStyles["edge"] = Parameters.cat("genia_features", exampleStyles["edge"], msg)
            elif task == "REL11":
                exampleStyles["edge"] = Parameters.cat("rel_features", exampleStyles["edge"], msg)
            elif task == "DDI11-FULL":
                exampleStyles["edge"] = Parameters.cat("drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], msg)
            elif task == "DDI13-FULL":
                exampleStyles["edge"] = Parameters.cat("keep_neg:drugbank_features:filter_shortest_path=conj_and", exampleStyles["edge"], msg)
            elif task == "CO11":
                exampleStyles["edge"] = Parameters.cat("co_features", exampleStyles["edge"], msg)
            elif task == "BI11-FULL":
                exampleStyles["edge"] = Parameters.cat("bi_features", exampleStyles["edge"], msg)
            # Trigger style #######################################################
            msg = "Trigger example style / " + fullTaskId
            if task in ["GE09", "GE11", "GE13"] and subTask == 1:
                exampleStyles["trigger"] = Parameters.cat("genia_task1", exampleStyles["trigger"], msg)
            elif task in ["EPI11", "PC13"]:
                exampleStyles["trigger"] = Parameters.cat("epi_merge_negated", exampleStyles["trigger"], msg)
            elif task == "BB11": # "bb_features:build_for_nameless:wordnet"
                exampleStyles["trigger"] = Parameters.cat("bb_features", exampleStyles["trigger"], msg)
            elif task == "BB13T3": # "bb_features:build_for_nameless:wordnet"
                exampleStyles["trigger"] = Parameters.cat("bb_features", exampleStyles["trigger"], msg)
            elif task == "REL11":
                exampleStyles["trigger"] = Parameters.cat("rel_features", exampleStyles["trigger"], msg)
            elif task in ["BI11-FULL", "DDI11-FULL"]:
                exampleStyles["trigger"] = "names:build_for_nameless"
            elif task == "DDI13-FULL":
                exampleStyles["trigger"] = "names:build_for_nameless:ddi13_features:drugbank_features"
            elif task == "BB_EVENT_16-FULL":
                exampleStyles["trigger"] = Parameters.cat("bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens:only_types=Bacteria,Habitat,Geographical", exampleStyles["trigger"], msg)
            elif task in "BB_EVENT_NER_16":
                exampleStyles["trigger"] = Parameters.cat("bb_spans:bb_features:ontobiotope_features:build_for_nameless:all_tokens", exampleStyles["trigger"], msg)            
                
            #######################################################################
            # Classifier parameters
            #######################################################################
            if task == "DDI11":
                classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId)
            #elif task == "DDI13":
            #    classifierParameters["examples"] = Parameters.cat("c=10,100,1000,2500,4000,5000,6000,7500,10000,20000,25000,50000:TEES.threshold", classifierParameters["examples"], "Classifier parameters for single-stage examples" + fullTaskId)
            elif task == "CO11":
                classifierParameters["edge"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["edge"], "Classifier parameters for edges / " + fullTaskId)
                classifierParameters["trigger"] = Parameters.cat("c=1000,5000,10000,20000,50000,80000,100000,150000,180000,200000,250000,300000,350000,500000,1000000", classifierParameters["trigger"], "Classifier parameters for triggers / " + fullTaskId)
                classifierParameters["recall"] = Parameters.cat("0.8,0.9,0.95,1.0", classifierParameters["recall"], "Recall adjust / " + fullTaskId)
            elif task == "BB_EVENT_16":
                classifierParameters["examples"] = Parameters.cat("c=10,20,30,40,50,60,70,80,100,110,115,120,125,130,140,150,200,500,1000,2000,3000,4000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000", classifierParameters["examples"], "Classifier parameters for edges / " + fullTaskId)
            elif task in ("BB_EVENT_16-FULL", "BB_EVENT_NER_16"):
                classifierParameters["edge"] = Parameters.cat("c=10,20,50,80,100,110,115,120,125,130,140,150,200,500,1000", classifierParameters["edge"], "Classifier parameters for edges / " + fullTaskId)
            elif task == "SDB16":
                classifierParameters["examples"] = Parameters.cat("c=1000,4500,5000,7500,10000,20000,25000,27500,28000,29000,30000,35000,40000,50000,60000,65000,80000,100000,150000", classifierParameters["examples"], "Classifier parameters for single-stage examples / " + fullTaskId)
        
        # Training fold parameters ############################################
        if task.startswith("DDI13") and task != "DDI13":
            #folds["devel"]=["train1", "train2", "train3", "train4"]
            #folds["train"]=["train5", "train6", "train7", "train8", "train9"]
            folds["devel"]=["train1", "train2", "train3"]
            folds["train"]=["train4", "train5", "train6", "train7", "train8", "train9"]
        
    return detector, bioNLPSTParams, preprocessorParams, folds
Beispiel #54
0
def train(output,
          task=None,
          detector=None,
          inputFiles=None,
          models=None,
          parse=None,
          processUnmerging=None,
          processModifiers=None,
          bioNLPSTParams=None,
          preprocessorParams=None,
          exampleStyles=None,
          classifierParams=None,
          doFullGrid=False,
          deleteOutput=False,
          copyFrom=None,
          log="log.txt",
          step=None,
          omitSteps=None,
          debug=False,
          connection=None,
          subset=None,
          folds=None):
    """
    Train a new model for event or relation detection.
    
    @param output: A directory where output files will appear.
    @param task: If defined, overridable default settings are used for many of the training parameters. Must be one of the supported TEES tasks.
    @param detector: a Detector object, or a string defining one to be imported
    @param inputFiles: A dictionary of file names, with keys "train", "devel" and, "test"
    @param models: A dictionary of file names defining the place for the new models, with keys "devel" and, "test"
    @param parse: The parse element name in the training interaction XML
    @param processUnmerging: Use the unmerging step of EventDetector. True, False or None for task default.
    @param processModifiers: Use the modifier detection step of EventDetector. True, False or None for task default.
    @param bioNLPSTParams: Parameters controlling BioNLP ST format output.
    @param preprocessorParams: Parameters controlling the preprocessor. Not used for training, but saved to the model for use when classifying.
    @param exampleStyles: A parameter set for controlling example builders.
    @param classifierParams: A parameter set for controlling classifiers.
    @param doFullGrid: Whether all parameters, as opposed to just recall adjustment, are tested in the EventDetector grid search.
    @param deleteOutput: Remove an existing output directory
    @param copyFrom: Copy an existing output directory for use as a template
    @param log: An optional alternative name for the log file. None is for no logging.
    @param step: A step=substep pair, where the steps are "TRAIN", "DEVEL", "EMPTY" and "TEST"
    @param omitSteps: step=substep parameters, where multiple substeps can be defined.
    @param debug: In debug mode, more output is shown, and some temporary intermediate files are saved
    @param connection: A parameter set defining a local or remote connection for training the classifier
    @param subset: A parameter set for making subsets of input files
    """
    # Insert default arguments where needed
    inputFiles = setDictDefaults(inputFiles, {
        "train": None,
        "devel": None,
        "test": None
    })
    models = setDictDefaults(models, {"devel": None, "test": None})
    exampleStyles = setDictDefaults(
        exampleStyles, {
            "examples": None,
            "trigger": None,
            "edge": None,
            "unmerging": None,
            "modifiers": None
        })
    classifierParams = setDictDefaults(
        classifierParams, {
            "examples": None,
            "trigger": None,
            "recall": None,
            "edge": None,
            "unmerging": None,
            "modifiers": None
        })
    subset = setDictDefaults(Parameters.get(subset), {
        "train": None,
        "devel": None,
        "test": None,
        "seed": 0,
        "all": None
    })
    folds = setDictDefaults(folds, {
        "train": None,
        "devel": None,
        "test": None
    })
    processUnmerging = getDefinedBool(processUnmerging)
    processModifiers = getDefinedBool(processModifiers)
    # Initialize working directory
    workdir(output, deleteOutput, copyFrom, log)
    # Get task specific parameters
    detector, bioNLPSTParams, preprocessorParams = getTaskSettings(
        task, detector, bioNLPSTParams, preprocessorParams, inputFiles,
        exampleStyles, classifierParams)
    # Learn training settings from input files
    detector = learnSettings(inputFiles, detector, classifierParams)
    # Get corpus subsets
    getFolds(inputFiles, folds)
    getSubsets(inputFiles, subset)
    if task != None:
        task = task.replace("-FULL", "")
    # Define processing steps
    selector, detectorSteps, omitDetectorSteps = getSteps(
        step, omitSteps, ["TRAIN", "DEVEL", "EMPTY", "TEST"])

    # Initialize the detector
    detector, detectorName = getDetector(detector)
    detector = detector()  # initialize object
    detector.debug = debug
    detector.bioNLPSTParams = detector.getBioNLPSharedTaskParams(
        bioNLPSTParams)
    #detector.useBioNLPSTFormat = useBioNLPSTFormat # classify-output and grid evaluation in ST-format
    #detector.stWriteScores = True # write confidence scores into additional st-format files
    connection = getConnection(connection)
    detector.setConnection(connection)
    connection.debug = debug
    if deleteOutput:
        connection.clearWorkDir()

    # Train
    if selector.check("TRAIN"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------------ Train Detector ------------------"
        print >> sys.stderr, "----------------------------------------------------"
        if isinstance(detector, SingleStageDetector):
            detector.train(inputFiles["train"],
                           inputFiles["devel"],
                           models["devel"],
                           models["test"],
                           exampleStyles["examples"],
                           classifierParams["examples"],
                           parse,
                           None,
                           task,
                           fromStep=detectorSteps["TRAIN"],
                           workDir="training")
        else:
            detector.train(inputFiles["train"],
                           inputFiles["devel"],
                           models["devel"],
                           models["test"],
                           exampleStyles["trigger"],
                           exampleStyles["edge"],
                           exampleStyles["unmerging"],
                           exampleStyles["modifiers"],
                           classifierParams["trigger"],
                           classifierParams["edge"],
                           classifierParams["unmerging"],
                           classifierParams["modifiers"],
                           classifierParams["recall"],
                           processUnmerging,
                           processModifiers,
                           doFullGrid,
                           task,
                           parse,
                           None,
                           fromStep=detectorSteps["TRAIN"],
                           workDir="training")
        # Save the detector type
        for model in [models["devel"], models["test"]]:
            if model != None and os.path.exists(model):
                model = Model(model, "a")
                model.addStr("detector", detectorName)
                if preprocessorParams != None:
                    preprocessor = Preprocessor()
                    model.addStr(
                        "preprocessorParams",
                        Parameters.toString(
                            preprocessor.getParameters(preprocessorParams)))
                model.save()
                model.close()
    if selector.check("DEVEL"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Check devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
        detector.classify(inputFiles["devel"],
                          models["devel"],
                          "classification-devel/devel",
                          goldData=inputFiles["devel"],
                          fromStep=detectorSteps["DEVEL"],
                          workDir="classification-devel")
    if selector.check("EMPTY"):
        # By passing an emptied devel set through the prediction system, we can check that we get the same predictions
        # as in the DEVEL step, ensuring the model does not use leaked information.
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------ Empty devel classification ------------"
        print >> sys.stderr, "----------------------------------------------------"
        #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
        detector.classify(getEmptyCorpus(
            inputFiles["devel"],
            removeNames=("names" in str(exampleStyles["examples"])
                         or "names" in str(exampleStyles["trigger"]))),
                          models["devel"],
                          "classification-empty/devel-empty",
                          fromStep=detectorSteps["EMPTY"],
                          workDir="classification-empty")
    if selector.check("TEST"):
        print >> sys.stderr, "----------------------------------------------------"
        print >> sys.stderr, "------------- Test set classification --------------"
        print >> sys.stderr, "----------------------------------------------------"
        if inputFiles["test"] == None or not os.path.exists(
                inputFiles["test"]):
            print >> sys.stderr, "Skipping, test file", inputFiles[
                "test"], "does not exist"
        else:
            #detector.bioNLPSTParams["scores"] = False # the evaluation server doesn't like additional files
            detector.classify(inputFiles["test"],
                              models["test"],
                              "classification-test/test",
                              fromStep=detectorSteps["TEST"],
                              workDir="classification-test")
            if detector.bioNLPSTParams["convert"]:
                Utils.STFormat.Compare.compare(
                    "classification-test/test-events.tar.gz",
                    "classification-devel/devel-events.tar.gz", "a2")