def optimize(self, examples, outDir, parameters, classifyExamples, classIds, step="BOTH", evaluator=None, determineThreshold=False, timeout=None, downloadAllModels=False): assert step in ["BOTH", "SUBMIT", "RESULTS"], step outDir = os.path.abspath(outDir) # Initialize training (or reconnect to existing jobs) combinations = Parameters.getCombinations(Parameters.get(parameters, valueListKey="c")) #Core.OptimizeParameters.getParameterCombinations(parameters) trained = [] for combination in combinations: trained.append( self.train(examples, outDir, combination, classifyExamples, replaceRemoteExamples=(len(trained) == 0), dummy=(step == "RESULTS")) ) if step == "SUBMIT": # Return already classifier = copy.copy(self) classifier.setState("OPTIMIZE") return classifier # Wait for the training to finish finalJobStatus = self.connection.waitForJobs([x.getJob() for x in trained]) # Evaluate the results print >> sys.stderr, "Evaluating results" #Stream.setIndent(" ") bestResult = None if evaluator == None: evaluator = self.defaultEvaluator for i in range(len(combinations)): id = trained[i].parameterIdStr #Stream.setIndent(" ") # Get predictions predictions = None if trained[i].getStatus() == "FINISHED": predictions = trained[i].downloadPredictions() else: print >> sys.stderr, "No results for combination" + id continue if downloadAllModels: trained[i].downloadModel() # Compare to other results print >> sys.stderr, "*** Evaluating results for combination" + id + " ***" threshold = None if determineThreshold: print >> sys.stderr, "Thresholding, original micro =", evaluation = evaluator.evaluate(classifyExamples, predictions, classIds, os.path.join(outDir, "evaluation-before-threshold" + id + ".csv"), verbose=False) print >> sys.stderr, evaluation.microF.toStringConcise() threshold, bestF = evaluator.threshold(classifyExamples, predictions) print >> sys.stderr, "threshold =", threshold, "at binary fscore", str(bestF)[0:6] evaluation = evaluator.evaluate(classifyExamples, ExampleUtils.loadPredictions(predictions, threshold=threshold), classIds, os.path.join(outDir, "evaluation" + id + ".csv")) if bestResult == None or evaluation.compare(bestResult[0]) > 0: #: averageResult.fScore > bestResult[1].fScore: bestResult = [evaluation, trained[i], combinations[i], threshold] if not self.connection.isLocal(): os.remove(predictions) # remove predictions to save space #Stream.setIndent() if bestResult == None: raise Exception("No results for any parameter combination") print >> sys.stderr, "*** Evaluation complete", finalJobStatus, "***" print >> sys.stderr, "Selected parameters", bestResult[2] classifier = copy.copy(bestResult[1]) classifier.threshold = bestResult[3] classifier.downloadModel() return classifier
def doGrid(self): print >> sys.stderr, "--------- Booster parameter search ---------" # Build trigger examples self.triggerDetector.buildExamples(self.model, [self.optData], [self.workDir+"grid-trigger-examples.gz"]) if self.fullGrid: # Parameters to optimize ALL_PARAMS={ "trigger":[int(i) for i in Parameters.get(self.triggerClassifierParameters, valueListKey="c")["c"]], "booster":[float(i) for i in self.recallAdjustParameters.split(",")], "edge":[int(i) for i in Parameters.get(self.edgeClassifierParameters, valueListKey="c")["c"]] } else: ALL_PARAMS={"trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameter"), valueListKey="c")["c"], "booster":[float(i) for i in self.recallAdjustParameters.split(",")], "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameter"), valueListKey="c")["c"]} paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"]) prevParams = None EDGE_MODEL_STEM = os.path.join(self.edgeDetector.workDir, os.path.normpath(self.model.path)+"-edge-models/model-c_") TRIGGER_MODEL_STEM = os.path.join(self.triggerDetector.workDir, os.path.normpath(self.model.path)+"-trigger-models/model-c_") bestResults = None for i in range(len(paramCombinations)): params = paramCombinations[i] print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" print >> sys.stderr, "Processing params", str(i+1) + "/" + str(len(paramCombinations)), params print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" # Triggers and Boost if prevParams == None or prevParams["trigger"] != params["trigger"] or prevParams["booster"] != params["booster"]: print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str(params["trigger"]), "booster:" + str(params["booster"]) xml = self.triggerDetector.classifyToXML(self.optData, self.model, self.workDir+"grid-trigger-examples.gz", self.workDir+"grid-", classifierModel=TRIGGER_MODEL_STEM+str(params["trigger"]), recallAdjust=params["booster"]) prevParams = params # Build edge examples self.edgeDetector.buildExamples(self.model, [xml], [self.workDir+"grid-edge-examples.gz"], [self.optData]) # Classify with pre-defined model edgeClassifierModel=EDGE_MODEL_STEM+str(params["edge"]) xml = self.edgeDetector.classifyToXML(xml, self.model, self.workDir+"grid-edge-examples.gz", self.workDir+"grid-", classifierModel=edgeClassifierModel) bestResults = self.evaluateGrid(xml, params, bestResults) print >> sys.stderr, "Booster search complete" print >> sys.stderr, "Tested", len(paramCombinations), "combinations" print >> sys.stderr, "Best parameters:", bestResults[0] print >> sys.stderr, "Best result:", bestResults[2] # f-score # Save grid model self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.model) self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.combinedModel, False) if self.fullGrid: # define best models self.triggerDetector.addClassifierModel(self.model, TRIGGER_MODEL_STEM+str(bestResults[0]["trigger"]), bestResults[0]["trigger"]) self.edgeDetector.addClassifierModel(self.model, EDGE_MODEL_STEM+str(bestResults[0]["edge"]), bestResults[0]["edge"]) # Remove work files for stepTag in [self.workDir+"grid-trigger", self.workDir+"grid-edge", self.workDir+"grid-unmerging"]: for fileStem in ["-classifications", "-classifications.log", "examples.gz", "pred.xml.gz"]: if os.path.exists(stepTag+fileStem): os.remove(stepTag+fileStem)
def doGrid(self): print >> sys.stderr, "--------- Parameter grid search ---------" # Build trigger examples self.triggerDetector.buildExamples( self.model, [self.optData], [self.workDir + "grid-trigger-examples"]) if self.fullGrid: stepParams = { "trigger": Parameters.get(self.model.getStr(self.triggerDetector.tag + "classifier-parameters-train", defaultIfNotExist=""), valueListKey="c"), "booster": [float(i) for i in self.recallAdjustParameters.split(",")], "edge": Parameters.get(self.model.getStr(self.edgeDetector.tag + "classifier-parameters-train", defaultIfNotExist=""), valueListKey="c") } else: stepParams = { "trigger": Parameters.get(self.model.getStr(self.triggerDetector.tag + "classifier-parameter", defaultIfNotExist=""), valueListKey="c"), "booster": [float(i) for i in self.recallAdjustParameters.split(",")], "edge": Parameters.get(self.model.getStr(self.edgeDetector.tag + "classifier-parameter", defaultIfNotExist=""), valueListKey="c") } for step in ["trigger", "edge"]: stepParams[step] = Parameters.getCombinations(stepParams[step]) for i in range(len(stepParams[step])): stepParams[step][i] = Parameters.toString(stepParams[step][i]) print >> sys.stderr, "Parameters", [ stepParams[x] for x in ["trigger", "booster", "edge"] ] paramCombinations = combine( *[stepParams[x] for x in ["trigger", "booster", "edge"]]) print >> sys.stderr, "Combinations", paramCombinations for i in range(len(paramCombinations)): paramCombinations[i] = { "trigger": paramCombinations[i][0], "booster": paramCombinations[i][1], "edge": paramCombinations[i][2] } #paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"]) prevParams = None EDGE_MODEL_STEM = os.path.join( self.edgeDetector.workDir, os.path.normpath(self.model.path) + "-edge-models/model") TRIGGER_MODEL_STEM = os.path.join( self.triggerDetector.workDir, os.path.normpath(self.model.path) + "-trigger-models/model") self.structureAnalyzer.load(self.model) bestResults = None for i in range(len(paramCombinations)): params = paramCombinations[i] print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" print >> sys.stderr, "Processing params", str(i + 1) + "/" + str( len(paramCombinations)), params print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" # Triggers and Boost (the trigger predictions are recalculated only when the relevant parameters change) if (prevParams == None) or ( prevParams["trigger"] != params["trigger"]) or ( prevParams["booster"] != params["booster"]): print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str( params["trigger"]), "booster:" + str(params["booster"]) xml = self.triggerDetector.classifyToXML( self.optData, self.model, self.workDir + "grid-trigger-examples", self.workDir + "grid-", classifierModel=TRIGGER_MODEL_STEM + Parameters.toId(params["trigger"]), recallAdjust=params["booster"], useExistingExamples=True) prevParams = params ## Build edge examples #self.edgeDetector.buildExamples(self.model, [xml], [self.workDir+"grid-edge-examples"], [self.optData]) # Classify with pre-defined model edgeClassifierModel = EDGE_MODEL_STEM + Parameters.toId( params["edge"]) xml = self.edgeDetector.classifyToXML( xml, self.model, self.workDir + "grid-edge-examples", self.workDir + "grid-", classifierModel=edgeClassifierModel, goldData=self.optData) bestResults = self.evaluateGrid(xml, params, bestResults) # Remove remaining intermediate grid files for tag1 in ["edge", "trigger", "unmerging"]: for tag2 in ["examples", "pred.xml.gz"]: if os.path.exists(self.workDir + "grid-" + tag1 + "-" + tag2): os.remove(self.workDir + "grid-" + tag1 + "-" + tag2) print >> sys.stderr, "Parameter grid search complete" print >> sys.stderr, "Tested", len(paramCombinations), "combinations" print >> sys.stderr, "Best parameters:", bestResults[0] print >> sys.stderr, "Best result:", bestResults[2] # f-score # Save grid model self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.model) self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.combinedModel, False) if self.fullGrid: # define best models self.triggerDetector.addClassifierModel( self.model, TRIGGER_MODEL_STEM + str(bestResults[0]["trigger"]), bestResults[0]["trigger"]) self.edgeDetector.addClassifierModel( self.model, EDGE_MODEL_STEM + str(bestResults[0]["edge"]), bestResults[0]["edge"]) # Remove work files for stepTag in [ self.workDir + "grid-trigger", self.workDir + "grid-edge", self.workDir + "grid-unmerging" ]: for fileStem in [ "-classifications", "-classifications.log", "examples.gz", "pred.xml.gz" ]: if os.path.exists(stepTag + fileStem): os.remove(stepTag + fileStem)
def doGrid(self): print >> sys.stderr, "--------- Parameter grid search ---------" # Build trigger examples self.triggerDetector.buildExamples(self.model, [self.optData], [self.workDir+"grid-trigger-examples.gz"]) if self.fullGrid: stepParams = { "trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameters-train", defaultIfNotExist=""), valueListKey="c"), "booster":[float(i) for i in self.recallAdjustParameters.split(",")], "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameters-train", defaultIfNotExist=""), valueListKey="c")} else: stepParams = { "trigger":Parameters.get(self.model.getStr(self.triggerDetector.tag+"classifier-parameter", defaultIfNotExist=""), valueListKey="c"), "booster":[float(i) for i in self.recallAdjustParameters.split(",")], "edge":Parameters.get(self.model.getStr(self.edgeDetector.tag+"classifier-parameter", defaultIfNotExist=""), valueListKey="c")} for step in ["trigger", "edge"]: stepParams[step] = Parameters.getCombinations(stepParams[step]) for i in range(len(stepParams[step])): stepParams[step][i] = Parameters.toString(stepParams[step][i]) print >> sys.stderr, [stepParams[x] for x in ["trigger", "booster", "edge"]] paramCombinations = combine(*[stepParams[x] for x in ["trigger", "booster", "edge"]]) print >> sys.stderr, paramCombinations for i in range(len(paramCombinations)): paramCombinations[i] = {"trigger":paramCombinations[i][0], "booster":paramCombinations[i][1], "edge":paramCombinations[i][2]} #paramCombinations = Parameters.getCombinations(ALL_PARAMS, ["trigger", "booster", "edge"]) prevParams = None EDGE_MODEL_STEM = os.path.join(self.edgeDetector.workDir, os.path.normpath(self.model.path)+"-edge-models/model") TRIGGER_MODEL_STEM = os.path.join(self.triggerDetector.workDir, os.path.normpath(self.model.path)+"-trigger-models/model") self.structureAnalyzer.load(self.model) bestResults = None for i in range(len(paramCombinations)): params = paramCombinations[i] print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" print >> sys.stderr, "Processing params", str(i+1) + "/" + str(len(paramCombinations)), params print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" # Triggers and Boost if prevParams == None or prevParams["trigger"] != params["trigger"] or prevParams["trigger"] != params["trigger"]: print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str(params["trigger"]), "booster:" + str(params["booster"]) xml = self.triggerDetector.classifyToXML(self.optData, self.model, self.workDir+"grid-trigger-examples", self.workDir+"grid-", classifierModel=TRIGGER_MODEL_STEM + Parameters.toId(params["trigger"]), recallAdjust=params["booster"]) prevParams = params ## Build edge examples #self.edgeDetector.buildExamples(self.model, [xml], [self.workDir+"grid-edge-examples"], [self.optData]) # Classify with pre-defined model edgeClassifierModel = EDGE_MODEL_STEM + Parameters.toId(params["edge"]) xml = self.edgeDetector.classifyToXML(xml, self.model, self.workDir+"grid-edge-examples", self.workDir+"grid-", classifierModel=edgeClassifierModel, goldData=self.optData) bestResults = self.evaluateGrid(xml, params, bestResults) # Remove remaining intermediate grid files for tag1 in ["edge", "trigger", "unmerging"]: for tag2 in ["examples", "pred.xml.gz"]: if os.path.exists(self.workDir+"grid-"+tag1+"-"+tag2): os.remove(self.workDir+"grid-"+tag1+"-"+tag2) print >> sys.stderr, "Parameter grid search complete" print >> sys.stderr, "Tested", len(paramCombinations), "combinations" print >> sys.stderr, "Best parameters:", bestResults[0] print >> sys.stderr, "Best result:", bestResults[2] # f-score # Save grid model self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.model) self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.combinedModel, False) if self.fullGrid: # define best models self.triggerDetector.addClassifierModel(self.model, TRIGGER_MODEL_STEM+str(bestResults[0]["trigger"]), bestResults[0]["trigger"]) self.edgeDetector.addClassifierModel(self.model, EDGE_MODEL_STEM+str(bestResults[0]["edge"]), bestResults[0]["edge"]) # Remove work files for stepTag in [self.workDir+"grid-trigger", self.workDir+"grid-edge", self.workDir+"grid-unmerging"]: for fileStem in ["-classifications", "-classifications.log", "examples.gz", "pred.xml.gz"]: if os.path.exists(stepTag+fileStem): os.remove(stepTag+fileStem)
def optimize(self, examples, outDir, parameters, classifyExamples, classIds, step="BOTH", evaluator=None, determineThreshold=False, timeout=None, downloadAllModels=False): assert step in ["BOTH", "SUBMIT", "RESULTS"], step outDir = os.path.abspath(outDir) # Initialize training (or reconnect to existing jobs) combinations = Parameters.getCombinations( Parameters.get(parameters, valueListKey="c") ) #Core.OptimizeParameters.getParameterCombinations(parameters) trained = [] for combination in combinations: trained.append( self.train(examples, outDir, combination, classifyExamples, replaceRemoteExamples=(len(trained) == 0), dummy=(step == "RESULTS"))) if step == "SUBMIT": # Return already classifier = copy.copy(self) classifier.setState("OPTIMIZE") return classifier # Wait for the training to finish finalJobStatus = self.connection.waitForJobs( [x.getJob() for x in trained]) # Evaluate the results print >> sys.stderr, "Evaluating results" #Stream.setIndent(" ") bestResult = None if evaluator == None: evaluator = self.defaultEvaluator for i in range(len(combinations)): id = trained[i].parameterIdStr #Stream.setIndent(" ") # Get predictions predictions = None if trained[i].getStatus() == "FINISHED": predictions = trained[i].downloadPredictions() else: print >> sys.stderr, "No results for combination" + id continue if downloadAllModels: trained[i].downloadModel() # Compare to other results print >> sys.stderr, "*** Evaluating results for combination" + id + " ***" threshold = None if determineThreshold: print >> sys.stderr, "Thresholding, original micro =", evaluation = evaluator.evaluate( classifyExamples, predictions, classIds, os.path.join(outDir, "evaluation-before-threshold" + id + ".csv"), verbose=False) print >> sys.stderr, evaluation.microF.toStringConcise() threshold, bestF = evaluator.threshold(classifyExamples, predictions) print >> sys.stderr, "threshold =", threshold, "at binary fscore", str( bestF)[0:6] evaluation = evaluator.evaluate( classifyExamples, ExampleUtils.loadPredictions(predictions, threshold=threshold), classIds, os.path.join(outDir, "evaluation" + id + ".csv")) if bestResult == None or evaluation.compare( bestResult[0] ) > 0: #: averageResult.fScore > bestResult[1].fScore: bestResult = [ evaluation, trained[i], combinations[i], threshold ] if not self.connection.isLocal(): os.remove(predictions) # remove predictions to save space #Stream.setIndent() if bestResult == None: raise Exception("No results for any parameter combination") print >> sys.stderr, "*** Evaluation complete", finalJobStatus, "***" print >> sys.stderr, "Selected parameters", bestResult[2] classifier = copy.copy(bestResult[1]) classifier.threshold = bestResult[3] classifier.downloadModel() return classifier
def doGrid(self): print >> sys.stderr, "--------- Booster parameter search ---------" # Build trigger examples self.triggerDetector.buildExamples( self.model, [self.optData], [self.workDir + "grid-trigger-examples.gz"]) if self.fullGrid: # Parameters to optimize ALL_PARAMS = { "trigger": [ int(i) for i in Parameters.get(self.triggerClassifierParameters, valueListKey="c")["c"] ], "booster": [float(i) for i in self.recallAdjustParameters.split(",")], "edge": [ int(i) for i in Parameters.get(self.edgeClassifierParameters, valueListKey="c")["c"] ] } else: ALL_PARAMS = { "trigger": Parameters.get(self.model.getStr(self.triggerDetector.tag + "classifier-parameter"), valueListKey="c")["c"], "booster": [float(i) for i in self.recallAdjustParameters.split(",")], "edge": Parameters.get(self.model.getStr(self.edgeDetector.tag + "classifier-parameter"), valueListKey="c")["c"] } paramCombinations = Parameters.getCombinations( ALL_PARAMS, ["trigger", "booster", "edge"]) prevParams = None EDGE_MODEL_STEM = os.path.join( self.edgeDetector.workDir, os.path.normpath(self.model.path) + "-edge-models/model-c_") TRIGGER_MODEL_STEM = os.path.join( self.triggerDetector.workDir, os.path.normpath(self.model.path) + "-trigger-models/model-c_") bestResults = None for i in range(len(paramCombinations)): params = paramCombinations[i] print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" print >> sys.stderr, "Processing params", str(i + 1) + "/" + str( len(paramCombinations)), params print >> sys.stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" # Triggers and Boost if prevParams == None or prevParams["trigger"] != params[ "trigger"] or prevParams["booster"] != params["booster"]: print >> sys.stderr, "Classifying trigger examples for parameters", "trigger:" + str( params["trigger"]), "booster:" + str(params["booster"]) xml = self.triggerDetector.classifyToXML( self.optData, self.model, self.workDir + "grid-trigger-examples.gz", self.workDir + "grid-", classifierModel=TRIGGER_MODEL_STEM + str(params["trigger"]), recallAdjust=params["booster"]) prevParams = params # Build edge examples self.edgeDetector.buildExamples( self.model, [xml], [self.workDir + "grid-edge-examples.gz"], [self.optData]) # Classify with pre-defined model edgeClassifierModel = EDGE_MODEL_STEM + str(params["edge"]) xml = self.edgeDetector.classifyToXML( xml, self.model, self.workDir + "grid-edge-examples.gz", self.workDir + "grid-", classifierModel=edgeClassifierModel) bestResults = self.evaluateGrid(xml, params, bestResults) print >> sys.stderr, "Booster search complete" print >> sys.stderr, "Tested", len(paramCombinations), "combinations" print >> sys.stderr, "Best parameters:", bestResults[0] print >> sys.stderr, "Best result:", bestResults[2] # f-score # Save grid model self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.model) self.saveStr("recallAdjustParameter", str(bestResults[0]["booster"]), self.combinedModel, False) if self.fullGrid: # define best models self.triggerDetector.addClassifierModel( self.model, TRIGGER_MODEL_STEM + str(bestResults[0]["trigger"]), bestResults[0]["trigger"]) self.edgeDetector.addClassifierModel( self.model, EDGE_MODEL_STEM + str(bestResults[0]["edge"]), bestResults[0]["edge"]) # Remove work files for stepTag in [ self.workDir + "grid-trigger", self.workDir + "grid-edge", self.workDir + "grid-unmerging" ]: for fileStem in [ "-classifications", "-classifications.log", "examples.gz", "pred.xml.gz" ]: if os.path.exists(stepTag + fileStem): os.remove(stepTag + fileStem)