def crossValidate(exampleBuilder, corpusElements, examples, options, timer): parameterOptimizationSet = None constantParameterOptimizationSet = None if options.paramOptData != None: print >> sys.stderr, "Separating parameter optimization set" parameterOptimizationDivision = Example.makeCorpusDivision(corpusElements, float(options.paramOptData)) exampleSets = Example.divideExamples(examples, parameterOptimizationDivision) constantParameterOptimizationSet = exampleSets[0] parameterOptimizationSet = constantParameterOptimizationSet optDocs = 0 for k,v in parameterOptimizationDivision.iteritems(): if v == 0: del corpusElements.documentsById[k] optDocs += 1 print >> sys.stderr, " Documents for parameter optimization:", optDocs discardedParameterCombinations = [] print >> sys.stderr, "Dividing data into folds" corpusFolds = Example.makeCorpusFolds(corpusElements, options.folds[0]) exampleSets = Example.divideExamples(examples, corpusFolds) keys = exampleSets.keys() keys.sort() evaluations = [] for key in keys: testSet = exampleSets[key] for example in testSet: example[3]["visualizationSet"] = key + 1 trainSet = [] for key2 in keys: if key != key2: trainSet.extend(exampleSets[key2]) print >> sys.stderr, "Fold", str(key + 1) # Create classifier object if options.output != None: if not os.path.exists(options.output+"/fold"+str(key+1)): os.mkdir(options.output+"/fold"+str(key+1)) # if not os.path.exists(options.output+"/fold"+str(key+1)+"/classifier"): # os.mkdir(options.output+"/fold"+str(key+1)+"/classifier") classifier = Classifier(workDir = options.output + "/fold"+str(key + 1)) else: classifier = Classifier() classifier.featureSet = exampleBuilder.featureSet # Optimize #################### # Check whether there is need for included param opt set if parameterOptimizationSet == None and options.folds[1] == 0: # 8-1-1 folds assert(len(keys) > 1) if keys.index(key) == 0: parameterOptimizationSetKey = keys[-1] else: parameterOptimizationSetKey = keys[keys.index(key)-1] parameterOptimizationSet = exampleSets[parameterOptimizationSetKey] trainSet = [] for key2 in keys: if key2 != key and key2 != parameterOptimizationSetKey: trainSet.extend(exampleSets[key2]) if parameterOptimizationSet != None: # constant external parameter optimization set evaluationArgs = {"classSet":exampleBuilder.classSet} if options.parameters != None: paramDict = splitParameters(options.parameters) bestResults = classifier.optimize([trainSet], [parameterOptimizationSet], paramDict, Evaluation, evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: bestResults = classifier.optimize([trainSet], [parameterOptimizationSet], evaluationClass=Evaluation, evaluationArgs=evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: # nested x-fold parameter optimization assert (options.folds[1] >= 2) optimizationFolds = Example.makeExampleFolds(trainSet, options.folds[1]) optimizationSets = Example.divideExamples(trainSet, optimizationFolds) optimizationSetList = [] optSetKeys = optimizationSets.keys() optSetKeys.sort() for optSetKey in optSetKeys: optimizationSetList.append(optimizationSets[optSetKey]) evaluationArgs = {"classSet":exampleBuilder.classSet} if options.parameters != None: paramDict = splitParameters(options.parameters) bestResults = classifier.optimize(optimizationSetList, optimizationSetList, paramDict, Evaluation, evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: bestResults = classifier.optimize(optimizationSetList, optimizationSetList, evaluationClass=Evaluation, evaluationArgs=evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) # Classify print >> sys.stderr, "Classifying test data" bestParams = bestResults[2] if bestParams.has_key("timeout"): del bestParams["timeout"] print >> sys.stderr, "Parameters:", bestParams print >> sys.stderr, "Training", startTime = time.time() classifier.train(trainSet, bestParams) print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)" print >> sys.stderr, "Testing", startTime = time.time() predictions = classifier.classify(testSet) if options.output != None: pdict = [] fieldnames = ["class","prediction","id","fold"] for p in predictions: if "typed" in exampleBuilder.styles: pdict.append( {"class":exampleBuilder.classSet.getName(p[0][1]), "prediction":exampleBuilder.classSet.getName(p[1]), "id":p[0][0], "fold":key} ) else: pdict.append( {"class":p[0][1], "prediction":p[1], "id":p[0][0], "fold":key} ) TableUtils.addToCSV(pdict, options.output +"/predictions.csv", fieldnames) print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)" # Calculate statistics evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet) print >> sys.stderr, evaluation.toStringConcise() print >> sys.stderr, timer.toString() evaluations.append(evaluation) # Save example sets if options.output != None: print >> sys.stderr, "Saving example sets to", options.output Example.writeExamples(exampleSets[0], options.output +"/fold"+str(key+1) + "/examplesTest.txt") Example.writeExamples(exampleSets[1], options.output +"/fold"+str(key+1) + "/examplesTrain.txt") if parameterOptimizationSet == None: for k,v in optimizationSets.iteritems(): Example.writeExamples(v, options.output +"/fold"+str(key+1) + "/examplesOptimizationSet" + str(k) + ".txt") else: Example.writeExamples(parameterOptimizationSet, options.output +"/fold"+str(key+1) + "/examplesOptimizationSetPredefined.txt") TableUtils.writeCSV(bestResults[2], options.output +"/fold"+str(key+1) + "/parameters.csv") evaluation.saveCSV(options.output +"/fold"+str(key+1) + "/results.csv") print >> sys.stderr, "Compressing folder" zipTree(options.output, "fold"+str(key+1)) parameterOptimizationSet = constantParameterOptimizationSet print >> sys.stderr, "Cross-validation Results" for i in range(len(evaluations)): print >> sys.stderr, evaluations[i].toStringConcise(" Fold "+str(i)+": ") averageResult = Evaluation.average(evaluations) print >> sys.stderr, averageResult.toStringConcise(" Avg: ") pooledResult = Evaluation.pool(evaluations) print >> sys.stderr, pooledResult.toStringConcise(" Pool: ") if options.output != None: for i in range(len(evaluations)): evaluations[i].saveCSV(options.output+"/results.csv", i) averageResult.saveCSV(options.output+"/results.csv", "Avg") pooledResult.saveCSV(options.output+"/results.csv", "Pool") averageResult.saveCSV(options.output+"/resultsAverage.csv") pooledResult.saveCSV(options.output+"/resultsPooled.csv") # Visualize if options.visualization != None: visualize(sentences, pooledResult.classifications, options, exampleBuilder) # Save interactionXML if options.resultsToXML != None: classSet = None if "typed" in exampleBuilder.styles: classSet = exampleBuilder.classSet Example.writeToInteractionXML(pooledResult.classifications, corpusElements, options.resultsToXML, classSet)
CLASSIFIER_PARAMS="c:25000,50000,87500" WORKDIR="/usr/share/biotext/GeniaChallenge/SharedTaskTriggerTest" PARSE_TOK="split-Charniak-Lease" workdir(WORKDIR, False) log() # Trigger detection #Gazetteer.run(TRAIN_FILE, "gazetteer-train") #GeneralEntityTypeRecognizer.run(TRAIN_FILE, "trigger-train-examples", PARSE_TOK, PARSE_TOK, "style:typed", "trigger-ids") GeneralEntityTypeRecognizer.run(TEST_FILE, "trigger-test-examples", PARSE_TOK, PARSE_TOK, "style:typed", "trigger-ids") Cls.test("trigger-test-examples", "trigger-param-opt/model-c_75000", "trigger-test-classifications") evaluator = Ev.evaluate("trigger-test-examples", "trigger-test-classifications", "trigger-ids.class_names") #evaluator = optimize(Cls, Ev, "trigger-train-examples", "trigger-test-examples",\ # "trigger-ids.class_names", CLASSIFIER_PARAMS, "trigger-param-opt")[0] ExampleUtils.writeToInteractionXML(evaluator.classifications, TEST_FILE, "test-predicted-triggers.xml", "trigger-ids.class_names", PARSE_TOK, PARSE_TOK) # RecallAdjust.run("test-predicted-triggers.xml",1.0,"test-predicted-triggers-adj.xml") # ix.splitMergedElements("test-predicted-triggers-adj.xml", "test-predicted-triggers-adj-split.xml") # ix.recalculateIds("test-predicted-triggers-adj-split.xml", "test-predicted-triggers-adj-split-recids.xml", True) # EvaluateInteractionXML.run(Ev, "test-predicted-triggers-adj-split-recids.xml", GOLD_TEST_FILE, PARSE_TOK, PARSE_TOK) ix.splitMergedElements("test-predicted-triggers.xml", "test-predicted-triggers-split.xml") ix.recalculateIds("test-predicted-triggers-split.xml", "test-predicted-triggers-split-recids.xml", True) EvaluateInteractionXML.run(Ev, "test-predicted-triggers-split-recids.xml", GOLD_TEST_FILE, PARSE_TOK, PARSE_TOK)
# goldSentences = [] # for sentence in goldCorpusElements.sentences: # goldSentences.append( [sentence.sentenceGraph,None] ) # Calculate statistics evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet) print >> sys.stderr, evaluation.toStringConcise() if options.output != None: evaluation.saveCSV(options.output + "/results.csv") # Save interactionXML if options.resultsToXML != None: classSet = None if "typed" in exampleBuilder.styles: classSet = exampleBuilder.classSet Example.writeToInteractionXML(evaluation.classifications, testCorpusElements, options.resultsToXML, classSet) # # Compare to binary # if options.binaryCorpus != None: # compareToBinary(corpusElements.sentencesById, predictions, exampleBuilder, options) # Visualize if options.visualization != None: for example in exampleSets[0]: example[3]["visualizationSet"] = "train" #corpusElements.sentencesById[example[0].rsplit(".",1)[0]].sentenceGraph.visualizationSet = "train" for example in exampleSets[1]: example[3]["visualizationSet"] = "test" #corpusElements.sentencesById[example[0].rsplit(".",1)[0]].sentenceGraph.visualizationSet = "test" if len(testSentences) > 0: visualize(testSentences, evaluation.classifications, options, exampleBuilder)
def crossValidate(exampleBuilder, corpusElements, examples, options, timer): parameterOptimizationSet = None constantParameterOptimizationSet = None if options.paramOptData != None: print >> sys.stderr, "Separating parameter optimization set" parameterOptimizationDivision = Example.makeCorpusDivision( corpusElements, float(options.paramOptData)) exampleSets = Example.divideExamples(examples, parameterOptimizationDivision) constantParameterOptimizationSet = exampleSets[0] parameterOptimizationSet = constantParameterOptimizationSet optDocs = 0 for k, v in parameterOptimizationDivision.iteritems(): if v == 0: del corpusElements.documentsById[k] optDocs += 1 print >> sys.stderr, " Documents for parameter optimization:", optDocs discardedParameterCombinations = [] print >> sys.stderr, "Dividing data into folds" corpusFolds = Example.makeCorpusFolds(corpusElements, options.folds[0]) exampleSets = Example.divideExamples(examples, corpusFolds) keys = exampleSets.keys() keys.sort() evaluations = [] for key in keys: testSet = exampleSets[key] for example in testSet: example[3]["visualizationSet"] = key + 1 trainSet = [] for key2 in keys: if key != key2: trainSet.extend(exampleSets[key2]) print >> sys.stderr, "Fold", str(key + 1) # Create classifier object if options.output != None: if not os.path.exists(options.output + "/fold" + str(key + 1)): os.mkdir(options.output + "/fold" + str(key + 1)) # if not os.path.exists(options.output+"/fold"+str(key+1)+"/classifier"): # os.mkdir(options.output+"/fold"+str(key+1)+"/classifier") classifier = Classifier(workDir=options.output + "/fold" + str(key + 1)) else: classifier = Classifier() classifier.featureSet = exampleBuilder.featureSet # Optimize #################### # Check whether there is need for included param opt set if parameterOptimizationSet == None and options.folds[ 1] == 0: # 8-1-1 folds assert (len(keys) > 1) if keys.index(key) == 0: parameterOptimizationSetKey = keys[-1] else: parameterOptimizationSetKey = keys[keys.index(key) - 1] parameterOptimizationSet = exampleSets[parameterOptimizationSetKey] trainSet = [] for key2 in keys: if key2 != key and key2 != parameterOptimizationSetKey: trainSet.extend(exampleSets[key2]) if parameterOptimizationSet != None: # constant external parameter optimization set evaluationArgs = {"classSet": exampleBuilder.classSet} if options.parameters != None: paramDict = splitParameters(options.parameters) bestResults = classifier.optimize( [trainSet], [parameterOptimizationSet], paramDict, Evaluation, evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: bestResults = classifier.optimize( [trainSet], [parameterOptimizationSet], evaluationClass=Evaluation, evaluationArgs=evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: # nested x-fold parameter optimization assert (options.folds[1] >= 2) optimizationFolds = Example.makeExampleFolds( trainSet, options.folds[1]) optimizationSets = Example.divideExamples(trainSet, optimizationFolds) optimizationSetList = [] optSetKeys = optimizationSets.keys() optSetKeys.sort() for optSetKey in optSetKeys: optimizationSetList.append(optimizationSets[optSetKey]) evaluationArgs = {"classSet": exampleBuilder.classSet} if options.parameters != None: paramDict = splitParameters(options.parameters) bestResults = classifier.optimize( optimizationSetList, optimizationSetList, paramDict, Evaluation, evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: bestResults = classifier.optimize( optimizationSetList, optimizationSetList, evaluationClass=Evaluation, evaluationArgs=evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) # Classify print >> sys.stderr, "Classifying test data" bestParams = bestResults[2] if bestParams.has_key("timeout"): del bestParams["timeout"] print >> sys.stderr, "Parameters:", bestParams print >> sys.stderr, "Training", startTime = time.time() classifier.train(trainSet, bestParams) print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)" print >> sys.stderr, "Testing", startTime = time.time() predictions = classifier.classify(testSet) if options.output != None: pdict = [] fieldnames = ["class", "prediction", "id", "fold"] for p in predictions: if "typed" in exampleBuilder.styles: pdict.append({ "class": exampleBuilder.classSet.getName(p[0][1]), "prediction": exampleBuilder.classSet.getName(p[1]), "id": p[0][0], "fold": key }) else: pdict.append({ "class": p[0][1], "prediction": p[1], "id": p[0][0], "fold": key }) TableUtils.addToCSV(pdict, options.output + "/predictions.csv", fieldnames) print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)" # Calculate statistics evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet) print >> sys.stderr, evaluation.toStringConcise() print >> sys.stderr, timer.toString() evaluations.append(evaluation) # Save example sets if options.output != None: print >> sys.stderr, "Saving example sets to", options.output Example.writeExamples( exampleSets[0], options.output + "/fold" + str(key + 1) + "/examplesTest.txt") Example.writeExamples( exampleSets[1], options.output + "/fold" + str(key + 1) + "/examplesTrain.txt") if parameterOptimizationSet == None: for k, v in optimizationSets.iteritems(): Example.writeExamples( v, options.output + "/fold" + str(key + 1) + "/examplesOptimizationSet" + str(k) + ".txt") else: Example.writeExamples( parameterOptimizationSet, options.output + "/fold" + str(key + 1) + "/examplesOptimizationSetPredefined.txt") TableUtils.writeCSV( bestResults[2], options.output + "/fold" + str(key + 1) + "/parameters.csv") evaluation.saveCSV(options.output + "/fold" + str(key + 1) + "/results.csv") print >> sys.stderr, "Compressing folder" zipTree(options.output, "fold" + str(key + 1)) parameterOptimizationSet = constantParameterOptimizationSet print >> sys.stderr, "Cross-validation Results" for i in range(len(evaluations)): print >> sys.stderr, evaluations[i].toStringConcise(" Fold " + str(i) + ": ") averageResult = Evaluation.average(evaluations) print >> sys.stderr, averageResult.toStringConcise(" Avg: ") pooledResult = Evaluation.pool(evaluations) print >> sys.stderr, pooledResult.toStringConcise(" Pool: ") if options.output != None: for i in range(len(evaluations)): evaluations[i].saveCSV(options.output + "/results.csv", i) averageResult.saveCSV(options.output + "/results.csv", "Avg") pooledResult.saveCSV(options.output + "/results.csv", "Pool") averageResult.saveCSV(options.output + "/resultsAverage.csv") pooledResult.saveCSV(options.output + "/resultsPooled.csv") # Visualize if options.visualization != None: visualize(sentences, pooledResult.classifications, options, exampleBuilder) # Save interactionXML if options.resultsToXML != None: classSet = None if "typed" in exampleBuilder.styles: classSet = exampleBuilder.classSet Example.writeToInteractionXML(pooledResult.classifications, corpusElements, options.resultsToXML, classSet)
startTime = time.time() predictions = classifier.classify(exampleSets[1], bestResults[2]) print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)" # Calculate statistics evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet) print >> sys.stderr, evaluation.toStringConcise() if options.output != None: evaluation.saveCSV(options.output + "/results.csv") # Save interactionXML if options.resultsToXML != None: classSet = None if "typed" in exampleBuilder.styles: classSet = exampleBuilder.classSet Example.writeToInteractionXML(evaluation.classifications, testCorpusElements, options.resultsToXML, classSet) # # Visualize # if options.visualization != None: # for example in exampleSets[0]: # example[3]["visualizationSet"] = "train" # #corpusElements.sentencesById[example[0].rsplit(".",1)[0]].sentenceGraph.visualizationSet = "train" # for example in exampleSets[1]: # example[3]["visualizationSet"] = "test" # #corpusElements.sentencesById[example[0].rsplit(".",1)[0]].sentenceGraph.visualizationSet = "test" # if len(testSentences) > 0: # visualize(testSentences, evaluation.classifications, options, exampleBuilder) # else: # visualize(sentences, evaluation.classifications, options, exampleBuilder)
#Gazetteer.run(TRAIN_FILE, "gazetteer-train") #GeneralEntityTypeRecognizer.run(TRAIN_FILE, "trigger-train-examples", PARSE_TOK, PARSE_TOK, "style:typed", "trigger-ids") GeneralEntityTypeRecognizer.run(TEST_FILE, "trigger-test-examples", PARSE_TOK, PARSE_TOK, "style:typed", "trigger-ids") Cls.test("trigger-test-examples", "trigger-param-opt/model-c_75000", "trigger-test-classifications") evaluator = Ev.evaluate("trigger-test-examples", "trigger-test-classifications", "trigger-ids.class_names") #evaluator = optimize(Cls, Ev, "trigger-train-examples", "trigger-test-examples",\ # "trigger-ids.class_names", CLASSIFIER_PARAMS, "trigger-param-opt")[0] ExampleUtils.writeToInteractionXML(evaluator.classifications, TEST_FILE, "test-predicted-triggers.xml", "trigger-ids.class_names", PARSE_TOK, PARSE_TOK) # RecallAdjust.run("test-predicted-triggers.xml",1.0,"test-predicted-triggers-adj.xml") # ix.splitMergedElements("test-predicted-triggers-adj.xml", "test-predicted-triggers-adj-split.xml") # ix.recalculateIds("test-predicted-triggers-adj-split.xml", "test-predicted-triggers-adj-split-recids.xml", True) # EvaluateInteractionXML.run(Ev, "test-predicted-triggers-adj-split-recids.xml", GOLD_TEST_FILE, PARSE_TOK, PARSE_TOK) ix.splitMergedElements("test-predicted-triggers.xml", "test-predicted-triggers-split.xml") ix.recalculateIds("test-predicted-triggers-split.xml", "test-predicted-triggers-split-recids.xml", True) EvaluateInteractionXML.run(Ev, "test-predicted-triggers-split-recids.xml", GOLD_TEST_FILE, PARSE_TOK, PARSE_TOK)