def saveCSV(self, filename, fold=None): import sys sys.path.append("..") import Utils.TableUtils as TableUtils dicts = self.toDict() if fold != None: for d in dicts: d["fold"] = fold TableUtils.addToCSV(dicts, filename, g_evaluatorFieldnames)
def saveCSV(self, filename, fold=None): global g_evaluatorFieldnames import sys sys.path.append("..") import Utils.TableUtils as TableUtils dicts = self.toDict() if fold != None: for d in dicts: d["fold"] = fold #TableUtils.addToCSV(dicts, filename, g_evaluatorFieldnames) TableUtils.writeCSV(dicts, filename, g_evaluatorFieldnames, writeTitles=True)
def analyzeLinearDistance(corpusElements): interactionEdges = 0 interactionLinearDistanceCounts = {} allEntitiesLinearDistanceCounts = {} for sentence in corpusElements.sentences: sentenceGraph = sentence.sentenceGraph interactionEdges += len(sentence.interactions) # Linear distance between end tokens of interaction edges for interaction in sentence.interactions: e1 = sentence.entitiesById[interaction.get("e1")] e2 = sentence.entitiesById[interaction.get("e2")] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] linDistance = int(t1.get("id").split("_")[-1]) - int( t2.get("id").split("_")[-1]) if linDistance < 0: linDistance *= -1 if not interactionLinearDistanceCounts.has_key(linDistance): interactionLinearDistanceCounts[linDistance] = 0 interactionLinearDistanceCounts[linDistance] += 1 # Linear distance between all entities for i in range(len(sentence.entities) - 1): for j in range(i + 1, len(sentence.entities)): tI = sentenceGraph.entityHeadTokenByEntity[ sentence.entities[i]] tJ = sentenceGraph.entityHeadTokenByEntity[ sentence.entities[j]] linDistance = int(tI.get("id").split("_")[-1]) - int( tJ.get("id").split("_")[-1]) if linDistance < 0: linDistance *= -1 if not allEntitiesLinearDistanceCounts.has_key(linDistance): allEntitiesLinearDistanceCounts[linDistance] = 0 allEntitiesLinearDistanceCounts[linDistance] += 1 print >> sys.stderr, "=== Linear Distance ===" print >> sys.stderr, "Interaction edges:", interactionEdges print >> sys.stderr, "Entity head token linear distance for interaction edges:" printPathDistribution(interactionLinearDistanceCounts) if options.output != None: interactionLinearDistanceCounts["corpus"] = options.input interactionLinearDistanceCounts["parse"] = options.parse TableUtils.addToCSV( interactionLinearDistanceCounts, options.output + "/interactionEdgeLinearDistance.csv") print >> sys.stderr, "Linear distance between head tokens of all entities:" printPathDistribution(allEntitiesLinearDistanceCounts) if options.output != None: allEntitiesLinearDistanceCounts["corpus"] = options.input allEntitiesLinearDistanceCounts["parse"] = options.parse TableUtils.addToCSV(allEntitiesLinearDistanceCounts, options.output + "/allEntitiesLinearDistance.csv")
def resultsToCSV(results, filename=None): rows = [] for k1 in sorted(results.keys()): for k2 in sorted(results[k1].keys()): rows.append({}) rows[-1]["eval"] = k1 rows[-1]["event_class"] = k2 for k3 in sorted(results[k1][k2].keys()): rows[-1][k3] = results[k1][k2][k3] if filename != None: fieldnames = ["eval", "event_class", "gold", "gold_match", "answer", "answer_match", "recall", "precision", "fscore"] TableUtils.writeCSV(rows, filename, fieldnames) return rows
def analyzeLinearDistance(corpusElements): interactionEdges = 0 interactionLinearDistanceCounts = {} allEntitiesLinearDistanceCounts = {} for sentence in corpusElements.sentences: sentenceGraph = sentence.sentenceGraph interactionEdges += len(sentence.interactions) # Linear distance between end tokens of interaction edges for interaction in sentence.interactions: e1 = sentence.entitiesById[interaction.get("e1")] e2 = sentence.entitiesById[interaction.get("e2")] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] linDistance = int(t1.get("id").split("_")[-1]) - int(t2.get("id").split("_")[-1]) if linDistance < 0: linDistance *= -1 if not interactionLinearDistanceCounts.has_key(linDistance): interactionLinearDistanceCounts[linDistance] = 0 interactionLinearDistanceCounts[linDistance] += 1 # Linear distance between all entities for i in range(len(sentence.entities)-1): for j in range(i+1,len(sentence.entities)): tI = sentenceGraph.entityHeadTokenByEntity[sentence.entities[i]] tJ = sentenceGraph.entityHeadTokenByEntity[sentence.entities[j]] linDistance = int(tI.get("id").split("_")[-1]) - int(tJ.get("id").split("_")[-1]) if linDistance < 0: linDistance *= -1 if not allEntitiesLinearDistanceCounts.has_key(linDistance): allEntitiesLinearDistanceCounts[linDistance] = 0 allEntitiesLinearDistanceCounts[linDistance] += 1 print >> sys.stderr, "=== Linear Distance ===" print >> sys.stderr, "Interaction edges:", interactionEdges print >> sys.stderr, "Entity head token linear distance for interaction edges:" printPathDistribution(interactionLinearDistanceCounts) if options.output != None: interactionLinearDistanceCounts["corpus"] = options.input interactionLinearDistanceCounts["parse"] = options.parse TableUtils.addToCSV(interactionLinearDistanceCounts, options.output+"/interactionEdgeLinearDistance.csv") print >> sys.stderr, "Linear distance between head tokens of all entities:" printPathDistribution(allEntitiesLinearDistanceCounts) if options.output != None: allEntitiesLinearDistanceCounts["corpus"] = options.input allEntitiesLinearDistanceCounts["parse"] = options.parse TableUtils.addToCSV(allEntitiesLinearDistanceCounts, options.output+"/allEntitiesLinearDistance.csv")
def crossValidate(exampleBuilder, corpusElements, examples, options, timer): parameterOptimizationSet = None constantParameterOptimizationSet = None if options.paramOptData != None: print >> sys.stderr, "Separating parameter optimization set" parameterOptimizationDivision = Example.makeCorpusDivision(corpusElements, float(options.paramOptData)) exampleSets = Example.divideExamples(examples, parameterOptimizationDivision) constantParameterOptimizationSet = exampleSets[0] parameterOptimizationSet = constantParameterOptimizationSet optDocs = 0 for k,v in parameterOptimizationDivision.iteritems(): if v == 0: del corpusElements.documentsById[k] optDocs += 1 print >> sys.stderr, " Documents for parameter optimization:", optDocs discardedParameterCombinations = [] print >> sys.stderr, "Dividing data into folds" corpusFolds = Example.makeCorpusFolds(corpusElements, options.folds[0]) exampleSets = Example.divideExamples(examples, corpusFolds) keys = exampleSets.keys() keys.sort() evaluations = [] for key in keys: testSet = exampleSets[key] for example in testSet: example[3]["visualizationSet"] = key + 1 trainSet = [] for key2 in keys: if key != key2: trainSet.extend(exampleSets[key2]) print >> sys.stderr, "Fold", str(key + 1) # Create classifier object if options.output != None: if not os.path.exists(options.output+"/fold"+str(key+1)): os.mkdir(options.output+"/fold"+str(key+1)) # if not os.path.exists(options.output+"/fold"+str(key+1)+"/classifier"): # os.mkdir(options.output+"/fold"+str(key+1)+"/classifier") classifier = Classifier(workDir = options.output + "/fold"+str(key + 1)) else: classifier = Classifier() classifier.featureSet = exampleBuilder.featureSet # Optimize #################### # Check whether there is need for included param opt set if parameterOptimizationSet == None and options.folds[1] == 0: # 8-1-1 folds assert(len(keys) > 1) if keys.index(key) == 0: parameterOptimizationSetKey = keys[-1] else: parameterOptimizationSetKey = keys[keys.index(key)-1] parameterOptimizationSet = exampleSets[parameterOptimizationSetKey] trainSet = [] for key2 in keys: if key2 != key and key2 != parameterOptimizationSetKey: trainSet.extend(exampleSets[key2]) if parameterOptimizationSet != None: # constant external parameter optimization set evaluationArgs = {"classSet":exampleBuilder.classSet} if options.parameters != None: paramDict = splitParameters(options.parameters) bestResults = classifier.optimize([trainSet], [parameterOptimizationSet], paramDict, Evaluation, evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: bestResults = classifier.optimize([trainSet], [parameterOptimizationSet], evaluationClass=Evaluation, evaluationArgs=evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: # nested x-fold parameter optimization assert (options.folds[1] >= 2) optimizationFolds = Example.makeExampleFolds(trainSet, options.folds[1]) optimizationSets = Example.divideExamples(trainSet, optimizationFolds) optimizationSetList = [] optSetKeys = optimizationSets.keys() optSetKeys.sort() for optSetKey in optSetKeys: optimizationSetList.append(optimizationSets[optSetKey]) evaluationArgs = {"classSet":exampleBuilder.classSet} if options.parameters != None: paramDict = splitParameters(options.parameters) bestResults = classifier.optimize(optimizationSetList, optimizationSetList, paramDict, Evaluation, evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: bestResults = classifier.optimize(optimizationSetList, optimizationSetList, evaluationClass=Evaluation, evaluationArgs=evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) # Classify print >> sys.stderr, "Classifying test data" bestParams = bestResults[2] if bestParams.has_key("timeout"): del bestParams["timeout"] print >> sys.stderr, "Parameters:", bestParams print >> sys.stderr, "Training", startTime = time.time() classifier.train(trainSet, bestParams) print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)" print >> sys.stderr, "Testing", startTime = time.time() predictions = classifier.classify(testSet) if options.output != None: pdict = [] fieldnames = ["class","prediction","id","fold"] for p in predictions: if "typed" in exampleBuilder.styles: pdict.append( {"class":exampleBuilder.classSet.getName(p[0][1]), "prediction":exampleBuilder.classSet.getName(p[1]), "id":p[0][0], "fold":key} ) else: pdict.append( {"class":p[0][1], "prediction":p[1], "id":p[0][0], "fold":key} ) TableUtils.addToCSV(pdict, options.output +"/predictions.csv", fieldnames) print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)" # Calculate statistics evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet) print >> sys.stderr, evaluation.toStringConcise() print >> sys.stderr, timer.toString() evaluations.append(evaluation) # Save example sets if options.output != None: print >> sys.stderr, "Saving example sets to", options.output Example.writeExamples(exampleSets[0], options.output +"/fold"+str(key+1) + "/examplesTest.txt") Example.writeExamples(exampleSets[1], options.output +"/fold"+str(key+1) + "/examplesTrain.txt") if parameterOptimizationSet == None: for k,v in optimizationSets.iteritems(): Example.writeExamples(v, options.output +"/fold"+str(key+1) + "/examplesOptimizationSet" + str(k) + ".txt") else: Example.writeExamples(parameterOptimizationSet, options.output +"/fold"+str(key+1) + "/examplesOptimizationSetPredefined.txt") TableUtils.writeCSV(bestResults[2], options.output +"/fold"+str(key+1) + "/parameters.csv") evaluation.saveCSV(options.output +"/fold"+str(key+1) + "/results.csv") print >> sys.stderr, "Compressing folder" zipTree(options.output, "fold"+str(key+1)) parameterOptimizationSet = constantParameterOptimizationSet print >> sys.stderr, "Cross-validation Results" for i in range(len(evaluations)): print >> sys.stderr, evaluations[i].toStringConcise(" Fold "+str(i)+": ") averageResult = Evaluation.average(evaluations) print >> sys.stderr, averageResult.toStringConcise(" Avg: ") pooledResult = Evaluation.pool(evaluations) print >> sys.stderr, pooledResult.toStringConcise(" Pool: ") if options.output != None: for i in range(len(evaluations)): evaluations[i].saveCSV(options.output+"/results.csv", i) averageResult.saveCSV(options.output+"/results.csv", "Avg") pooledResult.saveCSV(options.output+"/results.csv", "Pool") averageResult.saveCSV(options.output+"/resultsAverage.csv") pooledResult.saveCSV(options.output+"/resultsPooled.csv") # Visualize if options.visualization != None: visualize(sentences, pooledResult.classifications, options, exampleBuilder) # Save interactionXML if options.resultsToXML != None: classSet = None if "typed" in exampleBuilder.styles: classSet = exampleBuilder.classSet Example.writeToInteractionXML(pooledResult.classifications, corpusElements, options.resultsToXML, classSet)
try: import psyco psyco.full() print >> sys.stderr, "Found Psyco, using" except ImportError: print >> sys.stderr, "Psyco not installed" sys.path.append("..") from Utils.ProgressCounter import ProgressCounter from Utils.Parameters import splitParameters from optparse import OptionParser import Core.ExampleUtils as ExampleUtils from Core.IdSet import IdSet import Utils.TableUtils as TableUtils optparser = OptionParser(usage="%prog [options]\nCalculate f-score and other statistics.") optparser.add_option("-i", "--input", default=None, dest="input", help="Input file in csv-format", metavar="FILE") optparser.add_option("-o", "--output", default=None, dest="output", help="Output file for the statistics") optparser.add_option("-e", "--evaluator", default="BinaryEvaluator", dest="evaluator", help="Prediction evaluator class") (options, args) = optparser.parse_args() print >> sys.stderr, "Importing modules" exec "from Evaluators." + options.evaluator + " import " + options.evaluator + " as EvaluatorClass" if options.output != None: print >> sys.stderr, "Outputfile exists, removing", options.output if os.path.exists(options.output): os.remove(options.output) # Read input data fieldnames = ["class","prediction","id","fold"] rows = TableUtils.readCSV(options.input, fieldnames) evaluateCSV(rows, options, EvaluatorClass)
classNameDict[classId] = className classNameFile.close() #classSet = IdSet(idDict=classNameDict, locked=True) if options.output != None: print >> sys.stderr, "Outputfile exists, removing", options.output if os.path.exists(options.output): os.remove(options.output) print >> sys.stderr, "Importing modules" exec "from Evaluators." + options.evaluator + " import " + options.evaluator + " as EvaluatorClass" fieldnames = ["class","prediction","id","fold","c"] # Find best c-parameter from parameter estimation data print >> sys.stderr, "Finding optimal c-parameters from", options.parameters rows = TableUtils.readCSV(options.parameters, fieldnames) folds = sorted(list(TableUtils.getValueSet(rows, "fold"))) cParameterByFold = {} for fold in folds: print >> sys.stderr, " Processing fold", fold foldRows = TableUtils.selectRowsCSV(rows, {"fold":fold}) cParameters = sorted(list(TableUtils.getValueSet(foldRows, "c"))) evaluators = [] cParameterByEvaluator = {} for cParameter in cParameters: print >> sys.stderr, " Processing c-parameter", cParameter, paramRows = TableUtils.selectRowsCSV(foldRows, {"c":cParameter}) evaluator = Evaluator.calculateFromCSV(paramRows, EvaluatorClass) #print evaluator.toStringConcise() cParameterByEvaluator[evaluator] = cParameter evaluators.append(evaluator)
bestResults[2][k] = v featureSet = IdSet() featureSet.load(os.path.join(classifierParamDict["predefined"][0], "feature_names.txt")) classSet = None if os.path.exists(os.path.join(classifierParamDict["predefined"][0], "class_names.txt")): classSet = IdSet() classSet.load(os.path.join(classifierParamDict["predefined"][0], "class_names.txt")) exampleBuilder = ExampleBuilder(featureSet=featureSet, classSet=classSet, **splitParameters(options.exampleBuilderParameters)) # Save training sets if options.output != None: print >> sys.stderr, "Saving example sets to", options.output Example.writeExamples(exampleSets[0], options.output + "/examplesTrain.txt") if not classifierParamDict.has_key("predefined"): Example.writeExamples(optimizationSets[0], options.output + "/examplesOptimizationTest.txt") Example.writeExamples(optimizationSets[1], options.output + "/examplesOptimizationTrain.txt") TableUtils.writeCSV(bestResults[2], options.output +"/best_parameters.csv") # Optimize and train if options.output != None: classifier = Classifier(workDir = options.output + "/classifier") else: classifier = Classifier() classifier.featureSet = exampleBuilder.featureSet if hasattr(exampleBuilder,"classSet"): classifier.classSet = exampleBuilder.classSet print >> sys.stderr, "Classifying test data" if bestResults[2].has_key("timeout"): del bestResults[2]["timeout"] print >> sys.stderr, "Parameters:", bestResults[2] print >> sys.stderr, "Training", startTime = time.time()
def crossValidate(exampleBuilder, corpusElements, examples, options, timer): parameterOptimizationSet = None constantParameterOptimizationSet = None if options.paramOptData != None: print >> sys.stderr, "Separating parameter optimization set" parameterOptimizationDivision = Example.makeCorpusDivision( corpusElements, float(options.paramOptData)) exampleSets = Example.divideExamples(examples, parameterOptimizationDivision) constantParameterOptimizationSet = exampleSets[0] parameterOptimizationSet = constantParameterOptimizationSet optDocs = 0 for k, v in parameterOptimizationDivision.iteritems(): if v == 0: del corpusElements.documentsById[k] optDocs += 1 print >> sys.stderr, " Documents for parameter optimization:", optDocs discardedParameterCombinations = [] print >> sys.stderr, "Dividing data into folds" corpusFolds = Example.makeCorpusFolds(corpusElements, options.folds[0]) exampleSets = Example.divideExamples(examples, corpusFolds) keys = exampleSets.keys() keys.sort() evaluations = [] for key in keys: testSet = exampleSets[key] for example in testSet: example[3]["visualizationSet"] = key + 1 trainSet = [] for key2 in keys: if key != key2: trainSet.extend(exampleSets[key2]) print >> sys.stderr, "Fold", str(key + 1) # Create classifier object if options.output != None: if not os.path.exists(options.output + "/fold" + str(key + 1)): os.mkdir(options.output + "/fold" + str(key + 1)) # if not os.path.exists(options.output+"/fold"+str(key+1)+"/classifier"): # os.mkdir(options.output+"/fold"+str(key+1)+"/classifier") classifier = Classifier(workDir=options.output + "/fold" + str(key + 1)) else: classifier = Classifier() classifier.featureSet = exampleBuilder.featureSet # Optimize #################### # Check whether there is need for included param opt set if parameterOptimizationSet == None and options.folds[ 1] == 0: # 8-1-1 folds assert (len(keys) > 1) if keys.index(key) == 0: parameterOptimizationSetKey = keys[-1] else: parameterOptimizationSetKey = keys[keys.index(key) - 1] parameterOptimizationSet = exampleSets[parameterOptimizationSetKey] trainSet = [] for key2 in keys: if key2 != key and key2 != parameterOptimizationSetKey: trainSet.extend(exampleSets[key2]) if parameterOptimizationSet != None: # constant external parameter optimization set evaluationArgs = {"classSet": exampleBuilder.classSet} if options.parameters != None: paramDict = splitParameters(options.parameters) bestResults = classifier.optimize( [trainSet], [parameterOptimizationSet], paramDict, Evaluation, evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: bestResults = classifier.optimize( [trainSet], [parameterOptimizationSet], evaluationClass=Evaluation, evaluationArgs=evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: # nested x-fold parameter optimization assert (options.folds[1] >= 2) optimizationFolds = Example.makeExampleFolds( trainSet, options.folds[1]) optimizationSets = Example.divideExamples(trainSet, optimizationFolds) optimizationSetList = [] optSetKeys = optimizationSets.keys() optSetKeys.sort() for optSetKey in optSetKeys: optimizationSetList.append(optimizationSets[optSetKey]) evaluationArgs = {"classSet": exampleBuilder.classSet} if options.parameters != None: paramDict = splitParameters(options.parameters) bestResults = classifier.optimize( optimizationSetList, optimizationSetList, paramDict, Evaluation, evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: bestResults = classifier.optimize( optimizationSetList, optimizationSetList, evaluationClass=Evaluation, evaluationArgs=evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) # Classify print >> sys.stderr, "Classifying test data" bestParams = bestResults[2] if bestParams.has_key("timeout"): del bestParams["timeout"] print >> sys.stderr, "Parameters:", bestParams print >> sys.stderr, "Training", startTime = time.time() classifier.train(trainSet, bestParams) print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)" print >> sys.stderr, "Testing", startTime = time.time() predictions = classifier.classify(testSet) if options.output != None: pdict = [] fieldnames = ["class", "prediction", "id", "fold"] for p in predictions: if "typed" in exampleBuilder.styles: pdict.append({ "class": exampleBuilder.classSet.getName(p[0][1]), "prediction": exampleBuilder.classSet.getName(p[1]), "id": p[0][0], "fold": key }) else: pdict.append({ "class": p[0][1], "prediction": p[1], "id": p[0][0], "fold": key }) TableUtils.addToCSV(pdict, options.output + "/predictions.csv", fieldnames) print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)" # Calculate statistics evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet) print >> sys.stderr, evaluation.toStringConcise() print >> sys.stderr, timer.toString() evaluations.append(evaluation) # Save example sets if options.output != None: print >> sys.stderr, "Saving example sets to", options.output Example.writeExamples( exampleSets[0], options.output + "/fold" + str(key + 1) + "/examplesTest.txt") Example.writeExamples( exampleSets[1], options.output + "/fold" + str(key + 1) + "/examplesTrain.txt") if parameterOptimizationSet == None: for k, v in optimizationSets.iteritems(): Example.writeExamples( v, options.output + "/fold" + str(key + 1) + "/examplesOptimizationSet" + str(k) + ".txt") else: Example.writeExamples( parameterOptimizationSet, options.output + "/fold" + str(key + 1) + "/examplesOptimizationSetPredefined.txt") TableUtils.writeCSV( bestResults[2], options.output + "/fold" + str(key + 1) + "/parameters.csv") evaluation.saveCSV(options.output + "/fold" + str(key + 1) + "/results.csv") print >> sys.stderr, "Compressing folder" zipTree(options.output, "fold" + str(key + 1)) parameterOptimizationSet = constantParameterOptimizationSet print >> sys.stderr, "Cross-validation Results" for i in range(len(evaluations)): print >> sys.stderr, evaluations[i].toStringConcise(" Fold " + str(i) + ": ") averageResult = Evaluation.average(evaluations) print >> sys.stderr, averageResult.toStringConcise(" Avg: ") pooledResult = Evaluation.pool(evaluations) print >> sys.stderr, pooledResult.toStringConcise(" Pool: ") if options.output != None: for i in range(len(evaluations)): evaluations[i].saveCSV(options.output + "/results.csv", i) averageResult.saveCSV(options.output + "/results.csv", "Avg") pooledResult.saveCSV(options.output + "/results.csv", "Pool") averageResult.saveCSV(options.output + "/resultsAverage.csv") pooledResult.saveCSV(options.output + "/resultsPooled.csv") # Visualize if options.visualization != None: visualize(sentences, pooledResult.classifications, options, exampleBuilder) # Save interactionXML if options.resultsToXML != None: classSet = None if "typed" in exampleBuilder.styles: classSet = exampleBuilder.classSet Example.writeToInteractionXML(pooledResult.classifications, corpusElements, options.resultsToXML, classSet)
"feature_names.txt")) classSet = None if os.path.exists( os.path.join(classifierParamDict["predefined"][0], "class_names.txt")): classSet = IdSet() classSet.load( os.path.join(classifierParamDict["predefined"][0], "class_names.txt")) exampleBuilder = ExampleBuilder(featureSet=featureSet, classSet=classSet, **splitParameters( options.exampleBuilderParameters)) # Save training sets if options.output != None: TableUtils.writeCSV(bestResults[2], options.output + "/best_parameters.csv") # Optimize and train if options.output != None: classifier = Classifier(workDir=options.output + "/classifier") else: classifier = Classifier() classifier.featureSet = exampleBuilder.featureSet if hasattr(exampleBuilder, "classSet"): classifier.classSet = exampleBuilder.classSet print >> sys.stderr, "Classifying test data" if bestResults[2].has_key("timeout"): del bestResults[2]["timeout"] print >> sys.stderr, "Parameters:", bestResults[2] print >> sys.stderr, "Training", startTime = time.time()
def analyzeLengths(corpusElements): interactionEdges = 0 dependencyEdges = 0 pathsByLength = {} pathsBetweenAllEntitiesByLength = {} for sentence in corpusElements.sentences: sentenceGraph = sentence.sentenceGraph #interactionEdges += len(sentenceGraph.interactionGraph.edges()) interactionEdges += len(sentence.interactions) dependencyEdges += len(sentenceGraph.dependencyGraph.edges()) undirected = sentenceGraph.dependencyGraph.to_undirected() paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Shortest path for interaction edge for interaction in sentence.interactions: e1 = sentence.entitiesById[interaction.attrib["e1"]] e2 = sentence.entitiesById[interaction.attrib["e2"]] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] if paths.has_key(t1) and paths[t1].has_key(t2): path = paths[t1][t2] if not pathsByLength.has_key(len(path) - 1): pathsByLength[len(path) - 1] = 0 pathsByLength[len(path) - 1] += 1 else: if not pathsByLength.has_key("none"): pathsByLength["none"] = 0 pathsByLength["none"] += 1 # for intEdge in sentenceGraph.interactionGraph.edges(): # if paths.has_key(intEdge[0]) and paths[intEdge[0]].has_key(intEdge[1]): # path = paths[intEdge[0]][intEdge[1]] # if not pathsByLength.has_key(len(path)-1): # pathsByLength[len(path)-1] = 0 # pathsByLength[len(path)-1] += 1 # else: # if not pathsByLength.has_key("none"): # pathsByLength["none"] = 0 # pathsByLength["none"] += 1 # Shortest paths between all entities for i in range(len(sentence.entities) - 1): for j in range(i + 1, len(sentence.entities)): tI = sentenceGraph.entityHeadTokenByEntity[ sentence.entities[i]] tJ = sentenceGraph.entityHeadTokenByEntity[ sentence.entities[j]] if paths.has_key(tI) and paths[tI].has_key(tJ): path = paths[tI][tJ] if not pathsBetweenAllEntitiesByLength.has_key( len(path) - 1): pathsBetweenAllEntitiesByLength[len(path) - 1] = 0 pathsBetweenAllEntitiesByLength[len(path) - 1] += 1 elif tI == tJ: if not pathsBetweenAllEntitiesByLength.has_key(0): pathsBetweenAllEntitiesByLength[0] = 0 pathsBetweenAllEntitiesByLength[0] += 1 else: if not pathsBetweenAllEntitiesByLength.has_key("none"): pathsBetweenAllEntitiesByLength["none"] = 0 pathsBetweenAllEntitiesByLength["none"] += 1 # for i in range(len(sentenceGraph.tokens)-1): # for j in range(i+1,len(sentenceGraph.tokens)): # tI = sentenceGraph.tokens[i] # tJ = sentenceGraph.tokens[j] # if sentenceGraph.tokenIsEntityHead[tI] == None or sentenceGraph.tokenIsEntityHead[tJ] == None: # continue # if paths.has_key(tI) and paths[tI].has_key(tJ): # path = paths[tI][tJ] # if not pathsBetweenAllEntitiesByLength.has_key(len(path)-1): # pathsBetweenAllEntitiesByLength[len(path)-1] = 0 # pathsBetweenAllEntitiesByLength[len(path)-1] += 1 # else: # if not pathsBetweenAllEntitiesByLength.has_key("none"): # pathsBetweenAllEntitiesByLength["none"] = 0 # pathsBetweenAllEntitiesByLength["none"] += 1 print >> sys.stderr, "Interaction edges:", interactionEdges print >> sys.stderr, "Dependency edges:", dependencyEdges print >> sys.stderr, "Shortest path of dependencies for interaction edge:" printPathDistribution(pathsByLength) if options.output != None: pathsByLength["corpus"] = options.input pathsByLength["parse"] = options.parse TableUtils.addToCSV(pathsByLength, options.output + "/pathsByLength.csv") print >> sys.stderr, "Shortest path of dependencies between all entities:" printPathDistribution(pathsBetweenAllEntitiesByLength) if options.output != None: pathsByLength["corpus"] = options.input pathsByLength["parse"] = options.parse TableUtils.addToCSV( pathsBetweenAllEntitiesByLength, options.output + "/pathsBetweenAllEntitiesByLength.csv")
def analyzeLengths(corpusElements): interactionEdges = 0 dependencyEdges = 0 pathsByLength = {} pathsBetweenAllEntitiesByLength = {} for sentence in corpusElements.sentences: sentenceGraph = sentence.sentenceGraph #interactionEdges += len(sentenceGraph.interactionGraph.edges()) interactionEdges += len(sentence.interactions) dependencyEdges += len(sentenceGraph.dependencyGraph.edges()) undirected = sentenceGraph.dependencyGraph.to_undirected() paths = NX10.all_pairs_shortest_path(undirected, cutoff=999) # Shortest path for interaction edge for interaction in sentence.interactions: e1 = sentence.entitiesById[interaction.attrib["e1"]] e2 = sentence.entitiesById[interaction.attrib["e2"]] t1 = sentenceGraph.entityHeadTokenByEntity[e1] t2 = sentenceGraph.entityHeadTokenByEntity[e2] if paths.has_key(t1) and paths[t1].has_key(t2): path = paths[t1][t2] if not pathsByLength.has_key(len(path)-1): pathsByLength[len(path)-1] = 0 pathsByLength[len(path)-1] += 1 else: if not pathsByLength.has_key("none"): pathsByLength["none"] = 0 pathsByLength["none"] += 1 # for intEdge in sentenceGraph.interactionGraph.edges(): # if paths.has_key(intEdge[0]) and paths[intEdge[0]].has_key(intEdge[1]): # path = paths[intEdge[0]][intEdge[1]] # if not pathsByLength.has_key(len(path)-1): # pathsByLength[len(path)-1] = 0 # pathsByLength[len(path)-1] += 1 # else: # if not pathsByLength.has_key("none"): # pathsByLength["none"] = 0 # pathsByLength["none"] += 1 # Shortest paths between all entities for i in range(len(sentence.entities)-1): for j in range(i+1,len(sentence.entities)): tI = sentenceGraph.entityHeadTokenByEntity[sentence.entities[i]] tJ = sentenceGraph.entityHeadTokenByEntity[sentence.entities[j]] if paths.has_key(tI) and paths[tI].has_key(tJ): path = paths[tI][tJ] if not pathsBetweenAllEntitiesByLength.has_key(len(path)-1): pathsBetweenAllEntitiesByLength[len(path)-1] = 0 pathsBetweenAllEntitiesByLength[len(path)-1] += 1 elif tI == tJ: if not pathsBetweenAllEntitiesByLength.has_key(0): pathsBetweenAllEntitiesByLength[0] = 0 pathsBetweenAllEntitiesByLength[0] += 1 else: if not pathsBetweenAllEntitiesByLength.has_key("none"): pathsBetweenAllEntitiesByLength["none"] = 0 pathsBetweenAllEntitiesByLength["none"] += 1 # for i in range(len(sentenceGraph.tokens)-1): # for j in range(i+1,len(sentenceGraph.tokens)): # tI = sentenceGraph.tokens[i] # tJ = sentenceGraph.tokens[j] # if sentenceGraph.tokenIsEntityHead[tI] == None or sentenceGraph.tokenIsEntityHead[tJ] == None: # continue # if paths.has_key(tI) and paths[tI].has_key(tJ): # path = paths[tI][tJ] # if not pathsBetweenAllEntitiesByLength.has_key(len(path)-1): # pathsBetweenAllEntitiesByLength[len(path)-1] = 0 # pathsBetweenAllEntitiesByLength[len(path)-1] += 1 # else: # if not pathsBetweenAllEntitiesByLength.has_key("none"): # pathsBetweenAllEntitiesByLength["none"] = 0 # pathsBetweenAllEntitiesByLength["none"] += 1 print >> sys.stderr, "Interaction edges:", interactionEdges print >> sys.stderr, "Dependency edges:", dependencyEdges print >> sys.stderr, "Shortest path of dependencies for interaction edge:" printPathDistribution(pathsByLength) if options.output != None: pathsByLength["corpus"] = options.input pathsByLength["parse"] = options.parse TableUtils.addToCSV(pathsByLength, options.output+"/pathsByLength.csv") print >> sys.stderr, "Shortest path of dependencies between all entities:" printPathDistribution(pathsBetweenAllEntitiesByLength) if options.output != None: pathsByLength["corpus"] = options.input pathsByLength["parse"] = options.parse TableUtils.addToCSV(pathsBetweenAllEntitiesByLength, options.output+"/pathsBetweenAllEntitiesByLength.csv")
def optimize(self, trainSets, classifySets, parameters=defaultOptimizationParameters, evaluationClass=None, evaluationArgs={}, combinationsThatTimedOut=None): if parameters.has_key("predefined"): print >> sys.stderr, "Predefined model, skipping parameter estimation" return {"predefined": parameters["predefined"]} print >> sys.stderr, "Optimizing parameters" parameterNames = parameters.keys() parameterNames.sort() # for p in self.notOptimizedParameters: # if p in parameterNames: # parameterNames.remove(p) parameterValues = [] for parameterName in parameterNames: parameterValues.append([]) for value in parameters[parameterName]: parameterValues[-1].append((parameterName, value)) combinationLists = combine.combine(*parameterValues) combinations = [] for combinationList in combinationLists: combinations.append({}) for value in combinationList: combinations[-1][value[0]] = value[1] if combinationsThatTimedOut == None: combinationsThatTimedOut = [] # # re-add non-optimized parameters to combinations # for p in self.notOptimizedParameters: # if parameters.has_key(p): # for combination in combinations: # combination[p] = parameters[p] bestResult = None combinationCount = 1 if hasattr(self, "tempDir"): mainTempDir = self.tempDir mainDebugFile = self.debugFile for combination in combinations: print >> sys.stderr, " Parameters " + str( combinationCount) + "/" + str( len(combinations)) + ":", str(combination), skip = False #print combinationsThatTimedOut for discarded in combinationsThatTimedOut: if self._dictIsIdentical(combination, discarded): print >> sys.stderr print >> sys.stderr, " Discarded before, skipping" skip = True break if skip: continue # Make copies of examples in case they are modified fold = 1 foldResults = [] for classifyExamples in classifySets: if type(trainSets[0]) == types.StringType: trainExamples = trainSets[0] else: trainExamples = [] for trainSet in trainSets: if trainSet != classifyExamples: trainExamples.extend(trainSet) trainExamplesCopy = trainExamples if type(trainExamples) == types.ListType: trainExamplesCopy = trainExamples #ExampleUtils.copyExamples(trainExamples) classifyExamplesCopy = classifyExamples if type(classifyExamples) == types.ListType: classifyExamplesCopy = classifyExamples #ExampleUtils.copyExamples(classifyExamples) if hasattr(self, "tempDir"): self.tempDir = mainTempDir + "/parameters" + str( combinationCount) + "/optimization" + str(fold) if not os.path.exists(self.tempDir): os.makedirs(self.tempDir) self.debugFile = open(self.tempDir + "/debug.txt", "wt") timer = Timer() #trainStartTime = time.time() trainRV = self.train(trainExamplesCopy, combination) #trainTime = time.time() - trainStartTime #print >> sys.stderr, " Time spent:", trainTime, "s" print >> sys.stderr, " Time spent:", timer.elapsedTimeToString( ) if trainRV == 0: predictions = self.classify(classifyExamplesCopy) evaluation = evaluationClass(predictions, **evaluationArgs) if len(classifySets) == 1: print >> sys.stderr, evaluation.toStringConcise(" ") else: print >> sys.stderr, evaluation.toStringConcise( indent=" ", title="Fold " + str(fold)) foldResults.append(evaluation) if hasattr(self, "tempDir"): evaluation.saveCSV(self.tempDir + "/results.csv") else: combinationsThatTimedOut.append(combination) print >> sys.stderr, " Timed out" fold += 1 if len(foldResults) > 0: averageResult = evaluationClass.average(foldResults) poolResult = evaluationClass.pool(foldResults) if hasattr(self, "tempDir"): TableUtils.writeCSV( combination, mainTempDir + "/parameters" + str(combinationCount) + ".csv") averageResult.saveCSV(mainTempDir + "/parameters" + str(combinationCount) + "/resultsAverage.csv") poolResult.saveCSV(mainTempDir + "/parameters" + str(combinationCount) + "/resultsPooled.csv") if len(classifySets) > 1: print >> sys.stderr, averageResult.toStringConcise( " Avg: ") print >> sys.stderr, poolResult.toStringConcise(" Pool: ") if bestResult == None or poolResult.compare( bestResult[1] ) > 0: #: averageResult.fScore > bestResult[1].fScore: #bestResult = (predictions, averageResult, combination) bestResult = (None, poolResult, combination) # Make sure memory is released, especially important since some of the previous steps # copy examples bestResult[1].classifications = None bestResult[1].predictions = None combinationCount += 1 if hasattr(self, "tempDir"): self.debugFile.close() if hasattr(self, "tempDir"): self.tempDir = mainTempDir self.debugFile = mainDebugFile return bestResult
classNameDict[classId] = className classNameFile.close() #classSet = IdSet(idDict=classNameDict, locked=True) if options.output != None: print >> sys.stderr, "Outputfile exists, removing", options.output if os.path.exists(options.output): os.remove(options.output) print >> sys.stderr, "Importing modules" exec "from Evaluators." + options.evaluator + " import " + options.evaluator + " as EvaluatorClass" fieldnames = ["class", "prediction", "id", "fold", "c"] # Find best c-parameter from parameter estimation data print >> sys.stderr, "Finding optimal c-parameters from", options.parameters rows = TableUtils.readCSV(options.parameters, fieldnames) folds = sorted(list(TableUtils.getValueSet(rows, "fold"))) cParameterByFold = {} for fold in folds: print >> sys.stderr, " Processing fold", fold foldRows = TableUtils.selectRowsCSV(rows, {"fold": fold}) cParameters = sorted(list(TableUtils.getValueSet(foldRows, "c"))) evaluators = [] cParameterByEvaluator = {} for cParameter in cParameters: print >> sys.stderr, " Processing c-parameter", cParameter, paramRows = TableUtils.selectRowsCSV(foldRows, {"c": cParameter}) evaluator = Evaluator.calculateFromCSV(paramRows, EvaluatorClass) #print evaluator.toStringConcise() cParameterByEvaluator[evaluator] = cParameter evaluators.append(evaluator)
gridPointDir = "grid/gridpoint-" + pId assert gridCSC.exists(gridPointDir) if gridCSC.exists(gridPointDir + "/results.csv"): print >> sys.stderr, "Downloading results" gridCSC.download(gridPointDir + "/results.csv", "results" + pId + ".csv") else: print >> sys.stderr, "Run not yet finished" finished = False time.sleep(60) if options.mode in ["ALL", "GRID_EVALUATE"]: bestResult = (-1, None, None) for filename in os.listdir(WORKDIR): if filename[-4:] == ".csv" and os.path.getsize(filename) != 0: gridRows = TableUtils.readCSV(filename) fscore = None for row in gridRows: if row["eval"] == "approximate" and row[ "event_class"] == "ALL-TOTAL": fscore = row["fscore"] break assert fscore != None, row if fscore > bestResult[0]: bestResult = (fscore, gridRows, filename) print bestResult #if options.mode in ["] # print >> sys.stderr, "Grid search complete" # print >> sys.stderr, "Tested", count - options.startFrom, "out of", count, "combinations" # print >> sys.stderr, "Best parameter combination:", bestResults[0]
pId = getCombinationString(params) #"-boost_"+str(param)[0:3] # param id gridPointDir = "grid/gridpoint-"+pId assert gridCSC.exists(gridPointDir) if gridCSC.exists(gridPointDir + "/results.csv"): print >> sys.stderr, "Downloading results" gridCSC.download(gridPointDir + "/results.csv", "results"+pId+".csv") else: print >> sys.stderr, "Run not yet finished" finished = False time.sleep(60) if options.mode in ["ALL", "GRID_EVALUATE"]: bestResult = (-1, None, None) for filename in os.listdir(WORKDIR): if filename[-4:] == ".csv" and os.path.getsize(filename) != 0: gridRows = TableUtils.readCSV(filename) fscore = None for row in gridRows: if row["eval"] == "approximate" and row["event_class"] == "ALL-TOTAL": fscore = row["fscore"] break assert fscore != None, row if fscore > bestResult[0]: bestResult = (fscore, gridRows, filename) print bestResult #if options.mode in ["] # print >> sys.stderr, "Grid search complete" # print >> sys.stderr, "Tested", count - options.startFrom, "out of", count, "combinations" # print >> sys.stderr, "Best parameter combination:", bestResults[0]