def test(cls, examples, modelPath, output=None, parameters=None, timeout=None): if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len(examples), "with All-True Classifier" examples, predictions = self.filterClassificationSet(examples, False) testPath = self.tempDir+"/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with All-True Classifier" testPath = examples examples = Example.readExamples(examples,False) print >> sys.stderr, "Note! Classification must be binary" #examples, predictions = self.filterClassificationSet(examples, True) predictions = [] for example in examples: #predictions.append( (example, example[1]) ) predictions.append( [2] ) #[example[1]] ) if output == None: output = "predictions" f = open(output, "wt") for p in predictions: f.write(str(p[0])+"\n") f.close() return predictions
def train(self, examples, parameters=None): self.isBinary = self.isBinaryProblem(examples) examples = self.filterTrainingSet(examples) ExampleUtils.writeExamples(examples, self.tempDir + "/train.dat") #prepare parameters: if parameters.has_key("c"): assert (not parameters.has_key("C")) parameters["C"] = parameters["c"] del parameters["c"] totalExamples = float(sum(self.classes.values())) weight_label = self.classes.keys() weight_label.sort() weight = [] for k in weight_label: weight.append(1.0 - self.classes[k] / totalExamples) libSVMparam = svm.svm_parameter(nr_weight=len(self.classes), weight_label=weight_label, weight=weight, **parameters) labels = [] samples = [] for example in examples: labels.append(example[1]) samples.append(example[2]) problem = svm.svm_problem(labels, samples) self.model = svm.svm_model(problem, libSVMparam)
def test(cls, examples, modelPath, output=None, parameters=None, timeout=None): if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len( examples), "with All-Correct Classifier" examples, predictions = self.filterClassificationSet( examples, False) testPath = self.tempDir + "/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with All-Correct Classifier" testPath = examples examples = Example.readExamples(examples, False) #examples, predictions = self.filterClassificationSet(examples, True) predictions = [] for example in examples: #predictions.append( (example, example[1]) ) predictions.append([example[1]]) if output == None: output = "predictions" f = open(output, "wt") for p in predictions: f.write(str(p[0]) + "\n") f.close() return predictions
def classify(self, examples, parameters=None): if type(examples) == types.StringType: testFilePath = examples predictions = [] realClasses = [] exampleFile = open(examples,"rt") for line in exampleFile.readlines(): realClasses.append(int(line.split(" ",1)[0].strip())) exampleFile.close() elif type(examples) == types.ListType: examples, predictions = self.filterClassificationSet(examples, True) Example.writeExamples(examples, self.tempDir+"/test.dat") testFilePath = self.tempDir+"/test.dat" args = [self.classifyBin] if parameters != None: self.__addParametersToSubprocessCall(args, parameters) args += [testFilePath, self.tempDir+"/model", self.tempDir+"/predictions"] #print args subprocess.call(args, stdout = self.debugFile) os.remove(self.tempDir+"/model") predictionsFile = open(self.tempDir+"/predictions", "rt") lines = predictionsFile.readlines() predictionsFile.close() #predictions = [] for i in range(len(lines)): if type(examples) == types.ListType: predictions.append( (examples[i],float(lines[i]),self.type,lines[i]) ) else: predictions.append( ([None,realClasses[i]],float(lines[i]),self.type) ) return predictions
def buildGraphKernelFeatures(self, sentenceGraph, path): edgeList = [] depGraph = sentenceGraph.dependencyGraph pt = path for i in range(1, len(path)): edgeList.extend(depGraph.getEdges(pt[i], pt[i - 1])) edgeList.extend(depGraph.getEdges(pt[i - 1], pt[i])) edges = edgeList adjacencyMatrix, labels = self._buildAdjacencyMatrix( sentenceGraph, path, edges) node_count = 2 * len(sentenceGraph.tokens) + len( sentenceGraph.dependencies) if sentenceGraph.sentenceElement.attrib["id"] == "LLL.d0.s0": adjacencyMatrixToHtml(adjacencyMatrix, labels, "LLL.d0.s0_adjacency_matrix.html") allPathsMatrix = self._prepareMatrix(adjacencyMatrix, node_count) self._matrixToFeatures(allPathsMatrix, labels) if sentenceGraph.sentenceElement.attrib["id"] == "LLL.d0.s0": adjacencyMatrixToHtml(allPathsMatrix, labels, "LLL.d0.s0_all_paths_matrix.html") commentLines = [] commentLines.extend(self.featureSet.toStrings()) example = [ "example_" + self.entity1.attrib["id"] + "_" + self.entity2.attrib["id"], "unknown", self.features ] ExampleUtils.writeExamples([example], "LLL.d0.s0_example.txt", commentLines)
def train(cls, examples, parameters, outputFile=None): #, timeout=None): """ Train the SVM-multiclass classifier on a set of examples. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type parameters: a dictionary or string @param parameters: parameters for the classifier @type outputFile: string @param outputFile: the name of the model file to be written """ timer = Timer() parameters = cls.getParams(parameters) # If examples are in a list, they will be written to a file for SVM-multiclass if type(examples) == types.ListType: print >> sys.stderr, "Training SVM-MultiClass on", len(examples), "examples" trainPath = self.tempDir+"/train.dat" examples = self.filterTrainingSet(examples) Example.writeExamples(examples, trainPath) else: print >> sys.stderr, "Training SVM-MultiClass on file", examples trainPath = cls.stripComments(examples) args = ["/home/jari/Programs/liblinear-1.5-poly2/train"] cls.__addParametersToSubprocessCall(args, parameters) if outputFile == None: args += [trainPath, "model"] logFile = open("svmmulticlass.log","at") else: args += [trainPath, outputFile] logFile = open(outputFile+".log","wt") rv = subprocess.call(args, stdout = logFile) logFile.close() print >> sys.stderr, timer.toString() return rv
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ if forceInternal or Settings.SVMMultiClassDir == None: return cls.testInternal(examples, modelPath, output) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len(examples), "with SVM-MultiClass model", modelPath examples, predictions = self.filterClassificationSet(examples, False) testPath = self.tempDir+"/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath testPath = cls.stripComments(examples) examples = Example.readExamples(examples,False) args = ["/home/jari/Programs/liblinear-1.5-poly2/predict"] if modelPath == None: modelPath = "model" if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0],"classifier/model") del parameters["predefined"] self.__addParametersToSubprocessCall(args, parameters) if output == None: output = "predictions" logFile = open("svmmulticlass.log","at") else: logFile = open(output+".log","wt") args += [testPath, modelPath, output] #if timeout == None: # timeout = -1 #print args subprocess.call(args, stdout = logFile, stderr = logFile) predictionsFile = open(output, "rt") lines = predictionsFile.readlines() predictionsFile.close() predictions = [] for i in range(len(lines)): predictions.append( [int(lines[i].split()[0])] + lines[i].split()[1:] ) #predictions.append( (examples[i],int(lines[i].split()[0]),"multiclass",lines[i].split()[1:]) ) print >> sys.stderr, timer.toString() return predictions
def classify(self, examples, parameters=None): examples, predictions = self.filterClassificationSet( examples, self.isBinary) ExampleUtils.writeExamples(examples, self.tempDir + "/test.dat") for i in range(len(examples)): if self.isBinary: predictedClass = self.model.predict(examples[i][2]) predictions.append((examples[i], predictedClass, "binary")) else: predictedClass = self.model.predict(examples[i][2]) predictions.append((examples[i], predictedClass, "multiclass")) return predictions
def classify(self, examples, parameters=None): examples, predictions = self.filterClassificationSet(examples, self.isBinary) ExampleUtils.writeExamples(examples, self.tempDir+"/test.dat") for i in range(len(examples)): if self.isBinary: predictedClass = self.model.predict(examples[i][2]) predictions.append( (examples[i],predictedClass,"binary") ) else: predictedClass = self.model.predict(examples[i][2]) predictions.append( (examples[i],predictedClass,"multiclass") ) return predictions
def buildGraphKernelFeatures(self, sentenceGraph, path): edgeList = [] depGraph = sentenceGraph.dependencyGraph pt = path for i in range(1, len(path)): edgeList.extend(depGraph.getEdges(pt[i], pt[i-1])) edgeList.extend(depGraph.getEdges(pt[i-1], pt[i])) edges = edgeList adjacencyMatrix, labels = self._buildAdjacencyMatrix(sentenceGraph, path, edges) node_count = 2*len(sentenceGraph.tokens) + len(sentenceGraph.dependencies) if sentenceGraph.sentenceElement.attrib["id"] == "LLL.d0.s0": adjacencyMatrixToHtml(adjacencyMatrix, labels, "LLL.d0.s0_adjacency_matrix.html") allPathsMatrix = self._prepareMatrix(adjacencyMatrix, node_count) self._matrixToFeatures(allPathsMatrix, labels) if sentenceGraph.sentenceElement.attrib["id"] == "LLL.d0.s0": adjacencyMatrixToHtml(allPathsMatrix, labels, "LLL.d0.s0_all_paths_matrix.html") commentLines = [] commentLines.extend(self.featureSet.toStrings()) example = ["example_"+self.entity1.attrib["id"]+"_"+self.entity2.attrib["id"],"unknown",self.features] ExampleUtils.writeExamples([example],"LLL.d0.s0_example.txt",commentLines)
def train(self, examples, parameters=None, outputDir=None): timeout = -1 if type(examples) == types.StringType: trainFilePath = examples elif type(examples) == types.ListType: examples = self.filterTrainingSet(examples) parameters = copy.copy(parameters) if parameters.has_key("style"): if "no_duplicates" in parameters["style"]: examples = Example.removeDuplicates(examples) del parameters["style"] Example.writeExamples(examples, self.tempDir+"/train.dat") trainFilePath = self.tempDir+"/train.dat" if parameters.has_key("timeout"): timeout = parameters["timeout"] del parameters["timeout"] args = [self.trainBin] if parameters != None: self.__addParametersToSubprocessCall(args, parameters) args += [trainFilePath, self.tempDir+"/model"] return killableprocess.call(args, stdout = self.debugFile, timeout = timeout)
def train(self, examples, parameters=None): self.isBinary = self.isBinaryProblem(examples) examples = self.filterTrainingSet(examples) ExampleUtils.writeExamples(examples, self.tempDir+"/train.dat") #prepare parameters: if parameters.has_key("c"): assert(not parameters.has_key("C")) parameters["C"] = parameters["c"] del parameters["c"] totalExamples = float(sum(self.classes.values())) weight_label = self.classes.keys() weight_label.sort() weight = [] for k in weight_label: weight.append(1.0-self.classes[k]/totalExamples) libSVMparam = svm.svm_parameter(nr_weight = len(self.classes), weight_label=weight_label, weight=weight, **parameters) labels = [] samples = [] for example in examples: labels.append(example[1]) samples.append(example[2]) problem = svm.svm_problem(labels, samples) self.model = svm.svm_model(problem, libSVMparam)
def train(self, examples, parameters=None, outputDir=None): timeout = -1 if type(examples) == types.StringType: trainFilePath = examples elif type(examples) == types.ListType: examples = self.filterTrainingSet(examples) parameters = copy.copy(parameters) if parameters.has_key("style"): if "no_duplicates" in parameters["style"]: examples = Example.removeDuplicates(examples) del parameters["style"] Example.writeExamples(examples, self.tempDir + "/train.dat") trainFilePath = self.tempDir + "/train.dat" if parameters.has_key("timeout"): timeout = parameters["timeout"] del parameters["timeout"] args = [self.trainBin] if parameters != None: self.__addParametersToSubprocessCall(args, parameters) args += [trainFilePath, self.tempDir + "/model"] return killableprocess.call(args, stdout=self.debugFile, timeout=timeout)
def classify(self, examples, parameters=None): if type(examples) == types.StringType: testFilePath = examples predictions = [] realClasses = [] exampleFile = open(examples, "rt") for line in exampleFile.readlines(): realClasses.append(int(line.split(" ", 1)[0].strip())) exampleFile.close() elif type(examples) == types.ListType: examples, predictions = self.filterClassificationSet( examples, True) Example.writeExamples(examples, self.tempDir + "/test.dat") testFilePath = self.tempDir + "/test.dat" args = [self.classifyBin] if parameters != None: self.__addParametersToSubprocessCall(args, parameters) args += [ testFilePath, self.tempDir + "/model", self.tempDir + "/predictions" ] #print args subprocess.call(args, stdout=self.debugFile) os.remove(self.tempDir + "/model") predictionsFile = open(self.tempDir + "/predictions", "rt") lines = predictionsFile.readlines() predictionsFile.close() #predictions = [] for i in range(len(lines)): if type(examples) == types.ListType: predictions.append( (examples[i], float(lines[i]), self.type, lines[i])) else: predictions.append( ([None, realClasses[i]], float(lines[i]), self.type)) return predictions
def train(cls, examples, parameters, outputFile=None): #, timeout=None): """ Train the SVM-multiclass classifier on a set of examples. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type parameters: a dictionary or string @param parameters: parameters for the classifier @type outputFile: string @param outputFile: the name of the model file to be written """ timer = Timer() parameters = cls.getParams(parameters) # If examples are in a list, they will be written to a file for SVM-multiclass if type(examples) == types.ListType: print >> sys.stderr, "Training SVM-MultiClass on", len( examples), "examples" trainPath = self.tempDir + "/train.dat" examples = self.filterTrainingSet(examples) Example.writeExamples(examples, trainPath) else: print >> sys.stderr, "Training SVM-MultiClass on file", examples trainPath = cls.stripComments(examples) args = ["/home/jari/Programs/liblinear-1.5-poly2/train"] cls.__addParametersToSubprocessCall(args, parameters) if outputFile == None: args += [trainPath, "model"] logFile = open("svmmulticlass.log", "at") else: args += [trainPath, outputFile] logFile = open(outputFile + ".log", "wt") rv = subprocess.call(args, stdout=logFile) logFile.close() print >> sys.stderr, timer.toString() return rv
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False, classIds=None): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ if type(parameters) == types.StringType: parameters = splitParameters(parameters) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len(examples), "with SVM-MultiClass model", modelPath examples, predictions = self.filterClassificationSet(examples, False) testPath = self.tempDir+"/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath testPath = examples examples = Example.readExamples(examples,False) if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0],"classifier/model") del parameters["predefined"] # Read model if modelPath == None: modelPath = "model-multilabel" classModels = {} if modelPath.endswith(".gz"): f = gzip.open(modelPath, "rt") else: f = open(modelPath, "rt") thresholds = {} for line in f: key, value, threshold = line.split() classModels[key] = value if threshold != "None": thresholds[key] = float(threshold) else: thresholds[key] = 0.0 f.close() mergedPredictions = [] if type(classIds) == types.StringType: classIds = IdSet(filename=classIds) #print classModels print "Thresholds", thresholds classifierBin = Settings.SVMMultiClassDir+"/svm_multiclass_classify" print parameters if "classifier" in parameters and "svmperf" in parameters["classifier"]: classifierBin = Settings.SVMPerfDir+"/svm_perf_classify" parameters = copy.copy(parameters) del parameters["classifier"] for className in classIds.getNames(): if className != "neg" and not "---" in className: classId = classIds.getId(className) if thresholds[str(className)] != 0.0: print >> sys.stderr, "Classifying", className, "with threshold", thresholds[str(className)] else: print >> sys.stderr, "Classifying", className args = [classifierBin] #self.__addParametersToSubprocessCall(args, parameters) classOutput = "predictions" + ".cls-" + className logFile = open("svmmulticlass" + ".cls-" + className + ".log","at") args += [testPath, classModels[str(className)], classOutput] print args subprocess.call(args, stdout = logFile, stderr = logFile) cls.addPredictions(classOutput, mergedPredictions, classId, len(classIds.Ids), threshold=thresholds[str(className)]) print >> sys.stderr, timer.toString() predFileName = output f = open(predFileName, "wt") for mergedPred in mergedPredictions: if len(mergedPred[0]) > 1 and "1" in mergedPred[0]: mergedPred[0].remove("1") mergedPred[1] = str(mergedPred[1]) mergedPred[0] = ",".join(sorted(list(mergedPred[0]))) f.write(" ".join(mergedPred) + "\n") f.close() return mergedPredictions
def crossValidate(exampleBuilder, corpusElements, examples, options, timer): parameterOptimizationSet = None constantParameterOptimizationSet = None if options.paramOptData != None: print >> sys.stderr, "Separating parameter optimization set" parameterOptimizationDivision = Example.makeCorpusDivision(corpusElements, float(options.paramOptData)) exampleSets = Example.divideExamples(examples, parameterOptimizationDivision) constantParameterOptimizationSet = exampleSets[0] parameterOptimizationSet = constantParameterOptimizationSet optDocs = 0 for k,v in parameterOptimizationDivision.iteritems(): if v == 0: del corpusElements.documentsById[k] optDocs += 1 print >> sys.stderr, " Documents for parameter optimization:", optDocs discardedParameterCombinations = [] print >> sys.stderr, "Dividing data into folds" corpusFolds = Example.makeCorpusFolds(corpusElements, options.folds[0]) exampleSets = Example.divideExamples(examples, corpusFolds) keys = exampleSets.keys() keys.sort() evaluations = [] for key in keys: testSet = exampleSets[key] for example in testSet: example[3]["visualizationSet"] = key + 1 trainSet = [] for key2 in keys: if key != key2: trainSet.extend(exampleSets[key2]) print >> sys.stderr, "Fold", str(key + 1) # Create classifier object if options.output != None: if not os.path.exists(options.output+"/fold"+str(key+1)): os.mkdir(options.output+"/fold"+str(key+1)) # if not os.path.exists(options.output+"/fold"+str(key+1)+"/classifier"): # os.mkdir(options.output+"/fold"+str(key+1)+"/classifier") classifier = Classifier(workDir = options.output + "/fold"+str(key + 1)) else: classifier = Classifier() classifier.featureSet = exampleBuilder.featureSet # Optimize #################### # Check whether there is need for included param opt set if parameterOptimizationSet == None and options.folds[1] == 0: # 8-1-1 folds assert(len(keys) > 1) if keys.index(key) == 0: parameterOptimizationSetKey = keys[-1] else: parameterOptimizationSetKey = keys[keys.index(key)-1] parameterOptimizationSet = exampleSets[parameterOptimizationSetKey] trainSet = [] for key2 in keys: if key2 != key and key2 != parameterOptimizationSetKey: trainSet.extend(exampleSets[key2]) if parameterOptimizationSet != None: # constant external parameter optimization set evaluationArgs = {"classSet":exampleBuilder.classSet} if options.parameters != None: paramDict = splitParameters(options.parameters) bestResults = classifier.optimize([trainSet], [parameterOptimizationSet], paramDict, Evaluation, evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: bestResults = classifier.optimize([trainSet], [parameterOptimizationSet], evaluationClass=Evaluation, evaluationArgs=evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: # nested x-fold parameter optimization assert (options.folds[1] >= 2) optimizationFolds = Example.makeExampleFolds(trainSet, options.folds[1]) optimizationSets = Example.divideExamples(trainSet, optimizationFolds) optimizationSetList = [] optSetKeys = optimizationSets.keys() optSetKeys.sort() for optSetKey in optSetKeys: optimizationSetList.append(optimizationSets[optSetKey]) evaluationArgs = {"classSet":exampleBuilder.classSet} if options.parameters != None: paramDict = splitParameters(options.parameters) bestResults = classifier.optimize(optimizationSetList, optimizationSetList, paramDict, Evaluation, evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: bestResults = classifier.optimize(optimizationSetList, optimizationSetList, evaluationClass=Evaluation, evaluationArgs=evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) # Classify print >> sys.stderr, "Classifying test data" bestParams = bestResults[2] if bestParams.has_key("timeout"): del bestParams["timeout"] print >> sys.stderr, "Parameters:", bestParams print >> sys.stderr, "Training", startTime = time.time() classifier.train(trainSet, bestParams) print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)" print >> sys.stderr, "Testing", startTime = time.time() predictions = classifier.classify(testSet) if options.output != None: pdict = [] fieldnames = ["class","prediction","id","fold"] for p in predictions: if "typed" in exampleBuilder.styles: pdict.append( {"class":exampleBuilder.classSet.getName(p[0][1]), "prediction":exampleBuilder.classSet.getName(p[1]), "id":p[0][0], "fold":key} ) else: pdict.append( {"class":p[0][1], "prediction":p[1], "id":p[0][0], "fold":key} ) TableUtils.addToCSV(pdict, options.output +"/predictions.csv", fieldnames) print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)" # Calculate statistics evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet) print >> sys.stderr, evaluation.toStringConcise() print >> sys.stderr, timer.toString() evaluations.append(evaluation) # Save example sets if options.output != None: print >> sys.stderr, "Saving example sets to", options.output Example.writeExamples(exampleSets[0], options.output +"/fold"+str(key+1) + "/examplesTest.txt") Example.writeExamples(exampleSets[1], options.output +"/fold"+str(key+1) + "/examplesTrain.txt") if parameterOptimizationSet == None: for k,v in optimizationSets.iteritems(): Example.writeExamples(v, options.output +"/fold"+str(key+1) + "/examplesOptimizationSet" + str(k) + ".txt") else: Example.writeExamples(parameterOptimizationSet, options.output +"/fold"+str(key+1) + "/examplesOptimizationSetPredefined.txt") TableUtils.writeCSV(bestResults[2], options.output +"/fold"+str(key+1) + "/parameters.csv") evaluation.saveCSV(options.output +"/fold"+str(key+1) + "/results.csv") print >> sys.stderr, "Compressing folder" zipTree(options.output, "fold"+str(key+1)) parameterOptimizationSet = constantParameterOptimizationSet print >> sys.stderr, "Cross-validation Results" for i in range(len(evaluations)): print >> sys.stderr, evaluations[i].toStringConcise(" Fold "+str(i)+": ") averageResult = Evaluation.average(evaluations) print >> sys.stderr, averageResult.toStringConcise(" Avg: ") pooledResult = Evaluation.pool(evaluations) print >> sys.stderr, pooledResult.toStringConcise(" Pool: ") if options.output != None: for i in range(len(evaluations)): evaluations[i].saveCSV(options.output+"/results.csv", i) averageResult.saveCSV(options.output+"/results.csv", "Avg") pooledResult.saveCSV(options.output+"/results.csv", "Pool") averageResult.saveCSV(options.output+"/resultsAverage.csv") pooledResult.saveCSV(options.output+"/resultsPooled.csv") # Visualize if options.visualization != None: visualize(sentences, pooledResult.classifications, options, exampleBuilder) # Save interactionXML if options.resultsToXML != None: classSet = None if "typed" in exampleBuilder.styles: classSet = exampleBuilder.classSet Example.writeToInteractionXML(pooledResult.classifications, corpusElements, options.resultsToXML, classSet)
return examples if __name__=="__main__": # Import Psyco if available try: import psyco psyco.full() print >> sys.stderr, "Found Psyco, using" except ImportError: print >> sys.stderr, "Psyco not installed" from optparse import OptionParser import os optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in analysis format", metavar="FILE") optparser.add_option("-o", "--output", default=None, dest="output", help="Output directory, useful for debugging") (options, args) = optparser.parse_args() print >> sys.stderr, "Reading input from " + options.input examples = readARFF(options.input) if options.output == None: if options.input.rsplit(".",1)[-1] == "arff": options.output = options.input.rsplit(".",1)[0] + ".examples" else: options.output = options.input + ".examples" print >> sys.stderr, "Writing output to " + options.output ExampleUtils.writeExamples(examples, options.output)
variantExamples = ExampleUtils.readExamples( os.path.join(options.variant, "test-triggers.examples")) invariantFeatureSet = IdSet() invariantFeatureSet.load( os.path.join(options.invariant, "feature_names.txt")) invariantClassSet = IdSet() invariantClassSet.load(os.path.join(options.invariant, "class_names.txt")) variantFeatureSet = IdSet() variantFeatureSet.load( os.path.join(options.variant, "test-triggers.examples.feature_names")) variantClassSet = IdSet() variantClassSet.load( os.path.join(options.variant, "test-triggers.examples.class_names")) counter = ProgressCounter(len(variantExamples)) for example in variantExamples: counter.update() example[1] = invariantClassSet.getId( variantClassSet.getName(example[1])) newFeatures = {} for k, v in example[2].iteritems(): newFeatures[invariantFeatureSet.getId( variantFeatureSet.getName(k))] = v example[2] = newFeatures ExampleUtils.writeExamples( variantExamples, os.path.join(options.variant, "realignedExamples.txt"))
else: print >> sys.stderr, "Using predefined model" bestResults = [None,None,{}] for k,v in classifierParamDict.iteritems(): bestResults[2][k] = v featureSet = IdSet() featureSet.load(os.path.join(classifierParamDict["predefined"][0], "feature_names.txt")) classSet = None if os.path.exists(os.path.join(classifierParamDict["predefined"][0], "class_names.txt")): classSet = IdSet() classSet.load(os.path.join(classifierParamDict["predefined"][0], "class_names.txt")) exampleBuilder = ExampleBuilder(featureSet=featureSet, classSet=classSet, **splitParameters(options.exampleBuilderParameters)) # Save training sets if options.output != None: print >> sys.stderr, "Saving example sets to", options.output Example.writeExamples(exampleSets[0], options.output + "/examplesTrain.txt") if not classifierParamDict.has_key("predefined"): Example.writeExamples(optimizationSets[0], options.output + "/examplesOptimizationTest.txt") Example.writeExamples(optimizationSets[1], options.output + "/examplesOptimizationTrain.txt") TableUtils.writeCSV(bestResults[2], options.output +"/best_parameters.csv") # Optimize and train if options.output != None: classifier = Classifier(workDir = options.output + "/classifier") else: classifier = Classifier() classifier.featureSet = exampleBuilder.featureSet if hasattr(exampleBuilder,"classSet"): classifier.classSet = exampleBuilder.classSet print >> sys.stderr, "Classifying test data" if bestResults[2].has_key("timeout"):
def crossValidate(exampleBuilder, corpusElements, examples, options, timer): parameterOptimizationSet = None constantParameterOptimizationSet = None if options.paramOptData != None: print >> sys.stderr, "Separating parameter optimization set" parameterOptimizationDivision = Example.makeCorpusDivision( corpusElements, float(options.paramOptData)) exampleSets = Example.divideExamples(examples, parameterOptimizationDivision) constantParameterOptimizationSet = exampleSets[0] parameterOptimizationSet = constantParameterOptimizationSet optDocs = 0 for k, v in parameterOptimizationDivision.iteritems(): if v == 0: del corpusElements.documentsById[k] optDocs += 1 print >> sys.stderr, " Documents for parameter optimization:", optDocs discardedParameterCombinations = [] print >> sys.stderr, "Dividing data into folds" corpusFolds = Example.makeCorpusFolds(corpusElements, options.folds[0]) exampleSets = Example.divideExamples(examples, corpusFolds) keys = exampleSets.keys() keys.sort() evaluations = [] for key in keys: testSet = exampleSets[key] for example in testSet: example[3]["visualizationSet"] = key + 1 trainSet = [] for key2 in keys: if key != key2: trainSet.extend(exampleSets[key2]) print >> sys.stderr, "Fold", str(key + 1) # Create classifier object if options.output != None: if not os.path.exists(options.output + "/fold" + str(key + 1)): os.mkdir(options.output + "/fold" + str(key + 1)) # if not os.path.exists(options.output+"/fold"+str(key+1)+"/classifier"): # os.mkdir(options.output+"/fold"+str(key+1)+"/classifier") classifier = Classifier(workDir=options.output + "/fold" + str(key + 1)) else: classifier = Classifier() classifier.featureSet = exampleBuilder.featureSet # Optimize #################### # Check whether there is need for included param opt set if parameterOptimizationSet == None and options.folds[ 1] == 0: # 8-1-1 folds assert (len(keys) > 1) if keys.index(key) == 0: parameterOptimizationSetKey = keys[-1] else: parameterOptimizationSetKey = keys[keys.index(key) - 1] parameterOptimizationSet = exampleSets[parameterOptimizationSetKey] trainSet = [] for key2 in keys: if key2 != key and key2 != parameterOptimizationSetKey: trainSet.extend(exampleSets[key2]) if parameterOptimizationSet != None: # constant external parameter optimization set evaluationArgs = {"classSet": exampleBuilder.classSet} if options.parameters != None: paramDict = splitParameters(options.parameters) bestResults = classifier.optimize( [trainSet], [parameterOptimizationSet], paramDict, Evaluation, evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: bestResults = classifier.optimize( [trainSet], [parameterOptimizationSet], evaluationClass=Evaluation, evaluationArgs=evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: # nested x-fold parameter optimization assert (options.folds[1] >= 2) optimizationFolds = Example.makeExampleFolds( trainSet, options.folds[1]) optimizationSets = Example.divideExamples(trainSet, optimizationFolds) optimizationSetList = [] optSetKeys = optimizationSets.keys() optSetKeys.sort() for optSetKey in optSetKeys: optimizationSetList.append(optimizationSets[optSetKey]) evaluationArgs = {"classSet": exampleBuilder.classSet} if options.parameters != None: paramDict = splitParameters(options.parameters) bestResults = classifier.optimize( optimizationSetList, optimizationSetList, paramDict, Evaluation, evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) else: bestResults = classifier.optimize( optimizationSetList, optimizationSetList, evaluationClass=Evaluation, evaluationArgs=evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations) # Classify print >> sys.stderr, "Classifying test data" bestParams = bestResults[2] if bestParams.has_key("timeout"): del bestParams["timeout"] print >> sys.stderr, "Parameters:", bestParams print >> sys.stderr, "Training", startTime = time.time() classifier.train(trainSet, bestParams) print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)" print >> sys.stderr, "Testing", startTime = time.time() predictions = classifier.classify(testSet) if options.output != None: pdict = [] fieldnames = ["class", "prediction", "id", "fold"] for p in predictions: if "typed" in exampleBuilder.styles: pdict.append({ "class": exampleBuilder.classSet.getName(p[0][1]), "prediction": exampleBuilder.classSet.getName(p[1]), "id": p[0][0], "fold": key }) else: pdict.append({ "class": p[0][1], "prediction": p[1], "id": p[0][0], "fold": key }) TableUtils.addToCSV(pdict, options.output + "/predictions.csv", fieldnames) print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)" # Calculate statistics evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet) print >> sys.stderr, evaluation.toStringConcise() print >> sys.stderr, timer.toString() evaluations.append(evaluation) # Save example sets if options.output != None: print >> sys.stderr, "Saving example sets to", options.output Example.writeExamples( exampleSets[0], options.output + "/fold" + str(key + 1) + "/examplesTest.txt") Example.writeExamples( exampleSets[1], options.output + "/fold" + str(key + 1) + "/examplesTrain.txt") if parameterOptimizationSet == None: for k, v in optimizationSets.iteritems(): Example.writeExamples( v, options.output + "/fold" + str(key + 1) + "/examplesOptimizationSet" + str(k) + ".txt") else: Example.writeExamples( parameterOptimizationSet, options.output + "/fold" + str(key + 1) + "/examplesOptimizationSetPredefined.txt") TableUtils.writeCSV( bestResults[2], options.output + "/fold" + str(key + 1) + "/parameters.csv") evaluation.saveCSV(options.output + "/fold" + str(key + 1) + "/results.csv") print >> sys.stderr, "Compressing folder" zipTree(options.output, "fold" + str(key + 1)) parameterOptimizationSet = constantParameterOptimizationSet print >> sys.stderr, "Cross-validation Results" for i in range(len(evaluations)): print >> sys.stderr, evaluations[i].toStringConcise(" Fold " + str(i) + ": ") averageResult = Evaluation.average(evaluations) print >> sys.stderr, averageResult.toStringConcise(" Avg: ") pooledResult = Evaluation.pool(evaluations) print >> sys.stderr, pooledResult.toStringConcise(" Pool: ") if options.output != None: for i in range(len(evaluations)): evaluations[i].saveCSV(options.output + "/results.csv", i) averageResult.saveCSV(options.output + "/results.csv", "Avg") pooledResult.saveCSV(options.output + "/results.csv", "Pool") averageResult.saveCSV(options.output + "/resultsAverage.csv") pooledResult.saveCSV(options.output + "/resultsPooled.csv") # Visualize if options.visualization != None: visualize(sentences, pooledResult.classifications, options, exampleBuilder) # Save interactionXML if options.resultsToXML != None: classSet = None if "typed" in exampleBuilder.styles: classSet = exampleBuilder.classSet Example.writeToInteractionXML(pooledResult.classifications, corpusElements, options.resultsToXML, classSet)
defaultAnalysisFilename = "/usr/share/biotext/ComplexPPI/BioInferForComplexPPIVisible.xml" optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") optparser.add_option("-i", "--invariant", default=None, dest="invariant", help="Corpus in analysis format", metavar="FILE") optparser.add_option("-v", "--variant", default=None, dest="variant", help="Corpus in analysis format", metavar="FILE") (options, args) = optparser.parse_args() #invariantExamples = ExampleUtils.readExamples(os.path.join(options.invariant, "examples.txt")) variantExamples = ExampleUtils.readExamples(os.path.join(options.variant, "test-triggers.examples")) invariantFeatureSet = IdSet() invariantFeatureSet.load(os.path.join(options.invariant, "feature_names.txt")) invariantClassSet = IdSet() invariantClassSet.load(os.path.join(options.invariant, "class_names.txt")) variantFeatureSet = IdSet() variantFeatureSet.load(os.path.join(options.variant, "test-triggers.examples.feature_names")) variantClassSet = IdSet() variantClassSet.load(os.path.join(options.variant, "test-triggers.examples.class_names")) counter = ProgressCounter(len(variantExamples)) for example in variantExamples: counter.update() example[1] = invariantClassSet.getId(variantClassSet.getName(example[1])) newFeatures = {} for k,v in example[2].iteritems(): newFeatures[ invariantFeatureSet.getId(variantFeatureSet.getName(k)) ] = v example[2] = newFeatures ExampleUtils.writeExamples(variantExamples, os.path.join(options.variant, "realignedExamples.txt"))
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False, classIds=None): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ if type(parameters) == types.StringType: parameters = splitParameters(parameters) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len( examples), "with SVM-MultiClass model", modelPath examples, predictions = self.filterClassificationSet( examples, False) testPath = self.tempDir + "/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath testPath = examples examples = Example.readExamples(examples, False) if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0], "classifier/model") del parameters["predefined"] # Read model if modelPath == None: modelPath = "model-multilabel" classModels = {} if modelPath.endswith(".gz"): f = gzip.open(modelPath, "rt") else: f = open(modelPath, "rt") thresholds = {} for line in f: key, value, threshold = line.split() classModels[key] = value if threshold != "None": thresholds[key] = float(threshold) else: thresholds[key] = 0.0 f.close() mergedPredictions = [] if type(classIds) == types.StringType: classIds = IdSet(filename=classIds) #print classModels print "Thresholds", thresholds classifierBin = Settings.SVMMultiClassDir + "/svm_multiclass_classify" print parameters if "classifier" in parameters and "svmperf" in parameters["classifier"]: classifierBin = Settings.SVMPerfDir + "/svm_perf_classify" parameters = copy.copy(parameters) del parameters["classifier"] for className in classIds.getNames(): if className != "neg" and not "---" in className: classId = classIds.getId(className) if thresholds[str(className)] != 0.0: print >> sys.stderr, "Classifying", className, "with threshold", thresholds[ str(className)] else: print >> sys.stderr, "Classifying", className args = [classifierBin] #self.__addParametersToSubprocessCall(args, parameters) classOutput = "predictions" + ".cls-" + className logFile = open("svmmulticlass" + ".cls-" + className + ".log", "at") args += [testPath, classModels[str(className)], classOutput] print args subprocess.call(args, stdout=logFile, stderr=logFile) cls.addPredictions(classOutput, mergedPredictions, classId, len(classIds.Ids), threshold=thresholds[str(className)]) print >> sys.stderr, timer.toString() predFileName = output f = open(predFileName, "wt") for mergedPred in mergedPredictions: if len(mergedPred[0]) > 1 and "1" in mergedPred[0]: mergedPred[0].remove("1") mergedPred[1] = str(mergedPred[1]) mergedPred[0] = ",".join(sorted(list(mergedPred[0]))) f.write(" ".join(mergedPred) + "\n") f.close() return mergedPredictions
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False, classIds=None): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ #if forceInternal or Settings.SVMMultiClassDir == None: # return cls.testInternal(examples, modelPath, output) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len( examples), "with SVM-Light model", modelPath examples, predictions = self.filterClassificationSet( examples, False) testPath = self.tempDir + "/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-Light model", modelPath testPath = examples #examples = Example.readExamples(examples,False) if os.environ.has_key("METAWRK"): args = [SVMMultiClassClassifier.louhiBinDir + "/svm_classify"] else: args = [self.binDir + "/svm_classify"] if modelPath == None: modelPath = "model" if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0], "classifier/model") del parameters["predefined"] self.__addParametersToSubprocessCall(args, parameters) if output == None: output = "predictions" logFile = open("svmlight.log", "at") else: logFile = open(output + ".log", "wt") args += [testPath, modelPath, output] #if timeout == None: # timeout = -1 #print args subprocess.call(args, stdout=logFile, stderr=logFile) predictionsFile = open(output, "rt") lines = predictionsFile.readlines() predictionsFile.close() predictions = [] for i in range(len(lines)): predictions.append([int(lines[i].split()[0])] + lines[i].split()[1:]) #predictions.append( (examples[i],int(lines[i].split()[0]),"multiclass",lines[i].split()[1:]) ) print >> sys.stderr, timer.toString() return predictions
classSet = None if os.path.exists( os.path.join(classifierParamDict["predefined"][0], "class_names.txt")): classSet = IdSet() classSet.load( os.path.join(classifierParamDict["predefined"][0], "class_names.txt")) exampleBuilder = ExampleBuilder(featureSet=featureSet, classSet=classSet, **splitParameters( options.exampleBuilderParameters)) # Save training sets if options.output != None: print >> sys.stderr, "Saving example sets to", options.output Example.writeExamples(exampleSets[0], options.output + "/examplesTrain.txt") if not classifierParamDict.has_key("predefined"): Example.writeExamples( optimizationSets[0], options.output + "/examplesOptimizationTest.txt") Example.writeExamples( optimizationSets[1], options.output + "/examplesOptimizationTrain.txt") TableUtils.writeCSV(bestResults[2], options.output + "/best_parameters.csv") # Optimize and train if options.output != None: classifier = Classifier(workDir=options.output + "/classifier") else: classifier = Classifier()