def test(cls, examples, modelPath, output=None, parameters=None, timeout=None): if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len(examples), "with All-True Classifier" examples, predictions = self.filterClassificationSet(examples, False) testPath = self.tempDir+"/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with All-True Classifier" testPath = examples examples = Example.readExamples(examples,False) print >> sys.stderr, "Note! Classification must be binary" #examples, predictions = self.filterClassificationSet(examples, True) predictions = [] for example in examples: #predictions.append( (example, example[1]) ) predictions.append( [2] ) #[example[1]] ) if output == None: output = "predictions" f = open(output, "wt") for p in predictions: f.write(str(p[0])+"\n") f.close() return predictions
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) SharedTaskEvaluator.corpusElements = Core.SentenceGraph.loadCorpus( SharedTaskEvaluator.corpusFilename, SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) # Build interaction xml xml = BioTextExampleWriter.write( examples, predictions, SharedTaskEvaluator.corpusElements, None, SharedTaskEvaluator.ids + ".class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) # Convert to GENIA format gifxmlToGenia(xml, SharedTaskEvaluator.geniaDir, task=SharedTaskEvaluator.task, verbose=False) # Use GENIA evaluation tool self.results = evaluateSharedTask(SharedTaskEvaluator.geniaDir, task=SharedTaskEvaluator.task, evaluations=["approximate"], verbose=False)
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.classSet = classSet # define class ids in alphabetical order self.classSet = classSet if classSet != None: classNames = sorted(classSet.Ids.keys()) else: classNames = [] # make an ordered list of class ids self.classes = [] for className in classNames: self.classes.append(classSet.getId(className)) # create data structures for per-class evaluation self.dataByClass = {} for cls in self.classes: self.dataByClass[cls] = EvaluationData() # hack for unnamed classes if len(self.dataByClass) == 0: self.dataByClass[1] = EvaluationData() self.dataByClass[2] = EvaluationData() #self.untypedUndirected = None self.untypedCurrentMajorId = None self.untypedPredictionQueue = [] self.untypedUndirected = EvaluationData() #self.AUC = None if predictions != None: self._calculate(examples, predictions)
def classify(self, examples, output, model=None, finishBeforeReturn=False, replaceRemoteFiles=True): output = os.path.abspath(output) # Get examples if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len( examples), "with All-Correct Classifier" else: print >> sys.stderr, "Classifying file", examples, "with All-Correct Classifier" examples = self.getExampleFile(examples, upload=False, replaceRemote=False, dummy=False) examples = Example.readExamples(examples, False) # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) # Classify f = open(output, "wt") for example in examples: f.write(str(example[1]) + "\n") f.close() classifier.predictions = output return classifier
def test(cls, examples, modelPath, output=None, parameters=None, timeout=None): if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len( examples), "with All-Correct Classifier" examples, predictions = self.filterClassificationSet( examples, False) testPath = self.tempDir + "/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with All-Correct Classifier" testPath = examples examples = Example.readExamples(examples, False) #examples, predictions = self.filterClassificationSet(examples, True) predictions = [] for example in examples: #predictions.append( (example, example[1]) ) predictions.append([example[1]]) if output == None: output = "predictions" f = open(output, "wt") for p in predictions: f.write(str(p[0]) + "\n") f.close() return predictions
def __init__(self, examples=None, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.keep = set(["CPR:3", "CPR:4", "CPR:5", "CPR:6", "CPR:9"]) self.classSet = classSet self.results = None self.internal = None if predictions != None: for example in examples: if example[3] != None: print >> sys.stderr, "ChemProt Evaluator:" self._calculateExamples(examples, predictions) else: print >> sys.stderr, "No example extra info, skipping ChemProt evaluation" break self.internal = AveragingMultiClassEvaluator( examples, predictions, classSet) print >> sys.stderr, "AveragingMultiClassEvaluator:" print >> sys.stderr, self.internal.toStringConcise()
def loadExamples(self, examples, predictions): if type(predictions) == types.StringType: print >> sys.stderr, "Reading predictions from", predictions predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: print >> sys.stderr, "Reading examples from", examples examples = ExampleUtils.readExamples(examples, False) return examples, predictions
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ if forceInternal or Settings.SVMMultiClassDir == None: return cls.testInternal(examples, modelPath, output) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len(examples), "with SVM-MultiClass model", modelPath examples, predictions = self.filterClassificationSet(examples, False) testPath = self.tempDir+"/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath testPath = cls.stripComments(examples) examples = Example.readExamples(examples,False) args = ["/home/jari/Programs/liblinear-1.5-poly2/predict"] if modelPath == None: modelPath = "model" if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0],"classifier/model") del parameters["predefined"] self.__addParametersToSubprocessCall(args, parameters) if output == None: output = "predictions" logFile = open("svmmulticlass.log","at") else: logFile = open(output+".log","wt") args += [testPath, modelPath, output] #if timeout == None: # timeout = -1 #print args subprocess.call(args, stdout = logFile, stderr = logFile) predictionsFile = open(output, "rt") lines = predictionsFile.readlines() predictionsFile.close() predictions = [] for i in range(len(lines)): predictions.append( [int(lines[i].split()[0])] + lines[i].split()[1:] ) #predictions.append( (examples[i],int(lines[i].split()[0]),"multiclass",lines[i].split()[1:]) ) print >> sys.stderr, timer.toString() return predictions
def addExamples(exampleFile, predictionFile, classFile, matrix): classSet = IdSet(filename=classFile) f = open(predictionFile, "rt") for example in ExampleUtils.readExamples(exampleFile, False): pred = int(f.readline().split()[0]) predClasses = classSet.getName(pred) goldClasses = classSet.getName(example[1]) for predClass in predClasses.split("---"): for goldClass in goldClasses.split("---"): matrix[predClass][goldClass] matrix[goldClass][predClass] += 1 f.close()
def write( cls, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None, insertWeights=False, ): if type(examples) == types.StringType: print >>sys.stderr, "Reading examples from", examples examples = ExampleUtils.readExamples(examples, False) # This looks a bit strange, but should work with the re-iterable # generators that readExamples returns xType = None for example in examples: assert example[3].has_key("xtype") xType = example[3]["xtype"] break if xType == "token": w = EntityExampleWriter() if insertWeights: w.insertWeights = True elif xType == "edge": w = EdgeExampleWriter() elif xType == "task3": w = ModifierExampleWriter() elif xType == "entRel": w = EntityRelationExampleWriter() elif xType == "phrase": w = PhraseTriggerExampleWriter() # IF LOCAL elif xType == "um": w = UnmergingExampleWriter() # elif xType == "ue": # w = UnmergedEdgeExampleWriter() # elif xType == "asym": # w = AsymmetricEventExampleWriter() # ENDIF else: assert False, ("Unknown entity type", xType) return w.writeXML( examples, predictions, corpus, outputFile, classSet, parse, tokenization, goldCorpus=goldCorpus )
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) SharedTaskEvaluator.corpusElements = Core.SentenceGraph.loadCorpus(SharedTaskEvaluator.corpusFilename, SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) # Build interaction xml xml = BioTextExampleWriter.write(examples, predictions, SharedTaskEvaluator.corpusElements, None, SharedTaskEvaluator.ids+".class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) # Convert to GENIA format gifxmlToGenia(xml, SharedTaskEvaluator.geniaDir, task=SharedTaskEvaluator.task, verbose=False) # Use GENIA evaluation tool self.results = evaluateSharedTask(SharedTaskEvaluator.geniaDir, task=SharedTaskEvaluator.task, evaluations=["approximate"], verbose=False)
def classify(self, examples, output, model=None, finishBeforeReturn=False, replaceRemoteFiles=True): output = os.path.abspath(output) # Get examples if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len(examples), "with All-Correct Classifier" else: print >> sys.stderr, "Classifying file", examples, "with All-Correct Classifier" examples = self.getExampleFile(examples, upload=False, replaceRemote=False, dummy=False) examples = Example.readExamples(examples, False) # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) # Classify f = open(output, "wt") for example in examples: f.write(str(example[1]) + "\n") f.close() classifier.predictions = output return classifier
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.classSet = classSet self.dataByClass = defaultdict(EvaluationData) #self.untypedUndirected = None self.untypedCurrentMajorId = None self.untypedPredictionQueue = [] self.untypedUndirected = EvaluationData() #self.AUC = None if predictions != None: self._calculate(examples, predictions)
def __init__(self, examples=None, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) #self.examples = examples #self.predictions = predictions self.truePositives = 0 self.falsePositives = 0 self.trueNegatives = 0 self.falseNegatives = 0 self.precision = None self.recall = None self.fScore = None self.AUC = None self.type = "binary" if predictions != None: self._calculate(examples, predictions)
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) corpusElements = Core.SentenceGraph.loadCorpus(BXEvaluator.corpusFilename, BXEvaluator.parse, BXEvaluator.tokenization) # Build interaction xml xml = BioTextExampleWriter.write(examples, predictions, corpusElements, None, BXEvaluator.ids+".class_names", BXEvaluator.parse, BXEvaluator.tokenization) xml = ix.splitMergedElements(xml, None) xml = ix.recalculateIds(xml, None, True) #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) # Convert to GENIA format STFormat.ConvertXML.toSTFormat(xml, BXEvaluator.geniaDir, outputTag="a2") #gifxmlToGenia(xml, BXEvaluator.geniaDir, task=SharedTaskEvaluator.task, verbose=False) # Use GENIA evaluation tool self.results = BioNLP11GeniaTools.evaluateBX(BXEvaluator.geniaDir, corpusName=BXEvaluator.corpusTag) corpusElements = None
def __init__(self, examples=None, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) # self.examples = examples # self.predictions = predictions self.truePositives = 0 self.falsePositives = 0 self.trueNegatives = 0 self.falseNegatives = 0 self.precision = None self.recall = None self.fScore = None self.AUC = None self.type = "binary" if predictions != None: self._calculate(examples, predictions)
def polynomizeExamples(exampleFile, outFile, weightFeatures, idSet): outFile = open(outFile, "wt") addCount = 0 f = open(exampleFile) numExamples = sum([1 for line in f]) f.close() counter = ProgressCounter(numExamples, "Polynomize examples", step=0) weightFeatureIds = {} for weightFeature in weightFeatures: wId = idSet.getId(weightFeature, False) if wId == None: sys.exit("Weight vector feature", weightFeature, "not in id file") weightFeatureIds[weightFeature] = wId print "Polynomizing", exampleFile exampleCache = [] for example in ExampleUtils.readExamples(exampleFile): counter.update(1, "Processing example ("+example[0]+"): ") features = example[2] for i in range(len(weightFeatures)-1): wI = weightFeatures[i] wIid = weightFeatureIds[wI] if not features.has_key(wIid): continue for j in range(i + 1, len(weightFeatures)): wJ = weightFeatures[j] wJid = weightFeatureIds[wJ] if not features.has_key(wJid): continue # Make polynomial feature features[idSet.getId(wI + "_AND_" + wJ)] = 1 addCount += 1 exampleCache.append(example) if len(exampleCache) > 50: ExampleUtils.appendExamples(exampleCache, outFile) exampleCache = [] ExampleUtils.appendExamples(exampleCache, outFile) outFile.close() print "Added", addCount, "polynomial features"
def polynomizeExamples(exampleFile, outFile, weightFeatures, idSet): outFile = open(outFile, "wt") addCount = 0 f = open(exampleFile) numExamples = sum([1 for line in f]) f.close() counter = ProgressCounter(numExamples, "Polynomize examples", step=0) weightFeatureIds = {} for weightFeature in weightFeatures: wId = idSet.getId(weightFeature, False) if wId == None: sys.exit("Weight vector feature", weightFeature, "not in id file") weightFeatureIds[weightFeature] = wId print "Polynomizing", exampleFile exampleCache = [] for example in ExampleUtils.readExamples(exampleFile): counter.update(1, "Processing example (" + example[0] + "): ") features = example[2] for i in range(len(weightFeatures) - 1): wI = weightFeatures[i] wIid = weightFeatureIds[wI] if not features.has_key(wIid): continue for j in range(i + 1, len(weightFeatures)): wJ = weightFeatures[j] wJid = weightFeatureIds[wJ] if not features.has_key(wJid): continue # Make polynomial feature features[idSet.getId(wI + "_AND_" + wJ)] = 1 addCount += 1 exampleCache.append(example) if len(exampleCache) > 50: ExampleUtils.appendExamples(exampleCache, outFile) exampleCache = [] ExampleUtils.appendExamples(exampleCache, outFile) outFile.close() print "Added", addCount, "polynomial features"
def threshold(cls, examples, predictions): # Make negative confidence score / true class pairs if type(examples) in types.StringTypes: examples = ExampleUtils.readExamples(examples, False) if type(predictions) in types.StringTypes: predictions = ExampleUtils.loadPredictions(predictions) pairs = [] realPositives = 0 for example, prediction in itertools.izip(examples, predictions): trueClass = example[1] assert (trueClass > 0 ) # multiclass classification uses non-negative integers if trueClass > 1: realPositives += 1 negClassValue = prediction[1] pairs.append((negClassValue, trueClass)) pairs.sort(reverse=True) realNegatives = len(pairs) - realPositives # When starting thresholding, all examples are considered positive binaryF = EvaluationData() binaryF._tp = realPositives binaryF._fp = realNegatives binaryF._fn = 0 binaryF.calculateFScore() fscore = binaryF.fscore threshold = pairs[0][0] - 1. # Turn one example negative at a time for pair in pairs: if pair[1] == 1: # the real class is negative binaryF._fp -= 1 # false positive -> true negative else: # the real class is a positive class binaryF._tp -= 1 # true positive -> ... binaryF._fn += 1 # ... false negative binaryF.calculateFScore() if binaryF.fscore > fscore: fscore = binaryF.fscore threshold = pair[0] + 0.00000001 return threshold, fscore
def threshold(cls, examples, predictions): # Make negative confidence score / true class pairs if type(examples) in types.StringTypes: examples = ExampleUtils.readExamples(examples, False) if type(predictions) in types.StringTypes: predictions = ExampleUtils.loadPredictions(predictions) pairs = [] realPositives = 0 for example, prediction in itertools.izip(examples, predictions): trueClass = example[1] assert(trueClass > 0) # multiclass classification uses non-negative integers if trueClass > 1: realPositives += 1 negClassValue = prediction[1] pairs.append( (negClassValue, trueClass) ) pairs.sort(reverse=True) realNegatives = len(pairs) - realPositives # When starting thresholding, all examples are considered positive binaryF = EvaluationData() binaryF._tp = realPositives binaryF._fp = realNegatives binaryF._fn = 0 binaryF.calculateFScore() fscore = binaryF.fscore threshold = pairs[0][0]-1. # Turn one example negative at a time for pair in pairs: if pair[1] == 1: # the real class is negative binaryF._fp -= 1 # false positive -> true negative else: # the real class is a positive class binaryF._tp -= 1 # true positive -> ... binaryF._fn += 1 # ... false negative binaryF.calculateFScore() if binaryF.fscore > fscore: fscore = binaryF.fscore threshold = pair[0]+0.00000001 return threshold, fscore
def __init__(self, examples=None, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.keep = set(["CPR:3", "CPR:4", "CPR:5", "CPR:6", "CPR:9"]) self.classSet = classSet self.results = None self.internal = None if predictions != None: for example in examples: if example[3] != None: print >> sys.stderr, "ChemProt Evaluator:" self._calculateExamples(examples, predictions) else: print >> sys.stderr, "No example extra info, skipping ChemProt evaluation" break self.internal = AveragingMultiClassEvaluator(examples, predictions, classSet) print >> sys.stderr, "AveragingMultiClassEvaluator:" print >> sys.stderr, self.internal.toStringConcise()
def write(cls, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None, insertWeights=False): if type(examples) == types.StringType: print >> sys.stderr, "Reading examples from", examples examples = ExampleUtils.readExamples(examples, False) # This looks a bit strange, but should work with the re-iterable # generators that readExamples returns xType = None for example in examples: assert example[3].has_key("xtype") xType = example[3]["xtype"] break if xType == "token": w = EntityExampleWriter() if insertWeights: w.insertWeights = True elif xType == "edge": w = EdgeExampleWriter() elif xType == "task3": w = ModifierExampleWriter() elif xType == "entRel": w = EntityRelationExampleWriter() elif xType == "phrase": w = PhraseTriggerExampleWriter() #IF LOCAL elif xType == "um": w = UnmergingExampleWriter() #elif xType == "ue": # w = UnmergedEdgeExampleWriter() #elif xType == "asym": # w = AsymmetricEventExampleWriter() #ENDIF else: assert False, ("Unknown entity type", xType) return w.writeXML(examples, predictions, corpus, outputFile, classSet, parse, tokenization, goldCorpus=goldCorpus)
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False, classIds=None): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ if type(parameters) == types.StringType: parameters = splitParameters(parameters) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len(examples), "with SVM-MultiClass model", modelPath examples, predictions = self.filterClassificationSet(examples, False) testPath = self.tempDir+"/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath testPath = examples examples = Example.readExamples(examples,False) if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0],"classifier/model") del parameters["predefined"] # Read model if modelPath == None: modelPath = "model-multilabel" classModels = {} if modelPath.endswith(".gz"): f = gzip.open(modelPath, "rt") else: f = open(modelPath, "rt") thresholds = {} for line in f: key, value, threshold = line.split() classModels[key] = value if threshold != "None": thresholds[key] = float(threshold) else: thresholds[key] = 0.0 f.close() mergedPredictions = [] if type(classIds) == types.StringType: classIds = IdSet(filename=classIds) #print classModels print "Thresholds", thresholds classifierBin = Settings.SVMMultiClassDir+"/svm_multiclass_classify" print parameters if "classifier" in parameters and "svmperf" in parameters["classifier"]: classifierBin = Settings.SVMPerfDir+"/svm_perf_classify" parameters = copy.copy(parameters) del parameters["classifier"] for className in classIds.getNames(): if className != "neg" and not "---" in className: classId = classIds.getId(className) if thresholds[str(className)] != 0.0: print >> sys.stderr, "Classifying", className, "with threshold", thresholds[str(className)] else: print >> sys.stderr, "Classifying", className args = [classifierBin] #self.__addParametersToSubprocessCall(args, parameters) classOutput = "predictions" + ".cls-" + className logFile = open("svmmulticlass" + ".cls-" + className + ".log","at") args += [testPath, classModels[str(className)], classOutput] print args subprocess.call(args, stdout = logFile, stderr = logFile) cls.addPredictions(classOutput, mergedPredictions, classId, len(classIds.Ids), threshold=thresholds[str(className)]) print >> sys.stderr, timer.toString() predFileName = output f = open(predFileName, "wt") for mergedPred in mergedPredictions: if len(mergedPred[0]) > 1 and "1" in mergedPred[0]: mergedPred[0].remove("1") mergedPred[1] = str(mergedPred[1]) mergedPred[0] = ",".join(sorted(list(mergedPred[0]))) f.write(" ".join(mergedPred) + "\n") f.close() return mergedPredictions
def compareExamples(examples1, examples2, features1, features2=None): ExampleUtils.readExamples(examples1) exampleIter1 = ExampleUtils.readExamples(examples1) exampleIter2 = ExampleUtils.readExamples(examples2) features1 = IdSet(filename=features1) if features2 != None: features2 = IdSet(filename=features2) else: features2 = features1 # Compare feature sets if set(features1.Ids.keys()) != set(features2.Ids.keys()): print "Feature sets differ" # Compare examples counter = ProgressCounter(step=1) for e1, e2 in itertools.izip(exampleIter1, exampleIter2): counter.update() assert e1[0] == e2[0], (removeFeatures(e1), removeFeatures(e2)) if e1[1] != e2[1]: print "Class differs" print " E1", removeFeatures(e1) print " E2", removeFeatures(e2) f1 = getFeatureNames(e1, features1) f2 = getFeatureNames(e2, features2) f1Set = set(f1) f2Set = set(f2) f1Only = f1Set.difference(f2Set) f2Only = f2Set.difference(f1Set) if len(f1Only) > 0 or len(f2Only) > 0: print "Features differ" print " E1", removeFeatures(e1) print " E2", removeFeatures(e2) if len(f1Only) > 0: print " E1-only features:", f1Only if len(f2Only) > 0: print " E2-only features:", f2Only else: assert len(f1) == len(f2) fCount = 0 differ = False for feature1, feature2 in zip(f1, f2): #f1Id = features1.getId(feature1, createIfNotExist=False) #if f1Id == 454 or feature1 == "e1_strength_Positive_regulation": # print "!!!!!!!!!!!", 454, feature1, e1[2][f1Id] if feature1 != feature2: if not differ: print "Feature order differs for example", e1[0] differ = True print "[" + feature1 + "/" + feature2 + "](" + str( fCount) + ") ", else: f1Id = features1.getId(feature1, createIfNotExist=False) f2Id = features2.getId(feature2, createIfNotExist=False) f1Value = e1[2][f1Id] f2Value = e2[2][f2Id] if f1Value != f2Value: if not differ: print "Feature values differ", e1[0] differ = True print "[" + feature1 + "/" + str( f1Id) + "]" + "[" + str(f1Value) + "/" + str( f2Value) + "]" + "(" + str(fCount) + ") ", fCount += 1 if differ: print counter.endUpdate()
def compareExamples(examples1, examples2, features1, features2=None): ExampleUtils.readExamples(examples1) exampleIter1 = ExampleUtils.readExamples(examples1) exampleIter2 = ExampleUtils.readExamples(examples2) features1 = IdSet(filename=features1) if features2 != None: features2 = IdSet(filename=features2) else: features2 = features1 # Compare feature sets if set(features1.Ids.keys()) != set(features2.Ids.keys()): print "Feature sets differ" # Compare examples counter = ProgressCounter(step=1) for e1, e2 in itertools.izip(exampleIter1, exampleIter2): counter.update() assert e1[0] == e2[0], (removeFeatures(e1), removeFeatures(e2)) if e1[1] != e2[1]: print "Class differs" print " E1", removeFeatures(e1) print " E2", removeFeatures(e2) f1 = getFeatureNames(e1, features1) f2 = getFeatureNames(e2, features2) f1Set = set(f1) f2Set = set(f2) f1Only = f1Set.difference(f2Set) f2Only = f2Set.difference(f1Set) if len(f1Only) > 0 or len(f2Only) > 0: print "Features differ" print " E1", removeFeatures(e1) print " E2", removeFeatures(e2) if len(f1Only) > 0: print " E1-only features:", f1Only if len(f2Only) > 0: print " E2-only features:", f2Only else: assert len(f1) == len(f2) fCount = 0 differ = False for feature1, feature2 in zip(f1, f2): #f1Id = features1.getId(feature1, createIfNotExist=False) #if f1Id == 454 or feature1 == "e1_strength_Positive_regulation": # print "!!!!!!!!!!!", 454, feature1, e1[2][f1Id] if feature1 != feature2: if not differ: print "Feature order differs for example", e1[0] differ = True print "[" + feature1 + "/" + feature2 + "](" + str(fCount) + ") ", else: f1Id = features1.getId(feature1, createIfNotExist=False) f2Id = features2.getId(feature2, createIfNotExist=False) f1Value = e1[2][f1Id] f2Value = e2[2][f2Id] if f1Value != f2Value: if not differ: print "Feature values differ", e1[0] differ = True print "[" + feature1 + "/" + str(f1Id) + "]" + "[" + str(f1Value) + "/" + str(f2Value) + "]" + "(" + str(fCount) + ") ", fCount += 1 if differ: print counter.endUpdate()
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False, classIds=None): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ if type(parameters) == types.StringType: parameters = splitParameters(parameters) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len( examples), "with SVM-MultiClass model", modelPath examples, predictions = self.filterClassificationSet( examples, False) testPath = self.tempDir + "/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath testPath = examples examples = Example.readExamples(examples, False) if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0], "classifier/model") del parameters["predefined"] # Read model if modelPath == None: modelPath = "model-multilabel" classModels = {} if modelPath.endswith(".gz"): f = gzip.open(modelPath, "rt") else: f = open(modelPath, "rt") thresholds = {} for line in f: key, value, threshold = line.split() classModels[key] = value if threshold != "None": thresholds[key] = float(threshold) else: thresholds[key] = 0.0 f.close() mergedPredictions = [] if type(classIds) == types.StringType: classIds = IdSet(filename=classIds) #print classModels print "Thresholds", thresholds classifierBin = Settings.SVMMultiClassDir + "/svm_multiclass_classify" print parameters if "classifier" in parameters and "svmperf" in parameters["classifier"]: classifierBin = Settings.SVMPerfDir + "/svm_perf_classify" parameters = copy.copy(parameters) del parameters["classifier"] for className in classIds.getNames(): if className != "neg" and not "---" in className: classId = classIds.getId(className) if thresholds[str(className)] != 0.0: print >> sys.stderr, "Classifying", className, "with threshold", thresholds[ str(className)] else: print >> sys.stderr, "Classifying", className args = [classifierBin] #self.__addParametersToSubprocessCall(args, parameters) classOutput = "predictions" + ".cls-" + className logFile = open("svmmulticlass" + ".cls-" + className + ".log", "at") args += [testPath, classModels[str(className)], classOutput] print args subprocess.call(args, stdout=logFile, stderr=logFile) cls.addPredictions(classOutput, mergedPredictions, classId, len(classIds.Ids), threshold=thresholds[str(className)]) print >> sys.stderr, timer.toString() predFileName = output f = open(predFileName, "wt") for mergedPred in mergedPredictions: if len(mergedPred[0]) > 1 and "1" in mergedPred[0]: mergedPred[0].remove("1") mergedPred[1] = str(mergedPred[1]) mergedPred[0] = ",".join(sorted(list(mergedPred[0]))) f.write(" ".join(mergedPred) + "\n") f.close() return mergedPredictions
def determineThreshold(self, examples, predictions): if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) examplesByClass = {} for cls in self.classes: examplesByClass[cls] = [] # prepare examples for example, prediction in itertools.izip(examples, predictions): # Check true class for multilabel trueClass = example[1] trueClassName = self.classSet.getName(trueClass) assert(trueClass > 0) # multiclass classification uses non-negative integers if "---" in trueClassName: trueClass = set() for name in trueClassName.split("---"): trueClass.add(self.classSet.getId(name)) else: trueClass = [trueClass] # Check prediction for multilabel predictedClasses = prediction[0] if type(predictedClasses) == types.IntType: predictedClasses = [predictedClasses] for predType in predictedClasses: if predType != 1: exTrueClass = 1 if predType in trueClass: exTrueClass = 2 examplesByClass[predType].append( (prediction[predType], exTrueClass, 2) ) # positives are negatives for other classes for cls in self.classes: if cls not in predictedClasses: exTrueClass = 1 if cls in trueClass: exTrueClass = 2 examplesByClass[cls].append( (prediction[cls], exTrueClass, 1) ) # do the thresholding thresholdByClass = {} for cls in self.classes: if cls == 1: continue thresholdByClass[cls] = 0.0 examplesByClass[cls].sort() # Start with all below zero being negative, and all above it being what is predicted ev = EvaluationData() for example in examplesByClass[cls]: #print example if example[0] < 0.0: updateF(ev, example[1], 2, 1) # always negative else: updateF(ev, example[1], example[2], 1) # what is predicted count = 0 bestF = [self.dataByClass[cls].fscore, None, (0.0, None), None] for example in examplesByClass[cls]: if example[0] < 0.0: # Remove original example updateF(ev, example[1], 2, -1) # Add new example updateF(ev, example[1], example[2], 1) # Calculate F for this point else: # Remove original example updateF(ev, example[1], example[2], -1) # Add new example updateF(ev, example[1], 1, 1) # Calculate F for this point ev.calculateFScore() #print example, ev.toStringConcise() count += 1 #if self.classSet.getName(cls) == "Binding": # print count, example, ev.toStringConcise() if ev.fscore > bestF[0]: bestF = (ev.fscore, count, example, ev.toStringConcise()) self.dataByClass[cls] = copy.copy(ev) print >> sys.stderr, "Threshold", self.classSet.getName(cls), bestF if bestF[2][0] != 0.0: thresholdByClass[cls] = bestF[2][0] + 0.00000001 else: thresholdByClass[cls] = 0.0 #print thresholdByClass self.thresholds = thresholdByClass #self._calculate(examples, predictions, thresholdByClass) #print >> sys.stderr, "Optimal", self.toStringConcise() return thresholdByClass
def determineThreshold(self, examples, predictions): if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) examplesByClass = {} for cls in self.classes: examplesByClass[cls] = [] # prepare examples for example, prediction in itertools.izip(examples, predictions): # Check true class for multilabel trueClass = example[1] trueClassName = self.classSet.getName(trueClass) assert (trueClass > 0 ) # multiclass classification uses non-negative integers if "---" in trueClassName: trueClass = set() for name in trueClassName.split("---"): trueClass.add(self.classSet.getId(name)) else: trueClass = [trueClass] # Check prediction for multilabel predictedClasses = prediction[0] if type(predictedClasses) == types.IntType: predictedClasses = [predictedClasses] for predType in predictedClasses: if predType != 1: exTrueClass = 1 if predType in trueClass: exTrueClass = 2 examplesByClass[predType].append( (prediction[predType], exTrueClass, 2)) # positives are negatives for other classes for cls in self.classes: if cls not in predictedClasses: exTrueClass = 1 if cls in trueClass: exTrueClass = 2 examplesByClass[cls].append( (prediction[cls], exTrueClass, 1)) # do the thresholding thresholdByClass = {} for cls in self.classes: if cls == 1: continue thresholdByClass[cls] = 0.0 examplesByClass[cls].sort() # Start with all below zero being negative, and all above it being what is predicted ev = EvaluationData() for example in examplesByClass[cls]: #print example if example[0] < 0.0: updateF(ev, example[1], 2, 1) # always negative else: updateF(ev, example[1], example[2], 1) # what is predicted count = 0 bestF = [self.dataByClass[cls].fscore, None, (0.0, None), None] for example in examplesByClass[cls]: if example[0] < 0.0: # Remove original example updateF(ev, example[1], 2, -1) # Add new example updateF(ev, example[1], example[2], 1) # Calculate F for this point else: # Remove original example updateF(ev, example[1], example[2], -1) # Add new example updateF(ev, example[1], 1, 1) # Calculate F for this point ev.calculateFScore() #print example, ev.toStringConcise() count += 1 #if self.classSet.getName(cls) == "Binding": # print count, example, ev.toStringConcise() if ev.fscore > bestF[0]: bestF = (ev.fscore, count, example, ev.toStringConcise()) self.dataByClass[cls] = copy.copy(ev) print >> sys.stderr, "Threshold", self.classSet.getName(cls), bestF if bestF[2][0] != 0.0: thresholdByClass[cls] = bestF[2][0] + 0.00000001 else: thresholdByClass[cls] = 0.0 #print thresholdByClass self.thresholds = thresholdByClass #self._calculate(examples, predictions, thresholdByClass) #print >> sys.stderr, "Optimal", self.toStringConcise() return thresholdByClass
optparser.add_option("-i", "--invariant", default=None, dest="invariant", help="Corpus in analysis format", metavar="FILE") optparser.add_option("-v", "--variant", default=None, dest="variant", help="Corpus in analysis format", metavar="FILE") (options, args) = optparser.parse_args() #invariantExamples = ExampleUtils.readExamples(os.path.join(options.invariant, "examples.txt")) variantExamples = ExampleUtils.readExamples( os.path.join(options.variant, "test-triggers.examples")) invariantFeatureSet = IdSet() invariantFeatureSet.load( os.path.join(options.invariant, "feature_names.txt")) invariantClassSet = IdSet() invariantClassSet.load(os.path.join(options.invariant, "class_names.txt")) variantFeatureSet = IdSet() variantFeatureSet.load( os.path.join(options.variant, "test-triggers.examples.feature_names")) variantClassSet = IdSet() variantClassSet.load( os.path.join(options.variant, "test-triggers.examples.class_names")) counter = ProgressCounter(len(variantExamples))
def threshold(examples, predictionsDir=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) classIds = set() if type(examples) == types.StringType: # examples are in file examplesTemp = ExampleUtils.readExamples(examples, False) examples = [] for example in examplesTemp: examples.append(example) classIds.add(example[1]) classIds = list(classIds) classIds.sort() #multilabel = MultiLabelMultiClassEvaluator(None, None, classSet) #multilabel._calculate(examples, predictions) #print multilabel.toStringConcise(title="multilabel") bestThrF = [0] bestBaseF = [0] predFileNames = [] for filename in os.listdir(predictionsDir): if "predictions" in filename: predFileNames.append( (int(filename.rsplit("_")[-1]), filename) ) predFileNames.sort() for predFileName in predFileNames: predictionsTemp = ExampleUtils.loadPredictions(os.path.join(predictionsDir, predFileName[1])) predictions = [] for prediction in predictionsTemp: predictions.append(prediction) baseEv = AveragingMultiClassEvaluator(None, None, classSet) baseEv._calculate(examples, predictions) print "============================" print predFileName[1] print "============================" #print baseEv.toStringConcise(title="baseline") baseLineF = baseEv.microF.fscore for step in [0]: for classId in [1]: #classIds: cls = None if classSet != None: cls = classSet.getName(classId) else: cls = str(classId) bestF = thresholdClass(examples, predictions, classId, baseLineF) for prediction in predictions: prediction[classId] -= bestF[2][0] + 0.00000001 changed = 0 for prediction in predictions: maxVal = -999999 maxClass = None for i in range(1, len(prediction)): if prediction[i] > maxVal: maxVal = prediction[i] maxClass = i if maxClass != prediction[0]: prediction[0] = maxClass changed += 1 print step, cls, "changed", changed, bestF[0] baseLineF = bestF[0] if bestF[0] > bestThrF[0]: bestThrF = (bestF[0], predFileName[1], bestF[1], bestF[2], bestF[3]) if baseEv.microF.fscore > bestBaseF[0]: bestBaseF = (baseEv.microF.fscore, predFileName[1], baseEv.microF.toStringConcise()) print "-------- Baseline ------------" print baseEv.toStringConcise() print "-------- Best ------------" print bestF[0], bestF[1], bestF[2] print bestF[3] thEv = AveragingMultiClassEvaluator(None, None, classSet) thEv._calculate(examples, predictions) print thEv.toStringConcise() print "=============== All Best ===============" print "Threshold", bestThrF print "Base", bestBaseF sys.exit() memPredictions = [] bestEv = baseEv bestPair = [None, None, None] for p in predictions: memPredictions.append(p) for pair in pairs: modifier = pair[0] + 0.00000001 changedClass = 0 for pred in memPredictions: negPred = pred[1] - modifier maxVal = negPred maxClass = 1 for i in range(2, len(pred)): if pred[i] > maxVal: maxVal = pred[i] maxClass = i if pred[0] != maxClass: changedClass += 1 pred[0] = maxClass ev = AveragingMultiClassEvaluator(None) ev._calculate(examples, memPredictions) print pair[0], pair[2], changedClass print ev.toStringConcise() if ev.compare(bestEv) == 1: print "Improved" bestPair = pair bestEv = ev print "---------------------------------------------" print baseEv.toStringConcise(title="baseline") print bestPair[0], bestPair[2] print bestEv.toStringConcise(title="best")
# Import Psyco if available try: import psyco psyco.full() print >> sys.stderr, "Found Psyco, using" except ImportError: print >> sys.stderr, "Psyco not installed" defaultAnalysisFilename = "/usr/share/biotext/ComplexPPI/BioInferForComplexPPIVisible.xml" optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") optparser.add_option("-i", "--invariant", default=None, dest="invariant", help="Corpus in analysis format", metavar="FILE") optparser.add_option("-v", "--variant", default=None, dest="variant", help="Corpus in analysis format", metavar="FILE") (options, args) = optparser.parse_args() #invariantExamples = ExampleUtils.readExamples(os.path.join(options.invariant, "examples.txt")) variantExamples = ExampleUtils.readExamples(os.path.join(options.variant, "test-triggers.examples")) invariantFeatureSet = IdSet() invariantFeatureSet.load(os.path.join(options.invariant, "feature_names.txt")) invariantClassSet = IdSet() invariantClassSet.load(os.path.join(options.invariant, "class_names.txt")) variantFeatureSet = IdSet() variantFeatureSet.load(os.path.join(options.variant, "test-triggers.examples.feature_names")) variantClassSet = IdSet() variantClassSet.load(os.path.join(options.variant, "test-triggers.examples.class_names")) counter = ProgressCounter(len(variantExamples)) for example in variantExamples: counter.update() example[1] = invariantClassSet.getId(variantClassSet.getName(example[1]))
print >> sys.stderr, "Psyco not installed" from optparse import OptionParser import os optparser = OptionParser(description="Analyze SVM example files") optparser.add_option("-s", "--source", default=None, dest="source", help="examples", metavar="FILE") optparser.add_option("-t", "--target", default=None, dest="target", help="examples") optparser.add_option("-f", "--sourceFeatureIds", default=None, dest="sourceFeatureIds", help="examples", metavar="FILE") optparser.add_option("-g", "--targetFeatureIds", default=None, dest="targetFeatureIds", help="examples") (options, args) = optparser.parse_args() print "Loading ids" sFeatIds = IdSet(filename=options.sourceFeatureIds) tFeatIds = IdSet(filename=options.targetFeatureIds) print "Loading examples" sExamples = ExampleUtils.readExamples(options.source) tExamples = ExampleUtils.readExamples(options.target) print "Making name sets" s = getFeatureNames(sExamples, sFeatIds) t = getFeatureNames(tExamples, tFeatIds) print "Source features:", len(s) print "Target features:", len(t) print "Intersection:", len(s & t) onlyS = s - t onlyT = t - s print "Only source:", len(onlyS) print "Only target:", len(onlyT) # state = {} # for n in onlyS: # presence = state.setdefault(n, [0,0]) # presence[0] = 1
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ if forceInternal or Settings.SVMMultiClassDir == None: return cls.testInternal(examples, modelPath, output) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len( examples), "with SVM-MultiClass model", modelPath examples, predictions = self.filterClassificationSet( examples, False) testPath = self.tempDir + "/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath testPath = cls.stripComments(examples) examples = Example.readExamples(examples, False) args = ["/home/jari/Programs/liblinear-1.5-poly2/predict"] if modelPath == None: modelPath = "model" if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0], "classifier/model") del parameters["predefined"] self.__addParametersToSubprocessCall(args, parameters) if output == None: output = "predictions" logFile = open("svmmulticlass.log", "at") else: logFile = open(output + ".log", "wt") args += [testPath, modelPath, output] #if timeout == None: # timeout = -1 #print args subprocess.call(args, stdout=logFile, stderr=logFile) predictionsFile = open(output, "rt") lines = predictionsFile.readlines() predictionsFile.close() predictions = [] for i in range(len(lines)): predictions.append([int(lines[i].split()[0])] + lines[i].split()[1:]) #predictions.append( (examples[i],int(lines[i].split()[0]),"multiclass",lines[i].split()[1:]) ) print >> sys.stderr, timer.toString() return predictions
def threshold(examples, predictionsDir=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) classIds = set() if type(examples) == types.StringType: # examples are in file examplesTemp = ExampleUtils.readExamples(examples, False) examples = [] for example in examplesTemp: examples.append(example) classIds.add(example[1]) classIds = list(classIds) classIds.sort() #multilabel = MultiLabelMultiClassEvaluator(None, None, classSet) #multilabel._calculate(examples, predictions) #print multilabel.toStringConcise(title="multilabel") bestThrF = [0] bestBaseF = [0] predFileNames = [] for filename in os.listdir(predictionsDir): if "predictions" in filename: predFileNames.append((int(filename.rsplit("_")[-1]), filename)) predFileNames.sort() for predFileName in predFileNames: predictionsTemp = ExampleUtils.loadPredictions( os.path.join(predictionsDir, predFileName[1])) predictions = [] for prediction in predictionsTemp: predictions.append(prediction) baseEv = AveragingMultiClassEvaluator(None, None, classSet) baseEv._calculate(examples, predictions) print "============================" print predFileName[1] print "============================" #print baseEv.toStringConcise(title="baseline") baseLineF = baseEv.microF.fscore for step in [0]: for classId in [1]: #classIds: cls = None if classSet != None: cls = classSet.getName(classId) else: cls = str(classId) bestF = thresholdClass(examples, predictions, classId, baseLineF) for prediction in predictions: prediction[classId] -= bestF[2][0] + 0.00000001 changed = 0 for prediction in predictions: maxVal = -999999 maxClass = None for i in range(1, len(prediction)): if prediction[i] > maxVal: maxVal = prediction[i] maxClass = i if maxClass != prediction[0]: prediction[0] = maxClass changed += 1 print step, cls, "changed", changed, bestF[0] baseLineF = bestF[0] if bestF[0] > bestThrF[0]: bestThrF = (bestF[0], predFileName[1], bestF[1], bestF[2], bestF[3]) if baseEv.microF.fscore > bestBaseF[0]: bestBaseF = (baseEv.microF.fscore, predFileName[1], baseEv.microF.toStringConcise()) print "-------- Baseline ------------" print baseEv.toStringConcise() print "-------- Best ------------" print bestF[0], bestF[1], bestF[2] print bestF[3] thEv = AveragingMultiClassEvaluator(None, None, classSet) thEv._calculate(examples, predictions) print thEv.toStringConcise() print "=============== All Best ===============" print "Threshold", bestThrF print "Base", bestBaseF sys.exit() memPredictions = [] bestEv = baseEv bestPair = [None, None, None] for p in predictions: memPredictions.append(p) for pair in pairs: modifier = pair[0] + 0.00000001 changedClass = 0 for pred in memPredictions: negPred = pred[1] - modifier maxVal = negPred maxClass = 1 for i in range(2, len(pred)): if pred[i] > maxVal: maxVal = pred[i] maxClass = i if pred[0] != maxClass: changedClass += 1 pred[0] = maxClass ev = AveragingMultiClassEvaluator(None) ev._calculate(examples, memPredictions) print pair[0], pair[2], changedClass print ev.toStringConcise() if ev.compare(bestEv) == 1: print "Improved" bestPair = pair bestEv = ev print "---------------------------------------------" print baseEv.toStringConcise(title="baseline") print bestPair[0], bestPair[2] print bestEv.toStringConcise(title="best")