def main(phase, make): if (phase == 4): f = FeatureExtractor2.FeatureExtractor(createFile=make) ft = FeatureExtractor3.FeatureExtractor(createFile=make) idlist = f.IDs idlist2 = ft.IDs FeatureTable = orange.ExampleTable("table2") TestTable = orange.ExampleTable("table3") training, test = SplitDataInHalf(FeatureTable, f.size) learner = orngTree.TreeLearner(training) res = orngTest.testOnData([learner], test) if make == True: learner = orngTree.TreeLearner(FeatureTable) res = orngTest.testOnData([learner], TestTable) res2 = orngTest.testOnData([learner], FeatureTable) WriteToFile("dev_tonder_olsen.txt", res2, idlist) WriteToFile("test_tonder_olsen.txt", res, idlist2) printresult() else: f = featureExtractor.FeatureExtractor(createFile=True) FeatureTable = orange.ExampleTable("table") learner, res = CrossValidation(FeatureTable, f.size, 10) guessyes = 0 guessno = 0 correctyes = 0 correctno = 0 for r in res.results: if str(r.classes[0]) == "1": prtres = "Yes" else: prtres = "No" if str(r.actualClass) == "1": prttrue = "Yes" correctyes = correctyes + 1 else: prttrue = "No" correctno = correctno + 1 #print str(r.classes[0]) + " vs correct: " + str(r.actualClass) if prtres == "No" and prttrue == "No": guessno = guessno + 1 elif prtres == "Yes" and prttrue == "Yes": guessyes = guessyes + 1 print "Guessed " + prtres + " and the correct answer was: " + prttrue #res = orngTest.leaveOneOut([learner],FeatureTable) #printresult = orngStat.CA(res, orngStat.IS(res)) #print "Yes Accuracy: " + str(float(guessyes)/float(correctyes)) #print "No Accuracy: " + str(float(guessno)/float(correctno)) printresult = orngStat.CA(res) print "Accuracy: " + str(printresult[0])
def main(phase,make): if(phase == 4): f = FeatureExtractor2.FeatureExtractor(createFile =make) ft = FeatureExtractor3.FeatureExtractor(createFile=make) idlist = f.IDs idlist2 = ft.IDs FeatureTable = orange.ExampleTable("table2") TestTable = orange.ExampleTable("table3") training,test = SplitDataInHalf(FeatureTable,f.size) learner = orngTree.TreeLearner(training) res = orngTest.testOnData([learner],test) if make == True: learner = orngTree.TreeLearner(FeatureTable) res = orngTest.testOnData([learner],TestTable) res2 = orngTest.testOnData([learner],FeatureTable) WriteToFile("dev_tonder_olsen.txt",res2,idlist) WriteToFile("test_tonder_olsen.txt",res,idlist2) printresult() else: f = featureExtractor.FeatureExtractor(createFile =True) FeatureTable = orange.ExampleTable("table") learner,res = CrossValidation(FeatureTable,f.size,10) guessyes = 0 guessno = 0 correctyes = 0 correctno = 0 for r in res.results: if str(r.classes[0]) == "1": prtres = "Yes" else: prtres = "No" if str(r.actualClass) == "1": prttrue = "Yes" correctyes = correctyes +1 else: prttrue = "No" correctno = correctno +1 #print str(r.classes[0]) + " vs correct: " + str(r.actualClass) if prtres == "No" and prttrue == "No": guessno = guessno +1 elif prtres == "Yes" and prttrue == "Yes": guessyes = guessyes +1 print "Guessed " + prtres +" and the correct answer was: " + prttrue #res = orngTest.leaveOneOut([learner],FeatureTable) #printresult = orngStat.CA(res, orngStat.IS(res)) #print "Yes Accuracy: " + str(float(guessyes)/float(correctyes)) #print "No Accuracy: " + str(float(guessno)/float(correctno)) printresult = orngStat.CA(res) print "Accuracy: " + str(printresult[0])
def __call__(self, examples, weightID=0): if self.classifier is None: classifier = self.learner(examples, weightID=weightID) else: classifier = self.classifier for f in examples.domain.classVar.values: print "f", f, f.__class__ classIndex = examples.domain.classVar.values.index(self.positiveValue) results = orngTest.testOnData([classifier], examples) thresholds = list(na.arange(0, 1.1, 0.01)) matrices = [ orngStat.confusionMatrices(results, classIndex=classIndex, cutoff=threshold)[0] for threshold in thresholds ] fscores = map(fScore, matrices) i, score = math2d.argMax(fscores) threshold = thresholds[i] print "fscores", fscores print "threshold", threshold print "score", score return ThresholdProbabilityClassifier(self.classifier, threshold, self.positiveValue, self.negativeValue)
def __call__(self, examples, weightID=0): maxValue = max([x[self.attributeName] for x in examples]) minValue = min([x[self.attributeName] for x in examples]) steps = 10 bestAccuracy = 0 bestThreshold = None values = ["True", "False"] #examples.domain.classVar.values assert len(values) == 2 reversedValues = [x for x in reversed(values)] classifiers = [] for threshold in arange(minValue, maxValue, (maxValue - minValue) / steps): classifiers.append( ThresholdClassifier(self.attributeName, threshold, values, self.classifyFunction)) classifiers.append( ThresholdClassifier(self.attributeName, threshold, reversedValues, self.classifyFunction)) maxFScore = 0 bestClassifier = None for x in classifiers: results = orngTest.testOnData([x], examples) fscore = [ fScore(cm) for cm in orngStat.confusionMatrices(results, classIndex=0) ][0] if maxFScore <= fscore: maxFScore = fscore bestClassifier = x return bestClassifier
def CrossValidation(FeatureTable, n, p): """ FeatureTable = an orange ExampeTable with training data n = the size of the test data p = the number of sections you will make of the training data """ learner = None results = None best = 0 for i in range(p): start = i * n / p end = start + (n / p) testData = FeatureTable.getItems(range(start, end)) trainingData = FeatureTable.getItems(range(0, start)) for x in range(end, n): trainingData.append(FeatureTable[x]) l = orngTree.TreeLearner(trainingData) res = orngTest.testOnData([l], testData) c = 0 for r in res.results: if r.classes[0] == r.actualClass: c = c + 1 if c > best: best = c learner = l results = res return learner, results
def CrossValidation(FeatureTable,n, p): """ FeatureTable = an orange ExampeTable with training data n = the size of the test data p = the number of sections you will make of the training data """ learner = None results = None best = 0 for i in range(p): start = i*n/p end = start + (n/p) testData = FeatureTable.getItems(range(start,end)) trainingData = FeatureTable.getItems(range(0,start)) for x in range(end,n): trainingData.append(FeatureTable[x]) l = orngTree.TreeLearner(trainingData) res = orngTest.testOnData([l],testData) c = 0 for r in res.results: if r.classes[0] == r.actualClass: c = c+1 if c > best: best = c learner = l results = res return learner,results
def test_classifier(model, data): res = orngTest.testOnData( (model, ), data ) # testOnData requires a list of models, so convert model into a tuple of length 1 class_accuracy = orngStat.CA(res)[0] return class_accuracy, res
def saveClassifiers(): import psyco map_fn = "%s/data/directions/direction_floor_3/direction_floor_3_small_filled.cmf" % TKLIB_HOME cluster_fn = "%s/data/directions/direction_floor_3/skels/direction_floor_3_skel.pck" % TKLIB_HOME gtruth_tag_fn = "%s/data/directions/direction_floor_3/tags/df3_small_tags.tag" % TKLIB_HOME assignment_fns = ["%s/nlp/data/aaai_2010_smv/stefie10/assignment1.1.yaml" % TKLIB_HOME, "%s/nlp/data/aaai_2010_smv/stefie10/assignment1.2.yaml" % TKLIB_HOME, "%s/nlp/data/aaai_2010_smv/stefie10/assignment2.1.yaml" % TKLIB_HOME, "%s/nlp/data/aaai_2010_smv/tkollar/assignment3.1.yaml" % TKLIB_HOME, ] tagFile = tag_util.tag_file(gtruth_tag_fn, map_fn) tagFile.get_map() tagFile.get_tag_names() #print cluster_fn #raw_input() skeleton = carmen_map_skeletonizer.load(cluster_fn, map_fn) assignments = [Assignment.load(fn, tagFile, skeleton)for fn in assignment_fns] #classifiers = makeClassifiers(assignment) result = [] def run(): classifiers = makeClassifiers(assignments) result.append(classifiers) start = time.time() cProfile.runctx("run()", globals(), locals(), "profile.out") end = time.time() print "took", (end - start)/60., "minutes" classifiers = result[0] fname = "%s/nlp/data/engines.verbs.floor3.stefie10.pck" % TKLIB_HOME cPickle.dump(classifiers, open(fname, "w")) print "wrote", fname #testingAssignment = Assignment.load("%s/nlp/data/aaai_2010_smv/stefie10/assignment1.1.yaml" % TKLIB_HOME, tagFile, skeleton) #testingAssignment = Assignment.load("%s/nlp/data/aaai_2010_smv/tkollar/assignment3.1.yaml" % TKLIB_HOME, tagFile, skeleton) testingAssignment = Assignment.load("%s/nlp/data/aaai_2010_smv/stefie10/assignment4.1.yaml" % TKLIB_HOME, tagFile, skeleton) for name, c in classifiers.iteritems(): engine = c.engine testing = makeTable(engine, [testingAssignment]) results = orngTest.testOnData([c.classifier], testing) mpl.figure() line, = orangeGui.rocCurve(results, engine.name, stepSize=0.001, marker="x", plotArgs=dict(color="k")) mpl.title(engine.name.capitalize(), fontsize=30) mpl.xlabel("TP") mpl.ylabel("FP") mpl.xticks([0, 1], fontsize=20) mpl.yticks([0, 1], fontsize=20) line.set_label(engine.name.upper()) mpl.savefig("roc.%s.png" % engine.name) orangeUtils.displayResults(results) #mpl.legend(loc="lower right") #mpl.title("Classifiers for Verbs") mpl.show()
def doRun(title, newDomain, learner, training, testing, marker): newTraining = orangeUtils.convertTable(training, newDomain) newTesting = orangeUtils.convertTable(testing, newDomain) classifier = learner(newTraining) results = orngTest.testOnData([classifier], testing, storeClassifiers=1) print "title", title displayResults(results) line, = rocCurve(results, title, stepSize=0.001, marker=marker) line.set_label(title) lines.append(line) return results
def smallRocCurve(): trainer = Trainer() keys = None keys = None #keys = ["towards"] for i, key in enumerate(trainer.annotationEngines): if keys != None and not key in keys: continue print "*****************************************************" print key engine = trainer.engineMap[key] mpl.figure(figsize=(8, 8)) print "training" table = trainer.makeTable(engine) cv_indices = orange.MakeRandomIndices2(table, p0=0.75) training = table.select(cv_indices, 0, negate=True) testing = table.select(cv_indices, 0, negate=False) classifier = orangePickle.PickleableClassifier(training, orngBayes.BayesLearner) #orange.LogRegLearner) results = orngTest.testOnData([classifier], testing) displayResults(results) line = rocCurve(results, "", stepSize=0.001, marker=".", plotArgs=dict(linewidth=5)) line[0].set_label(engine.name()) mpl.xlabel("FP", fontsize=25) mpl.ylabel("TP", fontsize=25) mpl.xticks([0, 1], fontsize=20) mpl.yticks([0, 1], fontsize=20) ax = mpl.gca() ax.set_aspect(1. / ax.get_data_ratio()) mpl.title(engine.name().capitalize(), fontsize=30) #mpl.legend(loc='lower right', prop=FontProperties(size=25)) mpl.savefig("roc.%s.png" % engine.name()) mpl.show()
def getConfMat(testData, model): return ConfMat(orngTest.testOnData([model], testData))
def main(): from sys import argv map_fn = argv[1] gtruth_tag_fn = argv[2] cluster_fn = argv[3] assignment_fns = argv[4:] tagFile = tag_util.tag_file(gtruth_tag_fn, map_fn) tagFile.get_map() tagFile.get_tag_names() skeleton = carmen_map_skeletonizer.load(cluster_fn, map_fn) assignments = [Assignment.load(assignment_fn, tagFile, skeleton) for assignment_fn in assignment_fns] engineMap = dict((x.name, x) for x in [bring.Engine(), follow.Engine(), meet.Engine(), avoid.Engine(), #wander.Engine(), #go.Engine(), ]) for engine in engineMap.values(): verb = engine.name if verb != "follow" and False: continue def run(): return makeTable(engine, assignments) #cProfile.runctx("run()", globals(), locals(), "profile.out") #return table = run() print "verb", verb, len(table) cv_indices = orange.MakeRandomIndicesCV(table, 2) humanLabeledTraining = table.select(cv_indices, 0) training = orange.ExampleTable(humanLabeledTraining.domain) training.extend(humanLabeledTraining) generatedTraining = makeSubsetExamples(engine, humanLabeledTraining) training.extend(generatedTraining) print "Using", len(generatedTraining), "subset examples" testing = table.select(cv_indices, 1) #testFeatureSubsets(engine, training, testing) #classifier = orngBayes.BayesLearner(training) classifier = RandomForestLearner(training) results = orngTest.testOnData([classifier], testing) print "results", results tuples = list(zip(testing, results.results)) tuples.sort(key=lambda x: x[0]["description"]) for e, r in tuples: # print e["description"], e["hasApproach"], e["hasFollow"], if r.actualClass == r.classes[0]: print "correct", e["description"], e["entry"].value.id else: print "incorrect", e["description"], e["entry"].value.id mpl.figure(figsize=(6,6)) mpl.subplots_adjust(bottom=0.13) line, = orangeGui.rocCurve(results, engine.name, stepSize=0.001, plotArgs={"color":"black"}) orangeUtils.displayResults(results) mpl.xlabel("FP", fontsize=32) mpl.ylabel("TP", fontsize=32) mpl.xticks((0, 1), fontsize=20) mpl.yticks((0, 1), fontsize=20) line.set_label(engine.name) mpl.title(engine.name.capitalize(), fontsize=32) mpl.savefig("roc_%s.png" % engine.name) mpl.savefig("roc_%s.ps" % engine.name) mpl.show()
if __name__ == "__main__": """ Script to calculate the accuracy on a temporal test set with a saved model. Usage; python getTempAcc.py testDataPath modelPath """ # Full path to temporal test data file (with descriptors) in Orange format #testDataFile = "/home/jonna/projects/M-Lab/scfbmPaper/data/trainData.tab" testDataFile = sys.argv[1] # Read the test data testData = dataUtilities.DataTable(testDataFile) # Full path to the model #modelFile = "/home/jonna/projects/M-Lab/scfbmPaper/data/optRF.model" modelFile = sys.argv[2] # Read the model model = AZBaseClasses.modelRead(modelFile) # Use Orange methods to get the accuracy (Please see Orange doc) results = orngTest.testOnData([model], testData) print "Classification accuracy" print orngStat.CA(results)
def ablateFeaturesForCls(engineCls): mpl.figure() trainer = Trainer() engine = engineCls() trainer.configureClassifier(engine) markers = [ '.', ',', 'v', '^', '<', '>', '1', '2', '3', '4', 's', 'p', '*', 'h', 'H', ] colors = ["b", "g", "r", "c", "m", "y"] sub_engines = [] for i, name in enumerate(sorted(engine.masterList)): sub_engine = engineCls() sub_engine.setFeatureList([name]) sub_engines.append((name, sub_engine)) markers = markers[0:len(sub_engines)] colors = colors[0:len(sub_engines)] sub_engines.append(("all", engineCls())) markers.append("o") colors.append("k") for i, (name, sub_engine) in enumerate(sub_engines): table = trainer.configureClassifier(sub_engine) cv_indices = orange.MakeRandomIndices2(table, p0=0.75) training = table.select(cv_indices, 0, negate=True) testing = table.select(cv_indices, 0, negate=False) #classifier = orange.LogRegLearner(training) classifier = orngBayes.BayesLearner(training) results = orngTest.testOnData([classifier], testing) displayResults(results) line = rocCurve( results, "", stepSize=0.001, marker=markers[i % len(markers)], plotArgs=dict(linewidth=5, markersize=10, color=colors[i % len(colors)]), ) line[0].set_label(name) mpl.title(engine.name(), size=30) mpl.xlabel("FP", fontsize=30) mpl.ylabel("TP", fontsize=30) mpl.xticks([0, 1], fontsize=17) mpl.yticks([0, 1], fontsize=17) mpl.subplots_adjust(bottom=0.14, top=0.91) mpl.legend(loc="lower right", prop=dict(size=17)) mpl.savefig("roc.ablate.%s.png" % engine.name())
descList = ["rdk.fr_dihydropyridine", "rdk.fr_nitroso", "rdk.fr_benzodiazepine", "rdk.fr_thiocyan", "rdk.VSA_EState4" ,"rdk.VSA_EState6" \ ,"rdk.VSA_EState7" ,"rdk.VSA_EState1" ,"rdk.VSA_EState2" ,"rdk.VSA_EState3" ,"rdk.SlogP_VSA9" ,"rdk.SMR_VSA8" ,"rdk.fr_diazo" \ ,"rdk.fr_prisulfonamd" ,"rdk.fr_isocyan" ,"rdk.fr_azide" ,"rdk.fr_isothiocyan"] data = dataUtilities.attributeDeselectionData(data, descList) print "Length domain ", len(data.domain) learner = AZorngCvSVM.CvSVMLearner(C=32, gamma=0.03125) #learner = AZorngRF.RFLearner() #learner = AZorngRF.RFLearner(stratify = "Yes") # No effect #learner = AZorngCvBoost.CvBoostLearner() #learner.stratify = "Yes" # No effect #learner.priors = {"Active":0.80, "Inactive":0.20} # Test set accuracy model = learner(data) res = orngTest.testOnData([model], data) CM = evalUtilities.ConfMat(res)[0] CA = round(orngStat.CA(res)[0], 3) MCC = round(evalUtilities.calcMCC(CM), 3) # TH, FL, FH, TL resList = [ str(CM[0][0]), str(CM[0][1]), str(CM[1][0]), str(CM[1][1]), str(CA), str(MCC) ] wrtStr = string.join(resList, "\t") print "nonIID test set results" print wrtStr
def test_classifier(model, data): res = orngTest.testOnData( (model,), data) # testOnData requires a list of models, so convert model into a tuple of length 1 class_accuracy = orngStat.CA(res)[0] return class_accuracy, res
def nway(): engine_to_examples = {} trainer = Trainer() classes = set() for i, key in enumerate(trainer.annotationEngines): engine = trainer.engineMap[key] table = trainer.makeTable(engine) for ex in table: if ex["farAway"].value: cls = "null" else: cls = ex["sourceEngineName"].value geometry = ex["geometry"].value engine_to_examples.setdefault(cls, []) classes.add(cls) examples = [ trainer.engineMap[key].makeExample(expectInsane=True, **geometry) for key in trainer.annotationEngines if not len(geometry["figure"]) == 0 ] engine_to_examples[cls].append(examples) if i >= 1: #break pass variables = [] for ex in examples: for attr in ex.domain: if attr.name == "class": continue new_attr = orange.FloatVariable(attr.name) variables.append(new_attr) domain = orange.Domain(variables, orange.EnumVariable("class", values=list(classes))) table = orange.ExampleTable(domain) for engine_name, example_lists in engine_to_examples.iteritems(): for example_list in example_lists: ex = orange.Example(domain) for engine_ex in example_list: for attr in engine_ex.domain: ex[attr.name] = engine_ex[attr.name] ex["class"] = engine_name table.append(ex) print "domain", domain cv_indices = orange.MakeRandomIndices2(table, p0=0.75) training = table.select(cv_indices, 0, negate=True) testing = table.select(cv_indices, 0, negate=False) #classifier = orngBayes.BayesLearner(training) classifier = orangePickle.PickleableClassifier(training, orngBayes.BayesLearner) results = orngTest.testOnData([classifier], testing) print orngStat.CA(results) cm = orngStat.confusionMatrices(results)[0] classes = list(domain.classVar.values) print " ", " ".join([c.rjust(12) for c in classes + ["", ""]]) for className, classConfusions in zip(classes, cm): #format = ("%s" + ("\t%i" * len(classes))) values = (className, ) + tuple(classConfusions) print " ".join([str(c).rjust(12) for c in values]) #print format % values for name in classes: classIndex = classes.index(name) mpl.figure() rocCurve(results, "", classIndex, stepSize=0.001, plotArgs=dict(linewidth=5, markersize=10)) mpl.title(name, size=30) mpl.xlabel("FP", fontsize=30) mpl.ylabel("TP", fontsize=30) mpl.xticks([0, 1], fontsize=17) mpl.yticks([0, 1], fontsize=17) fname = "nway.pck" print "saving", fname with open(fname, "w") as f: pickle.dump(classifier, f, protocol=2) mpl.show()
import orange, orngWrap, orngTest, orngStat data = orange.ExampleTable("bupa") ri2 = orange.MakeRandomIndices2(data, 0.7) train = data.select(ri2, 0) test = data.select(ri2, 1) bayes = orange.BayesLearner(train) thresholds = [.2, .5, .8] models = [orngWrap.ThresholdClassifier(bayes, thr) for thr in thresholds] res = orngTest.testOnData(models, test) cm = orngStat.confusionMatrices(res) print for i, thr in enumerate(thresholds): print "%1.2f: TP %5.3f, TN %5.3f" % (thr, cm[i].TP, cm[i].TN)
from trainingMethods import AZBaseClasses from AZutilities import dataUtilities from AZutilities import paramOptUtilities if __name__ == "__main__": """ Script to calculate the accuracy on a temporal test set with a saved model. Usage; python getTempAcc.py testDataPath modelPath """ # Full path to temporal test data file (with descriptors) in Orange format #testDataFile = "/home/jonna/projects/M-Lab/scfbmPaper/data/trainData.tab" testDataFile = sys.argv[1] # Read the test data testData = dataUtilities.DataTable(testDataFile) # Full path to the model #modelFile = "/home/jonna/projects/M-Lab/scfbmPaper/data/optRF.model" modelFile = sys.argv[2] # Read the model model = AZBaseClasses.modelRead(modelFile) # Use Orange methods to get the accuracy (Please see Orange doc) results = orngTest.testOnData([model], testData) print "Classification accuracy" print orngStat.CA(results)