def test_rnd_sampling(data, learners, p=0.9, n=30, x=1): acc = [0.0]*len(learners) #sens = [0.0]*len(learners) #spec = [0.0]*len(learners) tpos = [0.0]*len(learners) tneg = [0.0]*len(learners) fpos = [0.0]*len(learners) fneg = [0.0]*len(learners) for i in range(n): newselection = orange.MakeRandomIndices2(data, 0.1, randseed=(datetime.datetime.now().time().microsecond + i)) selection = orange.MakeRandomIndices2(data, p, randseed=(datetime.datetime.now().time().microsecond-i)) train_data = data.select(selection, 0)#this selects selection test_data = data.select(newselection, 0) classifiers = [] for l in learners: classifiers.append(l(train_data)) acc1 = accuracy(test_data, classifiers)[0] #sens1 = accuracy(test_data, classifiers)[1] #spec1 = accuracy(test_data, classifiers)[2] tpos1 = accuracy(test_data, classifiers)[1] tneg1 = accuracy(test_data, classifiers)[2] fpos1 = accuracy(test_data, classifiers)[3] fneg1 = accuracy(test_data, classifiers)[4] #print "%d: %s" % (i+1, acc1) for j in range(len(learners)): acc[j] += acc1[j] #for j in range(len(learners)): # sens[j] += sens1[j] #for j in range(len(learners)): # spec[j] += spec1[j] for j in range(len(learners)): tpos[j] += tpos1[j] for j in range(len(learners)): tneg[j] += tneg1[j] for j in range(len(learners)): fpos[j] += fpos1[j] for j in range(len(learners)): fneg[j] += fneg1[j] for j in range(len(learners)): acc[j] = acc[j]/n #for j in range(len(learners)): # sens[j] = sens[j]/n #for j in range(len(learners)): # spec[j] = spec[j]/n for j in range(len(learners)): tpos[j] = tpos[j]/n for j in range(len(learners)): tneg[j] = tneg[j]/n for j in range(len(learners)): fpos[j] = fpos[j]/n for j in range(len(learners)): fneg[j] = fneg[j]/n return (acc, tpos, tneg, fpos, fneg)
def test_distance_on(self, dataset): import numpy indices = orange.MakeRandomIndices2(dataset, min(20, len(dataset))) dataset = dataset.select(indices, 0) with member_set(self.distance_constructor, "ignore_class", True): mat = distance_matrix(dataset, self.distance_constructor) self.assertIsInstance(mat, Orange.misc.SymMatrix) self.assertEqual(mat.dim, len(dataset)) m = numpy.array(list(mat)) self.assertTrue((m >= 0.0).all()) if dataset.domain.class_var: with member_set(self.distance_constructor, "ignore_class", False): try: mat = distance_matrix(dataset, self.distance_constructor) except orange.KernelException, ex: if "not supported" in str(ex): return else: raise m1 = numpy.array(list(mat)) self.assertTrue( (m1 != m).all() or dataset, "%r does not seem to respect the 'ignore_class' flag")
def learningCurveWithTestData( learners, learnset, testset, times=10, proportions=orange.frange(0.1), strat=orange.MakeRandomIndices.StratifiedIfPossible, pps=[], **argkw): verb = argkw.get("verbose", 0) learnset, learnweight = demangleExamples(learnset) testweight = demangleExamples(testset)[1] randomGenerator = argkw.get("indicesrandseed", 0) or argkw.get( "randseed", 0) or argkw.get("randomGenerator", 0) pick = orange.MakeRandomIndices2(stratified=strat, randomGenerator=randomGenerator) allResults = [] for p in proportions: printVerbose("Proportion: %5.3f" % p, verb) testResults = ExperimentResults( times, [l.name for l in learners], testset.domain.classVar.values.native(), testweight != 0, testset.domain.classVar.baseValue) testResults.results = [] for t in range(times): printVerbose(" repetition %d" % t, verb) learnAndTestOnTestData(learners, (learnset.selectref( pick(learnset, p), 0), learnweight), testset, testResults, t) allResults.append(testResults) return allResults
def test_pickling_on(self, dataset): """ Test learner and classifier pickling. """ classifier = self.learner(dataset) s = pickle.dumps(classifier) classifier_clone = pickle.loads(s) indices = orange.MakeRandomIndices2(p0=20)(dataset) test = dataset.select(indices, 0) for ex in test: if isinstance(dataset.domain.class_var, Orange.feature.Continuous): # Test to third digit after the decimal point self.assertAlmostEqual( classifier(ex, orange.GetValue).native(), classifier_clone(ex, orange.GetValue).native(), min(3, dataset.domain.class_var.number_of_decimals), "Pickled and original classifier return a different value!" ) else: self.assertEqual( classifier(ex, orange.GetValue), classifier_clone(ex, orange.GetValue), "Pickled and original classifier return a different value!" )
def main(): print "loading" annotations = annotation_reader.from_file("%s/data/directions/breadbox/nouns_stefie10.txt" % TKLIB_HOME) table = annotations.as_orange_table() cv_indices = orange.MakeRandomIndices2(table, p0=0.5) print "indices", set(cv_indices) print "splitting" training, testing = annotation_reader.split(annotations, cv_indices) print "features" engine = PairwiseEngine(training) training_table = engine.training_table testing_table = engine.makeTable(testing) print len(training_table), "training" print len(testing_table), "testing" learners = [orange.MajorityLearner(), orngEnsemble.RandomForestLearner(), ] results = orngTest.learnAndTestOnTestData(learners, training_table, testing_table) for accuracy, cm in zip(orngStat.CA(results), orngStat.confusionMatrices(results)): print orangeUtils.confusion_matrix_to_string(table.domain, cm) print "accuracy: %.2f%%" % (accuracy*100)
def proportionTest(learners, examples, learnProp, times=10, strat=orange.MakeRandomIndices.StratifiedIfPossible, pps=[], callback=None, **argkw): """train-and-test evaluation (train on a subset, test on remaing examples)""" # randomGenerator is set either to what users provided or to orange.RandomGenerator(0) # If we left it None or if we set MakeRandomIndices2.randseed, it would give same indices each time it's called randomGenerator = argkw.get("indicesrandseed", 0) or argkw.get( "randseed", 0) or argkw.get("randomGenerator", 0) pick = orange.MakeRandomIndices2(stratified=strat, p0=learnProp, randomGenerator=randomGenerator) examples, weight = demangleExamples(examples) classVar = examples.domain.classVar if classVar.varType == orange.VarTypes.Discrete: values = list(classVar.values) baseValue = classVar.baseValue else: baseValue = values = None testResults = ExperimentResults(times, [l.name for l in learners], values, weight != 0, baseValue) for time in range(times): indices = pick(examples) learnset = examples.selectref(indices, 0) testset = examples.selectref(indices, 1) learnAndTestOnTestData(learners, (learnset, weight), (testset, weight), testResults, time, pps, **argkw) if callback: callback() return testResults
def learningCurveN(learners, examples, folds=10, strat=orange.MakeRandomIndices.StratifiedIfPossible, proportions=orange.frange(0.1), pps=[], **argkw): """construct a learning curve for learners""" seed = argkw.get("indicesrandseed", -1) or argkw.get("randseed", -1) if seed: randomGenerator = orange.RandomGenerator(seed) else: randomGenerator = argkw.get("randomGenerator", orange.RandomGenerator()) if strat: cv = orange.MakeRandomIndicesCV(folds=folds, stratified=strat, randomGenerator=randomGenerator) pick = orange.MakeRandomIndices2(stratified=strat, randomGenerator=randomGenerator) else: cv = orange.RandomIndicesCV(folds=folds, stratified=strat, randomGenerator=randomGenerator) pick = orange.RandomIndices2(stratified=strat, randomGenerator=randomGenerator) return learningCurve(*(learners, examples, cv, pick, proportions, pps), **argkw)
def test_rnd_sampling(data, learners, p=0.9, n=30): acc = [0.0] * len(learners) for i in range(n): newselection = orange.MakeRandomIndices2(data, 0.1, randseed=i + 10) selection = orange.MakeRandomIndices2(data, p, randseed=i) train_data = data.select(selection, 0) #this selects selection test_data = data.select(newselection, 0) classifiers = [] for l in learners: classifiers.append(l(train_data)) acc1 = accuracy(test_data, classifiers) print "%d: %s" % (i + 1, acc1) for j in range(len(learners)): acc[j] += acc1[j] for j in range(len(learners)): acc[j] = acc[j] / n return acc
def partition_data(data, percent_train=0.5): indx = orange.MakeRandomIndices2(p0=percent_train) train_indices = indx(data) train = data.select(train_indices) test = data.select(train_indices, negate=True) return (train, test)
def cforange_split_dataset(input_dict): import orange output_dict = {} data = input_dict['dataset'] selection = orange.MakeRandomIndices2(data,float(input_dict['p'])) train_data = data.select(selection,0) test_data = data.select(selection,1) output_dict['train_data']=train_data output_dict['test_data']=test_data return output_dict
def FindSmilesAttr(self, data): data=data.select(orange.MakeRandomIndices2(data, min(20, len(data)))) stringVars=filter(lambda var:type(var)==orange.StringVariable, data.domain.attributes+data.domain.getmetas().values()) count=dict.fromkeys(stringVars, 0) for example in data: for var in stringVars: if LoadMolFromSmiles(str(example[var])): count[var]+=1 count=count.items() count.sort(lambda a,b:cmp(a[1], b[1])) return count[-1][0]
def test_rnd_sampling(data, learners, p=0.9, n=30): acc = [0.0] * len(learners) sens = [0.0] * len(learners) spec = [0.0] * len(learners) fpos = [0.0] * len(learners) fneg = [0.0] * len(learners) for i in range(n): newselection = orange.MakeRandomIndices2(data, 0.1, randseed=i + 10) selection = orange.MakeRandomIndices2(data, p, randseed=i) train_data = data.select(selection, 0) #this selects selection test_data = data.select(newselection, 0) classifiers = [] for l in learners: classifiers.append(l(train_data)) acc1 = accuracy(test_data, classifiers)[0] sens1 = accuracy(test_data, classifiers)[1] spec1 = accuracy(test_data, classifiers)[2] fpos1 = accuracy(test_data, classifiers)[3] fneg1 = accuracy(test_data, classifiers)[4] print "%d: %s" % (i + 1, acc1) for j in range(len(learners)): acc[j] += acc1[j] for j in range(len(learners)): sens[j] += sens1[j] for j in range(len(learners)): spec[j] += spec1[j] for j in range(len(learners)): fpos[j] += fpos1[j] for j in range(len(learners)): fneg[j] += fneg1[j] for j in range(len(learners)): acc[j] = acc[j] / n for j in range(len(learners)): sens[j] = sens[j] / n for j in range(len(learners)): spec[j] = spec[j] / n for j in range(len(learners)): fpos[j] = fpos[j] / n for j in range(len(learners)): fneg[j] = fneg[j] / n return (acc, sens, spec, fpos, fneg)
def randSamp(self, inData, trainingFrac): """Use random sampling to partition inData into a training and a test set. The seed seem to be reset and the same data partitioning is obtained each time the same data is partitioned. """ indices = orange.MakeRandomIndices2(p0=1-trainingFrac)#trainingFrac) indices.randomGenerator = None indices.randseed = len(inData) selection = indices(inData) train_data = inData.select(selection, 0) test_data = inData.select(selection, 1) return train_data, test_data
def test_MakeRandomIndices2(self): d = orange.ExampleTable("iris") inds = orange.MakeRandomIndices2(10, p0=5) self.assertEqual(sum(inds), 5) inds = orange.MakeRandomIndices2(10, p0=0.5) self.assertEqual(sum(inds), 5) inds = orange.MakeRandomIndices2(10, p0=0) self.assertEqual(sum(inds), 10) inds = orange.MakeRandomIndices2(10, p0=1) self.assertEqual(sum(inds), 0) mr = orange.MakeRandomIndices2(p0=0.3) self.assertEqual(sum(mr(10)), 7) mr.p0 = 0.9 inds = mr(d) self.assertEqual(sum(inds), 15) self.assertEqual( len([ i for i, fold in enumerate(inds) if fold == 0 and d[i].getclass() == 0 ]), 45) mr.stratified = mr.Stratification.NotStratified inds = mr(d) self.assertEqual(sum(inds), 15) ## Probably not equal... ;) self.assertNotEqual( len([ i for i, fold in enumerate(inds) if fold == 0 and d[i].getclass() == 0 ]), 45)
def test_rnd_sampling(data, learners, p=0.7, n=10): acc = [0.0] * len(learners) for i in range(n): selection = orange.MakeRandomIndices2(data, p) train_data = data.select(selection, 0) test_data = data.select(selection, 1) classifiers = [] for l in learners: classifiers.append(l(train_data)) acc1 = accuracy(test_data, classifiers) print "%d: %s" % (i + 1, acc1) for j in range(len(learners)): acc[j] += acc1[j] for j in range(len(learners)): acc[j] = acc[j] / n return acc
def smallRocCurve(): trainer = Trainer() keys = None keys = None #keys = ["towards"] for i, key in enumerate(trainer.annotationEngines): if keys != None and not key in keys: continue print "*****************************************************" print key engine = trainer.engineMap[key] mpl.figure(figsize=(8, 8)) print "training" table = trainer.makeTable(engine) cv_indices = orange.MakeRandomIndices2(table, p0=0.75) training = table.select(cv_indices, 0, negate=True) testing = table.select(cv_indices, 0, negate=False) classifier = orangePickle.PickleableClassifier(training, orngBayes.BayesLearner) #orange.LogRegLearner) results = orngTest.testOnData([classifier], testing) displayResults(results) line = rocCurve(results, "", stepSize=0.001, marker=".", plotArgs=dict(linewidth=5)) line[0].set_label(engine.name()) mpl.xlabel("FP", fontsize=25) mpl.ylabel("TP", fontsize=25) mpl.xticks([0, 1], fontsize=20) mpl.yticks([0, 1], fontsize=20) ax = mpl.gca() ax.set_aspect(1. / ax.get_data_ratio()) mpl.title(engine.name().capitalize(), fontsize=30) #mpl.legend(loc='lower right', prop=FontProperties(size=25)) mpl.savefig("roc.%s.png" % engine.name()) mpl.show()
def main(): print "loading" annotations = annotation_reader.from_file( "%s/data/directions/breadbox/nouns_stefie10.txt" % TKLIB_HOME) annotator2 = annotation_reader.from_file( "%s/data/directions/breadbox/nouns_dlaude.partial.txt" % TKLIB_HOME) #histogram(annotations) print "table" table = annotations.as_orange_table() cv_indices = orange.MakeRandomIndices2(table, p0=0.5) print "indices", set(cv_indices) print "splitting" training, testing = annotation_reader.split(annotations, cv_indices) print "features" engine = WordnetParentsEngine(training) training_table = engine.makeTable(training) testing_table = engine.makeTable(testing) #training_table, testing_table = wordnet_parents(training, testing) #training_table, testing_table = wordnet_glosses(training, testing) #training_table, testing_table = flickr_parents(training, testing) print len(training_table), "training examples" print len(testing_table), "testing examples" #training_table = annotation_reader.to_big_small(training_table) #testing_table = annotation_reader.to_big_small(testing_table) #information_gain = orange.MeasureAttribute_info() #for x in training_table.domain.attributes: # print "x", information_gain(x, training_table) learners = [ orange.MajorityLearner(), orngEnsemble.RandomForestLearner(), WordnetKnnClassifier, agreement.WizardOfOzLearner(annotator2.as_orange_table()) ] results = orngTest.learnAndTestOnTestData(learners, training_table, testing_table) for accuracy, cm in zip(orngStat.CA(results), orngStat.confusionMatrices(results)): print orangeUtils.confusion_matrix_to_string(table.domain, cm) print "accuracy: %.2f%%" % (accuracy * 100)
def learningCurve(learners, examples, cv=None, pick=None, proportions=orange.frange(0.1), pps=[], **argkw): verb = argkw.get("verbose", 0) cache = argkw.get("cache", 0) callback = argkw.get("callback", 0) for pp in pps: if pp[0] != "L": raise SystemError("cannot preprocess testing examples") if not cv or not pick: seed = argkw.get("indicesrandseed", -1) or argkw.get("randseed", -1) if seed: randomGenerator = orange.RandomGenerator(seed) else: randomGenerator = argkw.get("randomGenerator", orange.RandomGenerator()) if not cv: cv = orange.MakeRandomIndicesCV( folds=10, stratified=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=randomGenerator) if not pick: pick = orange.MakeRandomIndices2( stratified=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=randomGenerator) examples, weight = demangleExamples(examples) folds = cv(examples) ccsum = hex(examples.checksum())[2:] ppsp = encodePP(pps) nLrn = len(learners) allResults = [] for p in proportions: printVerbose("Proportion: %5.3f" % p, verb) if (cv.randseed < 0) or (pick.randseed < 0): cache = 0 else: fnstr = "{learningCurve}_%s_%s_%s_%s%s-%s" % ( "%s", p, cv.randseed, pick.randseed, ppsp, ccsum) if "*" in fnstr: cache = 0 conv = examples.domain.classVar.varType == orange.VarTypes.Discrete and int or float testResults = ExperimentResults( cv.folds, [l.name for l in learners], examples.domain.classVar.values.native(), weight != 0, examples.domain.classVar.baseValue) testResults.results = [ TestedExample(folds[i], conv(examples[i].getclass()), nLrn, examples[i].getweight(weight)) for i in range(len(examples)) ] if cache and testResults.loadFromFiles(learners, fnstr): printVerbose(" loaded from cache", verb) else: for fold in range(cv.folds): printVerbose(" fold %d" % fold, verb) # learning learnset = examples.selectref(folds, fold, negate=1) learnset = learnset.selectref(pick(learnset, p0=p), 0) if not len(learnset): continue for pp in pps: learnset = pp[1](learnset) classifiers = [None] * nLrn for i in range(nLrn): if not cache or not testResults.loaded[i]: classifiers[i] = learners[i](learnset, weight) # testing for i in range(len(examples)): if (folds[i] == fold): # This is to prevent cheating: ex = orange.Example(examples[i]) ex.setclass("?") for cl in range(nLrn): if not cache or not testResults.loaded[cl]: cls, pro = classifiers[cl](ex, orange.GetBoth) testResults.results[i].setResult(cl, cls, pro) if callback: callback() if cache: testResults.saveToFiles(learners, fnstr) allResults.append(testResults) return allResults
def __call__(self, data, weight=None): bestSeed = None bestAcc = None bestNiter = None bestModel = None #fix self.nDiffIniWeights for the disabled mode if self.nDiffIniWeights <= 1: self.nDiffIniWeights = 1 #loop over n different initial weights Disabled #Fix self.stopUPs for the disabled mode if self.stopUPs <= 0: self.stopUPs = 0 # Optimization of nIter will be disabled self.NTrainEx = len(data) #Remove from the domain any unused values of discrete attributes including class data = dataUtilities.getDataWithoutUnusedValues(data, True) #dataUtilities.rmAllMeta(data) if len(data.domain.getmetas()) == 0: cleanedData = data else: cleanedData = dataUtilities.getCopyWithoutMeta(data) # Create the imputer self.imputer = orange.ImputerConstructor_average(cleanedData) # Impute the data self.trainData = self.imputer(cleanedData) # If we are not seetin neither weights init optimization or nEphocs optimization (opencvLayer), the do nto split the data if self.stopUPs != 0 or self.nDiffIniWeights > 1: #Define train-80% and validation set-20% of the input data indices = orange.MakeRandomIndices2( p0=0.2, stratified=orange.MakeRandomIndices.StratifiedIfPossible) ind = indices(cleanedData) self.trainData = cleanedData.select(ind, 1) validationSet = cleanedData.select(ind, 0) else: validationSet = None if self.verbose and self.nDiffIniWeights > 1: print "=========== Training ", self.nDiffIniWeights, " times with different initial weights ==============" for n in range(self.nDiffIniWeights): if self.nDiffIniWeights <= 1: seed = 0 #in opencv mmlann seed=0 means the seed is disabled, and original seed will be used else: seed = len(cleanedData) * len(cleanedData.domain) * ( n + 1) #seed can be any integer #Create a model with a specific seed for training opencv ANN. #Also passing the step for the nIter optimization (self.stopUPs=0 - disable nIter optimization) #Also passing the validation set to be used in internal opencv implemented nEphocs optimization. model = self.__train__(weight=None, seed=seed, validationSet=validationSet) #Skip evaluation if the weights loop is disabled if self.nDiffIniWeights <= 1: return model break if cleanedData.domain.classVar.varType == orange.VarTypes.Discrete: Acc = evalUtilities.getClassificationAccuracy( validationSet, model) else: Acc = -evalUtilities.getRMSE(validationSet, model) if bestModel == None or (Acc > bestAcc) or ( Acc == bestAcc and model.nIter < bestNiter): bestSeed = seed bestAcc = Acc bestNiter = model.nIter bestModel = model if self.verbose: print "nIter:%-7s Acc:%-20s seed: %s" % (model.nIter, Acc, seed) if self.verbose: print "================ Best model Found: ===================" if self.verbose: print "nIter:%-7s Acc:%-20s seed: %s" % (bestNiter, bestAcc, bestSeed) # DEBUG for check if the returned model is indeed the best model, and not the last trainted #if cleanedData.domain.classVar.varType == orange.VarTypes.Discrete: # Acc = evalUtilities.getClassificationAccuracy(validationSet, bestModel) #else: # Acc = -evalUtilities.getRMSE(validationSet, bestModel) #if self.verbose: print "================ Best model returned: ===================" #if self.verbose: print "nIter:%-7s Acc:%-20s seed: %s" % (bestModel.nIter,Acc,bestModel.seed) return bestModel
import orange, orngWrap, orngTest, orngStat data = orange.ExampleTable("bupa") ri2 = orange.MakeRandomIndices2(data, 0.7) train = data.select(ri2, 0) test = data.select(ri2, 1) bayes = orange.BayesLearner(train) thresholds = [.2, .5, .8] models = [orngWrap.ThresholdClassifier(bayes, thr) for thr in thresholds] res = orngTest.testOnData(models, test) cm = orngStat.confusionMatrices(res) print for i, thr in enumerate(thresholds): print "%1.2f: TP %5.3f, TN %5.3f" % (thr, cm[i].TP, cm[i].TN)
##res = orngTest.proportionTest(learners, data, 0.7, 100, pps = [("L", classnoise)]) ##printResults(res) print "\nGood old 10-fold cross validation" res = orngTest.crossValidation(learners, data) printResults(res) print "\nLearning curve" prop = orange.frange(0.2, 1.0, 0.2) res = orngTest.learningCurveN(learners, data, folds=5, proportions=prop) for i in range(len(prop)): print "%5.3f:" % prop[i], printResults(res[i]) print "\nLearning curve with pre-separated data" indices = orange.MakeRandomIndices2(data, p0=0.7) train = data.select(indices, 0) test = data.select(indices, 1) res = orngTest.learningCurveWithTestData(learners, train, test, times=5, proportions=prop) for i in range(len(prop)): print "%5.3f:" % prop[i], printResults(res[i]) print "\nLearning and testing on pre-separated data" res = orngTest.learnAndTestOnTestData(learners, train, test) printResults(res)
def ablateFeaturesForCls(engineCls): mpl.figure() trainer = Trainer() engine = engineCls() trainer.configureClassifier(engine) markers = [ '.', ',', 'v', '^', '<', '>', '1', '2', '3', '4', 's', 'p', '*', 'h', 'H', ] colors = ["b", "g", "r", "c", "m", "y"] sub_engines = [] for i, name in enumerate(sorted(engine.masterList)): sub_engine = engineCls() sub_engine.setFeatureList([name]) sub_engines.append((name, sub_engine)) markers = markers[0:len(sub_engines)] colors = colors[0:len(sub_engines)] sub_engines.append(("all", engineCls())) markers.append("o") colors.append("k") for i, (name, sub_engine) in enumerate(sub_engines): table = trainer.configureClassifier(sub_engine) cv_indices = orange.MakeRandomIndices2(table, p0=0.75) training = table.select(cv_indices, 0, negate=True) testing = table.select(cv_indices, 0, negate=False) #classifier = orange.LogRegLearner(training) classifier = orngBayes.BayesLearner(training) results = orngTest.testOnData([classifier], testing) displayResults(results) line = rocCurve( results, "", stepSize=0.001, marker=markers[i % len(markers)], plotArgs=dict(linewidth=5, markersize=10, color=colors[i % len(colors)]), ) line[0].set_label(name) mpl.title(engine.name(), size=30) mpl.xlabel("FP", fontsize=30) mpl.ylabel("TP", fontsize=30) mpl.xticks([0, 1], fontsize=17) mpl.yticks([0, 1], fontsize=17) mpl.subplots_adjust(bottom=0.14, top=0.91) mpl.legend(loc="lower right", prop=dict(size=17)) mpl.savefig("roc.ablate.%s.png" % engine.name())
self.changedFlag = False ############################################################################## # Test the widget, run from DOS prompt if __name__=="__main__": a = QApplication(sys.argv) ow = OWPredictions() ow.show() import orngTree dataset = orange.ExampleTable('../../doc/datasets/iris.tab') # dataset = orange.ExampleTable('../../doc/datasets/auto-mpg.tab') ind = orange.MakeRandomIndices2(p0=0.5)(dataset) data = dataset.select(ind, 0) test = dataset.select(ind, 1) testnoclass = orange.ExampleTable(orange.Domain(test.domain.attributes, False), test) tree = orngTree.TreeLearner(data) tree.name = "tree" maj = orange.MajorityLearner(data) maj.name = "maj" knn = orange.kNNLearner(data, k = 10) knn.name = "knn" # ow.setData(test) # # ow.setPredictor(maj, 1)
import orange import orngClustering data = orange.ExampleTable("iris") sample = data.selectref(orange.MakeRandomIndices2(data, 20), 0) root = orngClustering.hierarchicalClustering(sample) orngClustering.dendrogram_draw("hclust-dendrogram.png", root, data=sample, labels=[str(d.getclass()) for d in sample])
# Description: Shows how to sample example by random divisions into two groups # Category: sampling # Classes: MakeRandomIndices, MakeRandomIndices2, RandomGenerator # Uses: lenses # Referenced: RandomIndices.htm import orange data = orange.ExampleTable("lenses") indices2 = orange.MakeRandomIndices2(p0=6) ind = indices2(data) print ind data0 = data.select(ind, 0) data1 = data.select(ind, 1) print len(data0), len(data1) print "\nIndices without playing with random generator" for i in range(5): print indices2(data) print "\nIndices with random generator" indices2.randomGenerator = orange.RandomGenerator(42) for i in range(5): print indices2(data) print "\nIndices with randseed" indices2.randomGenerator = None indices2.randseed = 42 for i in range(5):
def nway(): engine_to_examples = {} trainer = Trainer() classes = set() for i, key in enumerate(trainer.annotationEngines): engine = trainer.engineMap[key] table = trainer.makeTable(engine) for ex in table: if ex["farAway"].value: cls = "null" else: cls = ex["sourceEngineName"].value geometry = ex["geometry"].value engine_to_examples.setdefault(cls, []) classes.add(cls) examples = [ trainer.engineMap[key].makeExample(expectInsane=True, **geometry) for key in trainer.annotationEngines if not len(geometry["figure"]) == 0 ] engine_to_examples[cls].append(examples) if i >= 1: #break pass variables = [] for ex in examples: for attr in ex.domain: if attr.name == "class": continue new_attr = orange.FloatVariable(attr.name) variables.append(new_attr) domain = orange.Domain(variables, orange.EnumVariable("class", values=list(classes))) table = orange.ExampleTable(domain) for engine_name, example_lists in engine_to_examples.iteritems(): for example_list in example_lists: ex = orange.Example(domain) for engine_ex in example_list: for attr in engine_ex.domain: ex[attr.name] = engine_ex[attr.name] ex["class"] = engine_name table.append(ex) print "domain", domain cv_indices = orange.MakeRandomIndices2(table, p0=0.75) training = table.select(cv_indices, 0, negate=True) testing = table.select(cv_indices, 0, negate=False) #classifier = orngBayes.BayesLearner(training) classifier = orangePickle.PickleableClassifier(training, orngBayes.BayesLearner) results = orngTest.testOnData([classifier], testing) print orngStat.CA(results) cm = orngStat.confusionMatrices(results)[0] classes = list(domain.classVar.values) print " ", " ".join([c.rjust(12) for c in classes + ["", ""]]) for className, classConfusions in zip(classes, cm): #format = ("%s" + ("\t%i" * len(classes))) values = (className, ) + tuple(classConfusions) print " ".join([str(c).rjust(12) for c in values]) #print format % values for name in classes: classIndex = classes.index(name) mpl.figure() rocCurve(results, "", classIndex, stepSize=0.001, plotArgs=dict(linewidth=5, markersize=10)) mpl.title(name, size=30) mpl.xlabel("FP", fontsize=30) mpl.ylabel("TP", fontsize=30) mpl.xticks([0, 1], fontsize=17) mpl.yticks([0, 1], fontsize=17) fname = "nway.pck" print "saving", fname with open(fname, "w") as f: pickle.dump(classifier, f, protocol=2) mpl.show()
# Description: Shows how to use the nearest-neighbour learning # Category: learning # Classes: kNNLearner, kNNClassifier, ExamplesDistance, ExamplesDistanceConstructor # Uses: iris # Referenced: kNNLearner.htm import orange, orngTest, orngStat data = orange.ExampleTable("iris") rndind = orange.MakeRandomIndices2(data, p0=0.8) train = data.select(rndind, 0) test = data.select(rndind, 1) knn = orange.kNNLearner(train, k=10) for i in range(5): example = test.randomexample() print example.getclass(), knn(example) print "\n\n" data = orange.ExampleTable("iris") knn = orange.kNNLearner() knn.k = 10 knn.distanceConstructor = orange.ExamplesDistanceConstructor_Hamming() knn = knn(train) for i in range(5): example = test.randomexample() print example.getclass(), knn(example)
print classes = vehicle.domain.classVar.values AUCmatrix = orngStat.AUC_matrix(resVeh)[0] print "\t"+"\t".join(classes[:-1]) for className, AUCrow in zip(classes[1:], AUCmatrix[1:]): print ("%s" + ("\t%5.3f" * len(AUCrow))) % ((className, ) + tuple(AUCrow)) print print "AUCs for detecting various pairs of classes in 'vehicle'" for c1, s1 in enumerate(classes): for c2 in range(c1): print "%s vs %s: \t%5.3f\t%5.3f\t%5.3f" % ((s1, classes[c2]) + tuple(orngStat.AUC_pair(resVeh, c1, c2))) ri2 = orange.MakeRandomIndices2(voting, 0.6) train = voting.selectref(ri2, 0) test = voting.selectref(ri2, 1) res1 = orngTest.learnAndTestOnTestData(learners, train, test) print print "AUC and SE for voting" AUCs = orngStat.AUCWilcoxon(res1) for li, lrn in enumerate(learners): print "%s: %5.3f+-%5.3f" % (lrn.name, AUCs[li][0], AUCs[li][1]) print print "Difference between naive Bayes and tree: %5.3f+-%5.3f" % tuple(orngStat.compare2AUCs(res1, 0, 1)[2]) print print "ROC (first 20 points) for bayes on 'voting'"
# Description: Builds regression models from data and outputs predictions for first five instances # Category: modelling # Uses: housing # Classes: MakeRandomIndices2, MajorityLearner, orngTree.TreeLearner, orange.kNNLearner # Referenced: regression.htm import orange, orngTree, orngTest, orngStat data = orange.ExampleTable("housing.tab") selection = orange.MakeRandomIndices2(data, 0.5) train_data = data.select(selection, 0) test_data = data.select(selection, 1) maj = orange.MajorityLearner(train_data) maj.name = "default" rt = orngTree.TreeLearner(train_data, measure="retis", mForPruning=2, minExamples=20) rt.name = "reg. tree" k = 5 knn = orange.kNNLearner(train_data, k=k) knn.name = "k-NN (k=%i)" % k regressors = [maj, rt, knn] print "\n%10s " % "original", for r in regressors: print "%10s " % r.name, print for i in range(10):
def findProjection(self, method, attrIndices=None, setAnchors=0, percentDataUsed=100): if not self.graph.haveData: return ai = self.graph.attributeNameIndex if attrIndices == None: attributes = self.getShownAttributeList() attrIndices = [ai[label] for label in attributes] if len(attrIndices) == 0: return None validData = self.graph.getValidList(attrIndices) if sum(validData) == 0: return None dataMatrix = numpy.compress(validData, numpy.take( self.graph.noJitteringScaledData, attrIndices, axis=0), axis=1) if self.graph.dataHasClass: classArray = numpy.compress( validData, self.graph.noJitteringScaledData[self.graph.dataClassIndex]) if percentDataUsed != 100: indices = orange.MakeRandomIndices2( self.graph.rawData, 1.0 - (float(percentDataUsed) / 100.0)) try: dataMatrix = numpy.compress(indices, dataMatrix, axis=1) except: pass if self.graph.dataHasClass: classArray = numpy.compress(indices, classArray) vectors = None if method == DR_PCA: vals, vectors = createPCAProjection( dataMatrix, NComps=2, useGeneralizedEigenvectors=self.useGeneralizedEigenvectors) elif method == DR_SPCA and self.graph.dataHasClass: vals, vectors = createPCAProjection( dataMatrix, classArray, NComps=2, useGeneralizedEigenvectors=self.useGeneralizedEigenvectors) elif method == DR_PLS and self.graph.dataHasClass: dataMatrix = dataMatrix.transpose() classMatrix = numpy.transpose(numpy.matrix(classArray)) vectors = createPLSProjection(dataMatrix, classMatrix, 2) vectors = vectors.T # test if all values are 0, if there is an invalid number in the array and if there are complex numbers in the array if vectors == None or not vectors.any() or False in numpy.isfinite( vectors) or False in numpy.isreal(vectors): self.setStatusBarText( "Unable to compute anchor positions for the selected attributes" ) return None xAnchors = vectors[0] yAnchors = vectors[1] m = math.sqrt(max(xAnchors**2 + yAnchors**2)) xAnchors /= m yAnchors /= m names = self.graph.attributeNames attributes = [names[attrIndices[i]] for i in range(len(attrIndices))] if setAnchors: self.graph.setAnchors(list(xAnchors), list(yAnchors), attributes) self.graph.updateData() self.graph.repaint() return xAnchors, yAnchors, (attributes, attrIndices)