def catClassify(botData, kernelType, nTopic): X = botData[:, :nTopic] y = botData[:, nTopic] # Run classifier # classifier = svm.SVC(kernel='linear', probability=True) classifier = svm.NuSVC(probability=True) #cross-validation cv = StratifiedKFold(y, k=nFold) #select classifier #classifier = svm.SVC(kernel=kernelType, probability=True) metricstemp = np.zeros((nFold, nMetrics), np.float) for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) #fpr, tpr, thresholds = roc_curve(y[test], probas_[:,1]) #@UnusedVariable #roc_auc = auc(fpr, tpr) precision, recall, thresholds = precision_recall_curve( y[test], probas_[:, 1]) #@UnusedVariable pr_auc = auc(recall, precision) metricstemp[i] = [pr_auc] return [np.mean(metricstemp), np.std(metricstemp)]
def svmClassify(data,probability=True,kernelType='rbf',nFold=10,beta=1,nMetrics=3): X = data[:,:-1] y = data[:,-1] clfParamList = {'kernel': kernelType, 'gamma': 1e-3, 'C': 1, 'degree':4, 'probability':probability,'shrinking':True,'cache_size':10000} classifier = SVC(**clfParamList) #cross-validation cv = StratifiedKFold(y, k=nFold) metricstemp = np.zeros((nFold,nMetrics),np.float) for i, (train, test) in enumerate(cv): clf = classifier.fit(X[train], y[train]) ypred = clf.predict(X[test]) if(probability==True): probas_ = clf.predict_proba(X[test]) precision, recall, thresholds = precision_recall_curve(y[test], probas_[:,1]) #@UnusedVariable fpr,tpr,thresholds = roc_curve(y[test],probas_[:,1]) #@UnusedVariable roc_auc = auc(fpr,tpr) pr_auc = auc(recall,precision) else: f1score = f1_score(y[test],ypred) fpr,tpr,thresholds = roc_curve(y[test],ypred) #@UnusedVariable roc_auc = auc(fpr,tpr) metricstemp[i] = [f1score,roc_auc,pr_auc] return metricstemp
def catClassify(dataPath, catname, kernelType, dataext, catmap, nTopic): #read the categoy data which will positive fname = dataPath + catname + dataext catpos = np.genfromtxt(fname, dtype=np.int) # catpos catpos = catpos[:, :nTopic + 1] catpos[:, nTopic] = 1 #read the category data of remaining classes for cats in catmap.keys(): if (cats != catname): firstvisit = True if (firstvisit): catneg = np.genfromtxt(fname, dtype=np.int) firstvisit = False else: catneg = np.concatenate( (catneg, np.genfromtxt(fname, dtype=np.int)), axis=0) #sample the negative data to have equal size as the positive nPos = catpos.shape[0] nNeg = catneg.shape[0] catneg = catneg[np.random.randint(0, nNeg, nPos), :] #catneg catneg = catneg[:, :nTopic + 1] catneg[:, nTopic] = 0 #combine positive and negative data data = np.concatenate((catpos, catneg), axis=0) #shuffle the rows to aid in random selection of train and test np.random.shuffle(data) X = data[:, :nTopic] y = data[:, nTopic] #cross-validation cv = StratifiedKFold(y, k=nFold) #select classifier classifier = svm.SVC(kernel=kernelType, probability=True) metricstemp = np.zeros((nFold, nMetrics), np.float) for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) #@UnusedVariable roc_auc = auc(fpr, tpr) precision, recall, thresholds = precision_recall_curve( y[test], probas_[:, 1]) #@UnusedVariable pr_auc = auc(recall, precision) metricstemp[i] = [roc_auc, pr_auc] return [np.mean(metricstemp, axis=0), np.std(metricstemp, axis=0)]
def ParameterGridSearch(self, callback=None, nValidation=5): ''' Grid search for the best C and gamma parameters for the RBF Kernel. The efficiency of the parameters is evaluated using nValidation-fold cross-validation of the training data. As this process is time consuming and parallelizable, a number of threads equal to the number of cores in the computer is used for the calculations ''' from scikits.learn.grid_search import GridSearchCV from scikits.learn.metrics import precision_score from scikits.learn.cross_val import StratifiedKFold # # XXX: program crashes with >1 worker when running cpa.py # No crash when running from classifier.py. Why? # n_workers = 1 #try: #from multiprocessing import cpu_count #n_workers = cpu_count() #except: #n_workers = 1 # Define the parameter ranges for C and gamma and perform a grid search for the optimal setting parameters = { 'C': 2**np.arange(-5, 11, 2, dtype=float), 'gamma': 2**np.arange(3, -11, -2, dtype=float) } clf = GridSearchCV(SVC(kernel='rbf'), parameters, n_jobs=n_workers, score_func=precision_score) clf.fit(self.svm_train_values, self.svm_train_labels, cv=StratifiedKFold(self.svm_train_labels, nValidation)) # Pick the best parameters as the ones with the maximum cross-validation rate bestParameters = max(clf.grid_scores_, key=lambda a: a[1]) bestC = bestParameters[0]['C'] bestGamma = bestParameters[0]['gamma'] logging.info('Optimal values: C=%s g=%s rate=%s' % (bestC, bestGamma, bestParameters[1])) return bestC, bestGamma
def knnClassify(data,nFold=10,nNeighbor=10,nMetrics=2): X = data[:,:-1] y = data[:,-1] clfParamList = {'n_neighbors':nNeighbor,'algorithm':'auto'} classifier = NeighborsClassifier(**clfParamList) #cross-validation cv = StratifiedKFold(y, k=nFold) metricstemp = np.zeros((nFold,nMetrics),np.float) for i, (train, test) in enumerate(cv): clf = classifier.fit(X[train], y[train]) ypred = clf.predict(X[test]) f1score = f1_score(y[test],ypred) fpr,tpr,thresholds = roc_curve(y[test],ypred) #@UnusedVariable roc_auc = auc(fpr,tpr) metricstemp[i] = [f1score,roc_auc] return metricstemp
v, w = np.linalg.eigh(gmm.covars[n][:2, :2]) u = w[0] / np.linalg.norm(w[0]) angle = np.arctan(u[1]/u[0]) angle = 180 * angle / np.pi # convert to degrees v *= 9 ell = mpl.patches.Ellipse(gmm.means[n, :2], v[0], v[1], 180 + angle, color=color) ell.set_clip_box(ax.bbox) ell.set_alpha(0.5) ax.add_artist(ell) iris = datasets.load_iris() # Break up the dataset into non-overlapping training (75%) and testing # (25%) sets. skf = StratifiedKFold(iris.target, k=4) # Only take the first fold. train_index, test_index = skf.__iter__().next() X_train = iris.data[train_index] y_train = iris.target[train_index] X_test = iris.data[test_index] y_test = iris.target[test_index] n_classes = len(np.unique(y_train)) # Try GMMs using different types of covariances. classifiers = dict((x, GMM(n_states=n_classes, cvtype=x)) for x in ['spherical', 'diag', 'tied', 'full'])
def main(): try: kernelType = sys.argv[1] except(IndexError): kernelType='linear' #catmap = getCatMap(dataset) #initialise output matrices rocauc = np.zeros((nDim,nCategory),dtype=np.float32) mapauc = np.zeros((nDim,nCategory),dtype=np.float32) nSamplesPerCat = int(np.round(nClusterSamples/nCategory)) for iLDim,ldim in enumerate(ldims): #write the lower dimensional projections for each category for iCategory,catname in enumerate(catList): dataOuttemp = dimred(iCategory,catname,ldim) dataOut = np.array(np.round(dataOuttemp).astype(np.int16),dtype=np.int16) outFilename = tempPath+catname+'.'+dataExt np.savetxt(outFilename, dataOut, delimiter=' ', fmt='%d') if(dataOut.shape[0] <= nSamplesPerCat): catSample = dataOut else: rndsample = np.random.randint(0,dataOut.shape[0],nSamplesPerCat) catSample = dataOut[rndsample,:] if(iCategory==0): dataLower = catSample else: dataLower = np.concatenate((dataLower,catSample),axis=0) #cluster random sampled lower dimensional data # compute the code-book for the data-set [CodeBook,label] = kmeans2(dataLower,nCodewords,iter=nIterKmeans,minit='points',missing='warn') #@UnusedVariable # write code-book to file cbfilepath = tempPath+dataset+codebookext cbfile = open(cbfilepath,'w') np.savetxt(cbfile,CodeBook,fmt='%f', delimiter=' ',) cbfile.close() for iCategory,catname in enumerate(catList): tempFilename = tempPath+catname+'.'+dataExt catData = np.loadtxt(tempFilename, dtype=np.int16, delimiter=' ') [catLabel,catDist] = vq(catData,CodeBook) #@UnusedVariable catfilePath = dataPath+catname+'.'+dataExt catImgId = np.genfromtxt(catfilePath,dtype=np.int,usecols=[-2]) catId = np.genfromtxt(catfilePath,dtype=np.int,usecols=[-1])[0] ImgId = np.unique(catImgId) catboffilepath = tempPath+catname+bofext imgcount=0 for imgid in ImgId: imgLabel = catLabel[catImgId==imgid] [hist,edges] = np.histogram(imgLabel,nCodewords) #@UnusedVariable if imgcount==0: dataout = np.hstack((hist.T,imgid,catId)) else: dataout = np.vstack((dataout,np.hstack((hist.T,imgid,catId)))) imgcount+=1 np.savetxt(catboffilepath, dataout, fmt='%d', delimiter=' ', ) select = np.concatenate((np.arange(nCodewords),[nCodewords+1]),axis=1) for iCategory,catname in enumerate(catList): #posLabel = catmap.get(catname) #negLabel = 0 #read the category data which will positive catboffilepath = tempPath+catname+bofext catpos = np.genfromtxt(catboffilepath,dtype=np.int) catpos = catpos.take(select,axis=1) catpos[:,-1] = 1 #posLabel = catpos[0][-1] catset = set(catList) catset.remove(catname) firstvisit = True for cat in catset: #@UnusedVariable catboffilepath = tempPath+catname+bofext if(firstvisit): catneg = np.genfromtxt(catboffilepath,dtype=np.int) firstvisit = False else : catneg = np.concatenate((catneg,np.genfromtxt(catboffilepath,dtype=np.int)),axis=0) #sample the negative data to have equal size as the positive nPos = catpos.shape[0] nNeg = catneg.shape[0] catneg = catneg[np.random.randint(0,nNeg,nPos),:] catneg = catneg.take(select,axis=1) catneg[:,-1] = -1 #combine positive and negative data data = np.concatenate((catpos,catneg),axis=0) #shuffle the rows to aid in random selection of train and test #np.random.shuffle(data) X = data[:,:nCodewords] y = data[:,nCodewords] #labels for cross validation #y2 = np.where(y!=posLabel,0,y) #y2 = np.where(y2==posLabel,1,y2) #cross-validation cv = StratifiedKFold(y, k=nFold) #select classifier classifier = svm.SVC(kernel=kernelType, probability=True) metricstemp = np.zeros((nFold,nMetrics),np.float) for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) print y[test] print probas_[:,1] try: fpr, tpr, thresholds = roc_curve(y[test], probas_[:,1]) #@UnusedVariable roc_auc = auc(fpr, tpr) except: roc_auc = 0. try: precision, recall, thresholds = precision_recall_curve(y[test], probas_[:,1]) #@UnusedVariable pr_auc = auc(recall,precision) except: pr_auc = 0. metricstemp[i] = [roc_auc,pr_auc] rocauc[iLDim,iCategory] = np.mean(metricstemp[0],axis=0) mapauc[iLDim,iCategory] = np.mean(metricstemp[1],axis=0) print '%s classified...' % (catname) outPath = rootDir + dataset + outDir + '%s%s%s%s'%('dimensionality',dataset,kernelType,'.svg') outPath1 = rootDir + dataset + outDir + '%s%s%s%s' % ('dimensionality',dataset,kernelType,'.npz') plt.figure(0) #ax = plt.subplot(111) plt.errorbar(np.arange(1,nDim+1), np.mean(rocauc,axis=1), np.std(rocauc,axis=1), fmt = '-', elinewidth=1, marker = 'x', label = 'AUC-ROC') plt.errorbar(np.arange(1,nDim+1), np.mean(mapauc,axis=1), np.std(mapauc,axis=1), fmt = '--', elinewidth=1, marker = 'o', label = 'MAP') plt.xlabel('Visual Categories') plt.ylabel('Performance Metric') plt.title('BOF Performance: %s : %s' % (dataset,kernelType)) plt.legend(loc="lower right") #ax.set_xticks() #ax.set_xticklabels(ldim,size='small',ha='center') plt.savefig(outPath,format='svg') try: np.savez(outPath1,rocauc,mapauc) except: print 'unable to write file %s' % (outPath1) plt.show() plt.close()
############################################################################## # Loading a dataset iris = datasets.load_iris() X = iris.data y = iris.target n_classes = np.unique(y).size # Some noisy data not correlated random = np.random.RandomState(seed=0) E = random.normal(size=(len(X), 2200)) # Add noisy data to the informative features for make the task harder X = np.c_[X, E] svm = SVC(kernel='linear') cv = StratifiedKFold(y, 2) score, permutation_scores, pvalue = permutation_test_score(svm, X, y, zero_one_score, cv=cv, n_permutations=100, n_jobs=1) print "Classification score %s (pvalue : %s)" % (score, pvalue) ############################################################################### # View histogram of permutation scores pl.hist(permutation_scores, label='Permutation scores') ylim = pl.ylim()
algorithm.fit(train_data, train_target) y_pred = algorithm.predict(test_data) results.append(precision(y_pred)) def precision(y_pred): prec = sum(y_pred == test_target) return float(prec) / len(test_target) svmclf = svm.SVC() logisticclf = LogisticRegression() nnclf= neighbors.Neighbors() svmli = [] logli = [] nnli = [] cv = StratifiedKFold(iris.target, 20) for train_index, test_index in cv: train_data = iris.data[train_index] train_target = iris.target[train_index] test_data = iris.data[test_index] test_target = iris.target[test_index] #svm test_algorithm(svmclf, svmli, train_data, train_target, test_data) #logistic regression test_algorithm(logisticclf, logli, train_data, train_target, test_data) #NN test_algorithm(nnclf, nnli, train_data, train_target, test_data)
from scikits.learn.cross_val import StratifiedKFold from scikits.learn.feature_selection import RFECV from scikits.learn.datasets import samples_generator from scikits.learn.metrics import zero_one ################################################################################ # Loading a dataset X, y = samples_generator.test_dataset_classif(n_features=500, k=5, seed=0) ################################################################################ # Create the RFE object and compute a cross-validated score svc = SVC(kernel='linear') rfecv = RFECV(estimator=svc, n_features=2, percentage=0.1, loss_func=zero_one) rfecv.fit(X, y, cv=StratifiedKFold(y, 2)) print 'Optimal number of features : %d' % rfecv.support_.sum() import pylab as pl pl.figure() pl.semilogx(rfecv.n_features_, rfecv.cv_scores_) pl.xlabel('Number of features selected') pl.ylabel('Cross validation score (nb of misclassifications)') # 15 ticks regularly-space in log x_ticks = np.unique( np.logspace( np.log10(2), np.log10(rfecv.n_features_.max()), 15, ).astype(np.int))
def process_speaker(db): TEST_FOLD = 2 #db = connect_to_database() #db.add_son_manipulator(TransformToBinary()) #db = NeoEngine('/data/neo4j') #Impostor ratio is ratio of impostor records in training #and testing population. For ratio=N, subject is 1/(N+1), #impostor N/(N+1) of population IMPOSTOR_RATIO = 3 LIMIT = 30 #Set up queue ctx = zmq.Context() q = ctx.socket(zmq.PULL) q.connect("tcp://127.0.0.1:5555") outQ = ctx.socket(zmq.PUSH) outQ.connect("tcp://127.0.0.1:5556") while True: speaker_name = q.recv_json() print 'Received %s' % speaker_name #time.sleep(random.randint(1,10)) #Find all SVs for current subject #print 'Speaker Name: %s' % speaker.name #print 'Count:', db.sv.find({'speaker_name': speaker.name}).count() #cursor_subject = Concurrent_cursor(SV.objects(speaker_name=speaker.name)) sv_subject = stack_SVs(db.get('sv', {'speaker_name': speaker_name}), limit=LIMIT) num_subject = np.size(sv_subject, 0) print num_subject if num_subject < 20: continue #Get random SVs from rest of database for test population #cursor_impostor = db.sv.find({'speaker_name': {'$ne': speaker['name']}}) sv_impostor = stack_SVs_random(db, speaker_name, num_subject * IMPOSTOR_RATIO) num_impostor = np.size(sv_impostor, 0) print 'Subject: %i, Impostor: %i' % (num_subject, num_impostor) #generate total dataset of observations X with class labels y X = np.vstack((sv_subject, sv_impostor)) y = np.array([1] * num_subject + [0] * num_impostor) #Pick random assortment from each set to form training observations #Switch ensures that smaller number always used for training if TEST_FOLD < 3: train, test = iter(StratifiedKFold(y, TEST_FOLD)).next() else: test, train = iter(StratifiedKFold(y, TEST_FOLD)).next() #Perform crossvalidated SVM training #print type(X), type(y) #print np.shape(X[train]), np.shape(y[train]) clf = train_svm_crossvalidated(X[train], y[train]) #print type(clf) #clf_rec = {'classifier': SVMModelField(clf), 'speaker_name': speaker.name} #db.svm.insert(clf_rec, safe=True) #Collect classification statistics accuracy = test_svm_accuracy(X[test], y[test], clf) num_subject_test = np.sum(y[test]) num_impostor_test = len(y[test]) - num_subject_test print 'Accuracy: %f' % (float(accuracy['correct_subject']) / float(num_subject_test)) #print 'Sub: %i/%i Imp: %i/%i' % (accuracy['correct_subject'], num_subject_test, accuracy['correct_impostor'], num_impostor_test) #print 'False Neg: %i False Pos: %i' % (accuracy['false_neg'], accuracy['false_pos']) msg = { 'speaker_name': speaker_name, 'accuracy': accuracy, 'num_subject': num_subject, 'num_subject_test': num_subject_test, 'num_impostor_test': num_impostor_test } outQ.send_pyobj(msg)
def blWord(): (options, args) = parser.parse_args(sys.argv[1:]) #@UnusedVariable dataset = options.dataset kernelType = options.kernelType nFold = options.nFold nCodeword = options.nCodeword dataPath = rootDir + dataset + bofDir catmap = getCatMap(dataset) catList = catmap.keys() dataext = str(nCodeword) + bofext nCategory = len(catList) perfMean = np.zeros(nCategory) perfStd = np.zeros(nCategory) for iCat, catname in enumerate(catList): print catname #read the category data which will positive fname = dataPath + catname + dataext catpos = np.genfromtxt(fname, dtype=np.int) # catpos catpos = catpos[:, :nCodeword + 1] catpos[:, nCodeword] = 1 #read the category data of remaining classes for cats in catList: if (cats != catname): firstvisit = True if (firstvisit): catneg = np.genfromtxt(fname, dtype=np.int) firstvisit = False else: catneg = np.concatenate( (catneg, np.genfromtxt(fname, dtype=np.int)), axis=0) #sample the negative data to have equal size as the positive nPos = catpos.shape[0] nNeg = catneg.shape[0] catneg = catneg[np.random.randint(0, nNeg, nPos), :] #catneg catneg = catneg[:, :nCodeword + 1] catneg[:, nCodeword] = 0 #combine positive and negative data data = np.concatenate((catpos, catneg), axis=0) #shuffle the rows to aid in random selection of train and test np.random.shuffle(data) X = data[:, :nCodeword] y = data[:, nCodeword] clfParamList = { 'kernel': kernelType, 'gamma': 1e-3, 'C': 1, 'degree': 4, 'probability': True, 'shrinking': True, 'cache_size': 1000 } classifier = SVC(**clfParamList) cv = StratifiedKFold(y, k=nFold) avgprec = np.zeros(nFold) for icv, (train, test) in enumerate(cv): clf = classifier.fit(X[train], y[train]) probas_ = clf.predict_proba(X[test]) precision, recall, thresholds = precision_recall_curve( y[test], probas_[:, 1]) #@UnusedVariable avgprec[icv] = auc(recall, precision) perfMean[iCat] = np.mean(avgprec) perfStd[iCat] = np.std(avgprec) return [perfMean, perfStd]
def test_accuracy_concurrent(worker, concurrency): TEST_FOLD = 2 RUN_MAX = False iter_num = 0 db = connect_to_database() db.add_son_manipulator(TransformToBinary()) #take this out later when testing complete fid = open('/home/ubuntu/project/backend-search/src/spkrec/utils/hist'+str(worker)+'.csv', 'wb') csv_writer = csv.writer(fid) if worker == 0: csv_writer.writerow(['name', 'num speaker SVs', 'test subjects', 'test impostors', 'correct subject', 'correct impostor', 'false neg', 'false pos']) cursor = Concurrent_cursor(Speaker.objects()) cursor.set_concurrency(concurrency) cursor.set_worker(worker) for speaker in Speaker.objects(): #Impostor ratio is ratio of impostor records in training #and testing population. For ratio=N, subject is 1/(N+1), #impostor N/(N+1) of population IMPOSTOR_RATIO = 3 #Find all SVs for current subject print 'Speaker Name: %s' % speaker.name #print 'Count:', db.sv.find({'speaker_name': speaker.name}).count() #cursor_subject = Concurrent_cursor(SV.objects(speaker_name=speaker.name)) sv_subject = stack_SVs(db.sv.find({'speaker_name': speaker.name})) num_subject = np.size(sv_subject,0) #csv_writer.writerow([speaker.name, num_subject]) #print num_subject if num_subject < 20: continue #Get random SVs from rest of database for test population #cursor_impostor = db.sv.find({'speaker_name': {'$ne': speaker['name']}}) sv_impostor = stack_SVs_random(db, speaker.name, num_subject*IMPOSTOR_RATIO) num_impostor = np.size(sv_impostor,0) print 'Subject: %i, Impostor: %i' % (num_subject, num_impostor) #generate total dataset of observations X with class labels y X = np.vstack((sv_subject, sv_impostor)) y = np.array([1] * num_subject + [0] * num_impostor) #Pick random assortment from each set to form training observations #Switch ensures that smaller number always used for training if TEST_FOLD < 3: train, test = iter(StratifiedKFold(y, TEST_FOLD)).next() else: test, train = iter(StratifiedKFold(y, TEST_FOLD)).next() #print train #Perform crossvalidated SVM training #print type(X), type(y) #print np.shape(X[train]), np.shape(y[train]) clf = train_svm_crossvalidated(X[train], y[train]) #print type(clf) #clf_rec = {'classifier': SVMModelField(clf), 'speaker_name': speaker.name} #db.svm.insert(clf_rec, safe=True) #Collect classification statistics accuracy = test_svm_accuracy(X[test], y[test], clf) num_subject_test = np.sum(y[test]) num_impostor_test = len(y[test]) - num_subject_test print 'Accuracy: %f' % (float(accuracy['correct_subject'])/float(num_subject_test)) print 'Sub: %i/%i Imp: %i/%i' % (accuracy['correct_subject'], num_subject_test, accuracy['correct_impostor'], num_impostor_test) print 'False Neg: %i False Pos: %i' % (accuracy['false_neg'], accuracy['false_pos']) csv_writer.writerow([speaker.name, num_subject, num_subject_test, num_impostor_test, accuracy['correct_subject'], accuracy['correct_impostor'], accuracy['false_neg'], accuracy['false_pos']]) iter_num = iter_num + 1 #if RUN_MAX and iter_num >= RUN_MAX: # print "I'm breaking" # break #print num_subject, num_impostor fid.close() print "Complete"
y = digits.target ################################################################################ # Create the RFE object and compute a cross-validated score, compared to an # unvariate feature selection <<<<<<< HEAD rfe = RFE(estimator = SVC(kernel="linear",C=1), n_features = 10, percentage = 0.1) anova_filter = UnivariateFilter(SelectKBest(k=10), f_classif) clf = SVC(kernel="linear",C=1) y_pred_rfe = [] y_pred_univ = [] y_true = [] for train, test in StratifiedKFold(y, 2): Xtrain, ytrain, Xtest, ytest = X[train], y[train], X[test], y[test] ### Fit and predict rfe support = rfe.fit(X[train], y[train]).support_ y_pred_rfe.append(clf.fit(X[train,support],y[train]).predict( X[test,support])) ### Fit and predict univariate feature selection xr = anova_filter.fit(Xtrain, ytrain).transform(Xtrain) y_pred_univ.append(clf.fit(Xtrain[:,anova_filter.support_],ytrain).predict( Xtest[:,anova_filter.support_])) y_true.append(ytest) y_pred_univ = np.concatenate(y_pred_univ) y_true = np.concatenate(y_true)
from scikits.learn.metrics import precision_score from scikits.learn.metrics import recall_score from scikits.learn.svm import SVC ################################################################################ # Loading the Digits dataset digits = datasets.load_digits() # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.images) X = digits.images.reshape((n_samples, -1)) y = digits.target # split the dataset in two equal part respecting label proportions train, test = iter(StratifiedKFold(y, 2)).next() ################################################################################ # Set the parameters by cross-validation tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 1000] }] scores = [ ('precision', precision_score), ('recall', recall_score),
v *= 9 ell = mpl.patches.Ellipse(gmm.means[n, :2], v[0], v[1], 180 + angle, color=color) ell.set_clip_box(ax.bbox) ell.set_alpha(0.5) ax.add_artist(ell) iris = datasets.load_iris() # Break up the dataset into non-overlapping training (75%) and testing # (25%) sets. skf = StratifiedKFold(iris.target, k=4) # Only take the first fold. train_index, test_index = skf.__iter__().next() X_train = iris.data[train_index] y_train = iris.target[train_index] X_test = iris.data[test_index] y_test = iris.target[test_index] n_classes = len(np.unique(y_train)) # Try GMMs using different types of covariances. classifiers = dict((x, GMM(n_states=n_classes, cvtype=x)) for x in ['spherical', 'diag', 'tied', 'full']) n_classifiers = len(classifiers)
def draw_roc(self, fields_used=None, label_field='', pca=False): # Classification and ROC analysis # Run classifier with crossvalidation and plot ROC curves cv = StratifiedKFold(self.labels, k=self.folds) self.classifier = self.get_ml(probability=True) mean_tpr = 0.0 mean_fpr = numpy.linspace(0, 1, 100) for i, (train, test) in enumerate(cv): train_data = self.data[train] test_data = self.data[test] if pca: reducer = GlassPCA() reducer.get_pca(self.data[train]) train_data, test_data = reducer.project_data( self.data[train], self.data[test]) fitted = self.classifier.fit(train_data, self.labels[train], class_weight='auto') decisions = fitted.predict(test_data) print classification_report(self.labels[test], decisions) print confusion_matrix(self.labels[test], decisions) print mean_square_error(self.labels[test], decisions) if getattr(self.classifier, 'classes', None) is None or self.classifier.classes.shape[0] == 2: probas_ = fitted.predict_proba(test_data) # Compute ROC curve and area the curve try: fpr, tpr, thresholds = roc_curve(self.labels[test], probas_[:, 1]) except IndexError: fpr, tpr, thresholds = roc_curve(self.labels[test], probas_) mean_tpr += scipy.interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) pl.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) pl.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_tpr = numpy.ma.fix_invalid(mean_tpr, fill_value=0) mean_auc = auc(mean_fpr, mean_tpr) pl.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) pl.xlim([-0.05, 1.05]) pl.ylim([-0.05, 1.05]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') title = '%s ROC curve: %s' % (self.class_name, label_field) if fields_used: annot_fields = [ '%s (%.2f)' % (fields_used[i], self.classifier.coef_[0:, i]) for i in xrange(0, len(fields_used)) ] title += '\nUsing fields: ' + '\n'.join([ ', '.join(annot_fields[x:x + 6]) for x in xrange(0, len(annot_fields), 6) ]) pl.title(title, fontsize='small') pl.legend(loc="lower right") #pl.savefig('/Users/karmel/Desktop/pic_%d.png' % random.randint(0,99999)) pl.show()
# the label to predict is the id of the person y = lfw_people.target target_names = lfw_people.target_names n_classes = target_names.shape[0] print "Total dataset size:" print "n_samples: %d" % n_samples print "n_features: %d" % n_features print "n_classes: %d" % n_classes ################################################################################ # Split into a training set and a test set using a stratified k fold # split into a training and testing set train, test = iter(StratifiedKFold(y, k=4)).next() X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test] ################################################################################ # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction n_components = 150 print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]) t0 = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print "done in %0.3fs" % (time() - t0) eigenfaces = pca.components_.T.reshape((n_components, h, w))
# import some data to play with iris = datasets.load_iris() X = iris.data y = iris.target X, y = X[y != 2], y[y != 2] n_samples, n_features = X.shape # Add noisy features X = np.c_[X, np.random.randn(n_samples, 200 * n_features)] ################################################################################ # Classification and ROC analysis # Run classifier with crossvalidation and plot ROC curves cv = StratifiedKFold(y, k=6) classifier = svm.SVC(kernel='linear', probability=True) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) pl.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))