def AUC(targetVariable, allPredictions): trainMask = numpy.isfinite(targetVariable) targetVariableTrainOnly = targetVariable[trainMask] predictionsTrainOnly = allPredictions[trainMask] FPR, TPR, thresholds = roc(targetVariableTrainOnly, predictionsTrainOnly) roc_auc = auc(FPR, TPR) return roc_auc
def catClassify(botData, kernelType, nTopic): X = botData[:, :nTopic] y = botData[:, nTopic] # Run classifier # classifier = svm.SVC(kernel='linear', probability=True) classifier = svm.NuSVC(probability=True) #cross-validation cv = StratifiedKFold(y, k=nFold) #select classifier #classifier = svm.SVC(kernel=kernelType, probability=True) metricstemp = np.zeros((nFold, nMetrics), np.float) for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) #fpr, tpr, thresholds = roc_curve(y[test], probas_[:,1]) #@UnusedVariable #roc_auc = auc(fpr, tpr) precision, recall, thresholds = precision_recall_curve( y[test], probas_[:, 1]) #@UnusedVariable pr_auc = auc(recall, precision) metricstemp[i] = [pr_auc] return [np.mean(metricstemp), np.std(metricstemp)]
def catClassify(dataPath, catname, kernelType, dataext, catmap, nTopic): #read the categoy data which will positive fname = dataPath + catname + dataext catpos = np.genfromtxt(fname, dtype=np.int) # catpos catpos = catpos[:, :nTopic + 1] catpos[:, nTopic] = 1 #read the category data of remaining classes for cats in catmap.keys(): if (cats != catname): firstvisit = True if (firstvisit): catneg = np.genfromtxt(fname, dtype=np.int) firstvisit = False else: catneg = np.concatenate( (catneg, np.genfromtxt(fname, dtype=np.int)), axis=0) #sample the negative data to have equal size as the positive nPos = catpos.shape[0] nNeg = catneg.shape[0] catneg = catneg[np.random.randint(0, nNeg, nPos), :] #catneg catneg = catneg[:, :nTopic + 1] catneg[:, nTopic] = 0 #combine positive and negative data data = np.concatenate((catpos, catneg), axis=0) #shuffle the rows to aid in random selection of train and test np.random.shuffle(data) X = data[:, :nTopic] y = data[:, nTopic] #cross-validation cv = StratifiedKFold(y, k=nFold) #select classifier classifier = svm.SVC(kernel=kernelType, probability=True) metricstemp = np.zeros((nFold, nMetrics), np.float) for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) #@UnusedVariable roc_auc = auc(fpr, tpr) precision, recall, thresholds = precision_recall_curve( y[test], probas_[:, 1]) #@UnusedVariable pr_auc = auc(recall, precision) metricstemp[i] = [roc_auc, pr_auc] return [np.mean(metricstemp, axis=0), np.std(metricstemp, axis=0)]
def _calculate_auc(classifier, Xt, yt): w = classifier.coef_ b = classifier.intercept_[0] lin = np.dot(Xt, w.T) + b prob = sigmoid(lin) fpr, tpr, thresholds = roc_curve(yt, prob, thresholds=np.linspace(prob.min(),prob.max(),1e3)) auc_score = auc(fpr, tpr) return auc_score, fpr, tpr
def AUC(targetVariable, allPredictions): AUC_DEC_PTS = 3 # decimal points to round predictions to, to speed AUC calculation trainMask = numpy.isfinite(targetVariable) targetVariableTrainOnly = targetVariable[trainMask] predictionsTrainOnly = allPredictions[trainMask] predictionsTrainOnly = numpy.round(predictionsTrainOnly, decimals=AUC_DEC_PTS) #new FPR, TPR, thresholds = roc(targetVariableTrainOnly, predictionsTrainOnly) roc_auc = auc(FPR, TPR) return roc_auc
def AUCkFoldLogisticRegression(regularization, inData, penalty, kFolds): print "\n\tCalculating AUC for regularization", regularization, "using", kFolds, "folds" sys.stdout.flush() xData, yData = getXYData(inData) nSamples, nFeatures = xData.shape if nSamples % kFolds != 0: raise UserWarning( "Uneven fold sizes! Must evenly divide 5922 (e.g. 2,3,7 or 9 folds" ) # 2, 3, 7, and 9 are factors of 5922 (#data points) & yield equal fold sizes crossValFolds = KFold(nSamples, kFolds) yTestDataAllFolds = array([]) probasTestDataAllFolds = array([]) sumAUC = 0.0 for foldNum, (train, test) in enumerate(crossValFolds): # fit a new LR model for each fold's data & evaluate using AUC LRclassifier = LogisticRegression(C=regularization, penalty=penalty) probas_ = LRclassifier.fit(xData[train], yData[train]).predict_proba(xData[test]) numNon0Coefs = sum( [1 for coef in LRclassifier.coef_[:][0] if coef != 0]) # probas_ contains 2 columns of probabilities, one for each of the 2 classes (0,1) # In the documentation, seems like col 1 is for class 1, # but tests show it seems like col 0 is for class 1, so we use that below. CLASS_1_COL = 0 # Compute ROC curve and area under the curve FPR, TPR, thresholds = roc(yData[test], probas_[:, CLASS_1_COL]) roc_auc = auc(FPR, TPR) print "\tFold:", foldNum, " AUC:", roc_auc, "Non0Coefs:", numNon0Coefs, print "Reg:", regularization, print localTimeString() sys.stdout.flush() sumAUC += roc_auc yTestDataAllFolds = numpy.concatenate((yTestDataAllFolds, yData[test])) probasTestDataAllFolds = \ numpy.concatenate((probasTestDataAllFolds,probas_[:,CLASS_1_COL]) ) FPRallFolds, TPRallFolds, thresholds = roc(yTestDataAllFolds, probasTestDataAllFolds) roc_auc_allFolds = auc(FPRallFolds, TPRallFolds) print "AUC_all_folds:", roc_auc_allFolds, print "Reg:", regularization, "Penalty:", penalty, "kFolds:", kFolds, print localTimeString() return roc_auc_allFolds
def plot_precision_recall(precision, recall): """ Plot the ROC curve. precision, recall, thresholds = precision_recall_curve(y[half:], probas_[:,1]) plot_precision_recall(precision, recall) Code from http://scikit-learn.sourceforge.net/auto_examples/plot_precision_recall.html """ pl.figure(-1) pl.clf() area = auc(recall, precision) pl.plot(recall, precision, label='Precision-Recall curve (area = %0.2f)' % area) pl.xlabel('Recall') pl.ylabel('Precision') pl.ylim([0.0,1.05]) pl.xlim([0.0,1.0]) pl.title('Precision-Recall example: AUC=%0.2f' % area) pl.legend(loc="lower left") pl.show()
def plot_roc(fpr, tpr): """ Plot the ROC curve. fpr, tpr, thresholds = roc_curve(y[half:], probas_[:,1]) plot_roc(fpr, tpr) Code from http://scikit-learn.sourceforge.net/auto_examples/plot_roc.html """ # Plot ROC curve pl.figure(-1) pl.clf() roc_auc = auc(fpr, tpr) pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) pl.plot([0, 1], [0, 1], 'k--') pl.xlim([0.0,1.0]) pl.ylim([0.0,1.0]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') pl.title('Receiver operating characteristic example') pl.legend(loc="lower right") pl.show()
def svm_roc(table, kernel='linear', C=1.0): '''Classification and ROC analysis ''' from scikits.learn import svm from scikits.learn.metrics import roc_curve, auc import pylab as pl X = table[:, 1:] y = table[:, 0] n_samples, n_features = X.shape p = range(n_samples) np.random.seed(0) np.random.shuffle(p) X, y = X[p], y[p] half = int(n_samples / 2) # Run classifier classifier = svm.SVC(kernel=kernel, probability=True, C=C) probas_ = classifier.fit(X[:half], y[:half]).predict_proba(X[half:]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[half:], probas_[:, 1]) roc_auc = auc(fpr, tpr) print "Area under the ROC curve : %f" % roc_auc # Plot ROC curve pl.figure(-1) pl.clf() pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) pl.plot([0, 1], [0, 1], 'k--') pl.xlim([0.0, 1.0]) pl.ylim([0.0, 1.0]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') pl.title('Receiver operating characteristic example') pl.legend(loc="lower right") pl.show()
p = range(n_samples) random.seed(0) random.shuffle(p) X, y = X[p], y[p] half = int(n_samples / 2) # Add noisy features X = np.c_[X, np.random.randn(n_samples, 200 * n_features)] # Run classifier classifier = svm.SVC(kernel='linear', probability=True) probas_ = classifier.fit(X[:half], y[:half]).predict_proba(X[half:]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[half:], probas_[:, 1]) roc_auc = auc(fpr, tpr) print "Area under the ROC curve : %f" % roc_auc # Plot ROC curve pl.figure(-1) pl.clf() pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) pl.plot([0, 1], [0, 1], 'k--') pl.xlim([0.0, 1.0]) pl.ylim([0.0, 1.0]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') pl.title('Receiver operating characteristic example') pl.legend(loc="lower right") pl.show()
def calculate_auc_score(nn, X, y): probabilities = np.array([nn.activate(row).tolist() for row in X]) fpr, tpr, thresholds = roc_curve(y, probabilities, thresholds=np.linspace(0,1,1e3)) auc_score = auc(fpr, tpr) return auc_score, fpr, tpr
n_samples, n_features = X.shape p = range(n_samples) # Shuffle samples random.seed(0) random.shuffle(p) X, y = X[p], y[p] half = int(n_samples / 2) # Add noisy features np.random.seed(0) X = np.c_[X, np.random.randn(n_samples, 200 * n_features)] # Run classifier classifier = svm.SVC(kernel='linear', probability=True) probas_ = classifier.fit(X[:half], y[:half]).predict_proba(X[half:]) # Compute Precision-Recall and plot curve precision, recall, thresholds = precision_recall_curve(y[half:], probas_[:, 1]) area = auc(recall, precision) print "Area Under Curve: %0.2f" % area pl.figure(-1) pl.clf() pl.plot(recall, precision, label='Precision-Recall curve') pl.xlabel('Recall') pl.ylabel('Precision') pl.ylim([0.0, 1.05]) pl.xlim([0.0, 1.0]) pl.title('Precision-Recall example: AUC=%0.2f' % area) pl.legend(loc="lower left") pl.show()
n_samples, n_features = X.shape p = range(n_samples) # Shuffle samples random.seed(0) random.shuffle(p) X, y = X[p], y[p] half = int(n_samples/2) # Add noisy features np.random.seed(0) X = np.c_[X,np.random.randn(n_samples, 200 * n_features)] # Run classifier classifier = svm.SVC(kernel='linear', probability=True) probas_ = classifier.fit(X[:half], y[:half]).predict_proba(X[half:]) # Compute Precision-Recall and plot curve precision, recall, thresholds = precision_recall_curve(y[half:], probas_[:,1]) area = auc(recall, precision) print "Area Under Curve: %0.2f" % area pl.figure(-1) pl.clf() pl.plot(recall, precision, label='Precision-Recall curve') pl.xlabel('Recall') pl.ylabel('Precision') pl.ylim([0.0,1.05]) pl.xlim([0.0,1.0]) pl.title('Precision-Recall example: AUC=%0.2f' % area) pl.legend(loc="lower left") pl.show()
# Run classifier with crossvalidation and plot ROC curves cv = StratifiedKFold(y, k=6) classifier = svm.SVC(kernel='linear', probability=True) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:,1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) pl.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) pl.plot([0, 1], [0, 1], '--', color=(0.6,0.6,0.6), label='Luck') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) pl.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) pl.xlim([-0.05,1.05]) pl.ylim([-0.05,1.05]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') pl.title('Receiver operating characteristic example')
def main(): try: kernelType = sys.argv[1] except(IndexError): kernelType='linear' #catmap = getCatMap(dataset) #initialise output matrices rocauc = np.zeros((nDim,nCategory),dtype=np.float32) mapauc = np.zeros((nDim,nCategory),dtype=np.float32) nSamplesPerCat = int(np.round(nClusterSamples/nCategory)) for iLDim,ldim in enumerate(ldims): #write the lower dimensional projections for each category for iCategory,catname in enumerate(catList): dataOuttemp = dimred(iCategory,catname,ldim) dataOut = np.array(np.round(dataOuttemp).astype(np.int16),dtype=np.int16) outFilename = tempPath+catname+'.'+dataExt np.savetxt(outFilename, dataOut, delimiter=' ', fmt='%d') if(dataOut.shape[0] <= nSamplesPerCat): catSample = dataOut else: rndsample = np.random.randint(0,dataOut.shape[0],nSamplesPerCat) catSample = dataOut[rndsample,:] if(iCategory==0): dataLower = catSample else: dataLower = np.concatenate((dataLower,catSample),axis=0) #cluster random sampled lower dimensional data # compute the code-book for the data-set [CodeBook,label] = kmeans2(dataLower,nCodewords,iter=nIterKmeans,minit='points',missing='warn') #@UnusedVariable # write code-book to file cbfilepath = tempPath+dataset+codebookext cbfile = open(cbfilepath,'w') np.savetxt(cbfile,CodeBook,fmt='%f', delimiter=' ',) cbfile.close() for iCategory,catname in enumerate(catList): tempFilename = tempPath+catname+'.'+dataExt catData = np.loadtxt(tempFilename, dtype=np.int16, delimiter=' ') [catLabel,catDist] = vq(catData,CodeBook) #@UnusedVariable catfilePath = dataPath+catname+'.'+dataExt catImgId = np.genfromtxt(catfilePath,dtype=np.int,usecols=[-2]) catId = np.genfromtxt(catfilePath,dtype=np.int,usecols=[-1])[0] ImgId = np.unique(catImgId) catboffilepath = tempPath+catname+bofext imgcount=0 for imgid in ImgId: imgLabel = catLabel[catImgId==imgid] [hist,edges] = np.histogram(imgLabel,nCodewords) #@UnusedVariable if imgcount==0: dataout = np.hstack((hist.T,imgid,catId)) else: dataout = np.vstack((dataout,np.hstack((hist.T,imgid,catId)))) imgcount+=1 np.savetxt(catboffilepath, dataout, fmt='%d', delimiter=' ', ) select = np.concatenate((np.arange(nCodewords),[nCodewords+1]),axis=1) for iCategory,catname in enumerate(catList): #posLabel = catmap.get(catname) #negLabel = 0 #read the category data which will positive catboffilepath = tempPath+catname+bofext catpos = np.genfromtxt(catboffilepath,dtype=np.int) catpos = catpos.take(select,axis=1) catpos[:,-1] = 1 #posLabel = catpos[0][-1] catset = set(catList) catset.remove(catname) firstvisit = True for cat in catset: #@UnusedVariable catboffilepath = tempPath+catname+bofext if(firstvisit): catneg = np.genfromtxt(catboffilepath,dtype=np.int) firstvisit = False else : catneg = np.concatenate((catneg,np.genfromtxt(catboffilepath,dtype=np.int)),axis=0) #sample the negative data to have equal size as the positive nPos = catpos.shape[0] nNeg = catneg.shape[0] catneg = catneg[np.random.randint(0,nNeg,nPos),:] catneg = catneg.take(select,axis=1) catneg[:,-1] = -1 #combine positive and negative data data = np.concatenate((catpos,catneg),axis=0) #shuffle the rows to aid in random selection of train and test #np.random.shuffle(data) X = data[:,:nCodewords] y = data[:,nCodewords] #labels for cross validation #y2 = np.where(y!=posLabel,0,y) #y2 = np.where(y2==posLabel,1,y2) #cross-validation cv = StratifiedKFold(y, k=nFold) #select classifier classifier = svm.SVC(kernel=kernelType, probability=True) metricstemp = np.zeros((nFold,nMetrics),np.float) for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) print y[test] print probas_[:,1] try: fpr, tpr, thresholds = roc_curve(y[test], probas_[:,1]) #@UnusedVariable roc_auc = auc(fpr, tpr) except: roc_auc = 0. try: precision, recall, thresholds = precision_recall_curve(y[test], probas_[:,1]) #@UnusedVariable pr_auc = auc(recall,precision) except: pr_auc = 0. metricstemp[i] = [roc_auc,pr_auc] rocauc[iLDim,iCategory] = np.mean(metricstemp[0],axis=0) mapauc[iLDim,iCategory] = np.mean(metricstemp[1],axis=0) print '%s classified...' % (catname) outPath = rootDir + dataset + outDir + '%s%s%s%s'%('dimensionality',dataset,kernelType,'.svg') outPath1 = rootDir + dataset + outDir + '%s%s%s%s' % ('dimensionality',dataset,kernelType,'.npz') plt.figure(0) #ax = plt.subplot(111) plt.errorbar(np.arange(1,nDim+1), np.mean(rocauc,axis=1), np.std(rocauc,axis=1), fmt = '-', elinewidth=1, marker = 'x', label = 'AUC-ROC') plt.errorbar(np.arange(1,nDim+1), np.mean(mapauc,axis=1), np.std(mapauc,axis=1), fmt = '--', elinewidth=1, marker = 'o', label = 'MAP') plt.xlabel('Visual Categories') plt.ylabel('Performance Metric') plt.title('BOF Performance: %s : %s' % (dataset,kernelType)) plt.legend(loc="lower right") #ax.set_xticks() #ax.set_xticklabels(ldim,size='small',ha='center') plt.savefig(outPath,format='svg') try: np.savez(outPath1,rocauc,mapauc) except: print 'unable to write file %s' % (outPath1) plt.show() plt.close()
from src.utils import L_ex, sigmoid, roc_curve, get_path path = get_path(__file__) + '/..' w = np.array([-410.6073, 0.1494, 4.4185]) idxs = [L_ex.index(f) for f in ['sdE5', 'V11', 'E9']] Xf = D_ex[:, idxs] num_tests = 5 results = [] for i in range(num_tests): test_rows = np.random.random_integers(0, D_ex.shape[0]-1, 1e5) X = Xf[test_rows, :] y = D_ex[test_rows, 2] lin = np.dot(X, w) probs = sigmoid(lin) fpr, tpr, thresholds = roc_curve(y, probs, thresholds=np.linspace(0,1,1e3)) results.append(auc(fpr, tpr)) json_path = '{0}/data/hard-coded-results-{1}-tests.json'.format(path, num_tests) with open(json_path, 'w') as f: json.dump(results, f, indent=4)
p = range(n_samples) random.seed(0) random.shuffle(p) X, y = X[p], y[p] half = int(n_samples/2) # Add noisy features X = np.c_[X,np.random.randn(n_samples, 200*n_features)] # Run classifier classifier = svm.SVC(kernel='linear', probability=True) probas_ = classifier.fit(X[:half],y[:half]).predict_proba(X[half:]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[half:], probas_[:,1]) roc_auc = auc(fpr, tpr) print "Area under the ROC curve : %f" % roc_auc # Plot ROC curve pl.figure(-1) pl.clf() pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) pl.plot([0, 1], [0, 1], 'k--') pl.xlim([0.0,1.0]) pl.ylim([0.0,1.0]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') pl.title('Receiver operating characteristic example') pl.legend(loc="lower right") pl.show()
def calcAUC(targetData): yData = targetData probas = numpy.random.random((len(targetData))) FPR, TPR, thresholds = roc(yData, probas) roc_auc = auc(FPR, TPR) return roc_auc
def test_roc(): """test Receiver operating characteristic (ROC)""" fpr, tpr, thresholds = roc(y[half:], probas_[:,1]) roc_auc = auc(fpr, tpr) assert_array_almost_equal(roc_auc, 0.80, decimal=2)
def test_precision_recall(): """test Precision-Recall""" precision, recall, thresholds = precision_recall(y[half:], probas_[:,1]) precision_recall_auc = auc(precision, recall) assert_array_almost_equal(precision_recall_auc, 0.3197, 3)
# Run classifier with crossvalidation and plot ROC curves cv = StratifiedKFold(y, k=6) classifier = svm.SVC(kernel='linear', probability=True) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) pl.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) pl.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) pl.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) pl.xlim([-0.05, 1.05]) pl.ylim([-0.05, 1.05])