def knnClassify(data, n_neighbors=10, nFold=10, beta=1.0, nMetrics=1): X = data[:, :-1] y = data[:, -1] clfParamList = {'n_neighbors': 10, 'algorithm': 'auto'} classifier = NeighborsClassifier(**clfParamList) cv = StratifiedKFold(y, k=nFold) avgprec = np.zeros(nFold) for icv, (train, test) in enumerate(cv): clf = classifier.fit(X[train], y[train]) ypred = clf.predict(X[test]) avgprec[icv] = fbeta_score(y[test], ypred, beta) return avgprec
def test(train_data, test_data, train_class, test_class): neigh_1 = NeighborsClassifier(n_neighbors=1) neigh_1.fit(train_data, train_class) neigh_3 = NeighborsClassifier(n_neighbors=3) neigh_3.fit(train_data, train_class) neigh_5 = NeighborsClassifier(n_neighbors=5) neigh_5.fit(train_data, train_class) neigh_11 = NeighborsClassifier(n_neighbors=11) neigh_11.fit(train_data, train_class) neigh_17 = NeighborsClassifier(n_neighbors=17) neigh_17.fit(train_data, train_class) neigh_21 = NeighborsClassifier(n_neighbors=21) neigh_21.fit(train_data, train_class) print success_rate(neigh_1, test_data, test_class) print success_rate(neigh_3, test_data, test_class) print success_rate(neigh_5, test_data, test_class) print success_rate(neigh_11, test_data, test_class) print success_rate(neigh_17, test_data, test_class) print success_rate(neigh_21, test_data, test_class)
def blWord(): (options, args) = parser.parse_args(sys.argv[1:]) #@UnusedVariable dataset = options.dataset # kernelType = options.kernelType nFold = options.nFold nCodeword = options.nCodeword dataPath = rootDir+dataset+bofDir catmap = getCatMap(dataset) catList = catmap.keys() if(nCodeword==1000): dataext = bofext else: dataext = str(nCodeword)+bofext nCategory = len(catList) perfMean = np.zeros(nCategory) perfStd = np.zeros(nCategory) for iCat,catname in enumerate(catList): print catname #read the category data which will positive fname = dataPath+catname+dataext catpos = np.genfromtxt(fname,dtype=np.int) # catpos catpos = catpos[:,:nCodeword] posLabel = np.ones((catpos.shape[0],1),dtype=np.int) catpos = np.concatenate((catpos,posLabel),axis=1) #read the category data of remaining classes for cats in catList: if(cats!=catname): firstvisit = True if(firstvisit): catneg = np.genfromtxt(fname,dtype=np.int) firstvisit = False else : catneg = np.concatenate((catneg,np.genfromtxt(fname,dtype=np.int)),axis=0) #sample the negative data to have equal size as the positive nPos = catpos.shape[0] nNeg = catneg.shape[0] catneg = catneg[np.random.randint(0,nNeg,nPos),:] #catneg catneg = catneg[:,:nCodeword] negLabel = np.zeros((catneg.shape[0],1),dtype=np.int) catneg = np.concatenate((catneg,negLabel),axis=1) #combine positive and negative data data = np.concatenate((catpos,catneg),axis=0) #shuffle the rows to aid in random selection of train and test np.random.shuffle(data) X = data[:,:nCodeword] y = data[:,nCodeword] # clfParamList = {'kernel': kernelType, 'gamma': 1e-3, 'C': 1, 'degree':4, 'probability':True,'shrinking':True,'cache_size':1000} # classifier = SVC(**clfParamList) cv = StratifiedKFold(y, k=nFold) clfParamList = {'n_neighbors':10,'algorithm':'auto'} classifier = NeighborsClassifier(**clfParamList) avgprec = np.zeros(nFold) for icv,(train, test) in enumerate(cv): clf = classifier.fit(X[train], y[train]) # probas_ = clf.predict_proba(X[test]) # precision, recall, thresholds = precision_recall_curve(y[test], probas_[:,1]) #@UnusedVariable # avgprec[icv] = auc(recall,precision) ypred = clf.predict(X[test]) avgprec[icv] = f1_score(y[test],ypred) perfMean[iCat] = np.mean(avgprec) perfStd[iCat] = np.std(avgprec) if(options.verbose): print perfMean print perfStd return [perfMean,perfStd]
data_pca = (normalize(data_pca[0]).tolist(), data_pca[1]) pca = PCA() pca.fit(data_pca[0]) data_pca = (pca.transform(data_pca[0]).tolist(), data_pca[1]) kneighbors_grid = [1, 3, 5, 11, 21, 31] svm_linear_grid = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000] svm_rbf_grid = [] for i in svm_linear_grid: for j in svm_linear_grid: svm_rbf_grid.append([i, j]) randforest_grid = [2, 3, 5, 10, 20, 40, 60] kneighbors_rates = rates(data, kneighbors_grid, lambda K: NeighborsClassifier(n_neighbors=K)) print_result(kneighbors_rates) kneighbors_pca_rates = rates(data_pca, kneighbors_grid, lambda K: NeighborsClassifier(n_neighbors=K)) print_result(kneighbors_pca_rates) svm_linear_rates = rates(data, svm_linear_grid, lambda C: SVC(C=C, kernel='linear')) print_result(svm_linear_rates) svm_linear_pca_rates = rates(data_pca, svm_linear_grid, lambda C: SVC(C=C, kernel='linear')) print_result(svm_linear_pca_rates) svm_rbf_rates = rates(data, svm_rbf_grid, lambda C: SVC(C=C[0], gamma=C[1], kernel='rbf')) print_result(svm_rbf_rates)