def classify_nn(X, y, k): m = X.shape[0] m_test = int(m*0.25) m_train = m - m_test # Split data in train and test data # A random permutation, to split the data randomly #np.random.seed(k) indices = np.random.permutation(m) X_train = X[indices[:m_train]] y_train = y[indices[:m_train]] X_test = X[indices[m_train:]] y_test = y[indices[m_train:]] # Create and fit a nearest-neighbor classifier from sklearn.neighbors import NeighborsClassifier knn = NeighborsClassifier() knn.fit(X_train, y_train) print 'knn=%s' % knn y_pred = knn.predict(X_test) correct = y_pred == y_test print 'k=%2d: Num tests=%6d correct=%6d = %2d%%' % (k, correct.shape[0], correct.sum(), int(100*correct.sum()/correct.shape[0])) if False: for i in range(correct.shape[0]): print ' %d==%d => %d' % (y_pred[i], y_test[i], correct[i]) exit()
def run_tests(classifier, test_data, c_type, name, k_kmeans, knn_ks, t_shelf): for k_knn in knn_ks: if c_type == 'knn': means, labels = classifier cls = NeighborsClassifier(n_neighbors=k_knn) cls.fit(means, labels) elif c_type == 'svm': cls = classifier # results: every test sample is labeled by classifier X = np.vstack(test_data) start = time.time() results = cls.predict(X) elapsed_time = time.time() - start save_time(elapsed_time, len(X), t_shelf, name, k_kmeans, k_knn) targets = make_targets(test_data) save_results(results, targets, name, k_kmeans, k_knn) print_results(results, targets)
def knnClassify(data, n_neighbors=10, nFold=10, beta=1.0, nMetrics=1): X = data[:, :-1] y = data[:, -1] clfParamList = {'n_neighbors': 10, 'algorithm': 'auto'} classifier = NeighborsClassifier(**clfParamList) cv = StratifiedKFold(y, k=nFold) avgprec = np.zeros(nFold) for icv, (train, test) in enumerate(cv): clf = classifier.fit(X[train], y[train]) ypred = clf.predict(X[test]) avgprec[icv] = fbeta_score(y[test], ypred, beta) return avgprec
def blWord(): (options, args) = parser.parse_args(sys.argv[1:]) #@UnusedVariable dataset = options.dataset # kernelType = options.kernelType nFold = options.nFold nCodeword = options.nCodeword dataPath = rootDir+dataset+bofDir catmap = getCatMap(dataset) catList = catmap.keys() if(nCodeword==1000): dataext = bofext else: dataext = str(nCodeword)+bofext nCategory = len(catList) perfMean = np.zeros(nCategory) perfStd = np.zeros(nCategory) for iCat,catname in enumerate(catList): print catname #read the category data which will positive fname = dataPath+catname+dataext catpos = np.genfromtxt(fname,dtype=np.int) # catpos catpos = catpos[:,:nCodeword] posLabel = np.ones((catpos.shape[0],1),dtype=np.int) catpos = np.concatenate((catpos,posLabel),axis=1) #read the category data of remaining classes for cats in catList: if(cats!=catname): firstvisit = True if(firstvisit): catneg = np.genfromtxt(fname,dtype=np.int) firstvisit = False else : catneg = np.concatenate((catneg,np.genfromtxt(fname,dtype=np.int)),axis=0) #sample the negative data to have equal size as the positive nPos = catpos.shape[0] nNeg = catneg.shape[0] catneg = catneg[np.random.randint(0,nNeg,nPos),:] #catneg catneg = catneg[:,:nCodeword] negLabel = np.zeros((catneg.shape[0],1),dtype=np.int) catneg = np.concatenate((catneg,negLabel),axis=1) #combine positive and negative data data = np.concatenate((catpos,catneg),axis=0) #shuffle the rows to aid in random selection of train and test np.random.shuffle(data) X = data[:,:nCodeword] y = data[:,nCodeword] # clfParamList = {'kernel': kernelType, 'gamma': 1e-3, 'C': 1, 'degree':4, 'probability':True,'shrinking':True,'cache_size':1000} # classifier = SVC(**clfParamList) cv = StratifiedKFold(y, k=nFold) clfParamList = {'n_neighbors':10,'algorithm':'auto'} classifier = NeighborsClassifier(**clfParamList) avgprec = np.zeros(nFold) for icv,(train, test) in enumerate(cv): clf = classifier.fit(X[train], y[train]) # probas_ = clf.predict_proba(X[test]) # precision, recall, thresholds = precision_recall_curve(y[test], probas_[:,1]) #@UnusedVariable # avgprec[icv] = auc(recall,precision) ypred = clf.predict(X[test]) avgprec[icv] = f1_score(y[test],ypred) perfMean[iCat] = np.mean(avgprec) perfStd[iCat] = np.std(avgprec) if(options.verbose): print perfMean print perfStd return [perfMean,perfStd]
def test(train_data, test_data, train_class, test_class): neigh_1 = NeighborsClassifier(n_neighbors=1) neigh_1.fit(train_data, train_class) neigh_3 = NeighborsClassifier(n_neighbors=3) neigh_3.fit(train_data, train_class) neigh_5 = NeighborsClassifier(n_neighbors=5) neigh_5.fit(train_data, train_class) neigh_11 = NeighborsClassifier(n_neighbors=11) neigh_11.fit(train_data, train_class) neigh_17 = NeighborsClassifier(n_neighbors=17) neigh_17.fit(train_data, train_class) neigh_21 = NeighborsClassifier(n_neighbors=21) neigh_21.fit(train_data, train_class) print success_rate(neigh_1, test_data, test_class) print success_rate(neigh_3, test_data, test_class) print success_rate(neigh_5, test_data, test_class) print success_rate(neigh_11, test_data, test_class) print success_rate(neigh_17, test_data, test_class) print success_rate(neigh_21, test_data, test_class)