def NearestCentroidImplementation(X_train, X_test, y_train, y_test, x_classA, x_classB, userID): print("Implementing Nearest Centroid") clf = NearestCentroid() clf.fit(X_train, y_train) print("Predicting the train data") trainAccuracy = clf.score(X_train, y_train) print("Train accuracy =", trainAccuracy) print("Predicting the test data") testAccuracy = clf.score(X_test, y_test) print("Test accuracy =", testAccuracy) #Getting the centroid of Class A and Class B centroids = clf.centroids_ centroidClassA = np.array(centroids[0]) centroidClassB = np.array(centroids[1]) distanceA, distanceB = distForIndivClassesFromCentroid( x_classA, centroidClassA, centroidClassB, x_classB, userID) analysisClassWisePointsToCentroid(x_classA, centroidClassA, centroidClassB, x_classB, userID, distanceA, distanceB) analysisAllPointsToBothCentroids(x_classA, centroidClassA, centroidClassB, x_classB, userID, distanceA, distanceB)
def pickData(filename,class_numbers, training_instances, test_instances): data1 = np.genfromtxt(filename, delimiter=",") #### Reading File array = np.array(data1) data = array class_count = 0 test_instance = test_instances training_instance = training_instances count = 1 file_name = filename train_label_final = [] test_label_final = [] train_data_final = [] test_data_final = [] if (file_name == "HandWrittenLetters.txt"): class_count = 39 elif (file_name == "ATNTFaceImages400.txt"): class_count = 10 for i in range(len(class_numbers)): column_from = (class_numbers[i] - 1) * class_count column_to = column_from + class_count; training_column_end = column_to - test_instance train_label = data[0, column_from:training_column_end] train_data = data[1:, column_from:training_column_end] test_label = data[0, training_column_end:column_to] test_data = data[1:, training_column_end:column_to] if (count == 1): train_label_final = train_label test_label_final = test_label train_data_final = train_data test_data_final = test_data count = 0 else: train_label_final = np.hstack((train_label_final, train_label)) test_label_final = np.hstack((test_label_final, test_label)) train_data_final = np.hstack((train_data_final, train_data)) test_data_final = np.hstack((test_data_final, test_data)) train_data_final_t = train_data_final.transpose() test_data_final_t = test_data_final.transpose() clf = NearestCentroid() clf.fit(train_data_final_t, train_label_final) # predictions = clf.predict(test_data_final_t) # print("Test set predictions:\n{}".format(clf.predict(test_data_final_t))) # print("Test set accuracy: {:.2f}".format(clf.score(test_data_final_t, test_label_final))) accuracy =clf.score(test_data_final_t, test_label_final) return accuracy
def ml_algo(inp): df = pd.read_csv("data/final_preprocess.csv") X = np.array(df.drop(['Result'], axis=1)) y = np.array(df['Result']) X, y = shuffle(X, y, random_state=1) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2) model_centroid = NearestCentroid().fit(X_train, y_train) model_knn = KNeighborsClassifier(25).fit(X_train, y_train) model_svm = SVC().fit(X_train, y_train) model_lr = LinearRegression().fit(X_train, y_train) model_nb = BernoulliNB().fit(X_train, y_train) # criterion-> gini or entropy; splitter-> best or random; max_depth-> any integer value or None; # min_samples_split-> min no. of samples reqd. to split an internal node; # min_samples_leaf -> The minimum number of samples required to be at a leaf node. # min_impurity_split -> It defines the threshold for early stopping tree growth. model_dtree = DecisionTreeClassifier(criterion="entropy", random_state=100, max_depth=3, min_samples_leaf=5).fit( X_train, y_train) # print ("[1] ACCURACY OF DIFFERENT MODELS ",'\n___________________') accu_centroid = model_centroid.score(X_test, y_test) # print ("NearestCentroid -> ", accu_centroid) accu_knn = model_knn.score(X_test, y_test) # print ("Knn -> ",accu_knn) accu_svm = model_svm.score(X_test, y_test) # print ("SVM -> ", accu_svm,) accu_lr = model_lr.score(X_test, y_test) # print ("Linear Regr -> ", accu_lr) accu_nb = model_nb.score(X_test, y_test) # print ("Naive Bayes -> ", accu_nb) accu_dtree = model_dtree.score(X_test, y_test) # print ("Decission Tree -> ", accu_dtree, "\n") result_centroid = model_centroid.predict(inp) result_knn = model_knn.predict(inp) result_svm = model_svm.predict(inp) result_lr = model_lr.predict(inp) result_nb = model_nb.predict(inp) result_dtree = model_dtree.predict(inp) # disease-name, description, [list of step to be taken], [list of to whom we can contact] # print ("[2] PREDICTION ",'\n___________________') # print ("NearestCentroid -> ", result_centroid) # print ("knn -> ", result_centroid) # print ("svm -> ", result_svm) # print ("LinearReg -> ", result_lr) # print ("Naive Bayes -> ", result_nb) # print ("Decission Tree -> ", result_dtree) # return map_disease[str(result_knn[0])] return result_knn[0]
def NC_select_cv(X, Y, num_features): scores = [] skf = cross_validation.StratifiedKFold(Y, n_folds=10) for train, test in skf: X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test] XRF_train, imp, ind, std = fitRF(X_train, y_train, est=2000) # RFsel XRF_test = X_test[:, ind] # reorder test set after RFsel clf = NearestCentroid() clf.fit(XRF_train[:, 0:num_features], y_train) scores.append(clf.score(XRF_test[:, 0:num_features], y_test)) score = np.mean(scores) return(score)
def model_train(train_datas, train_labels): """产生决策树。""" clf = NearestCentroid() model = clf.fit(train_datas, train_labels) # 保存产生的模型 with open(model_save, 'wb') as f: pickle.dump(clf, f) train_acc = clf.score(train_datas, train_labels) print("训练集上的精度是:", train_acc)
def Centroid(i, X, y): kf = KFold(n_splits=i, random_state=None, shuffle=True) print("printing kf", kf) kf.get_n_splits(X) clf = NearestCentroid() accuracy_centroid = 0 for train_index, test_index in kf.split(X): # print('TRAIN:', train_index, 'TEST:', test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) # print("Test set predictions:\n{}".format(clf.predict(X_test))) # print("Test set accuracy: {:.2f}".format(clf.score(X_test, y_test))) accuracy_centroid += clf.score(X_test, y_test) print("Centroid Accuracy with ", i, " Fold: ", (clf.score(X_test, y_test))) print("Average accuracy of centroid with all folds: ", accuracy_centroid / i) centroid_accuracy_list.append(accuracy_centroid / i)
class Knn(): def __init__(self, method, n_neighbors, weights, radius): if method == 'knn_class': self.clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) elif method == 'knn_rad': self.clf = RadiusNeighborsClassifier(radius=radius) elif method == 'knn_cent': self.clf = NearestCentroid() def train_model(self, train): self.clf.fit(train[0], train[1]) def predict(self, data): return self.clf.predict(data) def test_model(self, test): return self.clf.score(test[0], test[1])
def main(CV=False, PLOT=True): """Entry Point. Parameters ---------- CV: bool Cross-validation flag PLOT: bool Plotting flag """ _data = fetch_data() if CV: method, params = cross_validate(_data) else: method = 'l2' params = {'metric': chisquare} data = normalise(_data, method) X_train, y_train = data['train'] X_test, y_test = data['test'] classifier = NearestCentroid(**params) classifier.fit(X_train, y_train) print('ACCURACY: ', classifier.score(X_test, y_test)) if PLOT: y_hat = classifier.predict(X_test) cnf_matrix = confusion_matrix(y_test, y_hat) plot_confusion_matrix(cnf_matrix, classes=list(set(y_test)), title='Nearest Centroid\nConfusion Matrix', cmap=plt.cm.Blues) plt.savefig('data/out/nc_cnf_matrix.pdf', format='pdf', dpi=300, transparent=True)
def train(): df = pd.read_csv('data.csv') df.drop(['id'], 1, inplace=True) X = np.array(df.drop(['move'], axis=1)) y = np.array(df['move']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) clf = NearestCentroid(metric='euclidean', shrink_threshold=None) clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) print(accuracy) example_measures = np.array([ 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 ]) example_measures = example_measures.reshape(1, -1) prediction = clf.predict(example_measures) print(prediction)
def text_classify(X_train, X_test, y_train, y_test): """ machine learning classifier :param X_train: :param X_test: :param y_train: :param y_test: :return: """ print('=' * 100) print('start launching MLP Classifier......') mlp = MLPClassifier(solver='lbfgs', alpha=1e-4, hidden_layer_sizes=(50, 30, 20, 20, 20, 30, 50), random_state=1) mlp.fit(X_train, y_train) print('finish launching MLP Classifier, the test accuracy is {:.5%}'.format(mlp.score(X_test, y_test))) print('=' * 100) print('start launching SVM Classifier......') svc = svm.SVC(decision_function_shape='ovo') svc.fit(X_train, y_train) print('finish launching SVM Classifier, the test accuracy is {:.5%}'.format(svc.score(X_test, y_test))) print('=' * 100) print('start launching Decision Tree Classifier......') dtree = tree.DecisionTreeClassifier() dtree.fit(X_train, y_train) print('finish launching Decision Tree Classifier, the test accuracy is {:.5%}'.format( dtree.score(X_test, y_test))) print('=' * 100) print('start launching KNN Classifier......') knn = NearestCentroid() knn.fit(X_train, y_train) print('finish launching KNN Classifier, the test accuracy is {:.5%}'.format(knn.score(X_test, y_test))) print('=' * 100) print('start launching Random Forest Classifier......') rf = RandomForestClassifier(n_estimators=20) rf.fit(X_train, y_train) print('finish launching Random Forest Classifier, the test accuracy is {:.5%}'.format(rf.score(X_test, y_test)))
def ncc_classify(X_train, y_train, X_test, y_test): hyper_param = True if hyper_param == True: params = { 'dist': ['euclidean', 'manhattan'], } best_accuracy = 0 for i in range(0, 2): model = NearestCentroid(metric=params['dist'][i]) model.fit(X_train, y_train) y_pred = model.predict(X_test) accuracy = model.score(X_test, y_test) if accuracy > best_accuracy: best_accuracy = accuracy best_param = model.metric model = NearestCentroid(metric=best_param) plot_confusion_matrix3(y_test, y_pred) print(model.metric) return model, y_pred
def nc_train(training_data): ''' svm from kernel https://www.kaggle.com/archaeocharlie/a-beginner-s-approach-to-classification ''' labeled_images = pd.read_csv(training_data) images = labeled_images.iloc[0:10000, 1:] labels = labeled_images.iloc[0:10000, :1] train_images, test_images, train_labels, test_labels = train_test_split( images, labels, train_size=0.8, random_state=0) # convert all pixels to black and white test_images[test_images > 0] = 1 train_images[train_images > 0] = 1 clf = NearestCentroid(shrink_threshold=1) # Train the model using the training sets and check score start = time.time() clf.fit(train_images, train_labels.values.ravel()) end = time.time() print("Training time: ", end - start) print("Accuracy: ", clf.score(test_images, test_labels)) return clf
#scores = cross_validation.cross_val_score(clf, data[:, 3:15], data[:, 2], cv=5) #print scores # Nearest Neighbor nbrs = KNeighborsClassifier(n_neighbors=2).fit(X_train, y_train) nbrs_y_pred = nbrs.predict(X_test) nbrs_pr = precision_score(y_test,nbrs_y_pred) nbrs_rc = recall_score(y_test,nbrs_y_pred) nbrs_CM = confusion_matrix(y_test,nbrs_y_pred) print "------------------" print "\tNearest Neighbor" print "------------------" print "Real: " print y_test print "Predict" print nbrs_y_pred print "Score:" print nbrs_pr # NearestCentroid clf = NearestCentroid().fit(X_train, y_train) print "------------------" print "\tNearest Centroid" print "------------------" print "Real: " print y_test print "Predict" print clf.predict(X_test) print "Score: " print clf.score(X_test, y_test)
print('=' * 100) print('start launching Decision Tree Classifier......') dtree = tree.DecisionTreeClassifier() dtree.fit(train_X, training_label) print( 'finish launching Decision Tree Classifier, the test accuracy is {:.5%}' .format(dtree.score(test_X, test_label))) print('=' * 100) print('start launching KNN Classifier......') knn = NearestCentroid() knn.fit(train_X, training_label) print( 'finish launching KNN Classifier, the test accuracy is {:.5%}'.format( knn.score(test_X, test_label))) print('=' * 100) print('start launching Random Forest Classifier......') rf = RandomForestClassifier(n_estimators=10) rf.fit(train_X, training_label) print( 'finish launching Random Forest Classifier, the test accuracy is {:.5%}' .format(rf.score(test_X, test_label))) """ train_X, training_label, test_X, test_label = init_20groups_data(TEXT_DIR) print('=' * 100) print('start launching MLP Classifier......') mlp = MLPClassifier(solver='lbfgs', alpha=1e-4, hidden_layer_sizes=(50, 30, 20, 20), random_state=1) mlp.fit(train_X, training_label) print('finish launching MLP Classifier, the test accuracy is {:.5%}'.format(mlp.score(test_X, test_label)))
import numpy as np import scipy.io as sio from sklearn.model_selection import train_test_split from sklearn.neighbors.nearest_centroid import NearestCentroid banana = sio.loadmat("banana.mat") train_data = banana["train_data"] train_labels = banana["train_labels"] train_labels = np.array(train_labels) test_data = banana["test_data"] test_labels = banana["test_labels"] test_labels = np.array(test_labels) data = np.concatenate((train_data, test_data), axis=0) data_labels = np.concatenate((train_labels, test_labels), axis=0) # division of the whole set, training 30%, testing 70% train, test, train_targets, test_targets = train_test_split( data, data_labels.ravel(), test_size=0.70, random_state=42) #Training the Classifier tmp = NearestCentroid() tmp.fit(train, train_targets) #score print("the percentage of correct classifications:", tmp.score(test, test_targets))
import scipy.io as sio from sklearn.model_selection import train_test_split import numpy as np from sklearn.neighbors.nearest_centroid import NearestCentroid banana = sio.loadmat("banana.mat") train_data = banana["train_data"] train_labels = banana["train_labels"] train_labels = np.array(train_labels) test_data = banana["test_data"] test_labels = banana["test_labels"] test_labels = np.array(test_labels) train, dummy, train_targets, dummy = train_test_split(train_data, train_labels.ravel(), test_size=0.70) dummy, test, dummy, test_targets = train_test_split(test_data, test_labels.ravel(), test_size=0.70) clf = NearestCentroid() clf.fit(train, train_targets) Z = clf.predict(test) print("Procent poprawnych klasyfikacji:", round(clf.score(test, test_targets) * 100, 2), "%")
# -*- coding: utf-8 -*- """ Created on Sun Jun 4 09:20:28 2017 @author: 凯风 """ from sklearn.neighbors.nearest_centroid import NearestCentroid from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split # 准备数据 iris_dataset = load_iris() X, Y = iris_dataset.data, iris_dataset.target trainX, testX, trainY, testY = train_test_split(X, Y, test_size=.3) ''' 最近质心分类: 和KNN很像,通过每个类的数据计算每个类的质心 然后用这个质心来表示这个类 算是比较简单的基分类器,参数不多 ''' rlf = NearestCentroid(metric='euclidean', shrink_threshold=None) rlf.fit(trainX, trainY) rlf.score(testX, testY) preY = rlf.predict(testX) ''' metric 计算距离的方法 shrink_threshold 是否缩小质心以消除特征的阈值 '''
for train_index, test_index in kfold.split(data_opto_SOM, target): ## CONTROL ## x_train, x_test = data_control[train_index, :], data_control[ test_index, :] y_train, y_test = target[train_index], target[test_index] mul_lr = LogisticRegression(multi_class='multinomial', solver='newton-cg', max_iter=300) mul_lr.fit(x_train, y_train) score_control_LR[n, f] = mul_lr.score(x_test, y_test) * 100 print(mul_lr.score(x_test, y_test)) clf = NearestCentroid(metric='euclidean', shrink_threshold=None) clf.fit(x_train, y_train) score_control_NN[n, f] = clf.score(x_test, y_test) * 100 lda = LinearDiscriminantAnalysis(solver='svd') lda.fit(x_train, y_train) score_control_LDA[n, f] = lda.score(x_test, y_test) * 100 print(lda.score(x_test, y_test)) svm_algo = svm.SVC(decision_function_shape='ovo', kernel='linear') svm_algo.fit(x_train, y_train) score_control_SVM[n, f] = svm_algo.score(x_test, y_test) * 100 ## DBS ## x_train, x_test = data_DBS[train_index, :], data_DBS[test_index, :] y_train, y_test = target[train_index], target[test_index] mul_lr = LogisticRegression(multi_class='multinomial',
print 'Reading features... Done!' # STEP 2 - computing scores print 'Training...' tfidf = models.TfidfModel(dictionary=features) # Computing tfidf model to be queried. tfidf.save('reuters/data/tfidf.model') # STEP 3 - computing centroids tfidf = models.TfidfModel.load('reuters/data/tfidf.model') features = corpora.Dictionary.load_from_text('reuters/data/word.dict') by_bow = Corpus2Dictionary(features) train_corpus = ReutersCorpus('training') tfidf_train = tfidf[by_bow[by_word[train_corpus]]] X = matutils.corpus2csc(tfidf_train) # to gensim into scipy sparse matrix X = X.transpose() # from csc (document as column) to csr (document as row) y = train_corpus.category_mask # label for doc rocchio = NearestCentroid() rocchio.fit(X, y) print 'Training... Done!' # STEP 4 - evaluate prediction test_corpus = ReutersCorpus('test') tfidf_test = tfidf[by_bow[by_word[test_corpus]]] # num_terms required: otherwise Z shrink to the max feature found X = matutils.corpus2csc(tfidf_test, num_terms=len(features)) X = X.transpose() y_true = test_corpus.category_mask y_pred = rocchio.predict(X) # print precision_score(y_true, y_pred) print rocchio.score(X, y_true)
predicted = clf.predict(X_test) #import report, confusion matrix for results print(clf.score(X_test, y_test)) print("Classification report for kNN classifier %s:\n%s\n" % (clf, metrics.classification_report(expected, predicted))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)) print(clf.score(X_test, y_test)) #calculation time fitting for Nearest Centroid start = int(round(time.time() * 1000)) #Import Nearest Centroid classifier = NearestCentroid() classifier.fit(X_lda, y_train) NearestCentroid(metric='euclidean', shrink_threshold=None) print(classifier) end = int(round(time.time() * 1000)) print("--Centroid fitting finished in ", (end - start), "ms") expected = y_test predicted = classifier.predict(X_test) #import report,confusion matrix for results print(classifier.score(X_test, y_test)) print("Classification report for Centroid classifier %s:\n%s\n" % (classifier, metrics.classification_report(expected, predicted))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)) print(classifier.score(X_test, y_test))
def cross_validate(data): """Cross-Validate KNN. Parameters ---------- data: dict * train: tuple - X: features - y: labels * test: tuple - X: features - y: labels Returns ------- method: std Transformation function params: dict * metric: function | str Distance metric function * metric_params: dict Parameters of `metric` function """ norm_methods = [ 'none', 'l1', 'l2', 'max', 'standard', 'maxabs', 'minmax', 'robust' ] params_grid = [('Intersection', { 'metric': intersection }), ('Correlation', { 'metric': correlation }), ('Manhattan', { 'metric': 'manhattan' }), ('Euclidean', { 'metric': 'euclidean' }), ('Chebyshev', { 'metric': 'chebyshev' }), ('Chi-Square', { 'metric': chisquare })] results = {} best_params = {} best_score = -1 for method in norm_methods: data = normalise(data, method=method) X_train, y_train = data['train'] X_test, y_test = data['test'] results[method] = {} for name, params in params_grid: classifier = NearestCentroid(**params) acc = cross_val_score(classifier, X_train, y_train, cv=3).mean() results[method][name] = acc best_metric = None best_accuracy = -1 for name, accuracy in results[method].items(): if accuracy > best_accuracy: best_accuracy = accuracy best_metric = name best_params_ = {**dict(params_grid)[best_metric]} print('') print('[%s]' % method, 'Best params:', best_params_) print('[%s]' % method, 'Best CV Score:', results[method][best_metric]) best_classifier_ = NearestCentroid(**best_params_) best_classifier_.fit(X_train, y_train) best_score_ = best_classifier_.score(X_test, y_test) print('[%s]' % method, 'Accuracy:', best_score_) if best_score_ > best_score: print('[%s]' % method, 'New Best:', best_params_) best_params = (method, best_params_) best_score = best_score_ print('') print('Cross Validation Results:', best_params) print('') return best_params
counter2 = 0 for i in range(len(x)): if z[i] == 1: plt.scatter(x[i], y[i], c="RED") counter1 += 1 else: plt.scatter(x[i], y[i], c="GREEN") counter2 += 1 c1 = [c1[0] / counter1, c1[1] / counter1] c2 = [c2[0] / counter2, c2[1] / counter2] plt.scatter(c1[0], c1[1], c="BLUE") plt.scatter(c2[0], c2[1], c="BROWN") plt.show() # Zadanie 8: print("Classifier efficiency: %f" % clf.score(train, np.ravel(train_targets))) # Zadanie 9: k_best = [0, 0] for k in range(5, 17): knn = neighbors.KNeighborsClassifier(k, weights='uniform', metric='euclidean') knn.fit(train, np.ravel(train_targets)) predicted = knn.predict(test) sc = knn.score(train, np.ravel(train_targets)) if sc > k_best[1]: k_best = [k, sc] print("Best efficiency in 5-16 is for k =", k_best[0], " --> efficiency : ", k_best[1])
### your code here! name your classifier object clf if you want the ### visualization code (prettyPicture) to show you the decision boundary ## k-nearest neighbors from sklearn.neighbors.nearest_centroid import NearestCentroid clf = NearestCentroid() T0 = time() clf = clf.fit(features_train, labels_train) print("nearest neighbors nearest centroid training time:", round(time() - T0, 3), "s") T1 = time() PRED = clf.predict(features_test) print("nearest neighbors nearest centroid predition time:", round(time() - T1, 3), "s") ACC = clf.score(features_test, labels_test) print(ACC) # ## adaboost from sklearn import ensemble clf = ensemble.AdaBoostClassifier() T0 = time() clf = clf.fit(features_train, labels_train) print("adaboost training time:", round(time() - T0, 3), "s") T1 = time() PRED = clf.predict(features_test) print("adaboost predition time:", round(time() - T1, 3), "s") ACC = clf.score(features_test, labels_test) print(ACC) # ## random forest
plt.scatter(grade_slow, bumpy_slow, color="r", label="slow") plt.legend() plt.xlabel("bumpiness") plt.ylabel("grade") plt.show() ################################################################################ ### your code here! name your classifier object clf if you want the ### visualization code (prettyPicture) to show you the decision boundary # kmeans from sklearn.neighbors.nearest_centroid import NearestCentroid clf = NearestCentroid() clf.fit(features_train, labels_train) predict = clf.predict(features_test) acc = clf.score(features_test, labels_test) print(acc) # # 0.908 # AdaBoostClassifier from sklearn.ensemble import AdaBoostClassifier clf = AdaBoostClassifier(n_estimators=100, random_state=0) clf.fit(features_train, labels_train) clf.predict(features_test) acc = clf.score(features_test, labels_test) print(acc) # 0.924 # RandomForestClassifier from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(max_depth=2, random_state=0)
def myclassify(numfiers=5,xtrain=xtrain,ytrain=ytrain,xtest=xtest,ytest=ytest): count = 0 bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) count += 1 classifiers = [bagging2.score(xtest,ytest)] if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) #print tree2.fit(xtrain,ytrain) #print tree2.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree2.score(xtest,ytest)) print "1" print tree2.score(xtest,ytest) if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging1.score(xtest,ytest)) print "2" print bagging1.score(xtest,ytest) # if count < numfiers: # # votingClassifiers combine completely different machine learning classifiers and use a majority vote # clff1 = SVC() # clff2 = RFC(bootstrap=False) # clff3 = ETC() # clff4 = neighbors.KNeighborsClassifier() # clff5 = quadda() # print"3" # eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) # eclf = eclf.fit(xtrain,ytrain) # #print(eclf.score(xtest,ytest)) # # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # # cla # # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # # print () # count+=1 # classifiers = np.append(classifiers,eclf.score(xtest,ytest)) # if count < numfiers: # svc1 = SVC() # svc1.fit(xtrain,ytrain) # dec = svc1.score(xtest,ytest) # count+=1 # classifiers = np.append(classifiers,svc1.score(xtest,ytest)) # print "3" if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,qda.score(xtest,ytest)) print "4" if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) #print tree1.fit(xtrain,ytrain) #print tree1.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree1.score(xtest,ytest)) if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) #print(knn1.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn1.score(xtest,ytest)) if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) #print(lda.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,lda.score(xtest,ytest)) if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) #print tree3.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree3.score(xtest,ytest)) if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) #print bagging3.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging3.score(xtest,ytest)) if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) #print bagging4.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging4.score(xtest,ytest)) if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) #print tree4.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree4.score(xtest,ytest)) if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) #print(tree6.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,tree6.score(xtest,ytest)) if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) #print(knn2.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn2.score(xtest,ytest)) if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) #print(knn3.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn3.score(xtest,ytest)) if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) #print(knn4.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn4.score(xtest,ytest)) if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) #print(knn5.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn5.score(xtest,ytest)) if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) #print (ncc1.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,ncc1.score(xtest,ytest)) if count < numfiers: # Nearest shrunken Centroid for shrinkage in [None,0.05,0.1,0.2,0.3,0.4,0.5]: ncc2 = NearestCentroid(shrink_threshold = shrinkage) ncc2.fit(xtrain,ytrain) #print(ncc2.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,ncc2.score(xtest,ytest)) if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) #print(tree5.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,tree5.score(xtest,ytest)) classifierlabel = ["BaggingETC (with bootstraps set to false)","ETC","BaggingETC","Voting Classifier","svm","QDA","DTC","KNN (default)","LDA","RFC", "BaggingRFC (with bootstraps set to false)","BaggingSVC (with bootstraps set to false)","RFC (bootstrap false)","GBC", "knn (n_neighbors = 10)","knn (n_neighbors = 3)","knn (ball tree algorithm)","knn (kd_tree algorithm)", "Nearest Centroid","Shrunken Centroid?","ABC"] classifierlabel = classifierlabel[:len(classifiers)] #print len(classifiers) #print classifiers for i in range(len(classifiers)): print ("{} classifier has percent correct {}".format(classifierlabel[i],classifiers[i]))
def Euclidean_MDC(X_train, X_test, y_train, y_test): clf = NearestCentroid(metric='euclidean') clf.fit(X_train, y_train.values.ravel()) print(clf.score(X_test, y_test))
import pickle import numpy as np from matplotlib.colors import ListedColormap import matplotlib.pyplot as plt features = [[20,11],[20,5],[20,12],[17,7],[16,7],[18,7],[19,7],[20,4],[20,9],[20,10]] r_fea = [[a[1],a[0] ]for a in features] #labels = [[0],[1],[1],[0],[1],[0],[1],[0],[0],[1]] labels = [0,1,1,0,1,0,1,0,0,1] r_lab = [(a-1)*(a-1) for a in labels] X = np.array(features+r_fea) y = np.array(labels + r_lab) clf = NearestCentroid() clf.fit(X, y) print(clf.centroids_) print(clf.score(X,y)) print(clf.predict([[20, 7]])) print(clf.predict([[7, 20]])) list_pickle = open('lr.pkl', 'wb') pickle.dump(clf, list_pickle) cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF']) h = .02 x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
import winsound as s s.Beep(1300,2000) neigh.score(arr[n:], target[n:]) # Roccio from sklearn.neighbors.nearest_centroid import NearestCentroid from sklearn.model_selection import cross_val_score import numpy as np import winsound as s n=50000 clf = NearestCentroid() clf.fit(arr[:n], target[:n]) s.Beep(1300,2000) NearestCentroid(metric='euclidean', shrink_threshold=None) s.Beep(1300,2000) print(clf.score(arr[n:],target[n:])) s.Beep(1300,2000) scores = cross_val_score(clf, arr[n:], target[n:], cv=5) print(scores) # # Naive-bayes from sklearn.naive_bayes import GaussianNB # gnb = GaussianNB() # y_pred = gnb.fit(arr[:500], target[:500]).predict(arr[500:600]) # print("Number of mislabeled points out of a total %d points : %d" # % (arr.shape[0],(target != y_pred).sum())) n=50000 clf = GaussianNB() clf.fit(arr[:n], target[:n]) clf.score(arr[n:],target[n:]) clf_pf = GaussianNB()
def Mahalanobis_MDC(X_train, X_test, y_train, y_test): clf = NearestCentroid(metric='mahalanobis') clf.fit(X_train, y_train.values.ravel()) print(clf.score(X_test, y_test))
t[:, 2:30, 2:30, 1] = x_test[k:(k + 100)] t[:, 2:30, 2:30, 2] = x_test[k:(k + 100)] _ = model.predict(t) out = [model.layers[5].output] func = K.function([model.input, K.learning_phase()], out) test[k:(k + 100), :] = func([t, 1.])[0] k += 100 np.save("mnist_test_embedded.npy", test) # use the 128 element vectors as training data for other models print("Full MNIST dataset:") print() print("Training Nearest Centroid") clf0 = NearestCentroid() clf0.fit(train, y_train) nscore = 100.0 * clf0.score(test, y_test) print("Training 3-NN") clf1 = KNeighborsClassifier(n_neighbors=3) clf1.fit(train, y_train) kscore = 100.0 * clf1.score(test, y_test) print("Training Random Forest") clf2 = RandomForestClassifier(n_estimators=50) clf2.fit(train, y_train) rscore = 100.0 * clf2.score(test, y_test) print("Training Linear SVM") clf3 = LinearSVC(C=0.1) clf3.fit(train, y_train) sscore = 100.0 * clf3.score(test, y_test)
usecols=tuple(columns)) # standard normally distributed data: Gaussian with zero mean and unit variance trainingData_scaled = preprocessing.scale(trainingData) # get a 50000 x 1 column array for all of the results (boolean) (just 1000 x 1 for now) results = np.loadtxt(filePath, delimiter=',', skiprows=numRowsToSkip, usecols=(622, )) # TRAIN THE MODELS # randomly split the data into training set and test set (40% testing) X_train, X_test, y_train, y_test = train_test_split(trainingData_scaled, results, test_size=0.4, random_state=0) # Accuracy: 70% model = NearestCentroid() # Fit the model according to the given training data model.fit(X_train, y_train) # evaluate the trained model on the test set # Returns the mean accuracy on the given test data and labels testAccuracy = model.score(X_test, y_test) print("Final results for '%s': testing accuracy of %f%%" % (model, testAccuracy * 100))
prec = precision_score(y_test,y_pred) rec = recall_score(y_test,y_pred) conf = confusion_matrix(y_test, y_pred) print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0],(y_test != y_pred).sum())) print("K-nn" ,acc, prec, rec, conf) #+++++++++++++++++++++++++++++++++++++++++++ Nearest Centroid ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ from sklearn.neighbors.nearest_centroid import NearestCentroid clf = NearestCentroid() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(clf.predict(X_test)) print(clf.score(X_test,y_test)) print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0],(y_test != y_pred).sum())) acc = accuracy_score(y_test,y_pred) prec = precision_score(y_test,y_pred) rec = recall_score(y_test,y_pred) conf = confusion_matrix(y_test, y_pred) print("Nearest Centroid" ,acc, prec, rec, conf) #+++++++++++++++++++++++++++++++++++++++++++++++++++++++ EM +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ from sklearn.mixture import GaussianMixture gmm = GaussianMixture(n_components=2) gmm.fit(X_train) #print(gmm.means_) #print(gmm.covariances_)
hidden_layer_sizes=(3, 3), random_state=1) model3 = model3.fit(textBowTrain, y_train) model3.score(textBowTest, y_test) predictions3 = model3.predict(textBowTest) # permite obtener las métrica para el modelo print(classification_report(y_test, predictions3)) predictions3 = pd.DataFrame(predictions3) predictions3.to_csv('predictmodelMLP.csv', index=False) # K-NN from sklearn.neighbors.nearest_centroid import NearestCentroid # Creación del modelo de aprendizaje model4 = NearestCentroid() model4 = model4.fit(textBowTrain, y_train) model4.score(textBowTest, y_test) predictions4 = model4.predict(textBowTest) print(classification_report(y_test, predictions4)) predictions4 = pd.DataFrame(predictions4) predictions4.to_csv('predictmodelKNN.csv', index=False) # Creación de la matriz de confusión predictions = [predictions1, predictions2, predictions3, predictions4] for i in range(len(predictions)): cm = confusion_matrix(y_test, predictions[i]) print(cm) classes = [0, 1, 2] plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues) plt.title("title")
# Executando classificador random forest com 10, 100 e 500 arvores, 10 vezes cada uma for j in 10, 100, 500: print("Numero de arvores: " + str(j)) soma = 0 for i in range(0, 10): clf = RandomForestClassifier(n_estimators=j) clf = clf.fit(treino_caracteristicas, treino_rotulos) print(clf.score(teste_caracteristicas, teste_rotulos)) soma += clf.score(teste_caracteristicas, teste_rotulos) #print(soma) media = soma / 10 print("Media: " + str(media)) featureImp = clf.feature_importances_ print(featureImp) print("posicao odor: " + str(featureImp[4])) clf2 = svm.SVC() clf2.fit(treino_caracteristicas, treino_rotulos) print("SVM") print(clf2.score(teste_caracteristicas, teste_rotulos)) #print(clf2.support_vectors_) clf3 = NearestCentroid() clf3.fit(treino_caracteristicas, treino_rotulos) print("KNN Centroide") print(clf3.score(teste_caracteristicas, teste_rotulos)) #print(clf3.centroids_)
# rnc1.fit(xtrain,ytrain1) # print (rnc1.score(xtest,ytest1)) # In[ ]: get_ipython().magic(u'whos') # In[17]: # Nearest centroid from sklearn.neighbors.nearest_centroid import NearestCentroid ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain1) print (ncc1.score(xtest,ytest1)) # In[18]: # Nearest shrunken Centroid for shrinkage in [None,0.05,0.1,0.2,0.3,0.4,0.5]: ncc2 = NearestCentroid(shrink_threshold = shrinkage) ncc2.fit(xtrain,ytrain1) print(ncc2.score(xtest,ytest1)) # In[19]: # linear discriminant analysis - classifier with linear decision boundary - from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as linda