def nearest_centroid(input_file,Output): lvltrace.lvltrace("LVLEntree dans nearest_centroid") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape clf = NearestCentroid() clf.fit(X,y) y_pred = clf.predict(X) print "#########################################################################################################\n" print "Nearest Centroid Classifier " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"Nearest_Centroid_metrics.txt" file = open(results, "w") file.write("Nearest Centroid Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "Nearest Centroid Classifier" save = Output + "Nearest_Centroid_Classifier_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans nearest_centroid")
def nearest_centroid(input_file,Output,test_size): ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf = NearestCentroid() clf.fit(X_train,y_train) y_pred = clf.predict(X_test) print "Nearest Centroid Classifier " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"Nearest_Centroid_metrics_test.txt" file = open(results, "w") file.write("Nearest Centroid Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Nearest Centroid %f"%test_size save = Output + "Nearest_Centroid_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans stochasticGD split_test")
def _clustering(self, targetgame, games): ''' Find similar games with clustering TODO ''' preparegames = list(map(lambda x: [i[1] for i in x.data], games)) preparegame = list(map(lambda x: x[1], targetgame.data)) lables = list(range(len(games))) clf = NearestCentroid() clf.fit(preparegames, lables) print(clf.predict(preparegame))
def NC_select_cv(X, Y, num_features): scores = [] skf = cross_validation.StratifiedKFold(Y, n_folds=10) for train, test in skf: X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test] XRF_train, imp, ind, std = fitRF(X_train, y_train, est=2000) # RFsel XRF_test = X_test[:, ind] # reorder test set after RFsel clf = NearestCentroid() clf.fit(XRF_train[:, 0:num_features], y_train) scores.append(clf.score(XRF_test[:, 0:num_features], y_test)) score = np.mean(scores) return(score)
def itemB(): train_dataset = load_nebulosa_train() # remover missing values # print(train_dataset) train_dataset = train_dataset[~np.isnan(train_dataset).any(axis=1)] train_dataset = train_dataset[:, 2:] train_target = train_dataset[:, -1] train_dataset = train_dataset[:, :-2] # train_dataset = normalize(train_dataset, axis=0) test_dataset = load_nebulosa_test() # remover mising values test_dataset = test_dataset[~np.isnan(test_dataset).any(axis=1)] test_dataset = test_dataset[:, 2:] test_target = test_dataset[:, -1] test_dataset = test_dataset[:, :-2] # print(test_dataset) # test_dataset = normalize(test_dataset, axis=1) # print(test_dataset) kbest = SelectKBest(f_classif, k=3).fit(train_dataset, train_target) train_dataset = kbest.transform(train_dataset) test_dataset = kbest.transform(test_dataset) # print(train_dataset) n_train_samples = train_dataset.shape[0] n_train_features = train_dataset.shape[1] # print("Nebulosa Train dataset: %d amostras(%d características)" % (n_train_samples, n_train_features)) n_test_samples = test_dataset.shape[0] n_test_features = test_dataset.shape[1] # print("Nebulosa Test dataset: %d amostras(%d características)" % (n_test_samples, n_test_features)) nn = KNeighborsClassifier(n_neighbors=1) nn.fit(train_dataset, train_target) nn_target_pred_test = nn.predict(test_dataset) nn_accuracy_test = accuracy_score(test_target, nn_target_pred_test) print("NN: Acurácia (Teste): %.2f" % (nn_accuracy_test)) nc = NearestCentroid(metric="euclidean") nc.fit(train_dataset, train_target) nc_target_pred_test = nc.predict(test_dataset) nc_accuracy_test = accuracy_score(test_target, nc_target_pred_test) print("Rocchio: Acurácia (Teste): %.2f" % (nc_accuracy_test))
def fit_dev(self,net_data,nnet_cluster='auto',nSubtypes=3): self.nnet_cluster = nnet_cluster self.nSubtypes = nSubtypes if nnet_cluster == 'auto': #self.nnet_cluster = self.getClusters(net_data) self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='meanshift') else: self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='kmeans') #self.valid_cluster = self.clust_list #self.valid_net_idx = range(len(self.valid_cluster)) for i in range(net_data.shape[0]): if i == 0 : self.assign_net = self.assigneDist(net_data[i,:,:],self.valid_cluster, self.valid_net_idx) else: self.assign_net = np.vstack(((self.assign_net,self.assigneDist(net_data[i,:,:],self.valid_cluster, self.valid_net_idx)))) print 'Size of the new data map: ',self.assign_net.shape # group subjects with the most network classifing them together # compute the consensus clustering self.consensus = cls.hclustering(self.assign_net,self.nSubtypes) # save the centroids in a method self.clf_subtypes = NearestCentroid() self.clf_subtypes.fit(self.assign_net,self.consensus) self.consensus = self.clf_subtypes.predict(self.assign_net) #print "score: ", self.clf_subtypes.score(self.assign_net,self.consensus) return self.consensus
def itemA(): train_dataset = load_nebulosa_train() train_target = train_dataset[:, -1] train_dataset = train_dataset[:, :-1] nam_target = np.where(np.isnan(train_target)) train_target = np.delete(train_target, nam_target) train_dataset = np.delete(train_dataset, nam_target, 0) train_dataset = np.nan_to_num(train_dataset) test_dataset = load_nebulosa_test() test_target = test_dataset[:, -1] test_dataset = test_dataset[:, :-1] nam_target = np.where(np.isnan(test_target)) test_target = np.delete(test_target, nam_target) test_dataset = np.delete(test_dataset, nam_target, 0) test_dataset = np.nan_to_num(test_dataset) n_train_samples = train_dataset.shape[0] n_train_features = train_dataset.shape[1] print("Nebulosa Train dataset: %d amostras(%d características)" % (n_train_samples, n_train_features)) n_test_samples = test_dataset.shape[0] n_test_features = test_dataset.shape[1] print("Nebulosa Test dataset: %d amostras(%d características)" % (n_test_samples, n_test_features)) nn = KNeighborsClassifier(n_neighbors=1) nn.fit(train_dataset, train_target) nn_target_pred_test = nn.predict(test_dataset) nn_accuracy_test = accuracy_score(test_target, nn_target_pred_test) print("NN: Acurácia (Teste): %.2f" % (nn_accuracy_test)) # train_target[18] = 1 nc = NearestCentroid(metric="euclidean") nc.fit(train_dataset, train_target) nc_target_pred_test = nc.predict(test_dataset) # print(nc_target_pred_test) nc_accuracy_test = accuracy_score(test_target, nc_target_pred_test) print("Rocchio: Acurácia (Teste): %.2f" % (nc_accuracy_test))
def __init__(self): x = [] y = [] small = False #clf = svm.LinearSVC() self.clf = NearestCentroid() folder = "gyro_side\\" files = ['still.csv', 'yes.csv', 'no.csv'] for i in range(3): f =open(folder+files[i], 'r') for line in f.readlines(): #print line line = [int(a) for a in line.split(',')] lines = [self.removeMag(line[9*j:9*j+9]) for j in range(9)] # smallLine=[] # for j in range(5): # smallLine = smallLine + line[6*j:6*j+3] # if small: # line=smallLine # if len(x)==0: # x= np.array(np.array([line])) # else: # x=np.append(x,np.array([line]), axis=0) # #print np.shape(x) x += [reduce(lambda x,y: x+y, lines[:5], [])] y += [i] x += [reduce(lambda x,y: x+y, lines[4:], [])] y += [i] try: z=1 except Exception as e: #print e print i, line #z=1/0 f.close() x= np.array([np.array(z) for z in x]) y = np.array(y) print y print np.shape(y) print np.shape(x) print type(x[0]), np.array(x[0]) self.clf.fit(x,y) self.data = [] #self.ser = serial.Serial('COM3', 9600) print "Classifier trained"
def loadData(self, ressourcepath1, ressourcepath2): print(ressourcepath1) dirList = os.listdir(ressourcepath1) fullpath1 = [] for fname in dirList: fullpath1.append(ressourcepath1+""+fname) dirList = os.listdir(ressourcepath2) fullpath2 = [] for fname in dirList: fullpath2.append(ressourcepath2+""+fname) counter = 0 for path in fullpath1: if counter == 0: fs, w1 = wavfile.read(path) w1 = self.scaler(w1) w1 = w1[self.get_startingpoint(w1):self.get_endingpoint(w1),:] X = np.array([self.euclidean_distance(self.word1,w1),self.euclidean_distance(self.word2,w1)]) y = np.array([1]) counter = 1 else: fs, w1 = wavfile.read(path) w1 = self.scaler(w1) w1 = w1[self.get_startingpoint(w1):self.get_endingpoint(w1),:] X = np.vstack((X,np.array([self.euclidean_distance(self.word1,w1),self.euclidean_distance(self.word2,w1)]))) y = np.hstack((y,np.array([1]))) for path in fullpath2: fs, w2 = wavfile.read(path) w2 = self.scaler(w2) w2 = w2[self.get_startingpoint(w2):self.get_endingpoint(w2),:] X = np.vstack((X,np.array([self.euclidean_distance(self.word1,w2),self.euclidean_distance(self.word2,w2)]))) y = np.hstack((y,np.array([2]))) from sklearn.neighbors.nearest_centroid import NearestCentroid self.clf = NearestCentroid() self.clf.fit(X,y)
col_input=['genre', 'year', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'col12', 'col13', 'col14', 'col15', 'col16', 'col17', 'col18', 'col19', 'col20', 'col21', 'col22', 'col23', 'col24', 'col25', 'col26', 'col27', 'col28', 'col29', 'col30', 'col31', 'col32', 'col33', 'col34', 'col35', 'col36', 'col37', 'col38', 'col39', 'col40', 'col41', 'col42', 'col43', 'col44', 'col45', 'col46', 'col47', 'col48', 'col49', 'col50', 'col51', 'col52', 'col53', 'col54', 'col55', 'col56', 'col57', 'col58', 'col59', 'col60', 'col61', 'col62', 'col63', 'col64', 'col65', 'col66', 'col67', 'col68', 'col69', 'col70', 'col71', 'col72'] df_input = pandas.read_csv('pandas_output_missing_data_fixed.csv', header=None, delimiter = ",", names=col_input) # range(2,74) means its goes from col 2 to col 73 df_input_data = df_input[list(range(2,74))].as_matrix() # test with few good features as determined through PCA? df_input_target = df_input[list(range(0,1))].as_matrix() colors = numpy.random.rand(len(df_input_target)) # splitting the data into training and testing sets from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(df_input_data, df_input_target.tolist()) # k-NN from sklearn.neighbors.nearest_centroid import NearestCentroid knc = NearestCentroid() knc.fit(X_train[:],numpy.ravel(y_train[:])) predicted = knc.predict(X_test) print y_test[60:90] , len(y_test[60:90]) print predicted[60:90] , len(predicted[60:90]) print knc.classes_ # Prediction Performance Measurement matches = (predicted == [item for sublist in y_test for item in sublist]) print matches.sum() print len(matches) print matches[10:50], len(matches[10:50])
def train(self): clf = NearestCentroid() half = len(self.X) / 2 self.fit = clf.fit(self.X[0:half], self.Y[0:half])
plt.scatter(bumpy_fast, grade_fast, color="b", label="fast") plt.scatter(grade_slow, bumpy_slow, color="r", label="slow") plt.legend() plt.xlabel("bumpiness") plt.ylabel("grade") plt.show() ################################################################################ ### your code here! name your classifier object clf if you want the ### visualization code (prettyPicture) to show you the decision boundary # Choose a smaller dataset features_train = features_train[:len(features_train) / 100] labels_train = labels_train[:len(labels_train) / 100] clf = NearestCentroid() t0 = time() clf = clf.fit(features_train, labels_train) print "Training time:", round(time() - t0, 2), "s" accurary = clf.score(features_test, labels_test) t1 = time() pred = clf.predict(features_test) print "Predicting time:", round(time() - t1, 2), "s" acc = accuracy_score(pred, labels_test) print "Accuracy:", acc try: prettyPicture(clf, features_test, labels_test) except NameError:
svm_model = SVC(kernel="rbf", probability=True, max_iter=10000) svm_model.fit(X_cropped, y_cropped) y_train_predicted = svm_model.predict(X_train) print "SVM Error rate on training data (t1): ", ml_aux.get_error_rate(y_train, y_train_predicted) # ml_aux.plot_confusion_matrix(y_train, y_train_predicted, "CM SVM Training (t1)") # plt.show() y_validation_predicted = svm_model.predict(X_validation) print "SVM Error rate on validation (t1): ", ml_aux.get_error_rate(y_validation, y_validation_predicted) # Start k nearest Centroid Classification print "Performing kNC Classification:" from sklearn.neighbors.nearest_centroid import NearestCentroid knnc_model = NearestCentroid() knnc_model.fit(X_cropped, y_cropped) y_validation_predicted = knnc_model.predict(X_validation) print "Error Rate on kNNC (t1) Validation: ", ml_aux.get_error_rate(y_validation, y_validation_predicted) # Start Bagging Classification print "Performing Bagging Classification:" # Bagging from sklearn.ensemble import BaggingClassifier from sklearn.neighbors import KNeighborsClassifier # Bagging bagging1 = BaggingClassifier(KNeighborsClassifier(n_neighbors=2), max_samples=1.0, max_features=0.1) bagging1.fit(X_cropped, y_cropped) y_validation_predicted = bagging1.predict(X_validation) print "Error Rate kNN with Baggging Validation: ", ml_aux.get_error_rate(y_validation, y_validation_predicted)
# SGD classifier - gives about 73% accuracy cl4 = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False) cl4.fit(X_train, target) pr4 = cl4.predict(X_test) allpred += pr4 print"SGD: " + "%.2f" % (evaluate(pr4, test_jokes)) + "%" # KNN Classifier - gives about 59% accuracy cl5 = NearestCentroid() cl5.fit(X_train, target) pr5 = cl5.predict(X_test) print"KNN: " + "%.2f" % (evaluate(pr5, test_jokes)) + "%" # Decision tree classifier - gives about 75% accuracy cl6 = tree.DecisionTreeClassifier() cl6.fit(X_train, target) pr6 = cl6.predict(X_test) allpred += pr6 print"Decision tree: " + "%.2f" % (evaluate(pr6, test_jokes)) + "%" maxpred = max(allpred) pr7 = [1 if x > maxpred / 2 else 0 for x in allpred]
def feature_redux_and_classify(df, target, selection, reduction, classifier, n_features, n_reduction=None): # 1 - Feature selection if n_reduction is None: n_reduction = n_features sequence = [] if selection == "Kruskal-Wallis": # 1.1 - Kruskal kruskal_stats = [] for column in df: stats, _ = ss.kruskal(df[column], target) kruskal_stats.append((column, stats)) kruskal_stats.sort(key=lambda x: x[1], reverse=True) selected_columns = [kruskal_stats[i][0] for i in range(n_features)] df = df[selected_columns] elif selection == "ROC": # 1.2 - Roc roc_values = [] for column in df: est = LogisticRegression(solver='liblinear', class_weight='balanced') est.fit(df[column].to_frame(), target) roc_values.append((column, roc_auc_score(target, est.predict(df[column].to_frame())))) roc_values.sort(key=lambda x: x[1], reverse=True) selected_columns = [roc_values[i][0] for i in range(n_features)] df = df[selected_columns] elif selection == "K-Best": # sequence.append(('select_best', SelectKBest(k=n_features, score_func=mutual_info_classif))) skb = SelectKBest(k=n_features, score_func=mutual_info_classif) df = skb.fit_transform(df, target) elif selection == "RFE": # RFE estimator = LogisticRegression(solver='liblinear', class_weight='balanced') rfe = RFE(estimator, n_features) df = rfe.fit_transform(df, target) # 2 - Dimension reduction if reduction == "PCA": # 2.1 - PCA sequence.append(('PCA', PCA(n_components=n_reduction))) elif reduction == "LDA": # 2.2 - LDA sequence.append(('LDARed', LinearDiscriminantAnalysis())) # 3 - Classifiers if classifier == "Euclidean": # 3.1 - Euclidean sequence.append(('Euclidean', NearestCentroid(metric='euclidean', shrink_threshold=None))) elif classifier == "Mahalanobis": # 3.2 - Mahalanobis sequence.append(('Mahalanobis', NearestCentroid(metric='mahalanobis', shrink_threshold=None))) elif classifier == "Bayes": # Naive Gaussian Bayes sequence.append(('Bayes', GaussianNB())) elif classifier == "K-Nearest": # K-Nearest Neighbors sequence.append(('K-Nearest', KNeighborsClassifier(n_neighbors=5))) elif classifier == "SVC": # SVC sequence.append(('SVC', SVC(gamma='auto'))) elif classifier == "Parzen Window": # Parzen (Kernel Density Estimation) sequence.append(('Parzen', KDEClassifier(kernel='gaussian', bandwidth=1))) else: # 3.3 - Fisher LDA sequence.append(('LDAClass', LinearDiscriminantAnalysis())) pipe = Pipeline(sequence) kfold = StratifiedKFold(n_splits=20, shuffle=True, random_state=10) scoring = {'accuracy': make_scorer(accuracy_score), 'precision': make_scorer(precision_score), 'recall': make_scorer(recall_score), 'f1_score': make_scorer(f1_score)} cv_results = cross_validate(pipe, df, target, cv=kfold, scoring=scoring) return cv_results
#performance: Euclidean, Cosine, or Manhattan. In `scikit-learn` you can see the #documentation for NearestCentroid here: #- http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestCentroid.html#sklearn.neighbors.NearestCentroid # #and for supported distance metrics here: #- http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.distance_metrics.html#sklearn.metrics.pairwise.distance_metrics #%% from sklearn.neighbors.nearest_centroid import NearestCentroid # the parameters for the nearest centroid metric to test are: # l1, l2, and cosine (all are optimized) # fill in the training and testing data and save as separate variables for d in ['l1', 'l2', 'cosine', 'euclidean', 'manhattan']: clf = NearestCentroid(metric=d) clf.fit(X_train, y_train) yhat = clf.predict(X_test) acc = accuracy_score(y_test, yhat) print(d, acc) p = 'cosine' print('The best distance metric is: ', p) #%% [markdown] # ___ # <a id="naive"></a> <a href="#top">Back to Top</a> # ## Naive Bayes Classification # Now let's look at the use of the Naive Bayes classifier. The 20 newsgroups # dataset has 20 classes and about 130,000 features per instance. Recall that # the Naive Bayes classifer calculates a posterior distribution for each
X_train, X_test, y_train, y_test = train_test_split(mdata, mlabels, test_size=0.25, random_state=55) print('X_train dimensions: ', X_train.shape) print('y_train dimensions: ', y_train.shape) print('X_test dimensions: ', X_test.shape) print('y_test dimensions: ', y_test.shape) #Mentioned below are the three models, use one and comment the other neigh = KNeighborsClassifier(n_neighbors=3) #model = neigh.fit(X_train,y_train.ravel()) #model = GaussianNB().fit(X_train, y_train.ravel()) model = NearestCentroid().fit(X_train, y_train.ravel()) y_train_pred = model.predict(X_train) #Training the model #printing the training Ground truth and training predicted results print("Training Data prediction: \n", y_train_pred) print("Training Data ground truth: \n", y_train.ravel()) #creating confusion_matrix for training dataset matrix = metrics.confusion_matrix(y_train, y_train_pred) #print(matrix) accuracy = (accuracy_score(y_train, y_train_pred)) * 100 print("Accuracy for training dataset: ", accuracy, "%") #plotting confussion matrix plt.matshow(matrix)
def rocchio_clas(X_train, y_train, X_test): from sklearn.neighbors.nearest_centroid import NearestCentroid model = NearestCentroid() model.fit(X_train, y_train) preds = model.predict(X_test) return preds
class RocchioClassifier(NLTKClassifier): nltk_class = nltk.classify.SklearnClassifier(NearestCentroid())
Y.append(row[0]) X.append(row[1:]) # close CSV file genderFile.close() # covert string values to numbers X_len = len(X) for row in range(X_len): X[row][0] = float(X[row][0]) X[row][1] = float(X[row][1]) X[row][2] = float(X[row][2]) # initialize classifiers clf_LinearSVC = svm.LinearSVC() clf_NearestCentroid = NearestCentroid() clf_SVC = svm.SVC() # train classifiers using data set clf_LinearSVC = clf_LinearSVC.fit(X, Y) clf_NearestCentroid = clf_NearestCentroid.fit(X, Y) clf_SVC = clf_SVC.fit(X, Y) # test clasiifiers using data set acc_LinearSVC = accuracy_score(Y, clf_LinearSVC.predict(X)) * 100.0 acc_NearestCentroid = accuracy_score(Y, clf_NearestCentroid.predict(X)) * 100.0 acc_SVC = accuracy_score(Y, clf_SVC.predict(X)) * 100.0 # identify best classifier index = np.argmax([acc_LinearSVC, acc_NearestCentroid, acc_SVC]) classifiers = {0: 'LinearSVC', 1: 'NearestCentroid', 2: 'SVC'}
from sklearn.cross_validation import KFold '''Reading the input file and converting it to matrix''' file = pd.read_csv('ATNTFaceImages400.txt', header=-1) data = file.as_matrix() print(data.shape) '''Splitting the features and labels from the matrix and transposing it to achieve the appropriate dimension''' X = np.transpose(data[1:, :]) y = np.transpose(data[0, :]) print(X) print(y) '''Splitting the data for kfold cross valaidation using KFold() method''' kf = KFold(len(y), n_folds=5, shuffle=True) print(kf) '''Looping thorught the kfold to access every index of that feature one at a time.''' for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] '''Performing Centroid to classify each feature tot he correspoding label.''' # Centroid Classification centroid_classifier = NearestCentroid() # Train the model using the training sets centroid_classifier.fit(X_train, y_train) # Predict the labels predictions = centroid_classifier.predict(X_test) '''Calculating the accuracy between the actual label and predicted label in percentage[(accuracy*100)%]''' actual = y_test accuracy = metrics.accuracy_score(actual, predictions) * 100 print("Accuracy is: ", accuracy)
def stacking(clf, train_x, train_y, test_x, clf_name, class_num=1): train = np.zeros((train_x.shape[0], class_num)) test = np.zeros((test_x.shape[0], class_num)) test_pre = np.zeros((folds, test_x.shape[0], class_num)) cv_scores = [] for i, (train_index, test_index) in enumerate(kf): tr_x = train_x[train_index] tr_y = train_y[train_index] te_x = train_x[test_index] te_y = train_y[test_index] if clf_name == "lgb": train_matrix = clf.Dataset(tr_x, label=tr_y) test_matrix = clf.Dataset(te_x, label=te_y) params = { 'boosting_type': 'gbdt', 'objective': 'multiclass', 'metric': 'multi_logloss', 'min_child_weight': 1.5, 'num_leaves': 2**5, 'lambda_l2': 10, 'subsample': 0.7, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.5, 'learning_rate': 0.1, 'scale_pos_weight': 20, 'seed': 2018, 'nthread': 16, 'num_class': class_num, 'silent': True, } num_round = 2000 early_stopping_rounds = 100 model = clf.train(params, train_matrix, num_round, valid_sets=test_matrix, early_stopping_rounds=early_stopping_rounds) pre = model.predict(te_x, num_iteration=model.best_iteration).reshape( (te_x.shape[0], class_num)) pred = model.predict(test_x, num_iteration=model.best_iteration).reshape( (test_x.shape[0], class_num)) if clf_name == "lr": model = LogisticRegression(C=4, dual=False) model.fit(tr_x, tr_y) pre = model.predict_proba(te_x) pred = model.predict_proba(test_x) if clf_name == "svm": model = svm.LinearSVC() model.fit(tr_x, tr_y) pre = model.decision_function(te_x) pred = model.decision_function(test_x) if clf_name == "ridge": model = Ridge(alpha=20, copy_X=True, fit_intercept=True, solver='auto', max_iter=100, normalize=False, random_state=0, tol=0.0025) model.fit(tr_x, tr_y) pre = model.predict(te_x) pred = model.predict(test_x) if clf_name == "roc": model = NearestCentroid() model.fit(tr_x, tr_y) pre = model.predict(te_x) pred = model.predict(test_x) if clf_name == "ftrl": model = FM_FTRL( alpha=0.02, beta=0.01, L1=0.00001, L2=30.0, D=tr_x.shape[1], alpha_fm=0.1, L2_fm=0.5, init_fm=0.01, weight_fm=50.0, D_fm=200, e_noise=0.0, iters=3, inv_link="identity", threads=15, ) model.fit(tr_x, tr_y) pre = model.predict(te_x) pred = model.predict(test_x) train[test_index] = pre.reshape((-1, 1)) test_pre[i, :] = pred.reshape((-1, 1)) cv_scores.append(log_loss(te_y, pre)) print("%s now score is:" % clf_name, cv_scores) test[:] = test_pre.mean(axis=0) with open("score_cv.txt", "a") as f: f.write("%s now score is:" % clf_name + str(cv_scores) + "\n") f.write("%s_score_mean:" % clf_name + str(np.mean(cv_scores)) + "\n") return train.reshape(-1, class_num), test.reshape( -1, class_num), np.mean(cv_scores)
def get_model(classifierType): """ Return a trained model based on collected raw data. Args: --------- classifierType: classifier type Return: --------- epoch: epoch number """ os.chdir('data/') if classifierType == 'SVM': # Create a classifier: a support vector classifier classifier = svm.SVC( gamma=0.001, kernel='linear' ) elif classifierType == 'NearestCentroid': # Nearest Centroid Classifier classifier = NearestCentroid() elif classifierType == 'KNN': # KNeighborsClassifier classifier = KNeighborsClassifier(n_neighbors=23) elif classifierType == 'ANN': # Neural network mlp = MLPClassifier( hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, solver='sgd', verbose=10, tol=1e-4, random_state=1 ) classifier = MLPClassifier( hidden_layer_sizes=(50, 50), max_iter=100, alpha=1e-4, solver='sgd', verbose=10, tol=1e-4, random_state=1, learning_rate_init=.1 ) else: print("Possible options: SVM/ANN/KNN/NearestCentroid") exit() # Get set of training data collected from LeapMotion sensor. data = np.loadtxt("TrainingInput.txt") RTP_0, RTP_1, RTP_2, RTP_3, RTP_4, \ RTT_01, RTT_02, RTT_03, RTT_04, RTT_12, \ RTT_13, RTT_14, RTT_23, RTT_24, RTT_34, RTJ_0 = \ data[:,0], data[:,1], data[:,2], data[:,3], data[:,4], \ data[:,5], data[:,6], data[:,7], data[:,8], data[:,9], \ data[:,10], data[:,11], data[:,12], data[:,13], data[:,14], data[:,15] InputSamples = np.vstack(( RTP_0, RTP_1, RTP_2, RTP_3, RTP_4, RTT_01, RTT_02, RTT_03, RTT_04, RTT_12, RTT_13, RTT_14, RTT_23, RTT_24, RTT_34, RTJ_0 )) InputSamples = InputSamples.T print((InputSamples)) dataTarget = np.loadtxt("TargetTraining.txt") dataTarget = dataTarget.T # n_samples = len(dataTarget) classifier.fit(InputSamples, dataTarget) # Storage model if classifierType == 'SVM': joblib.dump(classifier, 'SVM.pkl') elif classifierType == 'NearestCentroid': joblib.dump(classifier, 'NearestCentroid.pkl') elif classifierType == 'KNN': joblib.dump(classifier, 'KNN.pkl') elif classifierType == 'ANN': joblib.dump(classifier, 'ANN.pkl') else: print("Possible options: SVM/ANN/KNN/NearestCentroid") exit() # Predict the value of the set of symbols/gestures on the second half: dataTest = np.loadtxt("TestInput.txt") RTP_0, RTP_1, RTP_2, RTP_3, RTP_4, \ RTT_01, RTT_02, RTT_03, RTT_04, RTT_12, \ RTT_13, RTT_14, RTT_23, RTT_24, RTT_34, RTJ_0 = \ dataTest[:,0], dataTest[:,1], dataTest[:,2], dataTest[:,3], dataTest[:,4], \ dataTest[:,5], dataTest[:,6], dataTest[:,7], dataTest[:,8], dataTest[:,9], \ dataTest[:,10], dataTest[:,11], dataTest[:,12], dataTest[:,13], dataTest[:,14], dataTest[:,15] InputSamplesTest = np.vstack(( RTP_0, RTP_1, RTP_2, RTP_3, RTP_4, RTT_01, RTT_02, RTT_03, RTT_04, RTT_12, RTT_13, RTT_14, RTT_23, RTT_24, RTT_34, RTJ_0 )) InputSamplesTest = InputSamplesTest.T dataTargetTest = np.loadtxt("TestTarget.txt") dataTargetTest = dataTargetTest.T expected = dataTargetTest predicted = classifier.predict(InputSamplesTest) print( "Classification report for classifier %s:\n%s\n" % (classifier, metrics.classification_report(expected, predicted)) ) print( "Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted) ) conf = metrics.confusion_matrix(expected, predicted) plt.imshow(conf, cmap='binary', interpolation='None') plt.show()
kfold = StratifiedKFold(n_splits=number_splits, shuffle=True, random_state=seed) f=0; for train_index, test_index in kfold.split(data_opto_SOM,target): ## Opto SOM ## x_train, x_test = data_opto_SOM[train_index,:],data_opto_SOM[test_index,:] y_train,y_test = target[train_index],target[test_index] mul_lr = LogisticRegression(multi_class='multinomial', solver='newton-cg',max_iter=max_i) mul_lr.fit(x_train, y_train) score_opto_SOM_LR[n,f] = mul_lr.score(x_test, y_test)*100 print(mul_lr.score(x_test,y_test)) clf = NearestCentroid(metric='euclidean',shrink_threshold=None) clf.fit(x_train,y_train) score_opto_SOM_NN[n,f] = clf.score(x_test,y_test)*100 lda = LinearDiscriminantAnalysis(solver='svd') lda.fit(x_train,y_train) score_opto_SOM_LDA[n,f]=lda.score(x_test,y_test)*100 print(lda.score(x_test,y_test)) svm_algo = svm.SVC(decision_function_shape='ovo',kernel='linear') svm_algo.fit(x_train,y_train) score_opto_SOM_SVM[n,f]=svm_algo.score(x_test,y_test)*100 ## Opto PV ## x_train, x_test = data_opto_PV[train_index,:],data_opto_PV[test_index,:] y_train,y_test = target[train_index],target[test_index]
# rnc1 = RadiusNeighborsClassifier() # #default is r = 1.0 # rnc1.fit(xtrain,ytrain1) # print (rnc1.score(xtest,ytest1)) # In[ ]: get_ipython().magic(u'whos') # In[17]: # Nearest centroid from sklearn.neighbors.nearest_centroid import NearestCentroid ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain1) print (ncc1.score(xtest,ytest1)) # In[18]: # Nearest shrunken Centroid for shrinkage in [None,0.05,0.1,0.2,0.3,0.4,0.5]: ncc2 = NearestCentroid(shrink_threshold = shrinkage) ncc2.fit(xtrain,ytrain1) print(ncc2.score(xtest,ytest1)) # In[19]:
def do_centroid(): clf = NearestCentroid() clf.fit(X, Y) return do_testcase(clf)
'mdl__metric': ['euclidean', 'manhattan'], 'mdl__n_neighbors': [1, 10, 50, 100] } # NCentroid parameters. NCentroidParameters = {'mdl__metric': ['euclidean', 'manhattan']} testCases = [['Linear kernel SVM', LinearSVC(), LinearParameters], ['RBF kernel SVM', SVC(), RbfParameters], ['Polynomial kernel SVM', SVC(), PolynomialParameters], ['K-NearestNeighbors', KNeighborsClassifier(), KNNParameters], ['NearestCentroid', NearestCentroid(), NCentroidParameters]] for method, model, parameters in testCases: # Constuct a pipeline. pipeline = Pipeline([('scaler', MinMaxScaler()), ('pca', PCA(0.9)), ('mdl', model)]) print() print( '----------------------------------------------------------------------------' ) print('Tuning parameters to find the best accuracy using the %s.' % method) # Execute gridsearch. clf = GridSearchCV(pipeline,
test_classifier(clf, my_dataset, financial_features) from sklearn import tree clf1 = tree.DecisionTreeClassifier() test_classifier(clf1, my_dataset, financial_features) from sklearn.ensemble import AdaBoostClassifier # clf2 = AdaBoostClassifier() # test_classifier(clf2,my_dataset,financial_features) # from sklearn.neighbors import KNeighborsClassifier # clf3=KNeighborsClassifier(n_neighbors = 4) # test_classifier(clf3,my_dataset,financial_features) from sklearn.neighbors.nearest_centroid import NearestCentroid clf4 = NearestCentroid() test_classifier(clf4, my_dataset, financial_features) ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html # Example starting point. Try investigating other evaluation techniques! from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) ''' OUR FINAL ALGORITHM
from sklearn.neighbors.nearest_centroid import NearestCentroid import numpy as np X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) clf = NearestCentroid() clf.fit(X, y) print clf.predict([[-0.8, -1]])
### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. from sklearn.neighbors.nearest_centroid import NearestCentroid from sklearn.ensemble import AdaBoostClassifier from sklearn.feature_selection import SelectKBest, f_classif from sklearn.cross_validation import StratifiedShuffleSplit from sklearn.preprocessing import MinMaxScaler min_max_scaler = MinMaxScaler() nc = NearestCentroid() adc = AdaBoostClassifier() nc_report = {'accuracy': list(), 'precision': list(), 'recall': list()} adc_report = {'accuracy': list(), 'precision': list(), 'recall': list()} # this function is derived from tester.py def create_report(clf, features, labels): cv = StratifiedShuffleSplit(labels, 100, random_state=42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = []
# the Y (expected output) converted into the 3 closest bass for each test input convertedy = MatchBasses(Y, convertedx) # the test output converted into just numbers to make it easier for knn convertedinput = convert_to_numbers(input_midifile) indexarray = [ ] # y array used for the predicted indexs of the convertedy array # loops through the number of indexs for the array and stores the each index numer in the indexarray for i in range(len(convertedy)): indexarray.append(i) deletewaste(convertedx, convertedy, indexarray) # does all the k nearest neighbor stuff neighbor = NearestCentroid() neighbor.fit(convertedx, indexarray) predictionsindex = [] # stores the prediction indexs in an array predictionsindex.append((neighbor.predict(convertedinput))) print(predictionsindex[0]) predictions = [] # this is where the real predictions are stored # loops through the prediction index array and stores the correct predictions in it (by using the indexs inside the convertedy array) for index in predictionsindex[0]: predictions.append(convertedy[index]) # this adds that track1 format stuff to the database that we are outputing
from sklearn.neighbors.nearest_centroid import NearestCentroid import numpy as np from numpy import loadtxt from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split data = loadtxt('PhishingData.txt', delimiter=",") # split data into X and y X = data[:, 0:9] y = data[:, 9] seed = 7 test_size = 0.3 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed) clf = NearestCentroid() clf.fit(X_train, y_train) NearestCentroid(metric='euclidean', shrink_threshold=None) print(clf) y_pred = clf.predict(X_test) predictions = [round(value) for value in y_pred] #verify predictions accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0))
return features[:MIN] if os.path.exists(os.path.join(here, 'X_train.csv')): print('loading X_train ....') X_train = pd.read_csv(os.path.join(here, 'X_train.csv'), index_col=0) print('shape of X_train', X_train.shape) else: print('making X_train from trainData ...') X_train = pd.DataFrame(index=trainData.index, data=trainData['time_series_file'].apply(featurize).tolist()) X_train.to_csv(os.path.join(here, 'X_train.csv')) print('shape of X_train', X_train.shape) from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors.nearest_centroid import NearestCentroid clf = clf = NearestCentroid(shrink_threshold=None) clf.fit(X_train, trainTargets.ravel()) # print('=========================================================') X_test = pd.DataFrame(index=testData.index, data=testData['time_series_file'].apply(featurize).tolist()) print('shape of X_test', X_test.shape) y_pred = clf.predict(X_test) y_truth = testTargets.ravel() from sklearn.metrics import accuracy_score, f1_score accuracy = accuracy_score(y_truth, y_pred) f1 = f1_score(y_truth, y_pred, average='macro') print('F1 (macro) score on test data', f1)
X_test_main = np.loadtxt("X_test.dat") y_test = np.loadtxt("y_test.dat") ##################### X_test = np.array(X_test_main[:, column]) X_train = preprocessing.normalize(X_train, norm='l2') X_test = preprocessing.normalize(X_test, norm='l2') sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.fit_transform(X_test) ########################## print("Train:", X_train.shape) print("Test:", X_test.shape) classifier = NearestCentroid() classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) cm = confusion_matrix(y_test, y_pred) print(cm) TN = cm[0, 0] FP = cm[0, 1] FN = cm[1, 0] TP = cm[1, 1] TPR = (TP / (TP + FN)) print("TPR: {:0.2f}".format(TPR)) TNR = (TN / (TN + FP))
specific analysis or data in MNIST_NearestNeighborsCentroid.anls this code use NearestNeighborsCentroid method , results shows no specific advancement. with about 89% precision or so. ''' from sklearn.neighbors.nearest_centroid import NearestCentroid from sklearn import metrics import numpy import transform_data_to_format as tdtf #train_x , train_y = tdtf.read_data_to_ndarray("../data/train.csv",42000) #train_x , train_y = tdtf.read_data_to_ndarray("../data/train.csv",2100) #valid_x , valid_y = tdtf.read_data_to_ndarray("../data/valid.csv",21000) #test_x = tdtf.read_test_data_to_ndarray("../data/test.csv",28000); clf = NearestCentroid() clf.fit(train_x,train_y) #NearestCentroid(metric='euclidean', shrink_threshold=None) #pred_y = clf.predict(test_x) #pred_train_y = clf.predict(train_x[0:21000]) pred_valid_y = clf.predict(valid_x) #print pred_y #tdtf.write_to_csv(pred_y,"../data/MNIST_NearestNeighborsCentroid.out") #print("Classification report for classifier %s:\n%s\n" # % (clf , metrics.classification_report(train_y , pred_train_y ))) ''' print("Classification report for classifier %s:\n%s\n"
clf_ada = AdaBoostClassifier(n_estimators=100) clf_bdt_real = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=600, learning_rate=1.) clf_bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=600, learning_rate=1.5, algorithm="SAMME") #create gradient boosting clf_gbdt = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3) #create nearest centroid clf_nn = NearestCentroid() #create a stochastic gradient descent classifier clf_sgd = SGDClassifier(loss="modified_huber", penalty="l2") #define a training sample and train train_start = 0 train_stop = n_samples / 2 #clf_svm.fit(digits.data[train_start:train_stop], digits.target[train_start:train_stop]) #clf_rf.fit(digits.data[train_start:train_stop], digits.target[train_start:train_stop]) #clf_dt = DecisionTreeClassifier().fit(digits.data[train_start:train_stop], digits.target[train_start:train_stop]) #define a test sample and test test_start = n_samples / 2 test_stop = n_samples #expected_test_sample = digits.target[test_start:test_stop]
# averaging out the means for all channels mean_avg = [] for i in range(0, 15): mean_avg.append((means_data_1[i] + means_data_2[i] + means_data_3[i] + means_data_4[i]) / 4) # print len(mean_avg) ratio_avg = [] for i in range(0, 15): ratio_avg.append((ratio_data_1[i] + ratio_data_2[i] + ratio_data_3[i] + ratio_data_4[i]) / 4) # print (ratio_avg) # mean_center,mean_lab = trainSet(mean_avg) # ratio_center, ratio_lab = trainSet(ratio_avg) clf = NearestCentroid() X = [] Y = [] for i in range(0, 4): X.append([int(mean_avg[i]), ratio_avg[i]]) Y.append(0) for i in range(5, 9): X.append([int(mean_avg[i]), ratio_avg[i]]) Y.append(1) for i in range(10, 14): X.append([int(mean_avg[i]), ratio_avg[i]]) Y.append(2) # print X # print Y clf.fit(X, Y) res = clf.predict([[mean_avg[14], ratio_avg[14]]])
class clusteringST: ''' Identification of sub-types for prediction ''' def getClusters(self,net_data): self.avg_bin_mat = np.zeros((net_data.shape[0],net_data.shape[0])) self.avg_n_clusters = 0 self.clust_list = [] for i in range(net_data.shape[2]): ms = MeanShift() ms.fit(net_data[:,:,i]) self.clust_list.append(ms) labels = ms.labels_ cluster_centers = ms.cluster_centers_ n_clusters_ = len(np.unique(labels)) #print(labels,cluster_centers.shape,n_clusters_) #bin_mat = np.zeros(avg_bin_mat.shape) bin_mat = cls.ind2matrix(labels+1)>0 self.avg_bin_mat += bin_mat self.avg_n_clusters += n_clusters_ self.avg_bin_mat /= net_data.shape[2] self.avg_n_clusters /= net_data.shape[2] return self.avg_n_clusters def getMeanClustering(self): return self.avg_bin_mat def get_match_network(self,net_data,ncluster,algo='kmeans'): ''' net_data: 3d volume (subjects x vecnetwork x vecnetwork) ncluster: number of groups to partition the subjects algo: (default: kmeans) kmeans, meanshift. ''' valid_net_idx = [] valid_cluster = [] self.avg_bin_mat = np.zeros((net_data.shape[0],net_data.shape[0])) self.avg_n_clusters = 0 for i in range(net_data.shape[2]): # Compute clustering with for each network if algo == 'kmeans': clust = KMeans(init='k-means++', n_clusters=ncluster, n_init=10) else: clust = MeanShift() #t0 = time.time() clust.fit(net_data[:,:,i]) #t_batch = time.time() - t0 # Compute the stability matrix among networks bin_mat = cls.ind2matrix(clust.labels_+1)>0 self.avg_bin_mat += bin_mat self.avg_n_clusters += len(np.unique(clust.labels_)) valid_cluster.append(clust) valid_net_idx.append(i) self.avg_bin_mat /= net_data.shape[2] self.avg_n_clusters /= net_data.shape[2] return valid_cluster, valid_net_idx def assigneSubtype(self,nets,valid_cluster, valid_net_idx): classes = [] dist_centroid = np.array([]) for i in range(len(valid_net_idx)): classes.append(valid_cluster[i].predict(nets[:,valid_net_idx[i]])[0]) #points = np.vstack((nets[:,valid_net_idx[i]],valid_cluster[i].cluster_centers_)) #dist_ = squareform(pdist(points, metric='euclidean'))[0,1:] #classes.append(np.argmin(dist_)) points = np.vstack((nets[:,valid_net_idx[i]],valid_cluster[i].cluster_centers_)) dist_ = squareform(pdist(points, metric='euclidean'))[0,1:] dist_centroid = np.hstack((dist_centroid,dist_)) return classes, dist_centroid def assigneDist(self,nets,valid_cluster, valid_net_idx): classes = np.array([]) for i in range(len(valid_net_idx)): #print np.hstack((classes,(valid_cluster[i].transform(nets[:,valid_net_idx[i]])[0]))) points = np.vstack((nets[:,valid_net_idx[i]],valid_cluster[i].cluster_centers_)) dist_ = squareform(pdist(points, metric='euclidean'))[0,1:] #dist_ = squareform(pdist(points, metric='correlation'))[0,1:] classes = np.hstack((classes,dist_)) #classes.append(np.argmin(dist_)) return classes def fit_old(self,net_data,nnet_cluster='auto',nSubtypes=3): self.nnet_cluster = nnet_cluster self.nSubtypes = nSubtypes if nnet_cluster == 'auto': #self.nnet_cluster = self.getClusters(net_data) self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='meanshift') else: self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='kmeans') #self.valid_cluster = self.clust_list #self.valid_net_idx = range(len(self.valid_cluster)) self.assign_net = np.array([]) self.dist_net = np.array([]) for i in range(net_data.shape[0]): if i == 0 : classes_, dist_ = self.assigneSubtype(net_data[i,:,:],self.valid_cluster, self.valid_net_idx) self.dist_net = dist_ self.assign_net = classes_ else: classes_, dist_ = self.assigneSubtype(net_data[i,:,:],self.valid_cluster, self.valid_net_idx) self.dist_net = np.vstack((self.dist_net,dist_)) self.assign_net = np.vstack((self.assign_net,classes_)) # group subjects with the most network classifing them together # compute the consensus clustering self.consensus = cls.hclustering(self.assign_net,self.nSubtypes) # save the centroids in a method self.clf_subtypes = NearestCentroid() self.clf_subtypes.fit(self.assign_net,self.consensus) self.consensus = self.clf_subtypes.predict(self.assign_net) #print "score: ", self.clf_subtypes.score(self.assign_net,self.consensus) return self.consensus def transform_low_scale_old(self,net_data): # net_data_low --> Dimensions: nSubjects, nNetwork_low, nNetwork nnet_cluster = np.max(self.ind_low_scale) net_data_low = [] net_data_low = np.zeros((net_data.shape[0],nnet_cluster,net_data.shape[2])) for i in range(nnet_cluster): # average the apropriate parcels and scale them #net_data_low[:,i,:] = preprocessing.scale(net_data[:,self.ind_low_scale==i+1,:].mean(axis=1), axis=1) net_data_low[:,i,:] = net_data[:,self.ind_low_scale==i+1,:].mean(axis=1) return net_data_low def fit(self,net_data_low,nSubtypes=3,reshape_w=True): self.nnet_cluster = net_data_low.shape[1] self.nSubtypes = nSubtypes #ind_low_scale = cls.get_ind_high2low(low_res_template,orig_template) #self.ind_low_scale = ind_low_scale # net_data_low --> Dimensions: nSubjects, nNetwork_low, nNetwork #net_data_low = transform_low_scale(ts_data,self.ind_low_scale) self.net_data_low = net_data_low # st_templates --> Dimensions: nNetwork_low, nSubtypes, nNetwork st_templates = [] for i in range(len(net_data_low[1])): # indentity matrix of the corelation between subjects #tmp_subj_identity = np.corrcoef(net_data_low[:,i,:]) #ind_st = cls.hclustering(tmp_subj_identity,nSubtypes) # subjects X network_nodes #ind_st = cls.hclustering(net_data_low[:,i,:]-np.mean(net_data_low[:,i,:],axis=0),nSubtypes) ind_st = cls.hclustering(net_data_low[:,i,:],nSubtypes) for j in range(nSubtypes): if j == 0: st_templates_tmp = net_data_low[:,i,:][ind_st==j+1,:].mean(axis=0)[np.newaxis,...] else: st_templates_tmp = np.vstack((st_templates_tmp,net_data_low[:,i,:][ind_st==j+1,:].mean(axis=0)[np.newaxis,...])) if i == 0: st_templates = st_templates_tmp[np.newaxis,...] else: st_templates = np.vstack((st_templates,st_templates_tmp[np.newaxis,...])) self.st_templates = st_templates # calculate the weights for each subjects self.W = self.compute_weights(net_data_low) if reshape_w: return self.reshapeW(self.W) else: return self.W def compute_weights(self,net_data_low): # calculate the weights for each subjects W = np.zeros((net_data_low.shape[0],self.st_templates.shape[0],self.st_templates.shape[1])) for i in range(net_data_low.shape[0]): for j in range(self.st_templates.shape[0]): for k in range(self.st_templates.shape[1]): # Demean average_template = np.median(self.net_data_low[:,j,:],axis=0) #average_template = self.st_templates[j,:,:].mean(axis=0) dm_map = net_data_low[i,j,:] - average_template dm_map = preprocessing.scale(dm_map) st_dm_map = self.st_templates[j,k,:] - average_template W[i,j,k] = np.corrcoef(st_dm_map,dm_map)[-1,0:-1] return W def transform(self,net_data_low,reshape_w=True): ''' Calculate the weights for each sub-types previously computed ''' # compute the low scale version of the data #net_data_low = transform_low_scale(ts_data,self.ind_low_scale) # calculate the weights for each subjects W = self.compute_weights(net_data_low) if reshape_w: return self.reshapeW(W) else: return W def reshapeW(self,W): # reshape the matrix from [subjects, Nsubtypes, weights] to [subjects, vector of weights] xw = W.reshape((W.shape[0], W.shape[1]*W.shape[2])) return xw def fit_dev(self,net_data,nnet_cluster='auto',nSubtypes=3): self.nnet_cluster = nnet_cluster self.nSubtypes = nSubtypes if nnet_cluster == 'auto': #self.nnet_cluster = self.getClusters(net_data) self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='meanshift') else: self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='kmeans') #self.valid_cluster = self.clust_list #self.valid_net_idx = range(len(self.valid_cluster)) for i in range(net_data.shape[0]): if i == 0 : self.assign_net = self.assigneDist(net_data[i,:,:],self.valid_cluster, self.valid_net_idx) else: self.assign_net = np.vstack(((self.assign_net,self.assigneDist(net_data[i,:,:],self.valid_cluster, self.valid_net_idx)))) print 'Size of the new data map: ',self.assign_net.shape # group subjects with the most network classifing them together # compute the consensus clustering self.consensus = cls.hclustering(self.assign_net,self.nSubtypes) # save the centroids in a method self.clf_subtypes = NearestCentroid() self.clf_subtypes.fit(self.assign_net,self.consensus) self.consensus = self.clf_subtypes.predict(self.assign_net) #print "score: ", self.clf_subtypes.score(self.assign_net,self.consensus) return self.consensus
print 'Reading features... Done!' # STEP 2 - computing scores print 'Training...' tfidf = models.TfidfModel(dictionary=features) # Computing tfidf model to be queried. tfidf.save('reuters/data/tfidf.model') # STEP 3 - computing centroids tfidf = models.TfidfModel.load('reuters/data/tfidf.model') features = corpora.Dictionary.load_from_text('reuters/data/word.dict') by_bow = Corpus2Dictionary(features) train_corpus = ReutersCorpus('training') tfidf_train = tfidf[by_bow[by_word[train_corpus]]] X = matutils.corpus2csc(tfidf_train) # to gensim into scipy sparse matrix X = X.transpose() # from csc (document as column) to csr (document as row) y = train_corpus.category_mask # label for doc rocchio = NearestCentroid() rocchio.fit(X, y) print 'Training... Done!' # STEP 4 - evaluate prediction test_corpus = ReutersCorpus('test') tfidf_test = tfidf[by_bow[by_word[test_corpus]]] # num_terms required: otherwise Z shrink to the max feature found X = matutils.corpus2csc(tfidf_test, num_terms=len(features)) X = X.transpose() y_true = test_corpus.category_mask y_pred = rocchio.predict(X) # print precision_score(y_true, y_pred) print rocchio.score(X, y_true)
apr_dbz[:,iiap[0][1]:iiap[0][0]].T.shape # In[217]: ncc_sum = np.sum(apr_dbz[:,iiap[0][1]:iiap[0][0]],axis=0) ncc_set = ncc_sum.copy()*0.0 ncc_set[ncc_sum>0] = 1.0 # In[218]: nc = NearestCentroid() ncc = nc.fit(apr_dbz[:,iiap[0][1]:iiap[0][0]].T,ncc_set) # In[220]: ncc.centroids_[1] # In[237]: plt.figure() #plt.plot(ncc.centroids_[0],apr['altflt'][:,iap[0]],'.') plt.plot(ncc.centroids_[1],apr['altflt'][:,iap[0]],'.')
print("--------------------Results-------------------") print("Classification report for kNN classifier %s:\n%s\n" % (clf, metrics.classification_report(expected, predicted))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)) #Nearest Centroid classification start = int(round(time.time() * 1000)) classifier = NearestCentroid() classifier.fit(X_lda, y_train) NearestCentroid(metric='euclidean', shrink_threshold=None) print (classifier) print("---------(5) Cross validation accuracy--------") print(cross_validation.cross_val_score(classifier, X_lda,y_train, cv=5)) end = int(round(time.time() * 1000)) print("--Centroid fitting finished in ", (end-start), "ms--------------") print("---------Test-set dimensions after PCA--------")
#!/usr/bin/python from sklearn.neighbors.nearest_centroid import NearestCentroid import numpy X = numpy.array([[-1,-1],[-2,-1],[-3,-2],[1,1],[2,1],[3,2]]) y = numpy.array([1,1,1,2,2,2]) clf = NearestCentroid() clf.fit(X,y) NearestCentroid(metric='euclidean', shrink_threshold=None) print clf.predict([0,1])
def myclassify(numfiers=5,xtrain=xtrain,ytrain=ytrain,xtest=xtest,ytest=ytest): count = 0 bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) count += 1 classifiers = [bagging2.score(xtest,ytest)] if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) #print tree2.fit(xtrain,ytrain) #print tree2.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree2.score(xtest,ytest)) print "1" print tree2.score(xtest,ytest) if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging1.score(xtest,ytest)) print "2" print bagging1.score(xtest,ytest) # if count < numfiers: # # votingClassifiers combine completely different machine learning classifiers and use a majority vote # clff1 = SVC() # clff2 = RFC(bootstrap=False) # clff3 = ETC() # clff4 = neighbors.KNeighborsClassifier() # clff5 = quadda() # print"3" # eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) # eclf = eclf.fit(xtrain,ytrain) # #print(eclf.score(xtest,ytest)) # # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # # cla # # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # # print () # count+=1 # classifiers = np.append(classifiers,eclf.score(xtest,ytest)) # if count < numfiers: # svc1 = SVC() # svc1.fit(xtrain,ytrain) # dec = svc1.score(xtest,ytest) # count+=1 # classifiers = np.append(classifiers,svc1.score(xtest,ytest)) # print "3" if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,qda.score(xtest,ytest)) print "4" if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) #print tree1.fit(xtrain,ytrain) #print tree1.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree1.score(xtest,ytest)) if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) #print(knn1.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn1.score(xtest,ytest)) if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) #print(lda.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,lda.score(xtest,ytest)) if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) #print tree3.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree3.score(xtest,ytest)) if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) #print bagging3.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging3.score(xtest,ytest)) if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) #print bagging4.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging4.score(xtest,ytest)) if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) #print tree4.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree4.score(xtest,ytest)) if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) #print(tree6.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,tree6.score(xtest,ytest)) if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) #print(knn2.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn2.score(xtest,ytest)) if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) #print(knn3.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn3.score(xtest,ytest)) if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) #print(knn4.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn4.score(xtest,ytest)) if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) #print(knn5.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn5.score(xtest,ytest)) if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) #print (ncc1.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,ncc1.score(xtest,ytest)) if count < numfiers: # Nearest shrunken Centroid for shrinkage in [None,0.05,0.1,0.2,0.3,0.4,0.5]: ncc2 = NearestCentroid(shrink_threshold = shrinkage) ncc2.fit(xtrain,ytrain) #print(ncc2.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,ncc2.score(xtest,ytest)) if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) #print(tree5.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,tree5.score(xtest,ytest)) classifierlabel = ["BaggingETC (with bootstraps set to false)","ETC","BaggingETC","Voting Classifier","svm","QDA","DTC","KNN (default)","LDA","RFC", "BaggingRFC (with bootstraps set to false)","BaggingSVC (with bootstraps set to false)","RFC (bootstrap false)","GBC", "knn (n_neighbors = 10)","knn (n_neighbors = 3)","knn (ball tree algorithm)","knn (kd_tree algorithm)", "Nearest Centroid","Shrunken Centroid?","ABC"] classifierlabel = classifierlabel[:len(classifiers)] #print len(classifiers) #print classifiers for i in range(len(classifiers)): print ("{} classifier has percent correct {}".format(classifierlabel[i],classifiers[i]))
'finish launching Random Forest Classifier, the test accuracy is {:.5%}' .format(rf.score(X_val, y_val))) rf_predict = rf.predict(X_test) print('=' * 100) print('start launching SVM Classifier......') svm = svm.SVC() svm.fit(X_train, y_train) print( 'finish launching SVM Classifier, the test accuracy is {:.5%}'.format( svm.score(X_val, y_val))) svm_predict = svm.predict(X_test) print('=' * 100) print('start launching KNN Classifier......') knn = NearestCentroid() knn.fit(X_train, y_train) print( 'finish launching KNN Classifier, the test accuracy is {:.5%}'.format( knn.score(X_val, y_val))) knn.predict(X_test) print('=' * 100) print('start launching Decision Tree Classifier......') dtree = tree.DecisionTreeClassifier() dtree.fit(X_train, y_train) print( 'finish launching Decision Tree Classifier, the test accuracy is {:.5%}' .format(dtree.score(X_val, y_val))) dtree_predict = dtree.predict(X_test)
all_instances.append(row1) if(row1[0] > maxlength): maxlength = row1[0]; for row2 in negative: row2 = row2[:-1] row2 = row2.split(',') row2 = [int(i) for i in row2] all_instances.append(row2) if(row2[0] > maxlength): maxlength = row2[0]; for instance in all_instances: instance[0] = instance[0]/maxlength; random.shuffle(all_instances) # print all_instances[0:700] print "all_instances size: ", len(all_instances) train_set = np.array(all_instances[0:700]) test_set = np.array(all_instances[701:]) print train_set[:,:-1] X = np.array(train_set[:,:-1]) Y = np.array(train_set[:,-1]) clf = NearestCentroid() clf.fit(X, Y) predication = clf.predict(test_set[:,:-1]) evaluation(predication, test_set[:,-1])
from sklearn import svm import numpy as np import serial from sklearn.neighbors.nearest_centroid import NearestCentroid def removeMag(line): return line[6:] x = [] y = [] small = False #clf = svm.LinearSVC() clf = NearestCentroid() folder = "gyro_side\\" files = ['still.csv', 'yes.csv', 'no.csv'] for i in range(3): f =open(folder+files[i], 'r') for line in f.readlines(): #print line line = [int(a) for a in line.split(',')] lines = [removeMag(line[9*j:9*j+9]) for j in range(9)] # smallLine=[] # for j in range(5): # smallLine = smallLine + line[6*j:6*j+3] # if small: # line=smallLine # if len(x)==0: # x= np.array(np.array([line]))
from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.neighbors.nearest_centroid import NearestCentroid from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis) from sklearn.neural_network import MLPClassifier classifiers_by_name = { '10-nearest-neighbors': lambda: KNeighborsClassifier(n_neighbors=10), 'nearest-centroid-mean': lambda: NearestCentroid(metric='euclidean'), 'nearest-centroid-median': lambda: NearestCentroid(metric='manhattan'), 'logistic-regression': LogisticRegression, 'sgd': SGDClassifier, 'linear-svm': lambda: SVC(kernel='linear'), 'quadratic-svm': lambda: SVC(kernel='poly', degree=2), 'cubic-svm': lambda: SVC(kernel='poly', degree=3), 'rbf-svm': lambda: SVC(kernel='rbf'), 'decision-tree': DecisionTreeClassifier, 'random-forest': RandomForestClassifier, 'adaboost': AdaBoostClassifier, 'gaussian-naive-bayes': GaussianNB, 'lda': LinearDiscriminantAnalysis, 'qda': QuadraticDiscriminantAnalysis, 'multilayer-perceptron': MLPClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid import time conf_mat = numpy.zeros( (len(no_imgs), len(no_imgs))) # Initializing the Confusion Matrix n_neighbors = 1 # better to have this at the start of the code # 10-fold Cross Validation for i in range(kfold): train_indices = skfind[i][0] test_indices = skfind[i][1] clf = [] clf = NearestCentroid() X_train = X[train_indices] y_train = y[train_indices] X_test = X[test_indices] y_test = y[test_indices] # Training tic = time.time() clf.fit(X_train, y_train) toc = time.time() print "training time= ", toc - tic # roughly 2.5 secs # Testing y_predict = [] tic = time.time()
columns = list(df.columns.values) df = df.values words = df[:, :-1] #selecting words labels = df[:, -1] #selecting Labels X_train, X_test, Y_train, Y_test = train_test_split(words, labels, test_size=0.2, random_state=40) from sklearn.metrics import accuracy_score, confusion_matrix from matplotlib import style import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') style.use('ggplot') # Rocchio Algorithm clf = NearestCentroid() clf.fit(X_train, Y_train) predict = clf.predict(X_test) accuracy = accuracy_score(Y_test, predict) print('\nAccuracy of Rocchio:\n') print(accuracy) conf_mat = confusion_matrix(Y_test, predict) print('\nConfusion Matrix: \n', conf_mat) plt.matshow(conf_mat) plt.title('Confusion Matrix for test Data\t') plt.colorbar() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() # Naive Bayes clf_1 = GaussianNB()
results.append( benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"), X_train, y_train, X_test, y_test, target_names, feature_names=feature_names)) # Train NearestCentroid without threshold #print('=' * 80) #print("NearestCentroid (aka Rocchio classifier)") results.append( benchmark(NearestCentroid(), X_train, y_train, X_test, y_test, target_names, feature_names=feature_names)) # Train sparse Naive Bayes classifiers #print('=' * 80) #print("Naive Bayes") results.append( benchmark(MultinomialNB(alpha=.01), X_train, y_train, X_test,
print 'knn_benchmark_targets: ' + str(len(knn_benchmark_targets)) print 'rf_benchmark_targets: ' + str(len(rf_benchmark_targets)) # Takes a list, creates a csv file def submitFile(x, pre): f = open(pre + '_submission.csv', 'w') for val in x: f.write(str(val) + ',\r') f.close() # ============================================================================== # Nearest centroid classifier # ============================================================================== from sklearn.neighbors.nearest_centroid import NearestCentroid ncc = NearestCentroid() ncc.fit(features_to_train, targets_to_train) predicted_targets = ncc.predict(features_to_test) # Just print out the precision and f1 scores print 'precision: %0.5f' % metrics.precision_score(rf_benchmark_targets, predicted_targets) print 'f1 score: %0.5f' % metrics.f1_score(rf_benchmark_targets, predicted_targets) # The following scores are used for classification models print 'accuracy: %0.5f' % metrics.zero_one_score(rf_benchmark_targets, predicted_targets) print 'loss: %d' % metrics.zero_one(rf_benchmark_targets, predicted_targets) # ============================================================================== # Multinomial naive bayes # ==============================================================================
>>> sigmoid_svc = svm.SVC(kernel='sigmoid') >>> sigmoid_svc.fit(X_train, Y_train) >>> accuracy_score(Y_test,sigmoid_svc.predict(X_test).round()) #0.5617977528089888 #.................................................................................................# ## Nearest Centroid Classifier >>> from sklearn.neighbors.nearest_centroid import NearestCentroid >>> import numpy as np >>> file = open("/home/banafshbts/Desktop/hosh/76/all") >>> file.readline() >>> data = np.loadtxt(file,delimiter=',') >>> data = np.loadtxt(file,delimiter=',') >>> X_train = data[0:810, 0:12] >>> Y_train = data[0:810, 13] >>> X_test = data[810:, 0:12] >>> Y_test = data[810:, 13] >>> clf = NearestCentroid() >>> clf.fit(X_train, Y_train) >>> accuracy_score(clf.predict(X_test),Y_test) #0.5842696629213483 #.................................................................................................# ##Gaussian Naive Bayes >>> from sklearn.naive_bayes import GaussianNB >>> import numpy as np >>> file = open("/home/banafshbts/Desktop/hosh/76/all") >>> file.readline() >>> data = np.loadtxt(file,delimiter=',') >>> data = np.loadtxt(file,delimiter=',') >>> X_train = data[0:810, 0:12] >>> Y_train = data[0:810, 13] >>> X_test = data[810:, 0:12] >>> Y_test = data[810:, 13] >>> gnb = GaussianNB()
def nn_centroid(self, X, y, test): clf = NearestCentroid() clf.fit(X, y) t = clf.predict(test) print("nn_centroid:", t) return t
accuracy_score( Y_test, rbf_svc.predict(X_test).round( )) #0.4157303370786517 0.0092165898617511521 0.33640552995391704 #pre.append(precision_score(Y_test, rbf_svc.predict(X_test), average='macro')) sigmoid_svc = svm.SVC(kernel='sigmoid') sigmoid_svc.fit(X_train, Y_train) accuracy_score( Y_test, sigmoid_svc.predict(X_test).round( )) #0.5617977528089888 0.027649769585253458 0.16589861751152074 #pre.append(precision_score(Y_test, sigmoid_svc.predict(X_test), average='macro')) #.................................................................................................# ## Nearest Centroid Classifier from sklearn.neighbors.nearest_centroid import NearestCentroid ncc_clf = NearestCentroid() ncc_clf.fit(X_train, Y_train) accuracy_score( ncc_clf.predict(X_test), Y_test) #0.5842696629213483 0.72811059907834097 0.3686635944700461 #.................................................................................................# ##Gaussian Naive Bayes from sklearn.naive_bayes import GaussianNB gnb = GaussianNB() accuracy_score( gnb.fit(X_train, Y_train).predict(X_test), Y_test) #0.5955056179775281 0.26728110599078342 0.25345622119815669 #.................................................................................................# ##DecisionTreeClassifier from sklearn import tree DT_clf = tree.DecisionTreeClassifier()
class TwoWordRecognizer: def scaler(self,arr): return arr/np.max(np.abs(arr))*100 def get_startingpoint(self,arr): arr = np.abs(arr) st_i = 0 e_i = STEPS old_value = np.sum(arr[st_i:e_i,0]) counter = 0 while e_i < arr.shape[0]: arr_sum = np.sum(arr[st_i:e_i,0]) if(arr_sum>old_value*FACTOR): return st_i else: if(old_value<arr_sum): old_value = arr_sum st_i+=STEPS e_i+=STEPS return 10000 def get_endingpoint(self,arr): arr = np.abs(arr) e_i = arr.shape[0]-1 st_i = e_i - STEPS old_value = np.sum(arr[st_i:e_i,0]) while st_i > 0: arr_sum = np.sum(arr[st_i:e_i,0]) if(arr_sum>old_value*FACTOR): return e_i else: if(old_value<arr_sum): old_value = arr_sum st_i -= STEPS e_i -= STEPS return 10000 def euclidean_distance(self,arr1,arr2): a1 = arr1.copy() a2 = arr2.copy() if(a1.shape[0]<a2.shape[0]): zero_rows = a2[a1.shape[0]:a2.shape[0],[0,1]].copy() zero_rows[:,:] = 0 a1 = np.concatenate((a1,zero_rows)) elif(a1.shape[0]>a2.shape[0]): zero_rows = a1[a2.shape[0]:a1.shape[0],[0,1]].copy() zero_rows[:,:] = 0 a2 = np.concatenate((a2,zero_rows)) dist = np.sqrt((a2[:,0]-a1[:,0])**2) return np.sum(dist) def loadReferenceWords(self, word1_path, word2_path): fs, self.word1 = wavfile.read(word1_path) fs, self.word2 = wavfile.read(word2_path) self.word1 = self.scaler(self.word1) self.word2 = self.scaler(self.word2) self.word1 = self.word1[self.get_startingpoint(self.word1):self.get_endingpoint(self.word1),:] self.word2 = self.word2[self.get_startingpoint(self.word2):self.get_endingpoint(self.word2),:] def loadData(self, ressourcepath1, ressourcepath2): print(ressourcepath1) dirList = os.listdir(ressourcepath1) fullpath1 = [] for fname in dirList: fullpath1.append(ressourcepath1+""+fname) dirList = os.listdir(ressourcepath2) fullpath2 = [] for fname in dirList: fullpath2.append(ressourcepath2+""+fname) counter = 0 for path in fullpath1: if counter == 0: fs, w1 = wavfile.read(path) w1 = self.scaler(w1) w1 = w1[self.get_startingpoint(w1):self.get_endingpoint(w1),:] X = np.array([self.euclidean_distance(self.word1,w1),self.euclidean_distance(self.word2,w1)]) y = np.array([1]) counter = 1 else: fs, w1 = wavfile.read(path) w1 = self.scaler(w1) w1 = w1[self.get_startingpoint(w1):self.get_endingpoint(w1),:] X = np.vstack((X,np.array([self.euclidean_distance(self.word1,w1),self.euclidean_distance(self.word2,w1)]))) y = np.hstack((y,np.array([1]))) for path in fullpath2: fs, w2 = wavfile.read(path) w2 = self.scaler(w2) w2 = w2[self.get_startingpoint(w2):self.get_endingpoint(w2),:] X = np.vstack((X,np.array([self.euclidean_distance(self.word1,w2),self.euclidean_distance(self.word2,w2)]))) y = np.hstack((y,np.array([2]))) from sklearn.neighbors.nearest_centroid import NearestCentroid self.clf = NearestCentroid() self.clf.fit(X,y) #import matplotlib.pyplot as plt #plt.scatter(X[:,0],X[:,1]) #plt.show() def predict(self,input_path): fs, raw_arr = wavfile.read(input_path) raw_arr = self.scaler(raw_arr) word= raw_arr[self.get_startingpoint(raw_arr):self.get_endingpoint(raw_arr),:] x0 = np.array([self.euclidean_distance(self.word1,word),self.euclidean_distance(self.word2,word)]) return self.clf.predict(x0)
models.append(classifier.fit(X_train, y_train)) from sklearn.naive_bayes import BernoulliNB classifier = BernoulliNB() models.append(classifier.fit(X_train, y_train)) from sklearn.naive_bayes import MultinomialNB classifier = MultinomialNB() models.append(classifier.fit(X_train, y_train)) from sklearn.neighbors import KNeighborsClassifier # KNN classifier = KNeighborsClassifier() models.append(classifier.fit(X_train, y_train)) from sklearn.neighbors.nearest_centroid import NearestCentroid classifier = NearestCentroid() models.append(classifier.fit(X_train, y_train)) from sklearn.gaussian_process import GaussianProcessClassifier # gaussian process classifier = GaussianProcessClassifier() models.append(classifier.fit(X_train, y_train)) from sklearn.tree import DecisionTreeClassifier # decision trees. For interesting tree vizualisation, see graphviz module classifier = DecisionTreeClassifier() models.append(classifier.fit(X_train, y_train)) from sklearn.ensemble import BaggingClassifier # bagging meta classifier classifier = BaggingClassifier() models.append(classifier.fit(X_train, y_train)) from sklearn.ensemble import RandomForestClassifier # everyone's favorite homeboy random forest
df_input3_target = filtered3[list(range(0,1))].as_matrix() df_input4_data = filtered4[list(range(2,76))].as_matrix() df_input4_target = filtered4[list(range(0,1))].as_matrix() df_input5_data = filtered5[list(range(2,76))].as_matrix() df_input5_target = filtered5[list(range(0,1))].as_matrix() # df_input_data = filtered[list(range(2,76))].as_matrix() # df_input_target = filtered[list(range(0,1))].as_matrix() # Nearest Centroid from sklearn.neighbors.nearest_centroid import NearestCentroid # Nearest Centroid knc1 = NearestCentroid() knc1.fit(df_input1_data,numpy.ravel(df_input1_target)) pickle.dump(knc1, open('model_knc_t1.pkl', 'wb')) knc2 = NearestCentroid() knc2.fit(df_input2_data,numpy.ravel(df_input2_target)) pickle.dump(knc2, open('model_knc_t2.pkl', 'wb')) knc3 = NearestCentroid() knc3.fit(df_input3_data,numpy.ravel(df_input3_target)) pickle.dump(knc3, open('model_knc_t3.pkl', 'wb')) knc4 = NearestCentroid() knc4.fit(df_input4_data,numpy.ravel(df_input4_target)) pickle.dump(knc4, open('model_knc_t4.pkl', 'wb'))
df = df.values words = df[:, :-1] labels = df[:, -1] X_train, X_test, Y_train, Y_test = train_test_split(words, labels, test_size=0.2, random_state=50) from sklearn.metrics import accuracy_score, confusion_matrix from matplotlib import style import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') style.use('ggplot') # Rocchio Algorithm clf = NearestCentroid() clf.fit(X_train, Y_train) predict = clf.predict(X_test) accuracy = accuracy_score(Y_test, predict) print('\nAccuracy of Rocchio:\n') print(accuracy) conf_mat = confusion_matrix(Y_test, predict) print('\nConfusion Matrix: \n', conf_mat) plt.matshow(conf_mat) plt.title('Confusion Matrix for test Data\t') plt.colorbar() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() # Naive Bayes clf_1 = GaussianNB()
def myclassify_practice_set(numfiers,xtrain,ytrain,xtltrain,xtltest,xtest,ytarget=None,testing=False,grids='ABCDEFGHI'): #NOTE we might not need xtltrain # xtrain and ytrain are your training set. xtltrain is the indices of corresponding recordings in xtrain and ytrain. these will always be present #xtest is your testing set. xtltest is the corresponding indices of the recording. for the practice set xtltest = xtrunclength # ytest is optional and depends on if you are using a testing set or the practice set # remove NaN, Inf, and -Inf values from the xtest feature matrix xtest,xtltest,ytarget = removeNanAndInf(xtest,xtltest,ytarget) # print 'finished removal of Nans' ytrain = np.ravel(ytrain) ytarget = np.ravel(ytarget) #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector count = 0 # print numfiers predictionMat = np.empty((xtest.shape[0],numfiers)) predictionStringMat = [] finalPredMat = [] targetStringMat = [] targets1 = [] predictions1 = [] # svc1 = SVC() # svc1.fit(xtrain,ytrain) # ytest = svc1.predict(xtest) # predictionMat[:,count] = ytest # count+=1 if count < numfiers: # votingClassifiers combine completely different machine learning classifiers and use a majority vote clff1 = SVC() clff2 = RFC(bootstrap=False) clff3 = ETC() clff4 = neighbors.KNeighborsClassifier() clff5 = quadda() eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) eclf = eclf.fit(xtrain,ytrain) #print(eclf.score(xtest,ytest)) # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # cla # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # print () ytest = eclf.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) ytest = bagging2.predict(xtest) predictionMat[:,count] = ytest count += 1 if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) ytest = tree2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) ytest = bagging1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: svc1 = SVC() svc1.fit(xtrain,ytrain) ytest = svc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) ytest = qda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) ytest = tree1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) ytest = knn1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) ytest = lda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) ytest = tree3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) ytest = bagging3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) ytest = bagging4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) ytest = tree4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) ytest = tree6.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) ytest = knn2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) ytest = knn3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) ytest = knn4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) ytest = knn5.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) ytest = ncc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) ytest = tree5.predict(xtest) predictionMat[:,count] = ytest count+=1 # print xtltest # print len(ytest) for colCount in range(predictionMat.shape[1]): tempCol = predictionMat[:,colCount] if testing: modeCol = temppredWindowVecModeFinder(tempCol,xtltest,4,grids,isPrint=0) else: modeCol = predWindowVecModeFinder(tempCol,xtltest,4,isPrint=0) ytarg = predWindowVecModeFinder(ytarget,xtltest,1,isPrint=0) if testing: modeStr = temppredVec2Str(modeCol,grids) else: modeStr = predVec2Str(modeCol) modeStrans = predVec2Str(ytarg) predictionStringMat.append(modeStr) predictions1.append(modeCol) finalPredMat += map(int,modeCol) targetStringMat.append(modeStrans) targets1.append(ytarg) if testing == False: if ytarget != None: #print targets1 #print "" #print predictions1 confusionme = confusion_matrix(targets1[0],predictions1[0]) #print "Confusion Matrix is: " #print confusionme return predictionStringMat, targetStringMat, finalPredMat
def run_knn(train_varnames, train_labels,test_varnames, test_labels): clf=NearestCentroid() result,accuracy=fit_predict(clf,"Nearest Centroid Classifier", train_varnames, train_labels,test_varnames, test_labels) return result,accuracy