def train_nearest_centroid(params, x_train, y_train, n_folds, random_state, stratified=True, shuffle=True): # Model and hyperparameter selection if stratified: kf = StratifiedKFold(n_splits=n_folds, random_state=random_state, shuffle=shuffle) else: kf = KFold(n_splits=n_folds, random_state=random_state, shuffle=shuffle) nearest_centroid_model = NearestCentroid(**params) i = 0 # Model Training for (train_index, test_index) in kf.split(x_train, y_train): # cross-validation randomly splits train data into train and validation data print('\n Fold %d' % (i + 1)) x_train_cv, x_val_cv = x_train.iloc[train_index], x_train.iloc[ test_index] y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[ test_index] # declare your model nearest_centroid_model.fit(x_train_cv, y_train_cv) # predict train and validation set accuracy and get eval metrics scores_cv = nearest_centroid_model.predict(x_train_cv) scores_val = nearest_centroid_model.predict(x_val_cv) # training evaluation train_pc = accuracy_score(y_train_cv, scores_cv) train_pp = precision_score(y_train_cv, scores_cv) train_re = recall_score(y_train_cv, scores_cv) print('\n train-Accuracy: %.6f' % train_pc) print(' train-Precision: %.6f' % train_pp) print(' train-Recall: %.6f' % train_re) eval_pc = accuracy_score(y_val_cv, scores_val) eval_pp = precision_score(y_val_cv, scores_val) eval_re = recall_score(y_val_cv, scores_val) print('\n eval-Accuracy: %.6f' % eval_pc) print(' eval-Precision: %.6f' % eval_pp) print(' eval-Recall: %.6f' % eval_re) i = i + 1 # return model for evaluation and prediction return nearest_centroid_model
def build_model(): clf = NearestCentroid() clf.fit(trainX, trainY) joblib.dump(clf, 'models/nearest.model') predictY = clf.predict(testX) acc = get_acc(predictY, testY) print '* acc on test:', acc predictY = clf.predict(validX) acc = get_acc(predictY, validY) print '* acc on valid:', acc return 0
def pickData(filename, class_numbers, training_instances, test_instances): data1 = np.genfromtxt(filename, delimiter=",") #### Reading File array = np.array(data1) data = array class_count = 0 test_instance = test_instances training_instance = training_instances count = 1 file_name = filename if (file_name == "HandWrittenLetters.txt"): class_count = 39 elif (file_name == "ATNTFaceImages400.txt"): class_count = 10 for i in range(len(class_numbers)): column_from = (class_numbers[i] - 1) * class_count column_to = column_from + class_count training_column_end = column_to - test_instance train_label = data[0, column_from:training_column_end] train_data = data[1:, column_from:training_column_end] test_label = data[0, training_column_end:column_to] test_data = data[1:, training_column_end:column_to] if (count == 1): train_label_final = train_label test_label_final = test_label train_data_final = train_data test_data_final = test_data count = 0 else: train_label_final = np.hstack((train_label_final, train_label)) test_label_final = np.hstack((test_label_final, test_label)) train_data_final = np.hstack((train_data_final, train_data)) test_data_final = np.hstack((test_data_final, test_data)) train_data_final_t = train_data_final.transpose() test_data_final_t = test_data_final.transpose() outfile(train_data_final, test_data_final, train_label_final, test_label_final) clf = NearestCentroid() clf.fit(train_data_final_t, train_label_final) predictions = clf.predict(test_data_final_t) print("Test set predictions:\n{}".format(clf.predict(test_data_final_t))) print("Test set accuracy: {:.2f}".format( clf.score(test_data_final_t, test_label_final)))
def nearest_centroid(input_file,Output): lvltrace.lvltrace("LVLEntree dans nearest_centroid") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape clf = NearestCentroid() clf.fit(X,y) y_pred = clf.predict(X) print "#########################################################################################################\n" print "Nearest Centroid Classifier " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"Nearest_Centroid_metrics.txt" file = open(results, "w") file.write("Nearest Centroid Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "Nearest Centroid Classifier" save = Output + "Nearest_Centroid_Classifier_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans nearest_centroid")
def Rocchio_Algorith(): clf = NearestCentroid() clf.fit(X_train, Y_train) pred_result = clf.predict(X_test) print(pred_result) print() print(Y_test) print('classification report: ') # print classification_report(y_test, yhat) print(classification_report(Y_test, pred_result)) print('f1 score') print(f1_score(Y_test, pred_result, average='macro')) print('accuracy score') print(accuracy_score(Y_test, pred_result)) precision = precision_score(Y_test, pred_result, average=None) print("Precision : ") print(precision) recall = recall_score(Y_test, pred_result, average=None) print("Recall : ") print(recall)
def handwritingClassTest(self): hwLabels = [] # 加载训练数据集 trainingFileList = listdir(Config.DATAS + 'KNN/digits/trainingDigits') m = len(trainingFileList) trainingMat = np.zeros((m, 1024)) for i in range(m): fileNameStr = trainingFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) hwLabels.append(classNumStr) trainingMat[i, :] = self.img2vector( Config.DATAS + 'KNN/digits/trainingDigits/%s' % fileNameStr) # 开始训练 clf = NearestCentroid() clf.fit(trainingMat, hwLabels) testFileList = listdir(Config.DATAS + 'KNN/digits/testDigits') errorCount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) vectorUnderTest = self.img2vector(Config.DATAS + 'KNN/digits/testDigits/%s' % fileNameStr) classifierResult = clf.predict(vectorUnderTest) print "the classifier came back with: %d, the real answer is: %d" % ( classifierResult, classNumStr) if (classifierResult != classNumStr): errorCount += 1.0 print "\nthe total number of errors is: %d" % errorCount print "\nthe total error rate is: %f" % (errorCount / float(mTest))
def flw_dataset_classify(): f = Feature() paths, classes = loadFaceData('face.csv', nrows=82) X = [] y = [] for index, path in enumerate(paths): ar = f.getFeature(path) print(index, path) if ar.all() == 0: continue X.append(ar) y.append(classes[index]) X = np.array(X) y = np.array(y) print(X.shape) print(X) print(y) X_train_data, X_test_data, y_train_data, y_test_data = train_test_split( X, y, test_size=0.3, stratify=y) nearestCentroid = NearestCentroid() nearestCentroid.fit(X_train_data, y_train_data) predict_y = nearestCentroid.predict(X_test_data) acc = accuracy_score(y_test_data, predict_y) print(acc)
def classifyUsingKNNCentroid(trainX, trainY, testX, testY): print( '################# classifyUsingKNNCentroid() started ##################' ) start_time = time.time() clf = NearestCentroid() print('KNN Initialized') clf.fit(trainX, trainY) print('KNN Trained') predictedY = clf.predict(testX) print('KNN prediction completed') accuracy = accuracy_score(testY, predictedY) confusionMatrix = confusion_matrix(testY, predictedY) f1Score = f1_score(testY, predictedY, average='weighted') print('accuracy:', accuracy) print('confusionMatrix: ', confusionMatrix) print('f1Score: ', f1Score) print( '################# classifyUsingKNNCentroid() finished ##################' ) print("--- %s seconds ---" % (time.time() - start_time))
def exeML(mlmethod, xtr, ytr, xte, yte, islog=True, isfeatureselection=True): if islog: xtr = np.log(np.abs(xtr)).tolist() ytr = np.log(np.abs(ytr)).tolist() xte = np.log(np.abs(xte)).tolist() yte = np.log(np.abs(yte)).tolist() if isfeatureselection: estimator = SVR(kernel="linear") selector = RFE(estimator, 100, step=1) selector = selector.fit(xtr, ytr) xtr = np.array(xtr)[:, selector.support_].tolist() xte = np.array(xte)[:, selector.support_].tolist() np.random.seed(1000) if mlmethod == "SVM": clf = svm.SVR(kernel='poly') elif mlmethod == "NeaNei": clf = NearestCentroid() elif mlmethod == "dtree": clf = tree.DecisionTreeClassifier() elif mlmethod == "lda": clf = lda(solver="svd") predval = [] clf.fit(xtr, ytr) for i in range(len(xte)): predval.append(np.float(clf.predict(xte[i]))) return predval
def r3_get_r2(r3list, label, app_statedict: dict, psdict: dict): print('Using history data to extract r2 belonging to r3') df = pd.DataFrame(columns=list(psdict.keys())) r3combi = {} # generating dataframe for key, ps in psdict.items(): app_state = app_statedict[key] state_num = len(app_state) clf = NearestCentroid() clf.fit(np.append([0], np.array([i.center_value for i in app_state])).reshape(-1, 1), np.array(range(state_num + 1))) df[key] = clf.predict(ps.values.reshape(-1, 1)) for idn, r3 in enumerate(r3list): idx = label == idn tempt = df.iloc[idx] combination = set([tuple(i) for i in list(tempt.values)]) r3combi[idn] = combination r2list = [] state_count = [] for r2row in combination: app_state_tuple = [] for kk, key in enumerate(df.columns): if r2row[kk] > 0: app_state_tuple.append(app_statedict[key][r2row[kk] - 1]) if app_state_tuple != []: r2list.append(State_r2(tuple(app_state_tuple))) state_count.append(np.count_nonzero((tempt == np.array(r2row)).all(1))) else: r2list.append(State_r2(None)) state_count.append(np.count_nonzero((tempt == np.array(r2row)).all(1))) r3.set_state_r2_list(r2list, False) r3.statecount = state_count if (len(r2list) != len(state_count)): print() print('r2 extracting finished')
def nearest_centroid(input_file,Output,test_size): ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf = NearestCentroid() clf.fit(X_train,y_train) y_pred = clf.predict(X_test) print "Nearest Centroid Classifier " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"Nearest_Centroid_metrics_test.txt" file = open(results, "w") file.write("Nearest Centroid Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Nearest Centroid %f"%test_size save = Output + "Nearest_Centroid_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans stochasticGD split_test")
def vote_scheme(df_peaks): # Apply a voting scheme using the nearest neighbors (NN) to see if peaks in the # other axes are near the one found for X. If a peak gets 3 votes than it gives # higher confidence that motion was n_peaks = [len(df_peaks[col].dropna()) for col in df_peaks.columns] # Will use the direction with the greatest number of peaks as the base # The other two direction will vote to see which peaks they have matching # If they all have the same number of peaks, default to X-axis if len(set(n_peaks)) == 1: base_dir = df_peaks.iloc[:, 0] voting_dirs = df_peaks.iloc[:, 1:] else: highest_n_peak = [peak == max(n_peaks) for peak in n_peaks] lower_n_peak = [peak != max(n_peaks) for peak in n_peaks] base_dir = df_peaks.X_filt_hp voting_dirs = df_peaks.loc[:, lower_n_peak] X = np.array(base_dir.values).reshape(-1, 1) y = np.array(base_dir.index.values) clf = NearestCentroid() clf.fit(X, y) NearestCentroid(metric='euclidean', shrink_threshold=None) total_votes = np.ones(len(base_dir)) for col in voting_dirs: votes = clf.predict( np.array(voting_dirs[col].dropna().values).reshape(-1, 1)) total_votes[votes] += 1 peaks = (total_votes == len(df_peaks.columns)) #print(peaks, votes) return peaks
def kNCN(x, Y, newData): global model_kncn, modelCreated_kncn, predictBuf_kncn, pbDetected_kncn if not modelCreated_kncn: print('Training Initiated. . .') feature = np.array(x, dtype=np.float32) label = np.array(Y, dtype=np.int) model_kncn = NearestCentroid(metric='euclidean', shrink_threshold=None) model_kncn.fit(feature, label) modelCreated_kncn = True print('Training Complete') else: predicted = model_kncn.predict(newData) predictBuf_kncn = np.array(predicted, dtype=np.int) for i in range(len(predictBuf_kncn)): if predictBuf_kncn[i] == 0: for j in newData: print('>>>Cost:', j, '\n>>>Prediction: Prolonged') pbDetected_kncn += 1 print('Prolonged detection number', pbDetected_kncn) elif predictBuf_kncn[i] == 1: for j in newData: print('>>>Cost:', j, '\n>>>Prediction: Right') elif predictBuf_kncn[i] == 2: for j in newData: print('>>>Cost:', j, '\n>>>Prediction: Left') else: print('>>>hmmm...')
def predictor(final): from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors.nearest_centroid import NearestCentroid from sklearn import svm from sklearn.cross_validation import cross_val_score KNN = KNeighborsClassifier() cen = NearestCentroid() SVM = svm.SVC() #if(temper1==0): TrainX = [] TrainX = final[1] TrainY = [] TrainY = final[2] TestX = [] TestX = final[3] #predictor(TrainX.TrainY,TestX) KNN.fit(TrainX, TrainY) cen.fit(TrainX, TrainY) SVM.fit(TrainX, TrainY) abc = [] # print("The predicted values using KNN are", KNN.predict(TestX)) abc.append(KNN.predict(TestX)) #print("The predicted values using Centroid are",cen.predict(TestX)) abc.append(cen.predict(TestX)) #print("The predicted values using SVM are",SVM.predict(TestX)) abc.append(SVM.predict(TestX)) return abc
def scut_fbp_test(): f = Feature() # af1and5 0.890287769784 paths, classes = loadFaceData( './dataset/af1and5.csv', nrows=100) # './dataset/all(round_score).csv' for full class X = [] y = [] for index, path in enumerate(paths): ar = f.getFeature(path) print(index, path) if ar.all() == 0: continue X.append(ar) y.append(round(classes[index])) X = np.array(X) y = np.array(y) print(X.shape) print(X) print(y) X_train_data, X_test_data, y_train_data, y_test_data = train_test_split( X, y, test_size=0.3, stratify=y) nearestCentroid = NearestCentroid() nearestCentroid.fit(X_train_data, y_train_data) predict_y = nearestCentroid.predict(X_test_data) acc = accuracy_score(y_test_data, predict_y) print(acc)
def sk_nearest_neighbour(X_train, y_train, X_test, y_test): """ Wrapper over sklearn's nearest neighbor. """ clf = NearestCentroid() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) c = np.sum(y_pred == y_test) accuracy = c * 100.0 / len(y_test) return accuracy, y_pred
def run_nearest_neighbour(feature, label): print ('nearest neighbour begin...\n') x_train, x_test, y_train, y_test = tts(feature, label, test_size=0.2) clf = NearestCentroid() clf.fit(x_train, y_train) preds = clf.predict(x_test) run_result(y_test, preds) print ('...nearest neighbour complete\n')
def knn_classify0(training_set, training_labels, test_set, test_labels, num_neighbors): clf = NearestCentroid(metric='euclidean') clf.fit(training_set, training_labels) input_test_predictions = clf.predict(test_set) test_result = np.sum( input_test_predictions == test_labels) * 100.0 / float( len(test_labels)) # type: ndarray return 0.0, test_result
def ml_algo(inp): df = pd.read_csv("data/final_preprocess.csv") X = np.array(df.drop(['Result'], axis=1)) y = np.array(df['Result']) X, y = shuffle(X, y, random_state=1) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2) model_centroid = NearestCentroid().fit(X_train, y_train) model_knn = KNeighborsClassifier(25).fit(X_train, y_train) model_svm = SVC().fit(X_train, y_train) model_lr = LinearRegression().fit(X_train, y_train) model_nb = BernoulliNB().fit(X_train, y_train) # criterion-> gini or entropy; splitter-> best or random; max_depth-> any integer value or None; # min_samples_split-> min no. of samples reqd. to split an internal node; # min_samples_leaf -> The minimum number of samples required to be at a leaf node. # min_impurity_split -> It defines the threshold for early stopping tree growth. model_dtree = DecisionTreeClassifier(criterion="entropy", random_state=100, max_depth=3, min_samples_leaf=5).fit( X_train, y_train) # print ("[1] ACCURACY OF DIFFERENT MODELS ",'\n___________________') accu_centroid = model_centroid.score(X_test, y_test) # print ("NearestCentroid -> ", accu_centroid) accu_knn = model_knn.score(X_test, y_test) # print ("Knn -> ",accu_knn) accu_svm = model_svm.score(X_test, y_test) # print ("SVM -> ", accu_svm,) accu_lr = model_lr.score(X_test, y_test) # print ("Linear Regr -> ", accu_lr) accu_nb = model_nb.score(X_test, y_test) # print ("Naive Bayes -> ", accu_nb) accu_dtree = model_dtree.score(X_test, y_test) # print ("Decission Tree -> ", accu_dtree, "\n") result_centroid = model_centroid.predict(inp) result_knn = model_knn.predict(inp) result_svm = model_svm.predict(inp) result_lr = model_lr.predict(inp) result_nb = model_nb.predict(inp) result_dtree = model_dtree.predict(inp) # disease-name, description, [list of step to be taken], [list of to whom we can contact] # print ("[2] PREDICTION ",'\n___________________') # print ("NearestCentroid -> ", result_centroid) # print ("knn -> ", result_centroid) # print ("svm -> ", result_svm) # print ("LinearReg -> ", result_lr) # print ("Naive Bayes -> ", result_nb) # print ("Decission Tree -> ", result_dtree) # return map_disease[str(result_knn[0])] return result_knn[0]
def _clustering(self, targetgame, games): ''' Find similar games with clustering TODO ''' preparegames = list(map(lambda x: [i[1] for i in x.data], games)) preparegame = list(map(lambda x: x[1], targetgame.data)) lables = list(range(len(games))) clf = NearestCentroid() clf.fit(preparegames, lables) print(clf.predict(preparegame))
def _clustering(self, targetgame, games): ''' Find similar games with clustering TODO ''' preparegames = list(map(lambda x: [i[1] for i in x.data], games)) preparegame = list(map(lambda x: x[1], targetgame.data)) lables = list(range(len(games))) clf = NearestCentroid() clf.fit(preparegames, lables) print(clf.predict(preparegame))
def nearestCentroid(self, x_train, y_train, x_test, y_test): # Test with Nearest Centroid clf = None clf = NearestCentroid(metric='euclidean') # Train created model clf.fit(x_train, y_train) # Predict on test data prediction = None prediction = clf.predict(x_test) # Log results self.logResults(y_test, prediction, kn=False)
def KNN_K1(Xtrain, Ytrian, Xtest, err, Name): from sklearn.neighbors.nearest_centroid import NearestCentroid clf = NearestCentroid() clf.fit(Xtrain, Ytrian) a = clf.predict(Xtest) result = pd.read_csv("./upload" + ".csv", sep=',', delimiter=None) result['proba'] = a * (1.0 - err[0]) + (1 - a) * err[1] result.to_csv(Name[0] + str(Name[1]) + ".csv", sep=',', encoding='utf-8', index=False)
def knn_scargc(X, y, Ut): '''clf = KNeighborsClassifier(n_neighbors=1).fit(X, y) predicted_label = clf.predict(Ut.reshape(1, -1))''' best_distance, ind = NearestNeighbors(n_neighbors=1).fit(X).kneighbors( Ut.reshape(1, -1)) nearest = X[ind] clf = NearestCentroid(metric='euclidean').fit(X, y) predicted_label = clf.predict(Ut.reshape(1, -1)) #print(clf.centroids_) #exemplo [[ 0.25940611 -0.02868181] [ 5.450457 5.40674248]] #print("nearest scikit",nearest[0][0]) #print("predicted scikit",predicted_label) return predicted_label, best_distance, nearest[0]
class centroid(): def __init__(self, bm): self.input = File[bm] self.clf = NearestCentroid() self.x = [] #store the properties of the object self.y = [] #store the kind of class self.readData(bm) print 'centroid Fitting...' self.clf.fit(self.x, self.y) print 'Done!' def readData(self, bm): ''' readData: get the data in the source data ''' read = open(self.input, 'r') for line in read.readlines(): one = line.strip('\n').split(' ') tmp = list() for i in xrange(N[bm]): tmp.append(float(one[i])) self.x.append(tmp) self.y.append(int(one[N[bm]])) read.close() self.x = np.array(self.x) self.y = np.array(self.y) def doIt(self): accuracy = 0 for i in xrange(len(self.y)): if self.y[i] == self.clf.predict(self.x[i]): accuracy += 1 print accuracy * 1.0 / len(self.y) return accuracy * 1.0 / len(self.y) def predict(self, x): return self.clf.predict(x)
def predictor(TrainX_F, TrainY_F, TestX): cen = NearestCentroid() SVM = svm.SVC() regr = LinearRegression() cen.fit(TrainX_F, TrainY_F) SVM.fit(TrainX_F, TrainY_F) regr.fit(TrainX_F, TrainY_F) print("Centroid Predicted Labels: ", end='') print(cen.predict(TestX)) print("SVM Predicted Labels: ", end='') print(SVM.predict(TestX)) print("LR Predicted Labels: ", end='') print(regr.predict(TestX))
def itemB(): train_dataset = load_nebulosa_train() # remover missing values # print(train_dataset) train_dataset = train_dataset[~np.isnan(train_dataset).any(axis=1)] train_dataset = train_dataset[:, 2:] train_target = train_dataset[:, -1] train_dataset = train_dataset[:, :-2] # train_dataset = normalize(train_dataset, axis=0) test_dataset = load_nebulosa_test() # remover mising values test_dataset = test_dataset[~np.isnan(test_dataset).any(axis=1)] test_dataset = test_dataset[:, 2:] test_target = test_dataset[:, -1] test_dataset = test_dataset[:, :-2] # print(test_dataset) # test_dataset = normalize(test_dataset, axis=1) # print(test_dataset) kbest = SelectKBest(f_classif, k=3).fit(train_dataset, train_target) train_dataset = kbest.transform(train_dataset) test_dataset = kbest.transform(test_dataset) # print(train_dataset) n_train_samples = train_dataset.shape[0] n_train_features = train_dataset.shape[1] # print("Nebulosa Train dataset: %d amostras(%d características)" % (n_train_samples, n_train_features)) n_test_samples = test_dataset.shape[0] n_test_features = test_dataset.shape[1] # print("Nebulosa Test dataset: %d amostras(%d características)" % (n_test_samples, n_test_features)) nn = KNeighborsClassifier(n_neighbors=1) nn.fit(train_dataset, train_target) nn_target_pred_test = nn.predict(test_dataset) nn_accuracy_test = accuracy_score(test_target, nn_target_pred_test) print("NN: Acurácia (Teste): %.2f" % (nn_accuracy_test)) nc = NearestCentroid(metric="euclidean") nc.fit(train_dataset, train_target) nc_target_pred_test = nc.predict(test_dataset) nc_accuracy_test = accuracy_score(test_target, nc_target_pred_test) print("Rocchio: Acurácia (Teste): %.2f" % (nc_accuracy_test))
class NearestCentroidImpl(): def __init__(self, metric='euclidean', shrink_threshold=None): self._hyperparams = { 'metric': metric, 'shrink_threshold': shrink_threshold } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X)
class Knn(): def __init__(self, method, n_neighbors, weights, radius): if method == 'knn_class': self.clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) elif method == 'knn_rad': self.clf = RadiusNeighborsClassifier(radius=radius) elif method == 'knn_cent': self.clf = NearestCentroid() def train_model(self, train): self.clf.fit(train[0], train[1]) def predict(self, data): return self.clf.predict(data) def test_model(self, test): return self.clf.score(test[0], test[1])
def vote_scheme(df_peaks): # Apply a voting scheme using the nearest neighbors (NN) to see if peaks in the # other axes are near the one found for X. If a peak gets 3 votes than it gives # higher confidence that motion was n_peaks = [len(df_peaks[col].dropna()) for col in df_peaks.columns] # Will use the direction with the greatest number of peaks as the base # The other two direction will vote to see which peaks they have matching # If they all have the same number of peaks, default to X-axis if len(set(n_peaks)) == 1: base_dir = df_peaks.iloc[:, 0] voting_dirs = df_peaks.iloc[:, 1:] else: highest_n_peak = [peak == max(n_peaks) for peak in n_peaks] lower_n_peak = [peak != max(n_peaks) for peak in n_peaks] base_dir = df_peaks.iloc[:, highest_n_peak] voting_dirs = df_peaks.iloc[:, lower_n_peak].dropna() if len(base_dir) == 0 or len(voting_dirs) == 0: df_peaks_voted = pd.DataFrame() return df_peaks_voted # Do a KNN to vote if both directions have the a peak near same # Allow for a 5% max fluxuation X = np.array(base_dir.values).reshape(-1, 1) y = np.array(base_dir.index.values) clf = NearestCentroid() clf.fit(X, y) NearestCentroid(metric='euclidean', shrink_threshold=None) total_votes = np.ones(len(base_dir)) for col in voting_dirs: votes = clf.predict(np.array(voting_dirs[col].values).reshape(-1, 1)) total_votes[votes] += 1 peaks_votes = (total_votes == len(df_peaks.columns)) # Check how much each row differs - max 5% df_base = base_dir[peaks_votes] df_base.reset_index(drop=True, inplace=True) df_peaks_temp = pd.concat([df_base, voting_dirs], axis=1) df_peaks_temp['10pt_diff'] = df_peaks_temp['X_filt_bp'].sub( df_peaks_temp['Y_filt_bp']).abs() < 10 df_peaks_voted = df_peaks_temp[df_peaks_temp['10pt_diff']] df_peaks_voted.drop('10pt_diff', axis=1, inplace=True) return df_peaks_voted
def itemA(): train_dataset = load_nebulosa_train() train_target = train_dataset[:, -1] train_dataset = train_dataset[:, :-1] nam_target = np.where(np.isnan(train_target)) train_target = np.delete(train_target, nam_target) train_dataset = np.delete(train_dataset, nam_target, 0) train_dataset = np.nan_to_num(train_dataset) test_dataset = load_nebulosa_test() test_target = test_dataset[:, -1] test_dataset = test_dataset[:, :-1] nam_target = np.where(np.isnan(test_target)) test_target = np.delete(test_target, nam_target) test_dataset = np.delete(test_dataset, nam_target, 0) test_dataset = np.nan_to_num(test_dataset) n_train_samples = train_dataset.shape[0] n_train_features = train_dataset.shape[1] print("Nebulosa Train dataset: %d amostras(%d características)" % (n_train_samples, n_train_features)) n_test_samples = test_dataset.shape[0] n_test_features = test_dataset.shape[1] print("Nebulosa Test dataset: %d amostras(%d características)" % (n_test_samples, n_test_features)) nn = KNeighborsClassifier(n_neighbors=1) nn.fit(train_dataset, train_target) nn_target_pred_test = nn.predict(test_dataset) nn_accuracy_test = accuracy_score(test_target, nn_target_pred_test) print("NN: Acurácia (Teste): %.2f" % (nn_accuracy_test)) # train_target[18] = 1 nc = NearestCentroid(metric="euclidean") nc.fit(train_dataset, train_target) nc_target_pred_test = nc.predict(test_dataset) # print(nc_target_pred_test) nc_accuracy_test = accuracy_score(test_target, nc_target_pred_test) print("Rocchio: Acurácia (Teste): %.2f" % (nc_accuracy_test))
def predict_with_nearestcentroid(train_features, test_features, train_labels, test_labels, metric='cosine'): """using nearest center classifer to evaluate the results. :train_features, test_features: the feature vectors of train set and test set :train_labels, test_labels: the labels of train set and test set. :metric: the metric to calculate distence. :return: CRR and center vectors of each class :rtype: tuple """ clf = NearestCentroid(metric=metric) clf.fit(train_features, train_labels) predicted = clf.predict(test_features) return cal_crr(test_labels, predicted), clf.centroids_
def main(CV=False, PLOT=True): """Entry Point. Parameters ---------- CV: bool Cross-validation flag PLOT: bool Plotting flag """ _data = fetch_data() if CV: method, params = cross_validate(_data) else: method = 'l2' params = {'metric': chisquare} data = normalise(_data, method) X_train, y_train = data['train'] X_test, y_test = data['test'] classifier = NearestCentroid(**params) classifier.fit(X_train, y_train) print('ACCURACY: ', classifier.score(X_test, y_test)) if PLOT: y_hat = classifier.predict(X_test) cnf_matrix = confusion_matrix(y_test, y_hat) plot_confusion_matrix(cnf_matrix, classes=list(set(y_test)), title='Nearest Centroid\nConfusion Matrix', cmap=plt.cm.Blues) plt.savefig('data/out/nc_cnf_matrix.pdf', format='pdf', dpi=300, transparent=True)
def text_classify(X_train, X_test, y_train, y_test): """ machine learning classifier :param X_train: :param X_test: :param y_train: :param y_test: :return: """ # print('=' * 100) # print('start launching MLP Classifier......') # mlp = MLPClassifier(solver='lbfgs', alpha=1e-4, hidden_layer_sizes=(50, 30, 20, 20, 20, 30, 50), random_state=1) # mlp.fit(X_train, y_train) # print('finish launching MLP Classifier, the test accuracy is {:.5%}'.format(mlp.score(X_test, y_test))) print('=' * 100) print('start launching SVM Classifier......') svc = svm.SVC(decision_function_shape='ovo') svc.fit(X_train, y_train) print('finish launching SVM Classifier, the test accuracy is {:.5%}'.format( accuracy_score(svc.predict(X_test), y_test))) print('=' * 100) print('start launching Decision Tree Classifier......') dtree = tree.DecisionTreeClassifier() dtree.fit(X_train, y_train) print('finish launching Decision Tree Classifier, the test accuracy is {:.5%}'.format( accuracy_score(dtree.predict(X_test), y_test))) print('=' * 100) print('start launching KNN Classifier......') knn = NearestCentroid() knn.fit(X_train, y_train) print('finish launching KNN Classifier, the test accuracy is {:.5%}'.format( accuracy_score(knn.predict(X_test), y_test))) print('=' * 100) print('start launching Random Forest Classifier......') rf = RandomForestClassifier(n_estimators=20) rf.fit(X_train, y_train) print('finish launching Random Forest Classifier, the test accuracy is {:.5%}'.format( accuracy_score(rf.predict(X_test), y_test)))
# Takes a list, creates a csv file def submitFile(x, pre): f = open(pre + '_submission.csv', 'w') for val in x: f.write(str(val) + ',\r') f.close() # ============================================================================== # Nearest centroid classifier # ============================================================================== from sklearn.neighbors.nearest_centroid import NearestCentroid ncc = NearestCentroid() ncc.fit(features_to_train, targets_to_train) predicted_targets = ncc.predict(features_to_test) # Just print out the precision and f1 scores print 'precision: %0.5f' % metrics.precision_score(rf_benchmark_targets, predicted_targets) print 'f1 score: %0.5f' % metrics.f1_score(rf_benchmark_targets, predicted_targets) # The following scores are used for classification models print 'accuracy: %0.5f' % metrics.zero_one_score(rf_benchmark_targets, predicted_targets) print 'loss: %d' % metrics.zero_one(rf_benchmark_targets, predicted_targets) # ============================================================================== # Multinomial naive bayes # ============================================================================== from sklearn.naive_bayes import MultinomialNB mnb = MultinomialNB()
from sklearn.neighbors.nearest_centroid import NearestCentroid import numpy as np X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) clf = NearestCentroid() clf.fit(X, y) print(clf.predict([[-0.8, -1]]))
class clusteringST: """ Identification of sub-types for prediction """ def __init__(self, verbose=True): self.verbose = verbose def fit(self, net_data_low, nSubtypes=3, reshape_w=True): # net_data_low = net_data_low_main.copy() self.flag_2level = False self.nnet_cluster = net_data_low.shape[1] self.nSubtypes = nSubtypes # ind_low_scale = cls.get_ind_high2low(low_res_template,orig_template) # self.ind_low_scale = ind_low_scale # net_data_low --> Dimensions: nSubjects, nNetwork_low, nNetwork # net_data_low = transform_low_scale(ts_data,self.ind_low_scale) # self.net_data_low = net_data_low self.normalized_net_template = [] for i in range(net_data_low.shape[1]): # average template if nSubtypes < 1: self.normalized_net_template.append(np.zeros_like(net_data_low[0, i, :]).astype(float)) else: self.normalized_net_template.append(np.mean(net_data_low[:, i, :], axis=0)) # self.normalized_net_template.append(np.zeros_like(net_data_low[0,i,:])).astype(float)) # indentity matrix of the corelation between subjects # tmp_subj_identity = np.corrcoef(net_data_low[:,i,:]) # ind_st = cls.hclustering(tmp_subj_identity,nSubtypes) # subjects X network_nodes ind_st = cls.hclustering(net_data_low[:, i, :] - self.normalized_net_template[-1], nSubtypes) # ind_st = cls.hclustering(net_data_low[:,i,:],nSubtypes) for j in range(nSubtypes): if j == 0: st_templates_tmp = np.median(net_data_low[:, i, :][ind_st == j + 1, :], axis=0)[np.newaxis, ...] else: st_templates_tmp = np.vstack( ( st_templates_tmp, np.median(net_data_low[:, i, :][ind_st == j + 1, :], axis=0)[np.newaxis, ...], ) ) # st_templates --> Dimensions: nNetwork_low, nSubtypes, nNetwork if i == 0: self.st_templates = st_templates_tmp[np.newaxis, ...] else: self.st_templates = np.vstack((self.st_templates, st_templates_tmp[np.newaxis, ...])) del st_templates_tmp # calculate the weights for each subjects self.W = self.compute_weights(net_data_low, self.st_templates) if reshape_w: return self.reshapeW(self.W) else: return self.W def norm_subjects(self, data, ref=[]): if len(data.shape) == 2: ref_avg_rmaps = ref.mean() avrg_rmaps = data.mean(1) scaling_factor = ref_avg_rmaps / avrg_rmaps return data * scaling_factor.reshape(-1, 1) else: ref_avg_rmaps = np.array(self.normalized_net_template).mean(1) avrg_rmaps = data.mean(2) scaling_factor = ref_avg_rmaps / avrg_rmaps print ref_avg_rmaps.shape, avrg_rmaps.shape, scaling_factor.shape return np.swapaxes(np.swapaxes(data, 0, 2) * np.swapaxes(scaling_factor, 0, 1), 0, 2) def robust_st(self, net_data_low, nSubtypes, n_iter=50): bs_cluster = [] n = net_data_low.shape[0] stab_ = np.zeros((n, n)).astype(float) rs = ShuffleSplit(net_data_low.shape[0], n_iter=n_iter, test_size=0.05, random_state=1) for train, test in rs: # indentity matrix of the corelation between subjects ind_st = cls.hclustering(net_data_low[train, :], nSubtypes) mat_ = (cls.ind2matrix(ind_st) > 0).astype(float) for ii in range(len(train)): stab_[train, train[ii]] += mat_[:, ii] stab_ = stab_ / n_iter ms = KMeans(nSubtypes) ind = ms.fit_predict(stab_) # row_clusters = linkage(stab_, method='ward') # ind = fcluster(row_clusters, nSubtypes, criterion='maxclust') return ind + 1, stab_ def fit_robust(self, net_data_low, nSubtypes=3, reshape_w=True, stab_thereshold=0.5): self.flag_2level = False self.nnet_cluster = net_data_low.shape[1] self.nSubtypes = nSubtypes self.normalized_net_template = [] for i in range(net_data_low.shape[1]): # average template self.normalized_net_template.append(np.mean(net_data_low[:, i, :], axis=0)) # self.normalized_net_template.append(np.zeros_like(net_data_low[0,i,:])) # indentity matrix of the corelation between subjects # ind_st = cls.hclustering(net_data_low[:,i,:],nSubtypes) ind_st, stab_ = self.robust_st(net_data_low[:, i, :] - self.normalized_net_template[-1], nSubtypes) for j in range(nSubtypes): mask_stable = (stab_[ind_st == j + 1, :].mean(0) > stab_thereshold)[ind_st == j + 1] if self.verbose: print "Robust: new N ", mask_stable.sum(), " old N ", mask_stable.shape data_ = net_data_low[ind_st == j + 1, i, :][mask_stable, :] if j == 0: st_templates_tmp = np.median(data_, axis=0)[np.newaxis, ...] else: st_templates_tmp = np.vstack((st_templates_tmp, np.median(data_, axis=0)[np.newaxis, ...])) # st_templates --> Dimensions: nNetwork_low, nSubtypes, nNetwork if i == 0: self.st_templates = st_templates_tmp[np.newaxis, ...] else: self.st_templates = np.vstack((self.st_templates, st_templates_tmp[np.newaxis, ...])) del st_templates_tmp # calculate the weights for each subjects self.W = self.compute_weights(net_data_low, self.st_templates) if reshape_w: return self.reshapeW(self.W) else: return self.W def fit_robust_network(self, net_data_low, nSubtypes=3, reshape_w=True, stab_thereshold=0.5): self.flag_2level = False self.nnet_cluster = 1 self.nSubtypes = nSubtypes # net_data_low --> Dimensions: nSubjects, nNetwork_low, nNetwork self.normalized_net_template = [] # average template self.normalized_net_template.append(np.mean(net_data_low[:, :], axis=0)) # self.normalized_net_template.append(np.zeros_like(net_data_low[0,:])) # indentity matrix of the corelation between subjects ind_st, stab_ = self.robust_st(net_data_low - self.normalized_net_template[-1], nSubtypes) for j in range(nSubtypes): mask_stable = (stab_[ind_st == j + 1, :].mean(0) > stab_thereshold)[ind_st == j + 1] if self.verbose: print "Robust: new N ", mask_stable.sum(), " old N ", mask_stable.shape data_ = net_data_low[ind_st == j + 1, :][mask_stable, :] if j == 0: st_templates_tmp = np.median(data_, axis=0)[np.newaxis, ...] else: st_templates_tmp = np.vstack((st_templates_tmp, np.median(data_, axis=0)[np.newaxis, ...])) # st_templates --> Dimensions: nNetwork_low,nSubtypes, nNetwork self.st_templates = st_templates_tmp[np.newaxis, ...] del st_templates_tmp # calculate the weights for each subjects self.W = self.compute_weights(net_data_low, self.st_templates) if reshape_w: return self.reshapeW(self.W) else: return self.W def fit_network(self, net_data_low, nSubtypes=3, reshape_w=True): self.flag_2level = False self.nnet_cluster = 1 self.nSubtypes = nSubtypes # net_data_low --> Dimensions: nSubjects, nNetwork_low, nNetwork self.normalized_net_template = [] # average template self.normalized_net_template.append(np.mean(net_data_low, axis=0)) # self.normalized_net_template.append(np.zeros_like(net_data_low[0,:])) # indentity matrix of the corelation between subjects ind_st = cls.hclustering(net_data_low - self.normalized_net_template[-1], nSubtypes) for j in range(nSubtypes): data_tmp = np.median(net_data_low[ind_st == j + 1, :] - self.normalized_net_template[-1], axis=0)[ np.newaxis, ... ] if j == 0: st_templates_tmp = data_tmp else: st_templates_tmp = np.vstack((st_templates_tmp, data_tmp)) # st_templates --> Dimensions: nNetwork_low,nSubtypes, nNetwork self.st_templates = st_templates_tmp[np.newaxis, ...] del st_templates_tmp # calculate the weights for each subjects self.W = self.compute_weights(net_data_low, self.st_templates) if reshape_w: return self.reshapeW(self.W) else: return self.W def fit_2level(self, net_data_low_l1, net_data_low_l2, nSubtypes_l1=5, nSubtypes_l2=2, reshape_w=True): self.flag_2level = True self.nnet_cluster = net_data_low_l1.shape[1] self.nSubtypes = nSubtypes_l1 * nSubtypes_l2 self.nSubtypes_l1 = nSubtypes_l1 self.nSubtypes_l2 = nSubtypes_l2 # net_data_low --> Dimensions: nSubjects, nNetwork_low, nNetwork self.net_data_low = net_data_low_l1 self.net_data_low_l2 = net_data_low_l2 #### # LEVEL 1 #### # st_templates --> Dimensions: nNetwork_low, nSubtypes, nNetwork st_templates = [] for i in range(net_data_low_l1.shape[1]): # indentity matrix of the corelation between subjects ind_st = cls.hclustering(net_data_low_l1[:, i, :], nSubtypes_l1) for j in range(nSubtypes_l1): if j == 0: st_templates_tmp = net_data_low_l1[:, i, :][ind_st == j + 1, :].mean(axis=0)[np.newaxis, ...] else: st_templates_tmp = np.vstack( (st_templates_tmp, net_data_low_l1[:, i, :][ind_st == j + 1, :].mean(axis=0)[np.newaxis, ...]) ) if i == 0: st_templates = st_templates_tmp[np.newaxis, ...] else: st_templates = np.vstack((st_templates, st_templates_tmp[np.newaxis, ...])) self.st_templates_l1 = st_templates # calculate the weights for each subjects # W --> Dimensions: nSubjects,nNetwork_low, nSubtypes net_data_low_l2_tmp = np.vstack((net_data_low_l1, net_data_low_l2)) self.W_l1 = self.compute_weights(net_data_low_l2_tmp, self.st_templates_l1) #### # LEVEL 2 #### # st_templates --> Dimensions: nNetwork_low, nSubtypes, nNetwork st_templates = [] # st_templates = self.st_templates_l1.copy() # st_templates = st_templates[:,:,np.newaxis,:] for i in range(net_data_low_l2.shape[1]): # Iterate on all the Level1 subtypes (normal variability subtypes) for k in range(self.st_templates_l1.shape[1]): # Find the L1 subtype max_w = np.max(self.W_l1[:, i, :], axis=1) mask_selected_subj = self.W_l1[:, i, k] == max_w template2substract = self.st_templates_l1[i, k, :] if np.sum(mask_selected_subj) <= 3: print ("Less then 2 subjects for network: " + str(i) + " level1 ST: " + str(k)) for j in range(nSubtypes_l2): if (k == 0) & (j == 0): st_templates_tmp = self.st_templates_l1[i, k, :][np.newaxis, ...] else: st_templates_tmp = np.vstack( (st_templates_tmp, self.st_templates_l1[i, k, :][np.newaxis, ...]) ) else: # indentity matrix of the corelation between subjects ind_st = cls.hclustering( net_data_low_l2_tmp[:, i, :][mask_selected_subj, ...] - template2substract, nSubtypes_l2 ) # ind_st = cls.hclustering(net_data_low[:,i,:],nSubtypes) if len(np.unique(ind_st)) < nSubtypes_l2: print ( "Clustering generated less class then asked nsubjects: " + str(len(ind_st)) + " network: " + str(i) + " level1 ST: " + str(k) ) # if (i==6) & (k==3): # print ind_st for j in range(nSubtypes_l2): if (k == 0) & (j == 0): st_templates_tmp = ( net_data_low_l2_tmp[:, i, :][mask_selected_subj, ...][ind_st == j + 1, :] - template2substract ).mean(axis=0)[np.newaxis, ...] else: st_templates_tmp = np.vstack( ( st_templates_tmp, ( net_data_low_l2_tmp[:, i, :][mask_selected_subj, ...][ind_st == j + 1, :] - template2substract ).mean(axis=0)[np.newaxis, ...], ) ) if i == 0: st_templates = st_templates_tmp[np.newaxis, ...] else: print st_templates.shape, st_templates_tmp.shape st_templates = np.vstack((st_templates, st_templates_tmp[np.newaxis, ...])) self.st_templates_l2 = st_templates # calculate the weights for each subjects self.W_l2 = self.compute_weights(net_data_low_l2, self.st_templates_l2) if reshape_w: return self.reshapeW(self.W_l2) else: return self.W_l2 def compute_weights_old(self, net_data_low, st_templates): # calculate the weights for each subjects W = np.zeros((net_data_low.shape[0], st_templates.shape[0], st_templates.shape[1])) for i in range(net_data_low.shape[0]): for j in range(st_templates.shape[0]): for k in range(st_templates.shape[1]): # Demean average_template = np.median(self.net_data_low[:, j, :], axis=0) # average_template = self.st_templates[j,:,:].mean(axis=0) dm_map = net_data_low[i, j, :] - average_template dm_map = preprocessing.scale(dm_map) st_dm_map = st_templates[j, k, :] - average_template W[i, j, k] = np.corrcoef(st_dm_map, dm_map)[-1, 0:-1] return W def compute_weights(self, net_data_low, st_templates=[], mask_part=[]): if st_templates == []: st_templates = self.st_templates # calculate the weights for each subjects # W = np.zeros((net_data_low.shape[0],st_templates.shape[0],st_templates.shape[1])) for j in range(st_templates.shape[0]): average_template = self.normalized_net_template[j] if len(net_data_low.shape) == 2: rmaps = net_data_low - average_template else: rmaps = net_data_low[:, j, :] - average_template st_rmap = st_templates[j, :, :] - average_template tmp_rmap = self.compute_w(rmaps, st_rmap, mask_part) if j == 0: W = np.zeros((net_data_low.shape[0], st_templates.shape[0], tmp_rmap.shape[1])) W[:, j, :] = tmp_rmap return np.nan_to_num(W) def compute_w_global(self, X, ref): range_ = 1 if len(X.shape) == 3: # multiple networks for net in range(X.shape[1]): if len(ref.shape) > 2: range_ = ref.shape[1] w_global = np.corrcoef(ref[net, ...], X[:, net, :])[range_:, 0:range_] else: # One network if len(ref.shape) > 1: range_ = ref.shape[0] w_global = np.corrcoef(ref, X)[range_:, 0:range_] return w_global def compute_w(self, X, ref, mask_part=[]): if mask_part != []: # sub_w based on partition w_ = [] list_id = np.unique(mask_part) for idx in np.delete(list_id, np.where(list_id == 0)): mask_ = mask_part == idx w_.append(self.compute_w_global(X[..., mask_], ref[..., mask_])) w_ = np.hstack(w_) else: # global mode, no sub-partition w_ = self.compute_w_global(X, ref) return w_ def compute_weights_l2(self, net_data_low): corrected_ndl = net_data_low.copy() W_l1 = self.compute_weights(net_data_low, self.st_templates_l1) # calculate the weights for each subjects for i in range(net_data_low.shape[1]): for k in range(self.st_templates_l1.shape[1]): # Find the L1 subtype max_w = np.max(W_l1[:, i, :], axis=1) mask_selected_subj = W_l1[:, i, k] == max_w corrected_ndl[mask_selected_subj, i, :] = ( corrected_ndl[mask_selected_subj, i, :] - self.st_templates_l1[i, k, :] ) return self.compute_weights(corrected_ndl, self.st_templates_l2) def transform(self, net_data_low, mask_part=[], reshape_w=True): """ Calculate the weights for each sub-types previously computed """ # compute the low scale version of the data # net_data_low = transform_low_scale(ts_data,self.ind_low_scale) if self.flag_2level: # calculate the weights for each subjects # W = self.compute_weights(net_data_low,self.st_templates_l2) W = self.compute_weights_l2(net_data_low) else: # calculate the weights for each subjects W = self.compute_weights(net_data_low, self.st_templates, mask_part) if reshape_w: return self.reshapeW(W) else: return W def reshapeW(self, W): # reshape the matrix from [subjects, Nsubtypes, weights] to [subjects, vector of weights] xw = W.reshape((W.shape[0], W.shape[1] * W.shape[2])) return xw def fit_dev(self, net_data, nnet_cluster="auto", nSubtypes=3): self.nnet_cluster = nnet_cluster self.nSubtypes = nSubtypes if nnet_cluster == "auto": # self.nnet_cluster = self.getClusters(net_data) self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data, nnet_cluster, algo="meanshift") else: self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data, nnet_cluster, algo="kmeans") # self.valid_cluster = self.clust_list # self.valid_net_idx = range(len(self.valid_cluster)) for i in range(net_data.shape[0]): if i == 0: self.assign_net = self.assigneDist(net_data[i, :, :], self.valid_cluster, self.valid_net_idx) else: self.assign_net = np.vstack( ((self.assign_net, self.assigneDist(net_data[i, :, :], self.valid_cluster, self.valid_net_idx))) ) print "Size of the new data map: ", self.assign_net.shape # group subjects with the most network classifing them together # compute the consensus clustering self.consensus = cls.hclustering(self.assign_net, self.nSubtypes) # save the centroids in a method self.clf_subtypes = NearestCentroid() self.clf_subtypes.fit(self.assign_net, self.consensus) self.consensus = self.clf_subtypes.predict(self.assign_net) # print "score: ", self.clf_subtypes.score(self.assign_net,self.consensus) return self.consensus
# range(2,74) means its goes from col 2 to col 73 df_input_data = df_input[list(range(2,74))].as_matrix() # test with few good features as determined through PCA? df_input_target = df_input[list(range(0,1))].as_matrix() colors = numpy.random.rand(len(df_input_target)) # splitting the data into training and testing sets from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(df_input_data, df_input_target.tolist()) # k-NN from sklearn.neighbors.nearest_centroid import NearestCentroid knc = NearestCentroid() knc.fit(X_train[:],numpy.ravel(y_train[:])) predicted = knc.predict(X_test) print y_test[60:90] , len(y_test[60:90]) print predicted[60:90] , len(predicted[60:90]) print knc.classes_ # Prediction Performance Measurement matches = (predicted == [item for sublist in y_test for item in sublist]) print matches.sum() print len(matches) print matches[10:50], len(matches[10:50]) print "Accuracy : ", (matches.sum() / float(len(matches)))
class clusteringST: ''' Identification of sub-types for prediction ''' def getClusters(self,net_data): self.avg_bin_mat = np.zeros((net_data.shape[0],net_data.shape[0])) self.avg_n_clusters = 0 self.clust_list = [] for i in range(net_data.shape[2]): ms = MeanShift() ms.fit(net_data[:,:,i]) self.clust_list.append(ms) labels = ms.labels_ cluster_centers = ms.cluster_centers_ n_clusters_ = len(np.unique(labels)) #print(labels,cluster_centers.shape,n_clusters_) #bin_mat = np.zeros(avg_bin_mat.shape) bin_mat = cls.ind2matrix(labels+1)>0 self.avg_bin_mat += bin_mat self.avg_n_clusters += n_clusters_ self.avg_bin_mat /= net_data.shape[2] self.avg_n_clusters /= net_data.shape[2] return self.avg_n_clusters def getMeanClustering(self): return self.avg_bin_mat def get_match_network(self,net_data,ncluster,algo='kmeans'): ''' net_data: 3d volume (subjects x vecnetwork x vecnetwork) ncluster: number of groups to partition the subjects algo: (default: kmeans) kmeans, meanshift. ''' valid_net_idx = [] valid_cluster = [] self.avg_bin_mat = np.zeros((net_data.shape[0],net_data.shape[0])) self.avg_n_clusters = 0 for i in range(net_data.shape[2]): # Compute clustering with for each network if algo == 'kmeans': clust = KMeans(init='k-means++', n_clusters=ncluster, n_init=10) else: clust = MeanShift() #t0 = time.time() clust.fit(net_data[:,:,i]) #t_batch = time.time() - t0 # Compute the stability matrix among networks bin_mat = cls.ind2matrix(clust.labels_+1)>0 self.avg_bin_mat += bin_mat self.avg_n_clusters += len(np.unique(clust.labels_)) valid_cluster.append(clust) valid_net_idx.append(i) self.avg_bin_mat /= net_data.shape[2] self.avg_n_clusters /= net_data.shape[2] return valid_cluster, valid_net_idx def assigneSubtype(self,nets,valid_cluster, valid_net_idx): classes = [] dist_centroid = np.array([]) for i in range(len(valid_net_idx)): classes.append(valid_cluster[i].predict(nets[:,valid_net_idx[i]])[0]) #points = np.vstack((nets[:,valid_net_idx[i]],valid_cluster[i].cluster_centers_)) #dist_ = squareform(pdist(points, metric='euclidean'))[0,1:] #classes.append(np.argmin(dist_)) points = np.vstack((nets[:,valid_net_idx[i]],valid_cluster[i].cluster_centers_)) dist_ = squareform(pdist(points, metric='euclidean'))[0,1:] dist_centroid = np.hstack((dist_centroid,dist_)) return classes, dist_centroid def assigneDist(self,nets,valid_cluster, valid_net_idx): classes = np.array([]) for i in range(len(valid_net_idx)): #print np.hstack((classes,(valid_cluster[i].transform(nets[:,valid_net_idx[i]])[0]))) points = np.vstack((nets[:,valid_net_idx[i]],valid_cluster[i].cluster_centers_)) dist_ = squareform(pdist(points, metric='euclidean'))[0,1:] #dist_ = squareform(pdist(points, metric='correlation'))[0,1:] classes = np.hstack((classes,dist_)) #classes.append(np.argmin(dist_)) return classes def fit_old(self,net_data,nnet_cluster='auto',nSubtypes=3): self.nnet_cluster = nnet_cluster self.nSubtypes = nSubtypes if nnet_cluster == 'auto': #self.nnet_cluster = self.getClusters(net_data) self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='meanshift') else: self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='kmeans') #self.valid_cluster = self.clust_list #self.valid_net_idx = range(len(self.valid_cluster)) self.assign_net = np.array([]) self.dist_net = np.array([]) for i in range(net_data.shape[0]): if i == 0 : classes_, dist_ = self.assigneSubtype(net_data[i,:,:],self.valid_cluster, self.valid_net_idx) self.dist_net = dist_ self.assign_net = classes_ else: classes_, dist_ = self.assigneSubtype(net_data[i,:,:],self.valid_cluster, self.valid_net_idx) self.dist_net = np.vstack((self.dist_net,dist_)) self.assign_net = np.vstack((self.assign_net,classes_)) # group subjects with the most network classifing them together # compute the consensus clustering self.consensus = cls.hclustering(self.assign_net,self.nSubtypes) # save the centroids in a method self.clf_subtypes = NearestCentroid() self.clf_subtypes.fit(self.assign_net,self.consensus) self.consensus = self.clf_subtypes.predict(self.assign_net) #print "score: ", self.clf_subtypes.score(self.assign_net,self.consensus) return self.consensus def transform_low_scale_old(self,net_data): # net_data_low --> Dimensions: nSubjects, nNetwork_low, nNetwork nnet_cluster = np.max(self.ind_low_scale) net_data_low = [] net_data_low = np.zeros((net_data.shape[0],nnet_cluster,net_data.shape[2])) for i in range(nnet_cluster): # average the apropriate parcels and scale them #net_data_low[:,i,:] = preprocessing.scale(net_data[:,self.ind_low_scale==i+1,:].mean(axis=1), axis=1) net_data_low[:,i,:] = net_data[:,self.ind_low_scale==i+1,:].mean(axis=1) return net_data_low def fit(self,net_data_low,nSubtypes=3,reshape_w=True): self.nnet_cluster = net_data_low.shape[1] self.nSubtypes = nSubtypes #ind_low_scale = cls.get_ind_high2low(low_res_template,orig_template) #self.ind_low_scale = ind_low_scale # net_data_low --> Dimensions: nSubjects, nNetwork_low, nNetwork #net_data_low = transform_low_scale(ts_data,self.ind_low_scale) self.net_data_low = net_data_low # st_templates --> Dimensions: nNetwork_low, nSubtypes, nNetwork st_templates = [] for i in range(len(net_data_low[1])): # indentity matrix of the corelation between subjects #tmp_subj_identity = np.corrcoef(net_data_low[:,i,:]) #ind_st = cls.hclustering(tmp_subj_identity,nSubtypes) # subjects X network_nodes #ind_st = cls.hclustering(net_data_low[:,i,:]-np.mean(net_data_low[:,i,:],axis=0),nSubtypes) ind_st = cls.hclustering(net_data_low[:,i,:],nSubtypes) for j in range(nSubtypes): if j == 0: st_templates_tmp = net_data_low[:,i,:][ind_st==j+1,:].mean(axis=0)[np.newaxis,...] else: st_templates_tmp = np.vstack((st_templates_tmp,net_data_low[:,i,:][ind_st==j+1,:].mean(axis=0)[np.newaxis,...])) if i == 0: st_templates = st_templates_tmp[np.newaxis,...] else: st_templates = np.vstack((st_templates,st_templates_tmp[np.newaxis,...])) self.st_templates = st_templates # calculate the weights for each subjects self.W = self.compute_weights(net_data_low) if reshape_w: return self.reshapeW(self.W) else: return self.W def compute_weights(self,net_data_low): # calculate the weights for each subjects W = np.zeros((net_data_low.shape[0],self.st_templates.shape[0],self.st_templates.shape[1])) for i in range(net_data_low.shape[0]): for j in range(self.st_templates.shape[0]): for k in range(self.st_templates.shape[1]): # Demean average_template = np.median(self.net_data_low[:,j,:],axis=0) #average_template = self.st_templates[j,:,:].mean(axis=0) dm_map = net_data_low[i,j,:] - average_template dm_map = preprocessing.scale(dm_map) st_dm_map = self.st_templates[j,k,:] - average_template W[i,j,k] = np.corrcoef(st_dm_map,dm_map)[-1,0:-1] return W def transform(self,net_data_low,reshape_w=True): ''' Calculate the weights for each sub-types previously computed ''' # compute the low scale version of the data #net_data_low = transform_low_scale(ts_data,self.ind_low_scale) # calculate the weights for each subjects W = self.compute_weights(net_data_low) if reshape_w: return self.reshapeW(W) else: return W def reshapeW(self,W): # reshape the matrix from [subjects, Nsubtypes, weights] to [subjects, vector of weights] xw = W.reshape((W.shape[0], W.shape[1]*W.shape[2])) return xw def fit_dev(self,net_data,nnet_cluster='auto',nSubtypes=3): self.nnet_cluster = nnet_cluster self.nSubtypes = nSubtypes if nnet_cluster == 'auto': #self.nnet_cluster = self.getClusters(net_data) self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='meanshift') else: self.valid_cluster, self.valid_net_idx = self.get_match_network(net_data,nnet_cluster,algo='kmeans') #self.valid_cluster = self.clust_list #self.valid_net_idx = range(len(self.valid_cluster)) for i in range(net_data.shape[0]): if i == 0 : self.assign_net = self.assigneDist(net_data[i,:,:],self.valid_cluster, self.valid_net_idx) else: self.assign_net = np.vstack(((self.assign_net,self.assigneDist(net_data[i,:,:],self.valid_cluster, self.valid_net_idx)))) print 'Size of the new data map: ',self.assign_net.shape # group subjects with the most network classifing them together # compute the consensus clustering self.consensus = cls.hclustering(self.assign_net,self.nSubtypes) # save the centroids in a method self.clf_subtypes = NearestCentroid() self.clf_subtypes.fit(self.assign_net,self.consensus) self.consensus = self.clf_subtypes.predict(self.assign_net) #print "score: ", self.clf_subtypes.score(self.assign_net,self.consensus) return self.consensus
#!/usr/bin/python from sklearn.neighbors.nearest_centroid import NearestCentroid import numpy X = numpy.array([[-1,-1],[-2,-1],[-3,-2],[1,1],[2,1],[3,2]]) y = numpy.array([1,1,1,2,2,2]) clf = NearestCentroid() clf.fit(X,y) NearestCentroid(metric='euclidean', shrink_threshold=None) print clf.predict([0,1])
# SGD classifier - gives about 73% accuracy cl4 = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False) cl4.fit(X_train, target) pr4 = cl4.predict(X_test) allpred += pr4 print"SGD: " + "%.2f" % (evaluate(pr4, test_jokes)) + "%" # KNN Classifier - gives about 59% accuracy cl5 = NearestCentroid() cl5.fit(X_train, target) pr5 = cl5.predict(X_test) print"KNN: " + "%.2f" % (evaluate(pr5, test_jokes)) + "%" # Decision tree classifier - gives about 75% accuracy cl6 = tree.DecisionTreeClassifier() cl6.fit(X_train, target) pr6 = cl6.predict(X_test) allpred += pr6 print"Decision tree: " + "%.2f" % (evaluate(pr6, test_jokes)) + "%" maxpred = max(allpred) pr7 = [1 if x > maxpred / 2 else 0 for x in allpred] print "Bagging: " + "%.2f" % (evaluate(pr7, test_jokes)) + "%"
from sklearn.neighbors.nearest_centroid import NearestCentroid import numpy as np X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) clf = NearestCentroid() clf.fit(X, y) print clf.predict([[-0.8, -1]])
all_instances.append(row1) if(row1[0] > maxlength): maxlength = row1[0]; for row2 in negative: row2 = row2[:-1] row2 = row2.split(',') row2 = [int(i) for i in row2] all_instances.append(row2) if(row2[0] > maxlength): maxlength = row2[0]; for instance in all_instances: instance[0] = instance[0]/maxlength; random.shuffle(all_instances) # print all_instances[0:700] print "all_instances size: ", len(all_instances) train_set = np.array(all_instances[0:700]) test_set = np.array(all_instances[701:]) print train_set[:,:-1] X = np.array(train_set[:,:-1]) Y = np.array(train_set[:,-1]) clf = NearestCentroid() clf.fit(X, Y) predication = clf.predict(test_set[:,:-1]) evaluation(predication, test_set[:,-1])
def myclassify_AudPow(numfiers,xtrain_1,xtrain_2,ytrain_1,ytrain_2,xtest): # remove NaN, Inf, and -Inf values from the xtest feature matrix xtest = xtest[~np.isnan(xtest).any(axis=1),:] xtest = xtest[~np.isinf(xtest).any(axis=1),:] xtrain = np.append(xtrain_1,xtrain_2,0) ytrain = np.append(ytrain_1,ytrain_2) ytrain = np.ravel(ytrain) xtrunclength = sio.loadmat('../Files/xtrunclength.mat') xtrunclength = xtrunclength['xtrunclength'][0] #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector count = 0 # print numfiers predictionMat = np.empty((xtest.shape[0],numfiers)) predictionStringMat = [] finalPredMat = [] bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) ytest = bagging2.predict(xtest) predictionMat[:,count] = ytest count += 1 if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) ytest = tree2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) ytest = bagging1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # votingClassifiers combine completely different machine learning classifiers and use a majority vote clff1 = SVC() clff2 = RFC(bootstrap=False) clff3 = ETC() clff4 = neighbors.KNeighborsClassifier() clff5 = quadda() eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) eclf = eclf.fit(xtrain,ytrain) #print(eclf.score(xtest,ytest)) # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # cla # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # print () ytest = eclf.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: svc1 = SVC() svc1.fit(xtrain,ytrain) ytest = svc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) ytest = qda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) ytest = tree1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) ytest = knn1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) ytest = lda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) ytest = tree3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) ytest = bagging3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) ytest = bagging4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) ytest = tree4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) ytest = tree6.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) ytest = knn2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) ytest = knn3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) ytest = knn4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) ytest = knn5.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) ytest = ncc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) ytest = tree5.predict(xtest) predictionMat[:,count] = ytest count+=1 for colCount in range(predictionMat.shape[1]): tempCol = predictionMat[:,colCount] modeCol = predWindowVecModeFinder(tempCol,xtrunclength) modeStr = predVec2Str(modeCol) predictionStringMat.append(modeStr) finalPredMat += map(int,modeCol) return predictionStringMat,finalPredMat
def ncentr(train, labels, test): clf = NearestCentroid() clf.fit(train, labels) return clf.predict(test)
class Gestures: def removeMag(self, line): return line[6:] def __init__(self): x = [] y = [] small = False #clf = svm.LinearSVC() self.clf = NearestCentroid() folder = "gyro_side\\" files = ['still.csv', 'yes.csv', 'no.csv'] for i in range(3): f =open(folder+files[i], 'r') for line in f.readlines(): #print line line = [int(a) for a in line.split(',')] lines = [self.removeMag(line[9*j:9*j+9]) for j in range(9)] # smallLine=[] # for j in range(5): # smallLine = smallLine + line[6*j:6*j+3] # if small: # line=smallLine # if len(x)==0: # x= np.array(np.array([line])) # else: # x=np.append(x,np.array([line]), axis=0) # #print np.shape(x) x += [reduce(lambda x,y: x+y, lines[:5], [])] y += [i] x += [reduce(lambda x,y: x+y, lines[4:], [])] y += [i] try: z=1 except Exception as e: #print e print i, line #z=1/0 f.close() x= np.array([np.array(z) for z in x]) y = np.array(y) print y print np.shape(y) print np.shape(x) print type(x[0]), np.array(x[0]) self.clf.fit(x,y) self.data = [] #self.ser = serial.Serial('COM3', 9600) print "Classifier trained" def setInitialData(self, init): self.data = [] for i in init: self.data = np.append(self.data,self.removeMag(i)) def updateData(self, line): self.data = np.append(self.data[len(self.removeMag([[]]*9)):],np.array([self.removeMag(line)])) def predictGesture(self, line): self.updateData(line) return self.clf.predict(np.array(self.data))
from sklearn import metrics import numpy import transform_data_to_format as tdtf #train_x , train_y = tdtf.read_data_to_ndarray("../data/train.csv",42000) #train_x , train_y = tdtf.read_data_to_ndarray("../data/train.csv",2100) #valid_x , valid_y = tdtf.read_data_to_ndarray("../data/valid.csv",21000) #test_x = tdtf.read_test_data_to_ndarray("../data/test.csv",28000); clf = NearestCentroid() clf.fit(train_x,train_y) #NearestCentroid(metric='euclidean', shrink_threshold=None) #pred_y = clf.predict(test_x) #pred_train_y = clf.predict(train_x[0:21000]) pred_valid_y = clf.predict(valid_x) #print pred_y #tdtf.write_to_csv(pred_y,"../data/MNIST_NearestNeighborsCentroid.out") #print("Classification report for classifier %s:\n%s\n" # % (clf , metrics.classification_report(train_y , pred_train_y ))) ''' print("Classification report for classifier %s:\n%s\n" % (clf , metrics.classification_report(train_y[0:21000] , pred_train_y ))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(train_y[0:21000] , pred_train_y )) ''' print("Classification report for classifier %s:\n%s\n" % (clf , metrics.classification_report(valid_y , pred_valid_y ))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(valid_y , pred_valid_y ))
class TwoWordRecognizer: def scaler(self,arr): return arr/np.max(np.abs(arr))*100 def get_startingpoint(self,arr): arr = np.abs(arr) st_i = 0 e_i = STEPS old_value = np.sum(arr[st_i:e_i,0]) counter = 0 while e_i < arr.shape[0]: arr_sum = np.sum(arr[st_i:e_i,0]) if(arr_sum>old_value*FACTOR): return st_i else: if(old_value<arr_sum): old_value = arr_sum st_i+=STEPS e_i+=STEPS return 10000 def get_endingpoint(self,arr): arr = np.abs(arr) e_i = arr.shape[0]-1 st_i = e_i - STEPS old_value = np.sum(arr[st_i:e_i,0]) while st_i > 0: arr_sum = np.sum(arr[st_i:e_i,0]) if(arr_sum>old_value*FACTOR): return e_i else: if(old_value<arr_sum): old_value = arr_sum st_i -= STEPS e_i -= STEPS return 10000 def euclidean_distance(self,arr1,arr2): a1 = arr1.copy() a2 = arr2.copy() if(a1.shape[0]<a2.shape[0]): zero_rows = a2[a1.shape[0]:a2.shape[0],[0,1]].copy() zero_rows[:,:] = 0 a1 = np.concatenate((a1,zero_rows)) elif(a1.shape[0]>a2.shape[0]): zero_rows = a1[a2.shape[0]:a1.shape[0],[0,1]].copy() zero_rows[:,:] = 0 a2 = np.concatenate((a2,zero_rows)) dist = np.sqrt((a2[:,0]-a1[:,0])**2) return np.sum(dist) def loadReferenceWords(self, word1_path, word2_path): fs, self.word1 = wavfile.read(word1_path) fs, self.word2 = wavfile.read(word2_path) self.word1 = self.scaler(self.word1) self.word2 = self.scaler(self.word2) self.word1 = self.word1[self.get_startingpoint(self.word1):self.get_endingpoint(self.word1),:] self.word2 = self.word2[self.get_startingpoint(self.word2):self.get_endingpoint(self.word2),:] def loadData(self, ressourcepath1, ressourcepath2): print(ressourcepath1) dirList = os.listdir(ressourcepath1) fullpath1 = [] for fname in dirList: fullpath1.append(ressourcepath1+""+fname) dirList = os.listdir(ressourcepath2) fullpath2 = [] for fname in dirList: fullpath2.append(ressourcepath2+""+fname) counter = 0 for path in fullpath1: if counter == 0: fs, w1 = wavfile.read(path) w1 = self.scaler(w1) w1 = w1[self.get_startingpoint(w1):self.get_endingpoint(w1),:] X = np.array([self.euclidean_distance(self.word1,w1),self.euclidean_distance(self.word2,w1)]) y = np.array([1]) counter = 1 else: fs, w1 = wavfile.read(path) w1 = self.scaler(w1) w1 = w1[self.get_startingpoint(w1):self.get_endingpoint(w1),:] X = np.vstack((X,np.array([self.euclidean_distance(self.word1,w1),self.euclidean_distance(self.word2,w1)]))) y = np.hstack((y,np.array([1]))) for path in fullpath2: fs, w2 = wavfile.read(path) w2 = self.scaler(w2) w2 = w2[self.get_startingpoint(w2):self.get_endingpoint(w2),:] X = np.vstack((X,np.array([self.euclidean_distance(self.word1,w2),self.euclidean_distance(self.word2,w2)]))) y = np.hstack((y,np.array([2]))) from sklearn.neighbors.nearest_centroid import NearestCentroid self.clf = NearestCentroid() self.clf.fit(X,y) #import matplotlib.pyplot as plt #plt.scatter(X[:,0],X[:,1]) #plt.show() def predict(self,input_path): fs, raw_arr = wavfile.read(input_path) raw_arr = self.scaler(raw_arr) word= raw_arr[self.get_startingpoint(raw_arr):self.get_endingpoint(raw_arr),:] x0 = np.array([self.euclidean_distance(self.word1,word),self.euclidean_distance(self.word2,word)]) return self.clf.predict(x0)
z data = np.array([]) print "Starting to read" for i in range(5): line = ser.readline()[:-2] line = removeMag([int(a) for a in line.split(',')]) if small: line = smallLine data = np.append(data, np.array(line)) prevs = [0,0] while True: #fullLine = reduce(lambda a,b: a+b, data, []) #print data #print data p = clf.predict(np.array(data)) if p !=0: prevs[p-1] += 1 if p == 0: if (prevs[0] > 2 or prevs[1] > 5): if prevs[0] > prevs[1]: print "Yes" else: print "No" prevs = [0,0] line = ser.readline()[:-2] line = removeMag([int(a) for a in line.split(',')]) smallLine = line[0:3] if small: line = smallLine #data = np.append(data[(3 if small else 6):],np.array([line]))
#print y_test #scores = cross_validation.cross_val_score(clf, data[:, 3:15], data[:, 2], cv=5) #print scores # Nearest Neighbor nbrs = KNeighborsClassifier(n_neighbors=2).fit(X_train, y_train) nbrs_y_pred = nbrs.predict(X_test) nbrs_pr = precision_score(y_test,nbrs_y_pred) nbrs_rc = recall_score(y_test,nbrs_y_pred) nbrs_CM = confusion_matrix(y_test,nbrs_y_pred) print "------------------" print "\tNearest Neighbor" print "------------------" print "Real: " print y_test print "Predict" print nbrs_y_pred print "Score:" print nbrs_pr # NearestCentroid clf = NearestCentroid().fit(X_train, y_train) print "------------------" print "\tNearest Centroid" print "------------------" print "Real: " print y_test print "Predict" print clf.predict(X_test) print "Score: " print clf.score(X_test, y_test)
def nn_centroid(self, X, y, test): clf = NearestCentroid() clf.fit(X, y) t = clf.predict(test) print("nn_centroid:", t) return t
#Nearest Centroid classification start = int(round(time.time() * 1000)) classifier = NearestCentroid() classifier.fit(X_lda, y_train) NearestCentroid(metric='euclidean', shrink_threshold=None) print (classifier) print("---------(5) Cross validation accuracy--------") print(cross_validation.cross_val_score(classifier, X_lda,y_train, cv=5)) end = int(round(time.time() * 1000)) print("--Centroid fitting finished in ", (end-start), "ms--------------") print("---------Test-set dimensions after PCA--------") print(X_test.shape) expected = y_test predicted = classifier.predict(X_test) print("--------------------Results-------------------") print("Classification report for Centroid classifier %s:\n%s\n" % (classifier, metrics.classification_report(expected, predicted))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
clf = NearestCentroid() X = [] Y = [] for i in range(0, 4): X.append([int(mean_avg[i]), ratio_avg[i]]) Y.append(0) for i in range(5, 9): X.append([int(mean_avg[i]), ratio_avg[i]]) Y.append(1) for i in range(10, 14): X.append([int(mean_avg[i]), ratio_avg[i]]) Y.append(2) # print X # print Y clf.fit(X, Y) res = clf.predict([[mean_avg[14], ratio_avg[14]]]) if res == 0: print "rock" if res == 1: print "scissor" if res == 2: print "paper" """ # Plot individual channels data plt.figure(1) plt.subplot(431) plt.plot(x) plt.ylabel('x') #plt.figure(2) plt.subplot(412)
svm_model.fit(X_cropped, y_cropped) y_train_predicted = svm_model.predict(X_train) print "SVM Error rate on training data (t1): ", ml_aux.get_error_rate(y_train, y_train_predicted) # ml_aux.plot_confusion_matrix(y_train, y_train_predicted, "CM SVM Training (t1)") # plt.show() y_validation_predicted = svm_model.predict(X_validation) print "SVM Error rate on validation (t1): ", ml_aux.get_error_rate(y_validation, y_validation_predicted) # Start k nearest Centroid Classification print "Performing kNC Classification:" from sklearn.neighbors.nearest_centroid import NearestCentroid knnc_model = NearestCentroid() knnc_model.fit(X_cropped, y_cropped) y_validation_predicted = knnc_model.predict(X_validation) print "Error Rate on kNNC (t1) Validation: ", ml_aux.get_error_rate(y_validation, y_validation_predicted) # Start Bagging Classification print "Performing Bagging Classification:" # Bagging from sklearn.ensemble import BaggingClassifier from sklearn.neighbors import KNeighborsClassifier # Bagging bagging1 = BaggingClassifier(KNeighborsClassifier(n_neighbors=2), max_samples=1.0, max_features=0.1) bagging1.fit(X_cropped, y_cropped) y_validation_predicted = bagging1.predict(X_validation) print "Error Rate kNN with Baggging Validation: ", ml_aux.get_error_rate(y_validation, y_validation_predicted)
knc3.fit(df_input3_data,numpy.ravel(df_input3_target)) pickle.dump(knc3, open('model_knc_t3.pkl', 'wb')) knc4 = NearestCentroid() knc4.fit(df_input4_data,numpy.ravel(df_input4_target)) pickle.dump(knc4, open('model_knc_t4.pkl', 'wb')) knc5 = NearestCentroid() knc5.fit(df_input5_data,numpy.ravel(df_input5_target)) pickle.dump(knc5, open('model_knc_t5.pkl', 'wb')) # knc = KMeans(n_clusters=5, random_state=RandomState(9) # knc.fit(df_input_data,numpy.ravel(df_input_target)) # pickle.dump(knc, open('model_knc_train.pkl', 'wb')) predicted1 = knc1.predict(df_input1_data) predicted2 = knc2.predict(df_input2_data) predicted3 = knc3.predict(df_input3_data) predicted4 = knc4.predict(df_input4_data) predicted5 = knc5.predict(df_input5_data) # predicted = knc.predict(df_input_data) matches1 = (predicted1 == [item for sublist in df_input1_target for item in sublist]) matches2 = (predicted2 == [item for sublist in df_input2_target for item in sublist]) matches3 = (predicted3 == [item for sublist in df_input3_target for item in sublist]) matches4 = (predicted4 == [item for sublist in df_input4_target for item in sublist]) matches5 = (predicted5 == [item for sublist in df_input5_target for item in sublist]) # matches = (predicted == [item for sublist in df_input_target for item in sublist]) print 'using excess rock & uncats removed'
print 'Reading features... Done!' # STEP 2 - computing scores print 'Training...' tfidf = models.TfidfModel(dictionary=features) # Computing tfidf model to be queried. tfidf.save('reuters/data/tfidf.model') # STEP 3 - computing centroids tfidf = models.TfidfModel.load('reuters/data/tfidf.model') features = corpora.Dictionary.load_from_text('reuters/data/word.dict') by_bow = Corpus2Dictionary(features) train_corpus = ReutersCorpus('training') tfidf_train = tfidf[by_bow[by_word[train_corpus]]] X = matutils.corpus2csc(tfidf_train) # to gensim into scipy sparse matrix X = X.transpose() # from csc (document as column) to csr (document as row) y = train_corpus.category_mask # label for doc rocchio = NearestCentroid() rocchio.fit(X, y) print 'Training... Done!' # STEP 4 - evaluate prediction test_corpus = ReutersCorpus('test') tfidf_test = tfidf[by_bow[by_word[test_corpus]]] # num_terms required: otherwise Z shrink to the max feature found X = matutils.corpus2csc(tfidf_test, num_terms=len(features)) X = X.transpose() y_true = test_corpus.category_mask y_pred = rocchio.predict(X) # print precision_score(y_true, y_pred) print rocchio.score(X, y_true)
def myclassify_practice_set(numfiers,xtrain,ytrain,xtltrain,xtltest,xtest,ytarget=None,testing=False,grids='ABCDEFGHI'): #NOTE we might not need xtltrain # xtrain and ytrain are your training set. xtltrain is the indices of corresponding recordings in xtrain and ytrain. these will always be present #xtest is your testing set. xtltest is the corresponding indices of the recording. for the practice set xtltest = xtrunclength # ytest is optional and depends on if you are using a testing set or the practice set # remove NaN, Inf, and -Inf values from the xtest feature matrix xtest,xtltest,ytarget = removeNanAndInf(xtest,xtltest,ytarget) # print 'finished removal of Nans' ytrain = np.ravel(ytrain) ytarget = np.ravel(ytarget) #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector count = 0 # print numfiers predictionMat = np.empty((xtest.shape[0],numfiers)) predictionStringMat = [] finalPredMat = [] targetStringMat = [] targets1 = [] predictions1 = [] # svc1 = SVC() # svc1.fit(xtrain,ytrain) # ytest = svc1.predict(xtest) # predictionMat[:,count] = ytest # count+=1 if count < numfiers: # votingClassifiers combine completely different machine learning classifiers and use a majority vote clff1 = SVC() clff2 = RFC(bootstrap=False) clff3 = ETC() clff4 = neighbors.KNeighborsClassifier() clff5 = quadda() eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) eclf = eclf.fit(xtrain,ytrain) #print(eclf.score(xtest,ytest)) # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # cla # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # print () ytest = eclf.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) ytest = bagging2.predict(xtest) predictionMat[:,count] = ytest count += 1 if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) ytest = tree2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) ytest = bagging1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: svc1 = SVC() svc1.fit(xtrain,ytrain) ytest = svc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) ytest = qda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) ytest = tree1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) ytest = knn1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) ytest = lda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) ytest = tree3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) ytest = bagging3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) ytest = bagging4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) ytest = tree4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) ytest = tree6.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) ytest = knn2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) ytest = knn3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) ytest = knn4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) ytest = knn5.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) ytest = ncc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) ytest = tree5.predict(xtest) predictionMat[:,count] = ytest count+=1 # print xtltest # print len(ytest) for colCount in range(predictionMat.shape[1]): tempCol = predictionMat[:,colCount] if testing: modeCol = temppredWindowVecModeFinder(tempCol,xtltest,4,grids,isPrint=0) else: modeCol = predWindowVecModeFinder(tempCol,xtltest,4,isPrint=0) ytarg = predWindowVecModeFinder(ytarget,xtltest,1,isPrint=0) if testing: modeStr = temppredVec2Str(modeCol,grids) else: modeStr = predVec2Str(modeCol) modeStrans = predVec2Str(ytarg) predictionStringMat.append(modeStr) predictions1.append(modeCol) finalPredMat += map(int,modeCol) targetStringMat.append(modeStrans) targets1.append(ytarg) if testing == False: if ytarget != None: #print targets1 #print "" #print predictions1 confusionme = confusion_matrix(targets1[0],predictions1[0]) #print "Confusion Matrix is: " #print confusionme return predictionStringMat, targetStringMat, finalPredMat
#train np.random.seed(i) random.seed(i) random.shuffle(FRAMES) data = np.array([[frame.distances[joint] for joint in frame.distances.keys()] for frame in FRAMES]) target = np.array([frame.label for frame in FRAMES]) indices = np.random.permutation(len(data)) data_train = data[indices[:-len(data)/2]] target_train = target[indices[:-len(data)/2]] data_test = data[indices[-len(data)/2:]] target_test = target[indices[-len(data)/2:]] knn = NearestCentroid() knn.fit(data_train, target_train) accuracy = sum(1 for (actual, correct) in zip(knn.predict(data_test), target_test) if actual == correct) / float(len(target_test)) if scale: if accuracy > last_accuracy: times_better += 1 elif accuracy < last_accuracy: times_wrong += 1 else: times_same += 1 last_accuracy = accuracy print times_better / float(times_better + times_wrong + times_same) print (times_better, times_wrong, times_same)