def test_metric(): print("###precision###") print precision(test_label, knn_predict) print precision_score(test_label, knn_predict, average='macro') print("###recall###") print recall(test_label, knn_predict) print recall_score(test_label, knn_predict, average='macro') print("###f1###") print f1(test_label, knn_predict) print f1_score(test_label, knn_predict, average='macro')
def main(argv): size_training = int(argv[0]) epoch = int(argv[1]) learning_rate = float(argv[2]) path = argv[3] training_img_path = path + '/train-images-idx3-ubyte.gz' training_label_path = path + '/train-labels-idx1-ubyte.gz' testing_img_path = path + '/t10k-images-idx3-ubyte.gz' testing_label_path = path + '/t10k-labels-idx1-ubyte.gz' # read the file, trans to 1-d array training_img = read_mnist(training_img_path)[:size_training].reshape( size_training, 784) training_label = read_mnist(training_label_path)[:size_training] testing_img = read_mnist(testing_img_path) testing_img = testing_img.reshape(len(testing_img), 28 * 28) testing_label = read_mnist(testing_label_path) # shuffle(I dont like it) training_set = zip(training_img, training_label) random.shuffle(training_set) training_img, training_label = zip(*training_set) # scale [0, 255] to boolean training_img = [[round(pixel / 255) for pixel in sample] for sample in training_img] testing_img = [[round(pixel / 255) for pixel in sample] for sample in testing_img] # build 10 perceptron, one classifier for one digit perceptron_clfs = [] for target_label in range(10): local_training_label = [ 1 if label == target_label else -1 for label in training_label ] # pack together and shuffle local_training_set = zip(training_img, local_training_label) perceptron_clfs.append(Winnow(local_training_set, learning_rate, epoch)) # training for clf in perceptron_clfs: clf.train() # test on training set training_set = zip(training_img, training_label) predict_label = [] for item in training_set: # predict scores = [] for clf in perceptron_clfs: scores.append(clf.predict(item[0])) predict_label.append(scores.index(max(scores))) print("Train F1 score: ", f1(training_label, predict_label)) # test on testing set testing_set = zip(testing_img, testing_label) predict_label = [] for item in testing_set: # predict scores = [] for clf in perceptron_clfs: scores.append(clf.predict(item[0])) predict_label.append(scores.index(max(scores))) print("Test F1 score: ", f1(testing_label, predict_label))
def KFoldCross(model, feature=None, label=None, cv=4): """ data : array-like, data to fit target : array-like, target variable cv : int, """ assert len(feature) == len(label) num_test_sample = len(feature) // cv # num_train_sample = num_train_train_sample + num_validation_sample num_train_sample = len(feature) - num_test_sample num_validation_sample = num_train_sample // cv num_train_train_sample = num_train_sample - num_validation_sample assert num_test_sample + num_train_train_sample + num_validation_sample == len( feature) total_train_f1 = 0 total_train_accuracy = 0 total_validation_f1 = 0 total_validation_accuracy = 0 total_test_f1 = 0 total_test_accuracy = 0 # test fold for test_fold in range(0, cv): # split test and train test_sample_feature = feature[test_fold * num_test_sample:(test_fold + 1) * num_test_sample] test_sample_label = label[test_fold * num_test_sample:(test_fold + 1) * num_test_sample] train_sample_feature = np.concatenate( (feature[:test_fold * num_test_sample], feature[(test_fold + 1) * num_test_sample:]), axis=0) train_sample_label = np.concatenate( (label[:test_fold * num_test_sample], label[(test_fold + 1) * num_test_sample:]), axis=0) # check assert len(test_sample_feature) == num_test_sample == len( test_sample_label) assert len(train_sample_feature) == num_train_sample == len( train_sample_label) total_fold_train_f1 = 0 total_fold_train_accuracy = 0 total_fold_validation_f1 = 0 total_fold_validation_accuracy = 0 # train fold for validation_fold in range(0, cv): model.reset() # split train and validation validation_sample_feature = train_sample_feature[ validation_fold * num_validation_sample:(validation_fold + 1) * num_validation_sample] validation_sample_label = train_sample_label[validation_fold * num_validation_sample: (validation_fold + 1) * num_validation_sample] train_train_sample_feature = np.concatenate( (train_sample_feature[:validation_fold * num_validation_sample], train_sample_feature[(validation_fold + 1) * num_validation_sample:]), axis=0) train_train_sample_label = np.concatenate( (train_sample_label[:validation_fold * num_validation_sample], train_sample_label[(validation_fold + 1) * num_validation_sample:]), axis=0) assert len(validation_sample_label) == len( validation_sample_feature) == num_validation_sample assert len(train_train_sample_feature) == len( train_train_sample_label) == num_train_train_sample model.fit(train_train_sample_feature, train_train_sample_label) # train stat train_output = model.predict(train_train_sample_feature) total_train_f1 += f1(train_train_sample_label, train_output) total_train_accuracy += accuracy(train_train_sample_label, train_output) total_fold_train_f1 += f1(train_train_sample_label, train_output) total_fold_train_accuracy += accuracy(train_train_sample_label, train_output) # validation stat validation_output = model.predict(validation_sample_feature) total_validation_f1 += f1(validation_sample_label, validation_output) total_validation_accuracy += accuracy(validation_sample_label, validation_output) total_fold_validation_f1 += f1(validation_sample_label, validation_output) total_fold_validation_accuracy += accuracy(validation_sample_label, validation_output) # predict in test set model.fit(train_sample_feature, train_sample_label) output = model.predict(test_sample_feature) # fold stat(overall) total_test_f1 += f1(test_sample_label, output) total_test_accuracy += accuracy(test_sample_label, output) # fold statistics(local) fold_train_f1 = total_fold_train_f1 / cv fold_train_accuracy = total_fold_train_accuracy / cv fold_validation_f1 = total_fold_validation_f1 / cv fold_validation_accuracy = total_fold_validation_accuracy / cv fold_test_f1 = f1(test_sample_label, output) fold_test_accuracy = accuracy(test_sample_label, output) print("Fold-", test_fold + 1) print("Training: F1 score: ", fold_train_f1, ", Accuracy: ", fold_train_accuracy) print("Validation: F1 score: ", fold_validation_f1, ", Accuracy: ", fold_validation_accuracy) print("Testing: F1 score: ", fold_test_f1, ", Accuracy: ", fold_test_accuracy) print # statistics train_f1 = total_train_f1 / (cv * cv) train_accuracy = total_train_accuracy / (cv * cv) validation_f1 = total_validation_f1 / (cv * cv) validation_accuracy = total_validation_accuracy / (cv * cv) test_f1 = total_test_f1 / cv test_accuracy = total_test_accuracy / cv print("Average") overall_report_line = [ train_f1, train_accuracy, validation_f1, validation_accuracy, test_f1, test_accuracy, '\n' ] print("Training: F1 score: ", train_f1, ", Accuracy: ", train_accuracy) print("Validation: F1 score: ", validation_f1, ", Accuracy: ", validation_accuracy) print("Testing: F1 score: ", test_f1, ", Accuracy: ", test_accuracy) print
def main(): my_data = np.genfromtxt('../winequality-white.csv', delimiter=';', dtype=float, skip_header=1) # proprocess(normalization and weighted vote) # train set train_feature = my_data[:3000, :-1] train_label = my_data[:3000, -1] # test set test_feature = my_data[3000:, :-1] test_label = my_data[3000:, -1] # feature and label feature = my_data[:, :-1] label = my_data[:, -1] embed() # test_metric() """ # test knn knn = KNN_Classifier(k=1) knn.fit(train_feature, train_label) f**k = knn.predict(train_feature) print(accuracy(train_label, f**k)) embed() return """ # knn kfold """ print("----KNN kfold with p = 1----") for i in range(1, 5): print("--k = ", i, "--") knn = KNN_Classifier(k=3, p=1) KFoldCross(knn, feature, label, 4) print("##########################") print("----KNN kfold with p = 2----") for i in range(1, 5): print("--k = ", i, "--") knn = KNN_Classifier(k=3, p=2) KFoldCross(knn, feature, label, 4) print("##########################") print("----KNN kfold with cosine----") for i in range(1, 5): print("--k = ", i, "--") knn = KNN_Classifier(k=3, metric='cosine') KFoldCross(knn, feature, label, 4) """ """ # dt kfold print("----DT kfold----") for i in range(0, 7): print("--max depth = ", i, "--") dt = DT_Classifier(max_depth=i,sigmoid=True, min_impurity_decrease=1.0) KFoldCross(dt, feature, label, 4) """ # test knn # for i in range(5): #knn = KNN_Classifier(k=1) #knn.fit(train_feature, train_label) #knn_predict = knn.predict(test_feature) #print(accuracy(test_label, knn_predict)) #print(precision(test_label, knn_predict)) #print(recall(test_label, knn_predict)) #print(f1(test_label, knn_predict)) # test metric # test_metric() # test dt dt = DT_Classifier() dt.fit(train_feature, train_label) dt_predict = dt.predict(test_feature) print(accuracy_score(test_label, dt_predict)) # use sklearn dt clf = tree.DecisionTreeClassifier() clf = clf.fit(train_feature, train_label) sklearn_predict = clf.predict(test_feature) print("sklearn dt accuracy: ", accuracy_score(test_label, sklearn_predict)) print("sklearn dt f1: ", f1(test_label, sklearn_predict)) # use sklearn knn neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(train_feature, train_label) sklearn_knn_predict = neigh.predict(test_feature) print("sklearn knn accuracy: ", accuracy_score(test_label, sklearn_knn_predict)) print("sklearn knn f1: ", f1(test_label, sklearn_knn_predict))