def preprocess(corpus_folder, mode, settings, out_file, verbose_level): preprocessor.format_corpus(corpus_folder) preprocessor.clean_directory(corpus_folder + "_partitioned") filter_words = True if settings['filter_words'] == 0: filter_words = False preprocessor.split_files(settings['num_lines_split'], settings['sliding_window_size'], filter_words, corpus_folder, corpus_folder + "_partitioned") all_files = load_files(corpus_folder + "_partitioned") if verbose_level > 1: print( "mode : {} filter: {} window size: {} num_lines_split: {}".format( mode, filter_words, settings['sliding_window_size'], settings['num_lines_split'])) if mode == SVM: run_svm(all_files, settings['svm']['num_runs'], out_file, verbose_level) elif mode == KMEANS: param_dict = {} param_dict['n_init'] = settings['kmeans']['n_init'] param_dict['max_iter'] = settings['kmeans']['max_iter'] param_dict['tol'] = settings['kmeans']['tol'] param_dict['num_runs'] = settings['kmeans']['num_runs'] param_dict['k'] = settings['kmeans']['k'] run_cluster(all_files, param_dict, out_file, verbose_level)
def main(args): ''' flowers recognition gogogo! ''' if args.method == 'vgg': print('Using vgg network for flowers recognition') vgg.run_vgg(args.lr, args.epochs, args.batch_size, args.reg) if args.method == 'fc': print('Using fully connected network for flowers recognition') fc.run_fc(args.lr, args.epochs, args.batch_size, args.reg) if args.method == 'resnet34': print('Using deep residual network(34-layers) for flowers recognition') resnet.run_resnet(args.lr, args.epochs, args.batch_size, args.reg) if args.method == 'resnet50': print( 'Using deep residual network(50-layers) pretrained for flowers recognition' ) res.run_resnet50(args.batch_size, args.epochs, args.lr) if args.method == 'svm': print('Using Support Vector Machine for flowers recognition') svm.run_svm() if args.method == 'knn': print('Using K nearest neighbors for flowers recognition') knn.run_knn()
def main(): model = LogisticRegressionMulticlass.LogisticRegression() model.start_logistic_regression(learning_rate=0.7, num_epoch=50, theta=1, show_graph=False) svm.run_svm(kernal=svm.linear, train_size=20000) RandomForest.run_random_forest() nn = NeuralNetwork.NeuralNetwork() nn.start_neural_network(num_epoch=100, learning_rate=0.015, show_graph=False) combine_model.combine_model(combine_model.MNIST) combine_model.combine_model(combine_model.USPS)
def cross_validate(train_inputs, train_labels, identity): kfold = cross_validation.StratifiedKFold( identity, n_folds=3, shuffle=True, random_state=5) errors = [] for train_i, valid_i in kfold: train_x = train_inputs[train_i] valid_x = train_inputs[valid_i] train_y = train_labels[train_i] valid_y = train_labels[valid_i] error = svm.run_svm(train_x, train_y, valid_x, valid_y) errors.append(error) return errors
def test_images(labels, images, classifier): global label_dict sum = 0 for idx, image in enumerate(images): img = cv2.imread(image) correct_label = labels[idx] rectified_img, boxes = get_candidates(img) # Draw boxes and labels around ROI img_clone = rectified_img.copy() answers = [] for box in boxes: (x, y, w, h) = box cv2.rectangle(img_clone, (x, y), (x + w, y + h), (0, 255, 0), 1) # Run classifier h = hog(rectified_img, box) output = 0 if classifier == "svm-linear" or classifier == "svm-rbf": output = int(run_svm(h)[1][0][0]) elif classifier == "rf": output = int(run_rf(h)[1][0][0]) elif classifier == "mlp": output = int(run_mlp(h)) else: ValueError("Wrong classifier") exit(1) output_str = label_dict[output] answers.append(output) cv2.putText(img_clone, output_str, (x, y - 3), 3, 1, (0, 255, 0), 2, cv2.LINE_AA) Image.fromarray(img_clone).show() if correct_label in answers: sum += 1 # Reverse Rectification with labels # img_clone = reverse_rectification(img_clone) # Image.fromarray(img_clone).show() # Evaluation print("Test Accuracy: {:6}".format(sum / len(labels)))
def test_sliding_window(): # list of testing set accuracies test_error_list = [] overlap_list = [] overlap = 20 while overlap >= 0: print("overlap num : {}".format(overlap)) preprocessor.format_corpus("sermons") preprocessor.clean_directory("sermons_partitioned") preprocessor.split_files(38, overlap, True, "sermons", "sermons_partitioned") all_files = load_files("./sermons_partitioned/") test_error_list.append(100 - run_svm(all_files, 4, None, 3)) overlap_list.append(overlap) overlap -= 2 print(test_error_list) plot_sliding_window(test_error_list, overlap_list)
def test_split_num(): # list of testing set accuracies test_error_list = [] split_num_list = [] split_num = 38 while split_num >= 4: print("split num : {}".format(split_num)) preprocessor.format_corpus("sermons") preprocessor.clean_directory("sermons_partitioned") preprocessor.split_files(split_num, 0, True, "sermons", "sermons_partitioned") all_files = load_files("./sermons_partitioned/") test_error_list.append(100 - run_svm(all_files, 4, None, 3)) split_num_list.append(split_num) split_num -= 2 print(test_error_list) plot_split_num(test_error_list, split_num_list)
print('# Classifying Online Review Sentiment with Machine Learning #') print('# #') print('#################################################################') print() dataset = SentimentCorpus() nb = MultinomialNaiveBayes() params = nb.train(dataset.train_X, dataset.train_y) predict_train = nb.test(dataset.train_X, params) eval_train = nb.evaluate(predict_train, dataset.train_y) predict_test = nb.test(dataset.test_X, params) eval_test = nb.evaluate(predict_test, dataset.test_y) print("\n=======================================================\n") print("+++ Naive Bayes +++") print ("Accuracy on training data = %f \n Accuracy on testing data = %f" % (eval_train, eval_test)) print("Confusion Matrix:") print(confusion_matrix(dataset.test_y,predict_test)) print(classification_report(dataset.test_y,predict_test)) print("=======================================================\n") print("+++ Support Vector Machine +++") svm.run_svm(dataset.train_X, dataset.train_y, dataset.test_X, dataset.test_y) print("=======================================================\n") print("+++ Neural Network +++") nn.run_nn(dataset.train_X, dataset.train_y, dataset.test_X, dataset.test_y) print("=======================================================")
####################### if chosen_classifier == "knn": # Enable to run knn classifier print("Running knn classifier") # knn.run_knn(train_images, train_labels, valid_images, valid_labels) # knn.run_knn(lbp_train_images, train_labels, lbp_val_images, valid_labels) # knn.run_knn(exp_train_images, exp_train_labels, exp_val_images, exp_val_labels) knn.run_knn(gc_train, exp_train_labels, gc_val, exp_val_labels) elif chosen_classifier == "svm": # svm classifier # print("Running svm classifier") # svm.run_svm(exp_train_images, exp_train_labels, exp_val_images, exp_val_labels) print("Running svm classifier on gamma corrected images") svm.run_svm(gc_train, exp_train_labels, gc_val, exp_val_labels) elif chosen_classifier == "mog": # mog classifier print("Running mog classifier") mog.run_mog(gc_train, exp_train_labels, gc_val, exp_val_labels) elif chosen_classifier == "dt": print("Running decision tree classifier") dt.decision_tree(gc_train, exp_train_labels, gc_val, exp_val_labels) elif chosen_classifier == "ensemble": print("Running ensemble method") ensemble.run_ensemble(exp_train_images, exp_train_labels, exp_val_images, exp_val_labels) else:
if __name__ == '__main__': x_train, x_test, y_train, y_test = get_data(True) print( "\n-------------------------------------\nAccuracies with top 5 features:\n-------------------------------------" ) run_decision_tree(x_train, x_test, y_train, y_test) run_k_nearest_neighbour(x_train, x_test, y_train, y_test) run_logistic_regression(x_train, x_test, y_train, y_test) run_naive_bayes(x_train, x_test, y_train, y_test) run_neural_network(x_train, x_test, y_train, y_test) run_perceptron(x_train, x_test, y_train, y_test) run_random_forest(x_train, x_test, y_train, y_test) run_svm(x_train, x_test, y_train, y_test) run_xg_boost(x_train, x_test, y_train, y_test) print( "\n-------------------------------------\nAccuracy with Voting in top 5 features:\n-------------------------------------" ) run_voting(x_train, x_test, y_train, y_test) x_train, x_test, y_train, y_test = get_data() print( "\n-------------------------------------\nAccuracies with all 22 features:\n-------------------------------------" ) run_decision_tree(x_train, x_test, y_train, y_test) run_k_nearest_neighbour(x_train, x_test, y_train, y_test) run_logistic_regression(x_train, x_test, y_train, y_test)
#logging.info('This is an info log') #logging.warning('This is a warning log') #logging.error('This is an error log') #******************************************************************************* # DATA EXTRACTION TRAIN_DATA = data_extractor.get_data(defines.DATA_TRAIN_CSV_FILE) TEST_DATA = data_extractor.get_data(defines.DATA_TEST_CSV_FILE) #******************************************************************************* # STOP WORDS FILTER #logging.info('Prepare Data') #nb_lib.nb_lib_prepare(TRAIN_DATA) #******************************************************************************* # FEATURE SELECTION logging.info('Feature selection') #feature_select.get_selected_features(TRAIN_DATA) #******************************************************************************* logging.info('SVM') svm.run_svm(TRAIN_DATA, TEST_DATA) logging.info ('Cross Validation') #CrossVal.run_crossval(TRAIN_DATA) print ("done!") sys.exit()
def execute(topic1, topic2, test, dump_files): if dump_files == "True": print_bold("\n" + "Downloading the datasets ..." + "\n") create_cleaned_files(topic1, topic2, test) print_bold("Dumps TFIDF features ..." + "\n") # category is used to specify the unique Id of the dumped model category = topic1 + "-" + topic2 dump_tfidf(category) print("=========================================================") print_bold("Start Running bayes model to establish a baseline") print("=========================================================") print_bold("\n" + "Run Bayes model ..." + "\n") pred_train_bayes, pred_test_bayes = run_bayes(category) print("=========================================================") print_bold("Improvement of the baseline") print("=========================================================") print_bold("Run Cnn model ..." + "\n") pred_train_cnn, pred_test_cnn = run_cnn() print( "--------------------------------------------------------------------------" ) print_bold("Run Fasttext model ..." + "\n") pred_train_fasttext, pred_test_fasttext = run_fasttext() print( "--------------------------------------------------------------------------" ) print_bold("Run SVM model ..." + "\n") pred_train_svm, pred_test_svm, y_train = run_svm(category) print( "--------------------------------------------------------------------------" ) print_bold("Run Logistic Regression model ..." + "\n") pred_train_logreg, pred_test_logreg, y_test = run_logreg(category) print( "--------------------------------------------------------------------------" ) print_bold("Starting Ensemble Method") # using train+val for training the ensemble (training on more dataset == stronger results) train = np.column_stack((pred_train_svm, pred_train_logreg, pred_train_cnn, pred_train_fasttext)) test = np.column_stack( (pred_test_svm, pred_test_logreg, pred_test_cnn, pred_test_fasttext)) model = xgb().fit(train, y_train) print( "--------------------------------------------------------------------------" ) print_bold("Final results on the test set : ") print(classification_report(y_test, model.predict(test)))