def first_set(min_sup): """ Database bbp2 """ print("-- Database: bbp2 --") train_labels = load_labels("train_bbp2.gt") test_labels = load_labels("test_bbp2.gt") # Run graph mining to get matrices train_matrix, test_matrix, num_freq = graph_mining( os_join(DATA_PATH, "train_bbp2.gsp"), os_join(DATA_PATH, "test_bbp2.gsp"), len(train_labels) * min_sup) # Train RandomForrestClassifier with matrices and labels try: score = run_random_forest(train_matrix, train_labels, test_matrix, test_labels) except Exception as e: print(str(e)) return str(e) print("Accuracy of classifier: " + str(score)) return score, num_freq
def second_set(min_sup): """ Database molecules """ print("-- Database: molecules --") train_labels = load_labels("train_molecules.groundTruth", sep=" ") test_labels = load_labels("test_molecules.groundTruth", sep=" ") # Run graph mining to get matrices train_matrix, test_matrix, num_freq = graph_mining( os_join(DATA_PATH, "train_molecules.gsp"), os_join(DATA_PATH, "test_molecules.gsp"), len(train_labels) * min_sup) # Train RandomForrestClassifier with matrices and labels try: score = run_random_forest(train_matrix, train_labels, test_matrix, test_labels) except Exception as e: print(str(e)) return str(e) print("Accuracy of classifier: " + str(score)) return score, num_freq
from ada_boost import run_ada_boost from data import get_data_frame, split_train_test from decision_tree import run_decision_tree from knn import run_knn from linear_regression import run_linear_regression from logistic_regression import run_logistic_regression from naive_bayes import run_gaussianNB, run_multinomialNB, run_bernoulliNB from neural_network import run_neural_network from random_forest import run_random_forest from svm import run_svm if __name__ == "__main__": data = get_data_frame() X_train, X_test, y_train, y_test = split_train_test(data) # Uncomment the algorithms to run as you wish # be careful with knn it is too slow. #run_gaussianNB(X_train, X_test, y_train, y_test) #run_multinomialNB(X_train, X_test, y_train, y_test) #run_bernoulliNB(X_train, X_test, y_train, y_test) #run_knn(X_train, X_test, y_train, y_test) #run_linear_regression(X_train, X_test, y_train, y_test) #run_logistic_regression(X_train, X_test, y_train, y_test) #run_svm(X_train, X_test, y_train, y_test) #run_decision_tree(X_train, X_test, y_train, y_test) run_random_forest(X_train, X_test, y_train, y_test) #run_neural_network(X_train, X_test, y_train, y_test) #run_ada_boost(X_train, X_test, y_train, y_test)
from data_preprocessor import get_data if __name__ == '__main__': x_train, x_test, y_train, y_test = get_data(True) print( "\n-------------------------------------\nAccuracies with top 5 features:\n-------------------------------------" ) run_decision_tree(x_train, x_test, y_train, y_test) run_k_nearest_neighbour(x_train, x_test, y_train, y_test) run_logistic_regression(x_train, x_test, y_train, y_test) run_naive_bayes(x_train, x_test, y_train, y_test) run_neural_network(x_train, x_test, y_train, y_test) run_perceptron(x_train, x_test, y_train, y_test) run_random_forest(x_train, x_test, y_train, y_test) run_svm(x_train, x_test, y_train, y_test) run_xg_boost(x_train, x_test, y_train, y_test) print( "\n-------------------------------------\nAccuracy with Voting in top 5 features:\n-------------------------------------" ) run_voting(x_train, x_test, y_train, y_test) x_train, x_test, y_train, y_test = get_data() print( "\n-------------------------------------\nAccuracies with all 22 features:\n-------------------------------------" ) run_decision_tree(x_train, x_test, y_train, y_test) run_k_nearest_neighbour(x_train, x_test, y_train, y_test)
df_movie = fill_nan(df_movie) df_movie = df_movie.drop(columns="imdb_score") print(df_movie["director_name"].head()) col_mask = print(df_movie.isna().any(axis=0)) print(col_mask) # return the processed dataset. return df_movie if __name__ == "__main__": # df_movie, df_standard = data_prepocessing() # df_knn = df_movie # df_knn = df_knn.reset_index() # df_knn["class"] = df_knn.apply(classify, axis=1) # classes = list(df_knn["class"]) # amazing = classes==['AMAZING'] # print(amazing) # df_knn = df_knn.drop(columns="imdbRating") df_movie = load_metadata_dataset() #run_knn(df_movie) #run_logistic_regression(df_movie) classifier=run_random_forest(df_movie) run(classifier)