def run_naive_bayes_gaussian(): OUTPUT_PATH = "Outputs/yelp/naive-bayes-fbow-out.txt" f = open(OUTPUT_PATH, "w") TRAINING_DATA_PATH = "Data/FrequencyBOW/yelp-train" VALIDATION_DATA_PATH = "Data/FrequencyBOW/yelp-valid" TESTING_DATA_PATH = "Data/FrequencyBOW/yelp-test" f.write( "Loading Frequency Bag-Of-Words Representation for Training Data\n") training_data_x = Data.read_x_array(TRAINING_DATA_PATH + "-X.csv") training_data_y = Data.read_y_array(TRAINING_DATA_PATH + "-Y.csv") f.write( "Initializing Gaussian Naive Bayes Class Classifier with training data\n" ) gnb = GaussianNaiveBayesClassifier(training_data_x=training_data_x, training_data_y=training_data_y) f.write( "Finding best variance smoothing value for Gaussian Naive Bayes Model\n" ) f.write("Loading validation data\n") validation_data_x = Data.read_x_array(VALIDATION_DATA_PATH + "-X.csv") validation_data_y = Data.read_y_array(VALIDATION_DATA_PATH + "-Y.csv") best_params, results = gnb.find_best_params(validation_data_x, validation_data_y, n_jobs=1) f.write("The best variance smoothing value found was {}\n".format( best_params["var_smoothing"])) f.write("\nPerformance metrics for all var_smoothing values tested:\n\n") index = 0 while (index < 100 and index < len(results['params'])): f.write("var_smoothing: {} --> {}\n".format( results['params'][index]['var_smoothing'], results['mean_test_score'][index])) index += 1 f.write( "\n\nInitializing and training a Gaussian Naive Bayes Model with best hyper-parameters\n" ) gnb = GaussianNaiveBayesClassifier(training_data_x, training_data_y) gnb.initialize_classifier(var_smoothing=best_params['var_smoothing']) gnb.train() testing_data_x = Data.read_x_array(TESTING_DATA_PATH + "-X.csv") testing_data_y = Data.read_y_array(TESTING_DATA_PATH + "-Y.csv") f.write("Finding F1-Measure for different datasets\n") f1_train = gnb.get_f1_measure(training_data_x, training_data_y) f1_valid = gnb.get_f1_measure(validation_data_x, validation_data_y) f1_test = gnb.get_f1_measure(testing_data_x, testing_data_y) f.write("The F1-Measure on training data is {}\n".format(f1_train)) f.write("The F1-Measure on validation data is {}\n".format(f1_valid)) f.write("The F1-Measure on testing data is {}\n".format(f1_test)) f.close()
def run_naive_bayes_bernoulli(): OUTPUT_PATH = "Outputs/yelp/naive-bayes-bbow-out.txt" f = open(OUTPUT_PATH, "w+") TRAINING_DATA_PATH = "Data/BinaryBOW/yelp-train" VALIDATION_DATA_PATH = "Data/BinaryBOW/yelp-valid" TESTING_DATA_PATH = "Data/BinaryBOW/yelp-test" f.write("Loading Binary Bag-Of-Words Representation for Training Data\n") training_data_x = Data.read_x_array(TRAINING_DATA_PATH + "-X.csv") training_data_y = Data.read_y_array(TRAINING_DATA_PATH + "-Y.csv") f.write( "Initializing Bernoulli Naive Bayes Class Classifier with training data\n" ) bnb = BernoulliNaiveBayesClassifier(training_data_x=training_data_x, training_data_y=training_data_y) f.write("Finding best alpha value for Naive Bayes Bernoulli Model\n") f.write("Loading validation data\n") validation_data_x = Data.read_x_array(VALIDATION_DATA_PATH + "-X.csv") validation_data_y = Data.read_y_array(VALIDATION_DATA_PATH + "-Y.csv") alpha_start = 0 alpha_stop = 1 num_intervals = 200 alpha_vals = np.linspace(start=alpha_start, stop=alpha_stop, num=num_intervals) f.write("Testing {} alpha values between {} and {}:\n".format( num_intervals, alpha_start, alpha_stop)) best_params, results = bnb.find_best_params(validation_data_x, validation_data_y, alpha_vals, n_jobs=10) f.write("The best alpha value found was {}\n".format(best_params["alpha"])) f.write("\nPerformance metrics for all alpha values tested:\n\n") output = "" for i in range(0, len(alpha_vals)): output += "Alpha value: " + str( alpha_vals[i]) + " --> F1-Score: " + str( results['mean_test_score'][i]) + "\n" f.write(output) f.write( "\n\nInitializing and training a Bernoulli Naive Bayes Model with alpha={}\n" .format(best_params['alpha'])) alpha = float(best_params['alpha']) bnb = BernoulliNaiveBayesClassifier(training_data_x, training_data_y, alpha) bnb.train() testing_data_x = Data.read_x_array(TESTING_DATA_PATH + "-X.csv") testing_data_y = Data.read_y_array(TESTING_DATA_PATH + "-Y.csv") f.write("Finding F1-Measure for different datasets\n") f1_train = bnb.get_f1_measure(training_data_x, training_data_y) f1_valid = bnb.get_f1_measure(validation_data_x, validation_data_y) f1_test = bnb.get_f1_measure(testing_data_x, testing_data_y) f.write("The F1-Measure on training data with alpha={} is {}\n".format( alpha, f1_train)) f.write("The F1-Measure on validation data with alpha={} is {}\n".format( alpha, f1_valid)) f.write("The F1-Measure on testing data with alpha={} is {}\n".format( alpha, f1_test)) f.close()
def run_decision_tree(): OUTPUT_PATH = "Outputs/yelp/decision-tree-bbow-out.txt" f = open(OUTPUT_PATH, "w") TRAINING_DATA_PATH = "Data/BinaryBOW/yelp-train" VALIDATION_DATA_PATH = "Data/BinaryBOW/yelp-valid" TESTING_DATA_PATH = "Data/BinaryBOW/yelp-test" f.write("Loading Binary Bag-Of-Words Representation for Training Data\n") training_data_x = Data.read_x_array(TRAINING_DATA_PATH + "-X.csv") training_data_y = Data.read_y_array(TRAINING_DATA_PATH + "-Y.csv") f.write("Initializing Decision Tree Classifier with training data\n") dt = DecisionTree(training_data_x=training_data_x, training_data_y=training_data_y) f.write("Loading validation data\n") validation_data_x = Data.read_x_array(VALIDATION_DATA_PATH + "-X.csv") validation_data_y = Data.read_y_array(VALIDATION_DATA_PATH + "-Y.csv") f.write("Finding the best hyper-parameters:\n") best_params, best_score, results = dt.find_best_params(validation_data_x, validation_data_y, n_jobs=1) f.write("The best hyper-parameters are as follows: \n") f.write( "max_depth: {}\t| min_samples_split: {}\t| min_samples_leaf: {}\t| max_features: {} with an average F1-Measure of {}\n\n" .format(best_params['max_depth'], best_params['min_samples_split'], best_params['min_samples_leaf'], best_params['max_features'], best_score)) f.write( "\nPerformance metrics for the first 100 hyper-parameters_tested:\n\n") index = 0 while (index < 100 and index < len(results['params'])): f.write( "max_depth: {}\t| min_samples_split: {}\t| min_samples_leaf: {}\t| max_features: {} --> {}\n" .format(results['params'][index]['max_depth'], results['params'][index]['min_samples_split'], results['params'][index]['min_samples_leaf'], results['params'][index]['max_features'], results['mean_test_score'][index])) index += 1 f.write( "\n\nInitializing and training a Decision Tree Classifier with the best parameters \n" ) dt = DecisionTree(training_data_x, training_data_y) dt.initialize_classifier( max_depth=best_params['max_depth'], min_samples_split=best_params['min_samples_split'], min_samples_leaf=best_params['min_samples_leaf'], max_features=best_params['max_features']) dt.train() testing_data_x = Data.read_x_array(TESTING_DATA_PATH + "-X.csv") testing_data_y = Data.read_y_array(TESTING_DATA_PATH + "-Y.csv") f.write("Finding F1-Measure for different datasets\n") f1_train = dt.get_f1_measure(training_data_x, training_data_y) f1_valid = dt.get_f1_measure(validation_data_x, validation_data_y) f1_test = dt.get_f1_measure(testing_data_x, testing_data_y) f.write( "The F1-Measure on training data with these parameters is {}\n".format( f1_train)) f.write("The F1-Measure on validation data with these parameters is {}\n". format(f1_valid)) f.write( "The F1-Measure on testing data with these parameters is {}\n".format( f1_test)) f.close()
def run_linear_svm(): OUTPUT_PATH = "Outputs/yelp/linear-svm-bbow-out.txt" f = open(OUTPUT_PATH, "w") TRAINING_DATA_PATH = "Data/BinaryBOW/yelp-train" VALIDATION_DATA_PATH = "Data/BinaryBOW/yelp-valid" TESTING_DATA_PATH = "Data/BinaryBOW/yelp-test" f.write("Loading Binary Bag-Of-Words Representation for Training Data\n") training_data_x = Data.read_x_array(TRAINING_DATA_PATH + "-X.csv") training_data_y = Data.read_y_array(TRAINING_DATA_PATH + "-Y.csv") f.write( "Initializing Linear Support Vector Classifier with training data\n") lsvc = LinearSupportVectorClassifier(training_data_x=training_data_x, training_data_y=training_data_y) f.write("Loading validation data\n") validation_data_x = Data.read_x_array(VALIDATION_DATA_PATH + "-X.csv") validation_data_y = Data.read_y_array(VALIDATION_DATA_PATH + "-Y.csv") f.write("Finding the best hyper-parameters:\n") best_params, best_score, results = lsvc.find_best_params(validation_data_x, validation_data_y, n_jobs=10) f.write("The best hyper-parameters are as follows: \n") f.write("C: {}\t| tol: {} with an F1-Measure of {}\n\n".format( best_params['C'], best_params['tol'], best_score)) f.write( "\nPerformance metrics for the first 100 hyper-parameters_tested:\n\n") index = 0 while (index < 100 and index < len(results['params'])): f.write("C: {}\t| tol: {} --> {}\n".format( results['params'][index]['C'], results['params'][index]['tol'], results['mean_test_score'][index])) index += 1 f.write( "\n\nInitializing and training a Linear Support Vector Classifier with C={} and tol={} \n" .format(best_params['C'], best_params['tol'])) best_C = float(best_params['C']) best_tol = float(best_params['tol']) lsvc = LinearSupportVectorClassifier(training_data_x, training_data_y) lsvc.initialize_classifier(tol=best_tol, C=best_C) lsvc.train() testing_data_x = Data.read_x_array(TESTING_DATA_PATH + "-X.csv") testing_data_y = Data.read_y_array(TESTING_DATA_PATH + "-Y.csv") f.write("Finding F1-Measure for different datasets\n") f1_train = lsvc.get_f1_measure(training_data_x, training_data_y) f1_valid = lsvc.get_f1_measure(validation_data_x, validation_data_y) f1_test = lsvc.get_f1_measure(testing_data_x, testing_data_y) f.write( "The F1-Measure on training data with C={} and tol={} is {}\n".format( best_C, best_tol, f1_train)) f.write("The F1-Measure on validation data with C={} and tol={} is {}\n". format(best_C, best_tol, f1_valid)) f.write( "The F1-Measure on testing data with C={} and tol={} is {}\n".format( best_C, best_tol, f1_test)) f.close()