def find_best_params(self,validation_data_x,validation_data_y,n_jobs=1,params=[]): if not params: params = self.get_default_param_grid() merged_x = Data.merge_arrays(self.training_data_x, validation_data_x) merged_y = Data.merge_arrays(self.training_data_y, validation_data_y) test_fold = [] for i in range(0,len(self.training_data_y)): test_fold.append(1) for i in range(0,len(validation_data_y)): test_fold.append(0) cv = PredefinedSplit(test_fold) gs = GridSearchCV( estimator=GaussianNB(), scoring='f1_micro', param_grid=params, n_jobs=n_jobs, cv=cv ) gs.fit(merged_x,merged_y) best_params = gs.best_params_ results = gs.cv_results_ return best_params,results
def find_best_params(self,validation_data_x,validation_data_y,alpha_vals,n_jobs=1): merged_x = Data.merge_arrays(self.training_data_x, validation_data_x) merged_y = Data.merge_arrays(self.training_data_y, validation_data_y) test_fold = [] for i in range(0,len(self.training_data_y)): test_fold.append(1) for i in range(0,len(validation_data_y)): test_fold.append(0) cv = PredefinedSplit(test_fold) param = {"alpha": alpha_vals} gs = GridSearchCV( estimator=BernoulliNB(), scoring='f1_micro', param_grid=param, n_jobs=n_jobs, cv=cv ) gs.fit(merged_x,merged_y) best_params = gs.best_params_ results = gs.cv_results_ return best_params,results
def run_naive_bayes_gaussian(): OUTPUT_PATH = "Outputs/yelp/naive-bayes-fbow-out.txt" f = open(OUTPUT_PATH, "w") TRAINING_DATA_PATH = "Data/FrequencyBOW/yelp-train" VALIDATION_DATA_PATH = "Data/FrequencyBOW/yelp-valid" TESTING_DATA_PATH = "Data/FrequencyBOW/yelp-test" f.write( "Loading Frequency Bag-Of-Words Representation for Training Data\n") training_data_x = Data.read_x_array(TRAINING_DATA_PATH + "-X.csv") training_data_y = Data.read_y_array(TRAINING_DATA_PATH + "-Y.csv") f.write( "Initializing Gaussian Naive Bayes Class Classifier with training data\n" ) gnb = GaussianNaiveBayesClassifier(training_data_x=training_data_x, training_data_y=training_data_y) f.write( "Finding best variance smoothing value for Gaussian Naive Bayes Model\n" ) f.write("Loading validation data\n") validation_data_x = Data.read_x_array(VALIDATION_DATA_PATH + "-X.csv") validation_data_y = Data.read_y_array(VALIDATION_DATA_PATH + "-Y.csv") best_params, results = gnb.find_best_params(validation_data_x, validation_data_y, n_jobs=1) f.write("The best variance smoothing value found was {}\n".format( best_params["var_smoothing"])) f.write("\nPerformance metrics for all var_smoothing values tested:\n\n") index = 0 while (index < 100 and index < len(results['params'])): f.write("var_smoothing: {} --> {}\n".format( results['params'][index]['var_smoothing'], results['mean_test_score'][index])) index += 1 f.write( "\n\nInitializing and training a Gaussian Naive Bayes Model with best hyper-parameters\n" ) gnb = GaussianNaiveBayesClassifier(training_data_x, training_data_y) gnb.initialize_classifier(var_smoothing=best_params['var_smoothing']) gnb.train() testing_data_x = Data.read_x_array(TESTING_DATA_PATH + "-X.csv") testing_data_y = Data.read_y_array(TESTING_DATA_PATH + "-Y.csv") f.write("Finding F1-Measure for different datasets\n") f1_train = gnb.get_f1_measure(training_data_x, training_data_y) f1_valid = gnb.get_f1_measure(validation_data_x, validation_data_y) f1_test = gnb.get_f1_measure(testing_data_x, testing_data_y) f.write("The F1-Measure on training data is {}\n".format(f1_train)) f.write("The F1-Measure on validation data is {}\n".format(f1_valid)) f.write("The F1-Measure on testing data is {}\n".format(f1_test)) f.close()
result.columns = np.concatenate( [ask_column, bid_column, ask_vol_column, bid_vol_column, mid_column]) result.iloc[:, :20] = result.iloc[:, :20].values / result[ 'mid'].values[:, np.newaxis] - 1 for i in range(9): result.iloc[:, i + 21] += result.iloc[:, i + 20] result.iloc[:, i + 31] += result.iloc[:, i + 30] result = result.iloc[1:] labels = [1, 2, 3, 5, 10] label = data.iloc[:, labels.index(k) - 5] return result, label if __name__ == '__main__': from data_processor import DataForModel as Data data = DataGenerator(test=True) data, label = Preprocess(data) data = pd.concat([data, label], axis=1) print(data.shape) data = Data(data, test_ratio=1) print(data.len_train) x, y = data.get_test_batch(300, False) print(x.shape)
def run_naive_bayes_bernoulli(): OUTPUT_PATH = "Outputs/yelp/naive-bayes-bbow-out.txt" f = open(OUTPUT_PATH, "w+") TRAINING_DATA_PATH = "Data/BinaryBOW/yelp-train" VALIDATION_DATA_PATH = "Data/BinaryBOW/yelp-valid" TESTING_DATA_PATH = "Data/BinaryBOW/yelp-test" f.write("Loading Binary Bag-Of-Words Representation for Training Data\n") training_data_x = Data.read_x_array(TRAINING_DATA_PATH + "-X.csv") training_data_y = Data.read_y_array(TRAINING_DATA_PATH + "-Y.csv") f.write( "Initializing Bernoulli Naive Bayes Class Classifier with training data\n" ) bnb = BernoulliNaiveBayesClassifier(training_data_x=training_data_x, training_data_y=training_data_y) f.write("Finding best alpha value for Naive Bayes Bernoulli Model\n") f.write("Loading validation data\n") validation_data_x = Data.read_x_array(VALIDATION_DATA_PATH + "-X.csv") validation_data_y = Data.read_y_array(VALIDATION_DATA_PATH + "-Y.csv") alpha_start = 0 alpha_stop = 1 num_intervals = 200 alpha_vals = np.linspace(start=alpha_start, stop=alpha_stop, num=num_intervals) f.write("Testing {} alpha values between {} and {}:\n".format( num_intervals, alpha_start, alpha_stop)) best_params, results = bnb.find_best_params(validation_data_x, validation_data_y, alpha_vals, n_jobs=10) f.write("The best alpha value found was {}\n".format(best_params["alpha"])) f.write("\nPerformance metrics for all alpha values tested:\n\n") output = "" for i in range(0, len(alpha_vals)): output += "Alpha value: " + str( alpha_vals[i]) + " --> F1-Score: " + str( results['mean_test_score'][i]) + "\n" f.write(output) f.write( "\n\nInitializing and training a Bernoulli Naive Bayes Model with alpha={}\n" .format(best_params['alpha'])) alpha = float(best_params['alpha']) bnb = BernoulliNaiveBayesClassifier(training_data_x, training_data_y, alpha) bnb.train() testing_data_x = Data.read_x_array(TESTING_DATA_PATH + "-X.csv") testing_data_y = Data.read_y_array(TESTING_DATA_PATH + "-Y.csv") f.write("Finding F1-Measure for different datasets\n") f1_train = bnb.get_f1_measure(training_data_x, training_data_y) f1_valid = bnb.get_f1_measure(validation_data_x, validation_data_y) f1_test = bnb.get_f1_measure(testing_data_x, testing_data_y) f.write("The F1-Measure on training data with alpha={} is {}\n".format( alpha, f1_train)) f.write("The F1-Measure on validation data with alpha={} is {}\n".format( alpha, f1_valid)) f.write("The F1-Measure on testing data with alpha={} is {}\n".format( alpha, f1_test)) f.close()
def run_decision_tree(): OUTPUT_PATH = "Outputs/yelp/decision-tree-bbow-out.txt" f = open(OUTPUT_PATH, "w") TRAINING_DATA_PATH = "Data/BinaryBOW/yelp-train" VALIDATION_DATA_PATH = "Data/BinaryBOW/yelp-valid" TESTING_DATA_PATH = "Data/BinaryBOW/yelp-test" f.write("Loading Binary Bag-Of-Words Representation for Training Data\n") training_data_x = Data.read_x_array(TRAINING_DATA_PATH + "-X.csv") training_data_y = Data.read_y_array(TRAINING_DATA_PATH + "-Y.csv") f.write("Initializing Decision Tree Classifier with training data\n") dt = DecisionTree(training_data_x=training_data_x, training_data_y=training_data_y) f.write("Loading validation data\n") validation_data_x = Data.read_x_array(VALIDATION_DATA_PATH + "-X.csv") validation_data_y = Data.read_y_array(VALIDATION_DATA_PATH + "-Y.csv") f.write("Finding the best hyper-parameters:\n") best_params, best_score, results = dt.find_best_params(validation_data_x, validation_data_y, n_jobs=1) f.write("The best hyper-parameters are as follows: \n") f.write( "max_depth: {}\t| min_samples_split: {}\t| min_samples_leaf: {}\t| max_features: {} with an average F1-Measure of {}\n\n" .format(best_params['max_depth'], best_params['min_samples_split'], best_params['min_samples_leaf'], best_params['max_features'], best_score)) f.write( "\nPerformance metrics for the first 100 hyper-parameters_tested:\n\n") index = 0 while (index < 100 and index < len(results['params'])): f.write( "max_depth: {}\t| min_samples_split: {}\t| min_samples_leaf: {}\t| max_features: {} --> {}\n" .format(results['params'][index]['max_depth'], results['params'][index]['min_samples_split'], results['params'][index]['min_samples_leaf'], results['params'][index]['max_features'], results['mean_test_score'][index])) index += 1 f.write( "\n\nInitializing and training a Decision Tree Classifier with the best parameters \n" ) dt = DecisionTree(training_data_x, training_data_y) dt.initialize_classifier( max_depth=best_params['max_depth'], min_samples_split=best_params['min_samples_split'], min_samples_leaf=best_params['min_samples_leaf'], max_features=best_params['max_features']) dt.train() testing_data_x = Data.read_x_array(TESTING_DATA_PATH + "-X.csv") testing_data_y = Data.read_y_array(TESTING_DATA_PATH + "-Y.csv") f.write("Finding F1-Measure for different datasets\n") f1_train = dt.get_f1_measure(training_data_x, training_data_y) f1_valid = dt.get_f1_measure(validation_data_x, validation_data_y) f1_test = dt.get_f1_measure(testing_data_x, testing_data_y) f.write( "The F1-Measure on training data with these parameters is {}\n".format( f1_train)) f.write("The F1-Measure on validation data with these parameters is {}\n". format(f1_valid)) f.write( "The F1-Measure on testing data with these parameters is {}\n".format( f1_test)) f.close()
def run_linear_svm(): OUTPUT_PATH = "Outputs/yelp/linear-svm-bbow-out.txt" f = open(OUTPUT_PATH, "w") TRAINING_DATA_PATH = "Data/BinaryBOW/yelp-train" VALIDATION_DATA_PATH = "Data/BinaryBOW/yelp-valid" TESTING_DATA_PATH = "Data/BinaryBOW/yelp-test" f.write("Loading Binary Bag-Of-Words Representation for Training Data\n") training_data_x = Data.read_x_array(TRAINING_DATA_PATH + "-X.csv") training_data_y = Data.read_y_array(TRAINING_DATA_PATH + "-Y.csv") f.write( "Initializing Linear Support Vector Classifier with training data\n") lsvc = LinearSupportVectorClassifier(training_data_x=training_data_x, training_data_y=training_data_y) f.write("Loading validation data\n") validation_data_x = Data.read_x_array(VALIDATION_DATA_PATH + "-X.csv") validation_data_y = Data.read_y_array(VALIDATION_DATA_PATH + "-Y.csv") f.write("Finding the best hyper-parameters:\n") best_params, best_score, results = lsvc.find_best_params(validation_data_x, validation_data_y, n_jobs=10) f.write("The best hyper-parameters are as follows: \n") f.write("C: {}\t| tol: {} with an F1-Measure of {}\n\n".format( best_params['C'], best_params['tol'], best_score)) f.write( "\nPerformance metrics for the first 100 hyper-parameters_tested:\n\n") index = 0 while (index < 100 and index < len(results['params'])): f.write("C: {}\t| tol: {} --> {}\n".format( results['params'][index]['C'], results['params'][index]['tol'], results['mean_test_score'][index])) index += 1 f.write( "\n\nInitializing and training a Linear Support Vector Classifier with C={} and tol={} \n" .format(best_params['C'], best_params['tol'])) best_C = float(best_params['C']) best_tol = float(best_params['tol']) lsvc = LinearSupportVectorClassifier(training_data_x, training_data_y) lsvc.initialize_classifier(tol=best_tol, C=best_C) lsvc.train() testing_data_x = Data.read_x_array(TESTING_DATA_PATH + "-X.csv") testing_data_y = Data.read_y_array(TESTING_DATA_PATH + "-Y.csv") f.write("Finding F1-Measure for different datasets\n") f1_train = lsvc.get_f1_measure(training_data_x, training_data_y) f1_valid = lsvc.get_f1_measure(validation_data_x, validation_data_y) f1_test = lsvc.get_f1_measure(testing_data_x, testing_data_y) f.write( "The F1-Measure on training data with C={} and tol={} is {}\n".format( best_C, best_tol, f1_train)) f.write("The F1-Measure on validation data with C={} and tol={} is {}\n". format(best_C, best_tol, f1_valid)) f.write( "The F1-Measure on testing data with C={} and tol={} is {}\n".format( best_C, best_tol, f1_test)) f.close()