def get_accuracy_graph(): df = pd.read_csv('../data/weather_data.csv') # Split up X and y y = df['RainTomorrow'] X = df.drop(columns=['RainTomorrow']) # Get rid of prediction # test alphas = np.arange(0.01, 0.5, 0.01).tolist() # containers y_scores = [] y_trains = [] alp = [] for alpha in alphas: alp.append(alpha) temp_test = 0 temp_trains = 0 for i in range(10): test_score, y_hat_testing, y_testing = naive_bayes_classifier.run( X, y, alph=alpha) temp_test += test_score temp_trains += my_get_accuracy.run(y_hat_testing, y_testing, True) y_scores.append(temp_test / 10) y_trains.append(temp_trains / 10) ### test_df = pd.DataFrame(list(zip(alp, y_scores, y_trains)), columns=['alphas', 'train_score', 'test_score']) test_df p = ggplot(test_df) + geom_line(aes(x='alphas', y='train_score'), color='blue') \ + geom_line(aes(x='alphas', y='test_score'), color='red') \ + labs(y='Accuracy', x='Parameter Value') \ + ggtitle('Alpha vs. Accuracy') return p
def run(X, y, impurity_decrease, np_seed=None): if np_seed: np.random.seed(np_seed) # randomly select indices to be used for training, validation, and testing testing_indices = list(np.random.choice(len(X), len(X), replace=False)) # training subset will utilize 60% of the data X_training = X.iloc[testing_indices[0:600]] y_training = y.iloc[testing_indices[0:600]].tolist() # validation subset will utilize 20% of the remaining data X_validation = X.iloc[testing_indices[600:800]] y_validation = y.iloc[testing_indices[600:800]].tolist() # testing subset will utilize the remaining 20% of data X_testing = X.iloc[testing_indices[800:1000]] y_testing = y.iloc[testing_indices[800:]].tolist() # print(testing_data) clf = DecisionTreeClassifier(min_impurity_decrease=impurity_decrease) clf = clf.fit(X_training, y_training) clf_testing_predict = clf.predict(X_testing) clf_validation_predict = clf.predict(X_validation) validation_accuracy_score = my_get_accuracy.run(clf_validation_predict, y_validation, True) return (validation_accuracy_score, clf_testing_predict, y_testing)
def run(X, y, alph, np_seed=None): if np_seed: np.random.seed(np_seed) # Randomly split up the dataset for training, validation and testing # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, train_size=0.2) testing_indices = list(np.random.choice(len(X), len(X), replace=False)) X_training = X.iloc[testing_indices[0:600]] y_training = y.iloc[testing_indices[0:600]].tolist() X_validation = X.iloc[testing_indices[600:800]] y_validation = y.iloc[testing_indices[600:800]].tolist() X_testing = X.iloc[testing_indices[800:1000]] y_testing = y.iloc[testing_indices[800:1000]].tolist() # Training clf = BernoulliNB(alpha=alph) clf.fit(X_training, y_training) y_pred = clf.predict(X_testing) y_hat_validation = clf.predict(X_validation) y_hat_testing = clf.predict(X_testing) return my_get_accuracy.run(y_hat_validation, y_validation, True), y_hat_testing, y_testing
step = float(input("Enter step value: ")) # *step of 0.01 used in experiments conducted* print(f'Hyperparameter Range being Run: ({0.01}, {0.5}), step={step}.') # test alphas = np.arange(0.01, 0.5, step) # Testing for alpha for alpha in alphas: test_avg = 0 train_avg = 0 for i in range(num_test): test_score, y_hat_testing, y_testing = naive_bayes_classifier.run( X, y, alph=alpha) test_avg += test_score train_avg += my_get_accuracy.run(y_hat_testing, y_testing, True) test_avg = test_avg / num_test train_avg = train_avg / num_test testing_outputs.append(test_avg) train_outputs.append(train_avg) # to df df = pd.DataFrame(list(zip(alphas, train_outputs, testing_outputs)), columns=['Alpha', 'Train Score', 'Testing Score']) print(df) compression_opts = dict(method='zip', archive_name=f'bayes_output_data.csv') df.to_csv(f'../output/bayes_output_data.zip', index=False, compression=compression_opts)
validation_outputs = [] testing_outputs = [] # test for every hyperparameter in the list for i in range(len(inputs)): validation_accuracy_score = 0 testing_accuracy_score = 0 # run n_tests and calculate average accuracy score obtained from # specified hyperparameter for j in range(n_tests): validation_score, y_hat_testing, y_testing = decision_tree_classifier.run( X, y, inputs[i]) validation_accuracy_score = validation_accuracy_score + validation_score accuracy_score = my_get_accuracy.run(y_hat_testing, y_testing, True) accuracy_score = accuracy_score / len(y_hat_testing) testing_accuracy_score += accuracy_score validation_outputs.append(validation_accuracy_score / n_tests) testing_outputs.append(testing_accuracy_score / n_tests) # form pandas dataframe from data df = pd.DataFrame(list(zip(inputs, validation_outputs, testing_outputs))) df.columns = [ 'Min Impurity Decrease', 'Validation Accuracy', 'Testing Accuracy' ] print(df) print(f'Runtime {time.perf_counter() - start} seconds') compression_opts = dict(method='zip', archive_name=f'dtc_output_data.csv')
inputs = np.arange(((test_num * 0.1) - 0.1), test_num * 0.1, step).tolist() validation_outputs = [] testing_outputs = [] # test for every hyperparameter in the list for i in range(len(inputs)): validation_accuracy_score = 0 testing_accuracy_score = 0 # run n_tests and calculate average accuracy score obtained from # specified hyperparameter for j in range(n_tests): validation_accuracy_score, y_hat_testing, y_testing = decision_tree_classifier.run( X, y, inputs[i]) testing_accuracy_score += my_get_accuracy.run(y_hat_testing, y_testing) validation_outputs.append(validation_accuracy_score / n_tests) testing_outputs.append(testing_accuracy_score / n_tests) # form pandas dataframe from data df = pd.DataFrame(list(zip(inputs, validation_outputs, testing_outputs))) print(df) print(f'Runtime {time.perf_counter() - start} seconds') compression_opts = dict(method='zip', archive_name=f'dtc_output_data_{test_num}.csv') df.to_csv(f'../output/dtc_output_data_{test_num}.zip', index=False, compression=compression_opts)
# Split up X and y y = df['RainTomorrow'] X = df.drop(columns=['RainTomorrow']) # Get rid of prediction # Split the Training and Test Data X_train, X_test, y_train, y_test = train_test_split(X, y) y_test = y_test.to_numpy() # Train data clf = BernoulliNB() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # Accuracy Output accuracy = my_get_accuracy.run(y_test, y_pred) print('Test Accuracy : %.4f' % clf.score(X_test, y_test)) print('Training Accuracy : %.3f' % clf.score(X_train, y_train)) # Gridsearch for best alpha # will implement own version from sklearn.model_selection import GridSearchCV params = { 'alpha': [0.01, 0.1, 0.5, 1.0], } bernoulli_nb_grid = GridSearchCV(BernoulliNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)