Esempio n. 1
0
def get_accuracy_graph():
    df = pd.read_csv('../data/weather_data.csv')
    # Split up X and y
    y = df['RainTomorrow']
    X = df.drop(columns=['RainTomorrow'])  # Get rid of prediction

    # test
    alphas = np.arange(0.01, 0.5, 0.01).tolist()

    # containers
    y_scores = []
    y_trains = []
    alp = []

    for alpha in alphas:
        alp.append(alpha)
        temp_test = 0
        temp_trains = 0
        for i in range(10):
            test_score, y_hat_testing, y_testing = naive_bayes_classifier.run(
                X, y, alph=alpha)
            temp_test += test_score
            temp_trains += my_get_accuracy.run(y_hat_testing, y_testing, True)
        y_scores.append(temp_test / 10)
        y_trains.append(temp_trains / 10)
    ###

    test_df = pd.DataFrame(list(zip(alp, y_scores, y_trains)),
                           columns=['alphas', 'train_score', 'test_score'])
    test_df
    p = ggplot(test_df) + geom_line(aes(x='alphas', y='train_score'), color='blue') \
                  + geom_line(aes(x='alphas', y='test_score'), color='red') \
                  + labs(y='Accuracy', x='Parameter Value') \
                  + ggtitle('Alpha vs. Accuracy')
    return p
Esempio n. 2
0
def run(X, y, impurity_decrease, np_seed=None):
    if np_seed:
        np.random.seed(np_seed)

    # randomly select indices to be used for training, validation, and testing
    testing_indices = list(np.random.choice(len(X), len(X), replace=False))

    # training subset will utilize 60% of the data
    X_training = X.iloc[testing_indices[0:600]]
    y_training = y.iloc[testing_indices[0:600]].tolist()

    # validation subset will utilize 20% of the remaining data
    X_validation = X.iloc[testing_indices[600:800]]
    y_validation = y.iloc[testing_indices[600:800]].tolist()

    # testing subset will utilize the remaining 20% of data
    X_testing = X.iloc[testing_indices[800:1000]]
    y_testing = y.iloc[testing_indices[800:]].tolist()
    # print(testing_data)

    clf = DecisionTreeClassifier(min_impurity_decrease=impurity_decrease)

    clf = clf.fit(X_training, y_training)
    clf_testing_predict = clf.predict(X_testing)

    clf_validation_predict = clf.predict(X_validation)
    validation_accuracy_score = my_get_accuracy.run(clf_validation_predict,
                                                    y_validation, True)

    return (validation_accuracy_score, clf_testing_predict, y_testing)
Esempio n. 3
0
def run(X, y, alph, np_seed=None):
    if np_seed:
        np.random.seed(np_seed)

    # Randomly split up the dataset for training, validation and testing
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, train_size=0.2)
    testing_indices = list(np.random.choice(len(X), len(X), replace=False))

    X_training = X.iloc[testing_indices[0:600]]
    y_training = y.iloc[testing_indices[0:600]].tolist()

    X_validation = X.iloc[testing_indices[600:800]]
    y_validation = y.iloc[testing_indices[600:800]].tolist()

    X_testing = X.iloc[testing_indices[800:1000]]
    y_testing = y.iloc[testing_indices[800:1000]].tolist()

    # Training
    clf = BernoulliNB(alpha=alph)
    clf.fit(X_training, y_training)
    y_pred = clf.predict(X_testing)

    y_hat_validation = clf.predict(X_validation)
    y_hat_testing = clf.predict(X_testing)

    return my_get_accuracy.run(y_hat_validation, y_validation,
                               True), y_hat_testing, y_testing
step = float(input("Enter step value: "))

# *step of 0.01 used in experiments conducted*
print(f'Hyperparameter Range being Run: ({0.01}, {0.5}), step={step}.')

# test
alphas = np.arange(0.01, 0.5, step)

# Testing for alpha
for alpha in alphas:
    test_avg = 0
    train_avg = 0
    for i in range(num_test):
        test_score, y_hat_testing, y_testing = naive_bayes_classifier.run(
            X, y, alph=alpha)
        test_avg += test_score
        train_avg += my_get_accuracy.run(y_hat_testing, y_testing, True)
    test_avg = test_avg / num_test
    train_avg = train_avg / num_test
    testing_outputs.append(test_avg)
    train_outputs.append(train_avg)

# to df
df = pd.DataFrame(list(zip(alphas, train_outputs, testing_outputs)),
                  columns=['Alpha', 'Train Score', 'Testing Score'])
print(df)
compression_opts = dict(method='zip', archive_name=f'bayes_output_data.csv')
df.to_csv(f'../output/bayes_output_data.zip',
          index=False,
          compression=compression_opts)
validation_outputs = []
testing_outputs = []

# test for every hyperparameter in the list
for i in range(len(inputs)):
    validation_accuracy_score = 0
    testing_accuracy_score = 0

    # run n_tests and calculate average accuracy score obtained from
    # specified hyperparameter
    for j in range(n_tests):
        validation_score, y_hat_testing, y_testing = decision_tree_classifier.run(
            X, y, inputs[i])
        validation_accuracy_score = validation_accuracy_score + validation_score

        accuracy_score = my_get_accuracy.run(y_hat_testing, y_testing, True)
        accuracy_score = accuracy_score / len(y_hat_testing)
        testing_accuracy_score += accuracy_score

    validation_outputs.append(validation_accuracy_score / n_tests)
    testing_outputs.append(testing_accuracy_score / n_tests)

# form pandas dataframe from data
df = pd.DataFrame(list(zip(inputs, validation_outputs, testing_outputs)))
df.columns = [
    'Min Impurity Decrease', 'Validation Accuracy', 'Testing Accuracy'
]
print(df)
print(f'Runtime {time.perf_counter() - start} seconds')

compression_opts = dict(method='zip', archive_name=f'dtc_output_data.csv')
Esempio n. 6
0
inputs = np.arange(((test_num * 0.1) - 0.1), test_num * 0.1, step).tolist()
validation_outputs = []
testing_outputs = []

# test for every hyperparameter in the list
for i in range(len(inputs)):
    validation_accuracy_score = 0
    testing_accuracy_score = 0

    # run n_tests and calculate average accuracy score obtained from
    # specified hyperparameter
    for j in range(n_tests):
        validation_accuracy_score, y_hat_testing, y_testing = decision_tree_classifier.run(
            X, y, inputs[i])
        testing_accuracy_score += my_get_accuracy.run(y_hat_testing, y_testing)

    validation_outputs.append(validation_accuracy_score / n_tests)
    testing_outputs.append(testing_accuracy_score / n_tests)

# form pandas dataframe from data
df = pd.DataFrame(list(zip(inputs, validation_outputs, testing_outputs)))
print(df)
print(f'Runtime {time.perf_counter() - start} seconds')

compression_opts = dict(method='zip',
                        archive_name=f'dtc_output_data_{test_num}.csv')
df.to_csv(f'../output/dtc_output_data_{test_num}.zip',
          index=False,
          compression=compression_opts)
Esempio n. 7
0
# Split up X and y
y = df['RainTomorrow']
X = df.drop(columns=['RainTomorrow'])  # Get rid of prediction

# Split the Training and Test Data
X_train, X_test, y_train, y_test = train_test_split(X, y)
y_test = y_test.to_numpy()

# Train data
clf = BernoulliNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Accuracy Output
accuracy = my_get_accuracy.run(y_test, y_pred)
print('Test Accuracy : %.4f' % clf.score(X_test, y_test))
print('Training Accuracy : %.3f' % clf.score(X_train, y_train))

# Gridsearch for best alpha
# will implement own version
from sklearn.model_selection import GridSearchCV
params = {
    'alpha': [0.01, 0.1, 0.5, 1.0],
}

bernoulli_nb_grid = GridSearchCV(BernoulliNB(),
                                 param_grid=params,
                                 n_jobs=-1,
                                 cv=5,
                                 verbose=5)