nbc = NaiveBayesClassifier() gnb = GaussianNB() # finding best train/(train+test) ratio train_fractions = np.linspace(start=0.1, stop=0.9, num=17) nbc_prediction_accuracies = np.zeros((17, 1)) for idx, train_frac in enumerate(train_fractions): X_train, X_test, y_train, y_test = split_dataset(main_df, train_frac=train_frac) # alternatively sklearn.model_selection.train_test_split can be used nbc.fit(X_train, y_train) predictions = nbc.predict(X_test) nbc_prediction_accuracies[idx] = accuracy_score(y_test, predictions) best_train_fraction_nbc = train_fractions[np.argmax(nbc_prediction_accuracies)] gnb_prediction_accuracies = np.zeros((17, 1)) for idx, train_frac in enumerate(train_fractions): X_train, X_test, y_train, y_test = split_dataset(main_df, train_frac=train_frac) # alternatively sklearn.model_selection.train_test_split can be used gnb.fit(X_train, y_train) predictions = gnb.predict(X_test) gnb_prediction_accuracies[idx] = accuracy_score(y_test, predictions) best_train_fraction_gnb = train_fractions[np.argmax(gnb_prediction_accuracies)]
print("X_train_val shape: {}, X_test shape: {}".format( X_train_val.shape, X_test.shape)) print("y_train_val shape: {}, y_test shape: {}".format( y_train_val.shape, y_test.shape)) X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1, random_state=42) print("X_train shape: {}, X_val shape: {}".format(X_train.shape, X_val.shape)) print("y_train shape: {}, y_val shape: {}".format(y_train.shape, y_val.shape)) nb_clf = NaiveBayesClassifier() nb_clf.fit(X_train, y_train) y_pred_val = nb_clf.predict(X_val) y_pred_test = nb_clf.predict(X_test) print('NB validation acc: {}'.format((y_pred_val == y_val).mean())) evaluate(y_test, y_pred_test) for k in [1, 5, 9]: knn_clf = KNNClassifier(k) knn_clf.fit(X_train, y_train) y_pred_val = knn_clf.predict(X_val) y_pred_test = knn_clf.predict(X_test) print('{}-nn validation acc: {}'.format(k, (y_pred_val == y_val).mean())) evaluate(y_test, y_pred_test) c_values = [0.5, 1, 1.5, 2] for C in c_values: