def run_kfold(fields, labels): kf = KFold(n_splits=5) best = [], [] best_accuracy = 0 # train_index and test_index index into fields and labels for train_index, test_index in kf.split(fields): train_fields = fields.iloc[train_index].reset_index(drop = True) train_labels = [labels[i] for i in train_index] test_fields = fields.iloc[test_index].reset_index(drop = True) test_labels = [labels[i] for i in test_index] clf = CategoricalNB() clf.fit(train_fields, train_labels) try: res = clf.predict(test_fields).tolist() except IndexError: continue accuracy = [] for i in range(len(res)): if res[i] == test_labels[i]: accuracy.append(1) else: accuracy.append(0) accuracy = [1 if res[i] == test_labels[i] else 0 for i in range(len(res))] acc = sum(accuracy)/len(accuracy) if (acc > best_accuracy): best = train_index, test_index best_accuracy = acc print("accuracy rate: ", acc) return best
def categoricalNaiveBayes(dtrain, dtest): # can use split? y_train = dtrain[:, -1] x_train = dtrain[:, :-1] scaler = preprocessing.MinMaxScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) newscaler = preprocessing.MinMaxScaler() newscaler.fit(dtest) dtest = newscaler.transform(dtest) #print(x_train[x_train < 0]) gnb = CategoricalNB() gnb.fit(x_train, y_train) print("GNB Features") print("GNB cat count Features") print(gnb.category_count_) print("GNB class count Features") print(gnb.class_count_) print("GNB feature log prob Features") print(gnb.feature_log_prob_) print("GNB n Features") print(gnb.n_features_) print("Length test") print(len(dtest[0])) predictions = gnb.predict(dtest) return predictions
def run_model(training, testing, fields, labels): train_fields = fields.iloc[training].reset_index(drop = True) train_labels = [labels[i] for i in training] test_fields = fields.iloc[testing].reset_index(drop = True) test_labels = [labels[i] for i in testing] clf = CategoricalNB() clf.fit(train_fields, train_labels) res = clf.predict(test_fields).tolist() accuracy = [] for i in range(len(res)): if res[i] == 1 and test_labels[i] == 0: accuracy.append(1) elif res[i] == 0 and test_labels[i] == 1: accuracy.append(-1) else: accuracy.append(0) fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy) fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy) acc = sum([1 if accuracy[i] == 0 else 0 for i in range(len(accuracy))])/len(accuracy) print("false positive rate: %4f" % fp) print("false negative rate: %4f" % fn) print("accuracy: %4f" % acc) return res, acc, fp, fn
def test_categoricalnb_min_categories_errors(min_categories, error_msg): X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) y = np.array([1, 1, 2, 2]) clf = CategoricalNB(alpha=1, fit_prior=False, min_categories=min_categories) with pytest.raises(ValueError, match=error_msg): clf.fit(X, y)
def setUp(self): rng = np.random.RandomState(1) self.X = rng.randint(5, size=(6, 100)) y = np.array([1, 2, 3, 4, 5, 6]) model = CategoricalNB() model.fit(self.X, y) self.model = model
def test(X_train, Y_train, X_val, Y_val, categorical=False): from sklearn.naive_bayes import GaussianNB, CategoricalNB if categorical: model = CategoricalNB() else: model = GaussianNB() model.fit(X_train, Y_train) Y_pred = model.predict(X_val) # accuracy_score(Y_val, Y_pred) return f1_score(Y_val, Y_pred)
def check_sklearn_dev(): """ This just verifies that sklearn 0.23-dev is installed properly by checking CategoricalNB results """ rng = np.random.RandomState(1) X = rng.randint(5, size=(6, 100)) y = np.array([1, 2, 3, 4, 5, 6]) clf = CategoricalNB() clf.fit(X, y) assert [3] == clf.predict(X[2:3])
def fit(self, X, Y): """ Fit the classifier to training data X and lables Y. Arguments: X (np.array): training data matrix of shape (n_samples, n_features) Y (np.array): label matrix of shape (n_samples, n_labels) """ n_labels = Y.shape[1] for idx in range(n_labels): Y_col = Y[:, idx] predictor = CategoricalNB() predictor.fit(X, Y_col) self.predictors.append(predictor)
def cnb(train_x, train_y, test_x, test_y): compnb = CategoricalNB() compnb.fit(train_x, train_y) y_predictions = compnb.predict(test_x) print("RMSE for Complement Naive Bayes model = ", mean_squared_error(test_y, y_predictions)) my_f1 = f1_score(test_y, y_predictions, average='macro') print("f1_macro for Categorical Naive Bayes Classifier = ", my_f1) cm = confusion_matrix(test_y, y_predictions, normalize='true') sns.heatmap(cm, annot=True) plt.title('Confusion matrix of the Categorical Naive Bayes classifier') plt.xlabel('Predicted') plt.ylabel('True') plt.savefig('./output/CompNB.png') plt.show()
def test_categoricalnb_with_min_categories( min_categories, exp_X1_count, exp_X2_count, new_X, exp_n_categories_ ): X_n_categories = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) y_n_categories = np.array([1, 1, 2, 2]) expected_prediction = np.array([1]) clf = CategoricalNB(alpha=1, fit_prior=False, min_categories=min_categories) clf.fit(X_n_categories, y_n_categories) X1_count, X2_count = clf.category_count_ assert_array_equal(X1_count, exp_X1_count) assert_array_equal(X2_count, exp_X2_count) predictions = clf.predict(new_X) assert_array_equal(predictions, expected_prediction) assert_array_equal(clf.n_categories_, exp_n_categories_)
def bayes(test_set, training_set, categories): classifier = CategoricalNB() x, y = build_xy(training_set, categories) classifier.fit(x, y) false_positives = 0 false_negatives = 0 true_positives = 0 true_negatives = 0 x, y = build_xy(test_set, categories) y_predicted = classifier.predict(x) print(f'score: {classifier.score(x, y)}') print('bayes confusion matrix') print(classification_report(y, y_predicted))
def perform_bayes(df): les = build_labelencoders(df) res = [] for i in range(len(df.columns)): col = df.iloc[:, i].values res.append(les[i].transform(col)) res = pd.DataFrame(res).transpose() x = res.iloc[:, :-1] y = res.iloc[:, -1:] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=10) model = CategoricalNB() model.fit(x_train, y_train.values.ravel()) y_pred = model.predict(x_test) y_pred_probability = model.predict_proba(x_test)[::, 1] accuracy = accuracy_score(y_test, y_pred) * 100 print(accuracy) # example patient test = ['50-59','ge40','50-54','24-26','no','1','right','left_up','yes'] print(test) # transform using labelencoders for i in range(len(test)): e = test[i] test[i] = les[i].transform(np.array(e).reshape(1, )) test = np.array(test) # do prediction y = model.predict(test.reshape(1, -1)) # translate back y = les[-1].inverse_transform(y)[0] print(y) a, b, _ = roc_curve(y_test, y_pred_probability) area_under_curve = roc_auc_score(y_test, y_pred_probability) plt.plot(a, b, label="area under curve="+str(area_under_curve)) plt.xlabel("false positive rate") plt.ylabel("true positive rate") plt.axis plt.legend(loc=4) plot_confusion_matrix(model, x_train, y_train.values.ravel(), normalize='true', display_labels=les[-1].inverse_transform([0, 1])) plt.show()
def test_categoricalnb(): # Check the ability to predict the training set. clf = CategoricalNB() y_pred = clf.fit(X2, y2).predict(X2) assert_array_equal(y_pred, y2) X3 = np.array([[1, 4], [2, 5]]) y3 = np.array([1, 2]) clf = CategoricalNB(alpha=1, fit_prior=False) clf.fit(X3, y3) assert_array_equal(clf.n_categories_, np.array([3, 6])) # Check error is raised for X with negative entries X = np.array([[0, -1]]) y = np.array([1]) error_msg = re.escape("Negative values in data passed to CategoricalNB (input X)") with pytest.raises(ValueError, match=error_msg): clf.predict(X) with pytest.raises(ValueError, match=error_msg): clf.fit(X, y) # Test alpha X3_test = np.array([[2, 5]]) # alpha=1 increases the count of all categories by one so the final # probability for each category is not 50/50 but 1/3 to 2/3 bayes_numerator = np.array([[1 / 3 * 1 / 3, 2 / 3 * 2 / 3]]) bayes_denominator = bayes_numerator.sum() assert_array_almost_equal( clf.predict_proba(X3_test), bayes_numerator / bayes_denominator ) # Assert category_count has counted all features assert len(clf.category_count_) == X3.shape[1] # Check sample_weight X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) y = np.array([1, 1, 2, 2]) clf = CategoricalNB(alpha=1, fit_prior=False) clf.fit(X, y) assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([1])) assert_array_equal(clf.n_categories_, np.array([2, 2])) for factor in [1.0, 0.3, 5, 0.0001]: X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) y = np.array([1, 1, 2, 2]) sample_weight = np.array([1, 1, 10, 0.1]) * factor clf = CategoricalNB(alpha=1, fit_prior=False) clf.fit(X, y, sample_weight=sample_weight) assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([2])) assert_array_equal(clf.n_categories_, np.array([2, 2]))
def pengujian(): if "admin" not in session: return redirect(url_for("index")) mydb.connect() cursor = mydb.cursor() cursor.execute( "SELECT * FROM preprocessing WHERE preprocessing.tahundibuat REGEXP '(2005|2006|2007|2008|2009|2010|2011|2012|2013|2014|2015|2016|2017|2018|2019)'" ) training = cursor.fetchall() X = [[x[0], x[1], x[2], x[3], x[4]] for x in training] y = [x[5] for x in training] # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) clf = CategoricalNB() clf.fit(X, y) cursor.execute( "SELECT * FROM preprocessing WHERE preprocessing.tahundibuat REGEXP '(2020)'" ) testing = cursor.fetchall() X_test = [[x[0], x[1], x[2], x[3], x[4]] for x in testing] y_test = [x[5] for x in testing] predicted = clf.predict(X_test) payload = [] for index, x in enumerate(X_test): arr = x arr.append(y[index]) payload.append({ "no": index + 1, "stasiuntv": arr[0], "genre": arr[1], "writer": arr[2], "director": arr[3], "actor": arr[4], "status": arr[5], }) hasil = confusion_matrix(y_test, predicted) akurasi = (hasil[0][0] + hasil[1][1]) / (hasil[0][0] + hasil[0][1] + hasil[1][0] + hasil[1][1]) return render_template("pengujian.html", hasil=hasil, akurasi=round(akurasi * 100))
def example_weather_nominal(): path = (base_path / "weather-nominal.csv").resolve() series = pd.read_csv(path) # arrange table in X(features) and y(target) X = series.iloc[:, :-1] X = X.apply(LabelEncoder().fit_transform) y = series.iloc[:, -1] # apply GaussianNB and CategoricalNB gNB = GaussianNB() gNB.fit(X, y) cNB = CategoricalNB() cNB.fit(X, y) print( f"Prediction GaussianNB ([Sunny,Cool,High,True]]): {gNB.predict([[2,0,0,1]])}" ) print(f"Probability GaussianNB: {gNB.predict_proba([[2,0,0,1]])}") print("\n") print( f"Prediction CategoricalNB ([Sunny,Cool,High,True]]): {cNB.predict([[2, 0, 0, 1]])}" ) print(f"Probability CategoricalNB: {cNB.predict_proba([[2, 0, 0, 1]])}")
def test_predict_meta_override(): X = pd.DataFrame({"c_0": [1, 2, 3, 4]}) y = np.array([1, 2, 3, 4]) base = CategoricalNB() base.fit(pd.DataFrame(X), y) dd_X = dd.from_pandas(X, npartitions=2) dd_X._meta = pd.DataFrame({"c_0": [5]}) # Failure when not proving predict_meta # because of value dependent model wrap = ParallelPostFit(base) with pytest.raises(ValueError): wrap.predict(dd_X) # Success when providing meta over-ride wrap = ParallelPostFit(base, predict_meta=np.array([1])) result = wrap.predict(dd_X) expected = base.predict(X) assert_eq_ar(result, expected)
def combine_two_categorical(): """ run this so see how combining 2 sklearn.CategoricalNB == combining them externally """ data = Data() data.cleanse() data.encode() knr = np.array(data.x["Kontonummer"]).reshape((-1, 1)) text = np.array(data.x["BLZ"]).reshape((-1, 1)) k_cat_nb = CategoricalNB() t_cat_nb = CategoricalNB() k_cat_nb.fit(knr, data.y) t_cat_nb.fit(text, data.y) k_proba = k_cat_nb.predict_log_proba(knr) t_proba = t_cat_nb.predict_log_proba(text) combined_proba = k_proba + t_proba - k_cat_nb.class_log_prior_ combined_proba -= np.expand_dims(logsumexp(combined_proba, axis=1), axis=1) # now the same thing but in one cat_nb combi = data.x.loc[:, ["BLZ", "Kontonummer"]] c_cat_nb = CategoricalNB() c_cat_nb.fit(combi, data.y) proba = c_cat_nb.predict_log_proba(combi) diff = np.exp(proba) - np.exp(combined_proba) print("total difference in probabilities: %d" % np.sum(np.abs(diff)))
def example_weather_numeric(): path = (base_path / "weather-numeric.csv").resolve() series = pd.read_csv(path) # arrange table in X(features) and y(target) X = series.iloc[:, :-1] X.outlook = LabelEncoder().fit_transform(X.outlook) X.windy = LabelEncoder().fit_transform(X.windy) y = series.iloc[:, -1] # apply GaussianNB and CategoricalNB gNB = GaussianNB() gNB.fit(X, y) cNB = CategoricalNB() cNB.fit(X, y) print( f"Prediction GaussianNB ([Sunny,66,90,True]]]): {gNB.predict([[2, 66, 90, 1]])}" ) print(f"Probability GaussianNB: {gNB.predict_proba([[2, 66, 90, 1]])}") print("\n") print( f"Prediction CategoricalNB ([Sunny,66,90,True]]): {cNB.predict([[2, 66, 90, 1]])}" ) print(f"Probability CategoricalNB: {cNB.predict_proba([[2, 66, 90, 1]])}")
def classificationCategoricalNaiveBayes(): col_names = [ '*', 'web1', 'web2', 'cosine', 'len', 'word', 'sameDomain', 'label' ] #load dataset pima = pd.read_csv("data.csv", names=col_names) #split dataset in features and target variable feature_cols = ['cosine', 'len', 'word', 'sameDomain'] X = pima[feature_cols] # Features y = pima.label # Target variable #Split dataset into training set and test set X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4) # 80% training and 20% test clf = CategoricalNB() clf.fit(X_train, y_train) # save the model dump(clf, open('model.pkl', 'wb')) startTime = datetime.now() #Predict the response for test dataset y_pred = clf.predict(X_test) endTime = datetime.now() print("exec time :", endTime - startTime) #Model Accuracy, how often is the classifier correct? print("Accuracy:", metrics.accuracy_score(y_test, y_pred)) print("precision:", metrics.average_precision_score(y_test, y_pred)) print("recall:", metrics.recall_score(y_test, y_pred)) print() print(confusion_matrix(y_test, y_pred))
def naive_bayes_adapter(**kwargs): # getting data from kwargs train = kwargs['train'] test = kwargs['test'] # megreing data to get all uniques and to build encoder for all merged_data = pd.concat([train, test]) # adding unknown uniques to train dataframe # by adding new row with that unknown unique and rest are most cummon uniques train = improove_train(train, merged_data) # bulding encoder merged_data_without_class = merged_data.drop('class', 1) encoder = OrdinalEncoder() encoder.fit(merged_data_without_class) # seperating classification column from datasets train_without_class = train.drop('class', 1) test_without_class = test.drop('class', 1) train_classifications = train['class'] test_classifications = test['class'] # encoding them all encoded_train_without_class = encoder.transform(train_without_class) encoded_test_without_class = encoder.transform(test_without_class) encoded_train_classifications = train_classifications.map({ 'yes': 1, 'no': 0 }) encoded_test_classifications = test_classifications.map({ 'yes': 1, 'no': 0 }) # building classifer clf = CategoricalNB(alpha=1) # when alpha=1 its Laplace smoothing clf.fit(encoded_train_without_class, encoded_train_classifications) # pridicting with the tree predictions = clf.predict(encoded_test_without_class) # returning matrix and cakculating score return create_return_dict(predictions, encoded_test_classifications)
def test_categoricalnb(): # Check the ability to predict the training set. clf = CategoricalNB() y_pred = clf.fit(X2, y2).predict(X2) assert_array_equal(y_pred, y2) X3 = np.array([[1, 4], [2, 5]]) y3 = np.array([1, 2]) clf = CategoricalNB(alpha=1, fit_prior=False) clf.fit(X3, y3) # Check error is raised for X with negative entries X = np.array([[0, -1]]) y = np.array([1]) error_msg = "X must not contain negative values." assert_raise_message(ValueError, error_msg, clf.predict, X) assert_raise_message(ValueError, error_msg, clf.fit, X, y) # Check error is raised for incorrect X X = np.array([[1, 4, 1], [2, 5, 6]]) msg = "Expected input with 2 features, got 3 instead" assert_raise_message(ValueError, msg, clf.predict, X) # Test alpha X3_test = np.array([[2, 5]]) # alpha=1 increases the count of all categories by one so the final # probability for each category is not 50/50 but 1/3 to 2/3 bayes_numerator = np.array([[1 / 3 * 1 / 3, 2 / 3 * 2 / 3]]) bayes_denominator = bayes_numerator.sum() assert_array_almost_equal(clf.predict_proba(X3_test), bayes_numerator / bayes_denominator) # Assert category_count has counted all features assert len(clf.category_count_) == X3.shape[1] # Check sample_weight X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) y = np.array([1, 1, 2, 2]) clf = CategoricalNB(alpha=1, fit_prior=False) clf.fit(X, y) assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([1])) for factor in [1., 0.3, 5, 0.0001]: X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) y = np.array([1, 1, 2, 2]) sample_weight = np.array([1, 1, 10, 0.1]) * factor clf = CategoricalNB(alpha=1, fit_prior=False) clf.fit(X, y, sample_weight=sample_weight) assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([2]))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]) X_df = train_df clf = CategoricalNB() # errorList = [658, 1562, 5532, 5629, 7401, 9458, 9981, 14080, 17258, 24716, 25047] binned_test_df.loc[658,:] = 0 binned_test_df.loc[1562,:] = 0 binned_test_df.loc[5532,:] = 0 binned_test_df.loc[5629,:] = 0 binned_test_df.loc[7401,:] = 0 binned_test_df.loc[9458,:] = 0 binned_test_df.loc[9981,:] = 0 binned_test_df.loc[14080,:] = 0 binned_test_df.loc[17258,:] = 0 binned_test_df.loc[24716,:] = 0 binned_test_df.loc[25047,:] = 0 predict = clf.fit(X_df, Y_df).predict(test_df.loc[2500:5000]) f = open("CNB.csv", "w") f.write("ImageId,Label\n") for i in range(0, predict.size): f.write("{},{}\n".format(i+1, predict[i])) f.close()
print("Confusion Matrix:") print(confusion_matrix(y_test, prediction['Logistic Regression'])) #scoring with train data print('train score:', LR_final.score(X_train_new, y_train)) # scoring with test data print('test score:', LR_final.score(X_test_new, y_test)) LR_final.predict_proba(X_test_new) """# Naive Bayes""" #use the same train test set as logistic regression prediction = dict() NB = CategoricalNB() NB.fit(X_train_new, y_train) prediction['Naive Bayes'] = NB.predict(X_test_new) #accuracy, precision, recall, confusion matrix print("Acurracy:") print(accuracy_score(y_test, prediction['Naive Bayes'])) print("\n") print("Classfication report:") print(classification_report(y_test, prediction['Naive Bayes'])) print("\n") print("Confusion Matrix:") print(confusion_matrix(y_test, prediction['Naive Bayes'])) #scoring with train data print('train score:', NB.score(X_train_new, y_train))
# 交叉验证 X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True) # 特征提取 + 数据标准化 pipe = Pipeline([ ('count', CountVectorizer(max_features=100, tokenizer=jieba_tokenizer, stop_words=stop_words, min_df=200)), ('tf-idf', TfidfTransformer()), ('norm', Normalizer()), ]) X_train = pipe.fit_transform(X_train).toarray() print("train size", X_train.shape) X_test = pipe.transform(X_test).toarray() # 训练 model = CategoricalNB() model.fit(X_train, y_train) # 模型评估 y_train_pred = model.predict_proba(X_train)[:, 1] y_test_pred = model.predict_proba(X_test)[:, 1] plot_rocs([y_train, y_test], [y_train_pred, y_test_pred], ["train", "test"]) plot_pcs([y_train, y_test], [y_train_pred, y_test_pred], ["train", "test"])
x_train = np.array(x_train) y_train = np.array(y_train) x_test = np.array(x_test) y_test = np.array(y_test) label_encoder = LabelEncoder() for i in disc_columns: x_train[i] = label_encoder.fit_transform(x_train[i]) x_test[i] = label_encoder.fit_transform(x_test[i]) n_b = MixedNB(categorical_features=disc_columns) ''' # "uczymy" sie na zbiorze treningowym start_time = time.time() print("Learning and predicting with naive_bayes ...", end=" ") n_b.fit(x_train, y_train) # przewidujemy na testowym y_pred = n_b.predict(x_test) print(" took %s seconds " % round((time.time() - start_time), 5)) # na testowym znalismy prawdziwe klasy, mozemy porownac jak "dobrze" poszlo metric_accuracy = metrics.accuracy_score(y_test, y_pred) print("naive_bayes: accuracy = ", metric_accuracy) print("full classification report:") if type(classes_names) is not list: target_nms = classes_names.astype(str) else: target_nms = classes_names
X = data.drop(['y'], axis= 1) print(X) Y = data.loc[:,'y'] print(Y) #建立模型 from sklearn.naive_bayes import CategoricalNB #建立模型实例 model = CategoricalNB() #训练模型 model.fit(X, Y) y_prdict_prob = model.predict_proba(X) print(y_prdict_prob) #输出预测y y_predict = model.predict(X) print(y_predict) #计算模型准确率 from sklearn.metrics import accuracy_score accuracy = accuracy_score(Y, y_predict) print(accuracy)
def test_alpha(): # Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case X = np.array([[1, 0], [1, 1]]) y = np.array([0, 1]) nb = BernoulliNB(alpha=0.0) msg = "alpha too small will result in numeric errors, setting alpha = 1.0e-10" with pytest.warns(UserWarning, match=msg): nb.partial_fit(X, y, classes=[0, 1]) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) prob = np.array([[1, 0], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = MultinomialNB(alpha=0.0) with pytest.warns(UserWarning, match=msg): nb.partial_fit(X, y, classes=[0, 1]) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = CategoricalNB(alpha=0.0) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) prob = np.array([[1.0, 0.0], [0.0, 1.0]]) assert_array_almost_equal(nb.predict_proba(X), prob) # Test sparse X X = scipy.sparse.csr_matrix(X) nb = BernoulliNB(alpha=0.0) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) prob = np.array([[1, 0], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) nb = MultinomialNB(alpha=0.0) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) # Test for alpha < 0 X = np.array([[1, 0], [1, 1]]) y = np.array([0, 1]) expected_msg = re.escape( "Smoothing parameter alpha = -1.0e-01. alpha should be > 0." ) b_nb = BernoulliNB(alpha=-0.1) m_nb = MultinomialNB(alpha=-0.1) c_nb = CategoricalNB(alpha=-0.1) with pytest.raises(ValueError, match=expected_msg): b_nb.fit(X, y) with pytest.raises(ValueError, match=expected_msg): m_nb.fit(X, y) with pytest.raises(ValueError, match=expected_msg): c_nb.fit(X, y) b_nb = BernoulliNB(alpha=-0.1) m_nb = MultinomialNB(alpha=-0.1) with pytest.raises(ValueError, match=expected_msg): b_nb.partial_fit(X, y, classes=[0, 1]) with pytest.raises(ValueError, match=expected_msg): m_nb.partial_fit(X, y, classes=[0, 1])
testingSet = getTestingSet(full_data) dataset = trainingSet + testingSet print('Classifying') classifier = CategoricalNB() encoder = OrdinalEncoder() encoder.fit([row[:-1] for row in dataset]) print('Encoding') X = [row[:-1] for row in trainingSet] X = encoder.transform(X) Y = [row[-1] for row in trainingSet] classifier.fit(X, Y) test_set_x = encoder.transform([row[:-1] for row in testingSet]) test_set_y = [row[-1] for row in testingSet] print('Predicting') predictions = classifier.predict(test_set_x) right = 0 for y, prediction in zip(test_set_y, predictions): if y == prediction: right += 1 accuracy = right / len(testingSet)
drop_cols = ['id'] from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score for i in cat_cols: le = LabelEncoder() data[i] = le.fit_transform(data[i]) data.drop(columns=drop_cols, inplace=True) train_df = data.loc[data['risk_flag'] != -1] test_df = data.loc[data['risk_flag'] == -1] X_tr, X_tst, y_tr, y_tst = train_test_split( train_df.drop(columns=['risk_flag']), train_df['risk_flag'], stratify=train_df['risk_flag']) from sklearn.naive_bayes import CategoricalNB clf = CategoricalNB() clf.fit(X_tr, y_tr) clf.feature_count_ k = clf.predict(X_tst) roc_auc_score(y_tst, clf.predict(X_tst)) print(clf.predict(X[2:3]))
target_names = ["0", "1"] dataset = { "data": dataArray, "target": target, "feature_names": columnsIncluded, "target_names": target_names } # predict and output the test result df_test = pd.DataFrame(data_test) df_test.loc[df_test["Geography"]=="France", "Geography"] = 0 df_test.loc[df_test["Geography"]=="Spain", "Geography"] = 1 df_test.loc[df_test["Geography"]=="Germany", "Geography"] = 2 df_test.loc[df_test["Gender"]=="Male", "Gender"] = 0 df_test.loc[df_test["Gender"]=="Female", "Gender"] = 1 ########################################################## # train the model clf = CategoricalNB(alpha = 1) clf.fit(dataset['data'],dataset['target']) predictedTestResult = clf.predict(df_test[columnsIncluded].values) df_testOutput = df_test[["RowNumber"]] df_testOutput.insert(1, "Exited", predictedTestResult, True) df_testOutput.to_csv("submission_2_Bayes.csv", index=False) # compute f1 score f1_score_result = evaluateTask2.f1_score(predictedTestResult) print("f1-score: " + str(f1_score_result))