def test_categoricalnb(): # Check the ability to predict the training set. clf = CategoricalNB() y_pred = clf.fit(X2, y2).predict(X2) assert_array_equal(y_pred, y2) X3 = np.array([[1, 4], [2, 5]]) y3 = np.array([1, 2]) clf = CategoricalNB(alpha=1, fit_prior=False) clf.fit(X3, y3) assert_array_equal(clf.n_categories_, np.array([3, 6])) # Check error is raised for X with negative entries X = np.array([[0, -1]]) y = np.array([1]) error_msg = "Negative values in data passed to CategoricalNB (input X)" assert_raise_message(ValueError, error_msg, clf.predict, X) assert_raise_message(ValueError, error_msg, clf.fit, X, y) # Check error is raised for incorrect X X = np.array([[1, 4, 1], [2, 5, 6]]) msg = "Expected input with 2 features, got 3 instead" assert_raise_message(ValueError, msg, clf.predict, X) # Test alpha X3_test = np.array([[2, 5]]) # alpha=1 increases the count of all categories by one so the final # probability for each category is not 50/50 but 1/3 to 2/3 bayes_numerator = np.array([[1/3*1/3, 2/3*2/3]]) bayes_denominator = bayes_numerator.sum() assert_array_almost_equal(clf.predict_proba(X3_test), bayes_numerator / bayes_denominator) # Assert category_count has counted all features assert len(clf.category_count_) == X3.shape[1] # Check sample_weight X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) y = np.array([1, 1, 2, 2]) clf = CategoricalNB(alpha=1, fit_prior=False) clf.fit(X, y) assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([1])) assert_array_equal(clf.n_categories_, np.array([2, 2])) for factor in [1., 0.3, 5, 0.0001]: X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) y = np.array([1, 1, 2, 2]) sample_weight = np.array([1, 1, 10, 0.1]) * factor clf = CategoricalNB(alpha=1, fit_prior=False) clf.fit(X, y, sample_weight=sample_weight) assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([2])) assert_array_equal(clf.n_categories_, np.array([2, 2]))
def categoricalNaiveBayes(dtrain, dtest): # can use split? y_train = dtrain[:, -1] x_train = dtrain[:, :-1] scaler = preprocessing.MinMaxScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) newscaler = preprocessing.MinMaxScaler() newscaler.fit(dtest) dtest = newscaler.transform(dtest) #print(x_train[x_train < 0]) gnb = CategoricalNB() gnb.fit(x_train, y_train) print("GNB Features") print("GNB cat count Features") print(gnb.category_count_) print("GNB class count Features") print(gnb.class_count_) print("GNB feature log prob Features") print(gnb.feature_log_prob_) print("GNB n Features") print(gnb.n_features_) print("Length test") print(len(dtest[0])) predictions = gnb.predict(dtest) return predictions
def run_kfold(fields, labels): kf = KFold(n_splits=5) best = [], [] best_accuracy = 0 # train_index and test_index index into fields and labels for train_index, test_index in kf.split(fields): train_fields = fields.iloc[train_index].reset_index(drop = True) train_labels = [labels[i] for i in train_index] test_fields = fields.iloc[test_index].reset_index(drop = True) test_labels = [labels[i] for i in test_index] clf = CategoricalNB() clf.fit(train_fields, train_labels) try: res = clf.predict(test_fields).tolist() except IndexError: continue accuracy = [] for i in range(len(res)): if res[i] == test_labels[i]: accuracy.append(1) else: accuracy.append(0) accuracy = [1 if res[i] == test_labels[i] else 0 for i in range(len(res))] acc = sum(accuracy)/len(accuracy) if (acc > best_accuracy): best = train_index, test_index best_accuracy = acc print("accuracy rate: ", acc) return best
def run_model(training, testing, fields, labels): train_fields = fields.iloc[training].reset_index(drop = True) train_labels = [labels[i] for i in training] test_fields = fields.iloc[testing].reset_index(drop = True) test_labels = [labels[i] for i in testing] clf = CategoricalNB() clf.fit(train_fields, train_labels) res = clf.predict(test_fields).tolist() accuracy = [] for i in range(len(res)): if res[i] == 1 and test_labels[i] == 0: accuracy.append(1) elif res[i] == 0 and test_labels[i] == 1: accuracy.append(-1) else: accuracy.append(0) fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy) fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy) acc = sum([1 if accuracy[i] == 0 else 0 for i in range(len(accuracy))])/len(accuracy) print("false positive rate: %4f" % fp) print("false negative rate: %4f" % fn) print("accuracy: %4f" % acc) return res, acc, fp, fn
def perform_bayes(df): les = build_labelencoders(df) res = [] for i in range(len(df.columns)): col = df.iloc[:, i].values res.append(les[i].transform(col)) res = pd.DataFrame(res).transpose() x = res.iloc[:, :-1] y = res.iloc[:, -1:] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=10) model = CategoricalNB() model.fit(x_train, y_train.values.ravel()) y_pred = model.predict(x_test) y_pred_probability = model.predict_proba(x_test)[::, 1] accuracy = accuracy_score(y_test, y_pred) * 100 print(accuracy) # example patient test = ['50-59','ge40','50-54','24-26','no','1','right','left_up','yes'] print(test) # transform using labelencoders for i in range(len(test)): e = test[i] test[i] = les[i].transform(np.array(e).reshape(1, )) test = np.array(test) # do prediction y = model.predict(test.reshape(1, -1)) # translate back y = les[-1].inverse_transform(y)[0] print(y) a, b, _ = roc_curve(y_test, y_pred_probability) area_under_curve = roc_auc_score(y_test, y_pred_probability) plt.plot(a, b, label="area under curve="+str(area_under_curve)) plt.xlabel("false positive rate") plt.ylabel("true positive rate") plt.axis plt.legend(loc=4) plot_confusion_matrix(model, x_train, y_train.values.ravel(), normalize='true', display_labels=les[-1].inverse_transform([0, 1])) plt.show()
def test(X_train, Y_train, X_val, Y_val, categorical=False): from sklearn.naive_bayes import GaussianNB, CategoricalNB if categorical: model = CategoricalNB() else: model = GaussianNB() model.fit(X_train, Y_train) Y_pred = model.predict(X_val) # accuracy_score(Y_val, Y_pred) return f1_score(Y_val, Y_pred)
def check_sklearn_dev(): """ This just verifies that sklearn 0.23-dev is installed properly by checking CategoricalNB results """ rng = np.random.RandomState(1) X = rng.randint(5, size=(6, 100)) y = np.array([1, 2, 3, 4, 5, 6]) clf = CategoricalNB() clf.fit(X, y) assert [3] == clf.predict(X[2:3])
def test_categoricalnb_with_min_categories( min_categories, exp_X1_count, exp_X2_count, new_X, exp_n_categories_ ): X_n_categories = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) y_n_categories = np.array([1, 1, 2, 2]) expected_prediction = np.array([1]) clf = CategoricalNB(alpha=1, fit_prior=False, min_categories=min_categories) clf.fit(X_n_categories, y_n_categories) X1_count, X2_count = clf.category_count_ assert_array_equal(X1_count, exp_X1_count) assert_array_equal(X2_count, exp_X2_count) predictions = clf.predict(new_X) assert_array_equal(predictions, expected_prediction) assert_array_equal(clf.n_categories_, exp_n_categories_)
def cnb(train_x, train_y, test_x, test_y): compnb = CategoricalNB() compnb.fit(train_x, train_y) y_predictions = compnb.predict(test_x) print("RMSE for Complement Naive Bayes model = ", mean_squared_error(test_y, y_predictions)) my_f1 = f1_score(test_y, y_predictions, average='macro') print("f1_macro for Categorical Naive Bayes Classifier = ", my_f1) cm = confusion_matrix(test_y, y_predictions, normalize='true') sns.heatmap(cm, annot=True) plt.title('Confusion matrix of the Categorical Naive Bayes classifier') plt.xlabel('Predicted') plt.ylabel('True') plt.savefig('./output/CompNB.png') plt.show()
def bayes(test_set, training_set, categories): classifier = CategoricalNB() x, y = build_xy(training_set, categories) classifier.fit(x, y) false_positives = 0 false_negatives = 0 true_positives = 0 true_negatives = 0 x, y = build_xy(test_set, categories) y_predicted = classifier.predict(x) print(f'score: {classifier.score(x, y)}') print('bayes confusion matrix') print(classification_report(y, y_predicted))
class CategoricalBatchNB(TransformerMixin): def __init__(self, batch_size, classes, *args, **kwargs): self._batch_size = batch_size self._classes = classes self._args = args self._kwargs = kwargs self._model = CategoricalNB(*args, **kwargs) def fit(self, x, y, **fit_params): batch_size = self._batch_size self._model = CategoricalNB(*self._args, **self._kwargs) for index in tqdm(range(batch_size, x.shape[0] + batch_size, batch_size)): self._model.partial_fit( x[index - batch_size:index, :].toarray(), y[index - batch_size:index], classes=self._classes ) return self @staticmethod def transform(x, y=None, **fit_params): return x def predict(self, x): batch_size = self._batch_size predictions = [] for index in tqdm(range(batch_size, x.shape[0] + batch_size, batch_size)): predictions.extend( self._model.predict( x[index - batch_size:index, :].toarray() ).tolist() ) return np.array(predictions).ravel() def score(self, x, y): y_pred = self.predict(x) return accuracy_score(y, y_pred) def __str__(self): return "CategoricalBatchNB()" def __repr__(self): return self.__str__()
def pengujian(): if "admin" not in session: return redirect(url_for("index")) mydb.connect() cursor = mydb.cursor() cursor.execute( "SELECT * FROM preprocessing WHERE preprocessing.tahundibuat REGEXP '(2005|2006|2007|2008|2009|2010|2011|2012|2013|2014|2015|2016|2017|2018|2019)'" ) training = cursor.fetchall() X = [[x[0], x[1], x[2], x[3], x[4]] for x in training] y = [x[5] for x in training] # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) clf = CategoricalNB() clf.fit(X, y) cursor.execute( "SELECT * FROM preprocessing WHERE preprocessing.tahundibuat REGEXP '(2020)'" ) testing = cursor.fetchall() X_test = [[x[0], x[1], x[2], x[3], x[4]] for x in testing] y_test = [x[5] for x in testing] predicted = clf.predict(X_test) payload = [] for index, x in enumerate(X_test): arr = x arr.append(y[index]) payload.append({ "no": index + 1, "stasiuntv": arr[0], "genre": arr[1], "writer": arr[2], "director": arr[3], "actor": arr[4], "status": arr[5], }) hasil = confusion_matrix(y_test, predicted) akurasi = (hasil[0][0] + hasil[1][1]) / (hasil[0][0] + hasil[0][1] + hasil[1][0] + hasil[1][1]) return render_template("pengujian.html", hasil=hasil, akurasi=round(akurasi * 100))
def test_predict_meta_override(): X = pd.DataFrame({"c_0": [1, 2, 3, 4]}) y = np.array([1, 2, 3, 4]) base = CategoricalNB() base.fit(pd.DataFrame(X), y) dd_X = dd.from_pandas(X, npartitions=2) dd_X._meta = pd.DataFrame({"c_0": [5]}) # Failure when not proving predict_meta # because of value dependent model wrap = ParallelPostFit(base) with pytest.raises(ValueError): wrap.predict(dd_X) # Success when providing meta over-ride wrap = ParallelPostFit(base, predict_meta=np.array([1])) result = wrap.predict(dd_X) expected = base.predict(X) assert_eq_ar(result, expected)
def classificationCategoricalNaiveBayes(): col_names = [ '*', 'web1', 'web2', 'cosine', 'len', 'word', 'sameDomain', 'label' ] #load dataset pima = pd.read_csv("data.csv", names=col_names) #split dataset in features and target variable feature_cols = ['cosine', 'len', 'word', 'sameDomain'] X = pima[feature_cols] # Features y = pima.label # Target variable #Split dataset into training set and test set X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4) # 80% training and 20% test clf = CategoricalNB() clf.fit(X_train, y_train) # save the model dump(clf, open('model.pkl', 'wb')) startTime = datetime.now() #Predict the response for test dataset y_pred = clf.predict(X_test) endTime = datetime.now() print("exec time :", endTime - startTime) #Model Accuracy, how often is the classifier correct? print("Accuracy:", metrics.accuracy_score(y_test, y_pred)) print("precision:", metrics.average_precision_score(y_test, y_pred)) print("recall:", metrics.recall_score(y_test, y_pred)) print() print(confusion_matrix(y_test, y_pred))
def naive_bayes_adapter(**kwargs): # getting data from kwargs train = kwargs['train'] test = kwargs['test'] # megreing data to get all uniques and to build encoder for all merged_data = pd.concat([train, test]) # adding unknown uniques to train dataframe # by adding new row with that unknown unique and rest are most cummon uniques train = improove_train(train, merged_data) # bulding encoder merged_data_without_class = merged_data.drop('class', 1) encoder = OrdinalEncoder() encoder.fit(merged_data_without_class) # seperating classification column from datasets train_without_class = train.drop('class', 1) test_without_class = test.drop('class', 1) train_classifications = train['class'] test_classifications = test['class'] # encoding them all encoded_train_without_class = encoder.transform(train_without_class) encoded_test_without_class = encoder.transform(test_without_class) encoded_train_classifications = train_classifications.map({ 'yes': 1, 'no': 0 }) encoded_test_classifications = test_classifications.map({ 'yes': 1, 'no': 0 }) # building classifer clf = CategoricalNB(alpha=1) # when alpha=1 its Laplace smoothing clf.fit(encoded_train_without_class, encoded_train_classifications) # pridicting with the tree predictions = clf.predict(encoded_test_without_class) # returning matrix and cakculating score return create_return_dict(predictions, encoded_test_classifications)
y_test = np.array(y_test) label_encoder = LabelEncoder() for i in disc_columns: x_train[i] = label_encoder.fit_transform(x_train[i]) x_test[i] = label_encoder.fit_transform(x_test[i]) n_b = MixedNB(categorical_features=disc_columns) ''' # "uczymy" sie na zbiorze treningowym start_time = time.time() print("Learning and predicting with naive_bayes ...", end=" ") n_b.fit(x_train, y_train) # przewidujemy na testowym y_pred = n_b.predict(x_test) print(" took %s seconds " % round((time.time() - start_time), 5)) # na testowym znalismy prawdziwe klasy, mozemy porownac jak "dobrze" poszlo metric_accuracy = metrics.accuracy_score(y_test, y_pred) print("naive_bayes: accuracy = ", metric_accuracy) print("full classification report:") if type(classes_names) is not list: target_nms = classes_names.astype(str) else: target_nms = classes_names print(classification_report(y_test, y_pred, target_names=target_nms))
#建立模型 from sklearn.naive_bayes import CategoricalNB #建立模型实例 model = CategoricalNB() #训练模型 model.fit(X, Y) y_prdict_prob = model.predict_proba(X) print(y_prdict_prob) #输出预测y y_predict = model.predict(X) print(y_predict) #计算模型准确率 from sklearn.metrics import accuracy_score accuracy = accuracy_score(Y, y_predict) print(accuracy) #测试样本预测 X_test = np.array([[0,0,0,1,1,0]]) #模型的可能性预测 y_test_proba = model.predict_proba(X_test) print(y_test_proba) #模型预测结果 y_test = model.predict(X_test)
target_names = ["0", "1"] dataset = { "data": dataArray, "target": target, "feature_names": columnsIncluded, "target_names": target_names } # predict and output the test result df_test = pd.DataFrame(data_test) df_test.loc[df_test["Geography"]=="France", "Geography"] = 0 df_test.loc[df_test["Geography"]=="Spain", "Geography"] = 1 df_test.loc[df_test["Geography"]=="Germany", "Geography"] = 2 df_test.loc[df_test["Gender"]=="Male", "Gender"] = 0 df_test.loc[df_test["Gender"]=="Female", "Gender"] = 1 ########################################################## # train the model clf = CategoricalNB(alpha = 1) clf.fit(dataset['data'],dataset['target']) predictedTestResult = clf.predict(df_test[columnsIncluded].values) df_testOutput = df_test[["RowNumber"]] df_testOutput.insert(1, "Exited", predictedTestResult, True) df_testOutput.to_csv("submission_2_Bayes.csv", index=False) # compute f1 score f1_score_result = evaluateTask2.f1_score(predictedTestResult) print("f1-score: " + str(f1_score_result))
encoder.fit([row[:-1] for row in dataset]) print('Encoding') X = [row[:-1] for row in trainingSet] X = encoder.transform(X) Y = [row[-1] for row in trainingSet] classifier.fit(X, Y) test_set_x = encoder.transform([row[:-1] for row in testingSet]) test_set_y = [row[-1] for row in testingSet] print('Predicting') predictions = classifier.predict(test_set_x) right = 0 for y, prediction in zip(test_set_y, predictions): if y == prediction: right += 1 accuracy = right / len(testingSet) print(accuracy) print(f1_score(test_set_y, predictions)) randomForestClassification(X, Y, test_set_x, test_set_y) test_primeroci = pd.read_csv("Sample.csv").values.tolist() test_primeroci = getSampleDataset(test_primeroci)
model_linreg = LinearRegression() print_dict(model_linreg.get_params(), 'LinearRegressor params:') model_linreg.fit(X_train, y_train) y_predict_linreg = model_linreg.predict(X_test) y_predict_linreg = np.round(y_predict_linreg).astype( int) # Regressor -> classifier! error_rate_linreg = test(y_predict_linreg, y_test) print(f'Linear Regressor score: {model_linreg.score(X_test, y_test):.3g}') # %% [markdown] # # Naive Bayesian: # %% model_bayes = CategoricalNB() print_dict(model_bayes.get_params(), 'CategoricalNB params:') model_bayes.fit(X_train, y_train) y_predict_bayes = model_bayes.predict(X_test) error_rate_bayes = test(y_predict_bayes, y_test) print(f'Naive Bayesian score: {model_bayes.score(X_test, y_test):.3g}') # %% [markdown] # # NearestNeighbors: # %% model_nn = KNeighborsClassifier() print_dict(model_nn.get_params(), 'KNeighborsClassifier params:') model_nn.fit(X_train, y_train) y_predict_nn = model_nn.predict(X_test) error_rate_nn = test(y_predict_nn, y_test) print(f'Nearest Neighbors score: {model_nn.score(X_test, y_test):.3g}') # %% [markdown] # # DecisionTree:
# Read pixel values into X, read class values into y df_X = pandas.read_csv("../../data/x_train_gr_smpl.csv") df_y = pandas.read_csv("../../data/y_train_smpl.csv") # Shuffle the order of the data (keeping the X and y rows in sync) df_X, df_y = shuffle(df_X, df_y) # Split dataset into training and testing set, 90% and 10%, respectively X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.1, random_state=0) naive_bayes = CategoricalNB() classifier = naive_bayes.fit(X_train, y_train) y_predicted = naive_bayes.predict(X_test) print("\nNaive Bayes accuracy score: ", round(metrics.accuracy_score(y_test, y_predicted) * 100, 2), "%\n") # Plot non-normalized confusion matrix labels = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"] np.set_printoptions(precision=2) # Plot non-normalized confusion matrix titles_options = [("Confusion matrix, without normalization", None), ("Normalized confusion matrix", 'true')] for title, normalize in titles_options: disp = plot_confusion_matrix(classifier, X_test,
def class_metric_full_process(data, target): x = data y = target # MLP print("MLP") mlp = MLPClassifier(hidden_layer_sizes=(10, )) total_pred = list() total_res = list() loo = LeaveOneOut() count = 0 for train_index, test_index in loo.split(x): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] mlp.fit(x_train, y_train) try: results = mlp.predict(x_test) total_pred.append(results) total_res.append(y_test) except: count += 1 print(classification_report(total_pred, total_res, digits=3)) # KNN print("KNN") knn = KNeighborsClassifier() total_pred = list() total_res = list() loo = LeaveOneOut() count = 0 for train_index, test_index in loo.split(x): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] knn.fit(x_train, y_train) try: results = knn.predict(x_test) total_pred.append(results) total_res.append(y_test) except: count += 1 print(classification_report(total_pred, total_res, digits=3)) # Bayes print("Naive Bayes") clf = CategoricalNB() total_pred = list() total_res = list() loo = LeaveOneOut() count = 0 for train_index, test_index in loo.split(x): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(x_train, y_train) try: results = clf.predict(x_test) total_pred.append(results) total_res.append(y_test) except: count += 1 print(classification_report(total_pred, total_res, digits=3)) # Tree print("Decision Tree") tree = DecisionTreeClassifier() total_pred = list() total_res = list() loo = LeaveOneOut() count = 0 for train_index, test_index in loo.split(x): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] tree.fit(x_train, y_train) try: results = tree.predict(x_test) total_pred.append(results) total_res.append(y_test) except: count += 1 print(classification_report(total_pred, total_res, digits=3))
# Saves off encoded labels separately train_labels = train_data['Label'] test_labels = test_data['Label'] # Dropping unneccessary features (and labels) train_data = train_data.drop("Label", axis=1) test_data = test_data.drop("Label", axis=1) columns = list(train_data.to_dict().keys()) # Create NB model clf = CategoricalNB() clf.fit(train_data[columns], train_labels) # Get TP, FP, TN, and FN rates nb_predictions = clf.predict(test_data[columns]) tp = 0 fp = 0 tn = 0 fn = 0 for i in range(len(nb_predictions)): # True positive if nb_predictions[i] == 'Win' and test_labels[i] == 'Win': tp += 1 # False positive elif nb_predictions[i] == 'Win' and test_labels[i] == 'Lose': fp += 1 #False negative elif nb_predictions[i] == 'Lose' and test_labels[i] == 'Win': fn += 1 # True negative
(tbl.loc['setting'] == top_model[1]).values] best_params = tmp.loc['params'].iloc[0] #%% # ## Test model nb = CategoricalNB() nb.set_params(**best_params) #Prep data x, y = prep_nb(x_train, y_train) #Fit model nb.fit(x, y) #Compute performance on training set pred = nb.predict(x) score_training = [m(y, pred) for m in metrics] #Predict on test set x, y = prep_nb(x_test, y_test) pred = nb.predict(x) #Compute scores score = [m(y, pred) for m in metrics] # We can see that training scores and test scores are equivalent, i.e. we are confident to not have overfitted. #%% plot_confusion_matrix(nb, x, y, cmap=plt.cm.Blues, normalize='true') #fig =plot_roc_curve(nb, x,y, response_method='predict_proba')
X = df.drop(df.columns[-1], axis=1) y = df[df.columns[-1]] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) clf = CategoricalNB(min_categories=np.array(list( min_categories.values())).astype(int)[:-1]) clf.fit(X_train, y_train) if args.v: print("----------------------") print("Initial accuracy:") print("Train accuracy: ", accuracy_score(clf.predict(X_train), y_train)) print("Test accuracy: ", accuracy_score(clf.predict(X_test), y_test)) print("----------------------") if args.p is not None: print("----------------------") print("Rounded accuracy (precision=" + str(args.p) + "):") print("Train accuracy: ", accuracy_score(predict_proba(X_train, clf, args.p), y_train)) print("Test accuracy: ", accuracy_score(predict_proba(X_test, clf, args.p), y_test)) print("----------------------") if args.ox: if not os.path.exists(os.path.dirname(args.ox)):
accuracy = np.mean( pred == dev_labels['bank_transaction_category'].values.astype('U')) print(accuracy, 'amounts') # doesnt take into account correlations between features # model for transaction type encoder = OrdinalEncoder() train_cat = encoder.fit_transform( (train_X['bank_transaction_type'].values.astype('U')).reshape(-1, 1)) dev_cat = encoder.fit_transform( (dev_X['bank_transaction_type'].values.astype('U')).reshape(-1, 1)) clf_type = CategoricalNB() clf_type.fit(train_cat, train_labels['bank_transaction_category'].values.astype('U')) predicted = clf_type.predict(dev_cat) accuracy = np.mean( predicted == dev_labels['bank_transaction_category'].values.astype('U')) print(accuracy, 'transaction type') # combine features # weighted probabilites total_probs = 0.91 * clf_desc.predict_proba( X_dev_tfidf) + 0.6 * clf_amount.predict_proba(dev_amount.reshape( -1, 1)) + 0.6 * clf_type.predict_proba(dev_cat) index = clf_desc.classes_ predicted = [] for probs in total_probs: max_index = np.nanargmax(probs) predicted.append(index[max_index])
def public_classification(): if request.method == "POST": stasiuntv_ = request.form["stasiuntv"] genre_ = request.form["genre"] penulis_ = request.form["penulis"] direktur_ = request.form["direktur"] tokohutama_ = request.form["tokohutama"] mydb.connect() cursor = mydb.cursor() cursor.execute("SELECT * FROM dataset") data = cursor.fetchall() labelEncoderStasiunTV = LabelEncoder() stasiuntv = [x[1] for x in data] stasiuntv = labelEncoderStasiunTV.fit_transform(stasiuntv) labelEncoderGenre = LabelEncoder() genre = [x[2] for x in data] genre = labelEncoderGenre.fit_transform(genre) labelEncoderWriter = LabelEncoder() writer = [x[3] for x in data] writer = labelEncoderWriter.fit_transform(writer) labelEncoderDirector = LabelEncoder() director = [x[4] for x in data] director = labelEncoderDirector.fit_transform(director) labelEncoderActor = LabelEncoder() actor = [x[5] for x in data] actor = labelEncoderActor.fit_transform(actor) labelEncoderStatus = LabelEncoder() status = [x[10] for x in data] status = labelEncoderStatus.fit_transform(status) s = labelEncoderStasiunTV.transform([stasiuntv_])[0] g = labelEncoderGenre.transform([genre_])[0] p = labelEncoderWriter.transform([penulis_])[0] d = labelEncoderDirector.transform([direktur_])[0] t = labelEncoderActor.transform([tokohutama_])[0] cursor.execute( "SELECT * FROM preprocessing WHERE preprocessing.tahundibuat REGEXP '(2005|2006|2007|2008|2009|2010|2011|2012|2013|2014|2015|2016|2017|2018|2019|2020)'" ) training = cursor.fetchall() X = [[x[0], x[1], x[2], x[3], x[4]] for x in training] y = [x[5] for x in training] clf = CategoricalNB() clf.fit(X, y) hasil = labelEncoderStatus.inverse_transform( [clf.predict([[s, g, p, d, t]])[0]])[0] mydb.close() return render_template("public_classification.html", hasil=hasil) mydb.connect() cursor = mydb.cursor() cursor.execute("SELECT DISTINCT(stasiuntv) FROM dataset") stasiuntv = [x[0] for x in cursor.fetchall()] cursor.execute("SELECT DISTINCT(genre) FROM dataset") genre = [x[0] for x in cursor.fetchall()] cursor.execute("SELECT DISTINCT(penulis) FROM dataset") penulis = [x[0] for x in cursor.fetchall()] cursor.execute("SELECT DISTINCT(direktur) FROM dataset") direktur = [x[0] for x in cursor.fetchall()] cursor.execute("SELECT DISTINCT(tokohutama) FROM dataset") tokohutama = [x[0] for x in cursor.fetchall()] cursor.close() mydb.close() return render_template("public_Classification.html", genre=genre, stasiuntv=stasiuntv, penulis=penulis, direktur=direktur, tokohutama=tokohutama)
#scoring with train data print('train score:', LR_final.score(X_train_new, y_train)) # scoring with test data print('test score:', LR_final.score(X_test_new, y_test)) LR_final.predict_proba(X_test_new) """# Naive Bayes""" #use the same train test set as logistic regression prediction = dict() NB = CategoricalNB() NB.fit(X_train_new, y_train) prediction['Naive Bayes'] = NB.predict(X_test_new) #accuracy, precision, recall, confusion matrix print("Acurracy:") print(accuracy_score(y_test, prediction['Naive Bayes'])) print("\n") print("Classfication report:") print(classification_report(y_test, prediction['Naive Bayes'])) print("\n") print("Confusion Matrix:") print(confusion_matrix(y_test, prediction['Naive Bayes'])) #scoring with train data print('train score:', NB.score(X_train_new, y_train)) # scoring with test data
drop_cols = ['id'] from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score for i in cat_cols: le = LabelEncoder() data[i] = le.fit_transform(data[i]) data.drop(columns=drop_cols, inplace=True) train_df = data.loc[data['risk_flag'] != -1] test_df = data.loc[data['risk_flag'] == -1] X_tr, X_tst, y_tr, y_tst = train_test_split( train_df.drop(columns=['risk_flag']), train_df['risk_flag'], stratify=train_df['risk_flag']) from sklearn.naive_bayes import CategoricalNB clf = CategoricalNB() clf.fit(X_tr, y_tr) clf.feature_count_ k = clf.predict(X_tst) roc_auc_score(y_tst, clf.predict(X_tst)) print(clf.predict(X[2:3]))
""" clf_b = BernoulliNB() clf_b.fit(X_train, y_train) prd = clf_b.predict(X_test) metrics.accuracy_score(y_test, prd) # 0.9818 roc_auc_score(y_test, prd) # 0.6832 # --------------------------------- clf_c = CategoricalNB() clf_c.fit(X_train, y_train) prd = clf_c.predict(X_test) metrics.accuracy_score(y_test, prd) # 0.9827 roc_auc_score(y_test, prd) # 0.6793 """ ------------------------------------------------------------------ Hyper parameter tuning - gridSearch (on smaller subsets of data - memory & time) May have to check params individually on best model fit ------------------------------------------------------------------ """ param_grid = { 'class_weight': ['balanced', 'balanced_subsample', None], 'max_depth': [2, 4, 6, 10, None], 'max_features': ['auto', 'sqrt', 'log2'],