Example #1
0
def test_categoricalnb():
    # Check the ability to predict the training set.
    clf = CategoricalNB()
    y_pred = clf.fit(X2, y2).predict(X2)
    assert_array_equal(y_pred, y2)

    X3 = np.array([[1, 4], [2, 5]])
    y3 = np.array([1, 2])
    clf = CategoricalNB(alpha=1, fit_prior=False)

    clf.fit(X3, y3)
    assert_array_equal(clf.n_categories_, np.array([3, 6]))

    # Check error is raised for X with negative entries
    X = np.array([[0, -1]])
    y = np.array([1])
    error_msg = "Negative values in data passed to CategoricalNB (input X)"
    assert_raise_message(ValueError, error_msg, clf.predict, X)
    assert_raise_message(ValueError, error_msg, clf.fit, X, y)

    # Check error is raised for incorrect X
    X = np.array([[1, 4, 1], [2, 5, 6]])
    msg = "Expected input with 2 features, got 3 instead"
    assert_raise_message(ValueError, msg, clf.predict, X)

    # Test alpha
    X3_test = np.array([[2, 5]])
    # alpha=1 increases the count of all categories by one so the final
    # probability for each category is not 50/50 but 1/3 to 2/3
    bayes_numerator = np.array([[1/3*1/3, 2/3*2/3]])
    bayes_denominator = bayes_numerator.sum()
    assert_array_almost_equal(clf.predict_proba(X3_test),
                              bayes_numerator / bayes_denominator)

    # Assert category_count has counted all features
    assert len(clf.category_count_) == X3.shape[1]

    # Check sample_weight
    X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
    y = np.array([1, 1, 2, 2])
    clf = CategoricalNB(alpha=1, fit_prior=False)
    clf.fit(X, y)
    assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([1]))
    assert_array_equal(clf.n_categories_, np.array([2, 2]))

    for factor in [1., 0.3, 5, 0.0001]:
        X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
        y = np.array([1, 1, 2, 2])
        sample_weight = np.array([1, 1, 10, 0.1]) * factor
        clf = CategoricalNB(alpha=1, fit_prior=False)
        clf.fit(X, y, sample_weight=sample_weight)
        assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([2]))
        assert_array_equal(clf.n_categories_, np.array([2, 2]))
Example #2
0
def categoricalNaiveBayes(dtrain, dtest):
    # can use split?
    y_train = dtrain[:, -1]
    x_train = dtrain[:, :-1]
    scaler = preprocessing.MinMaxScaler()
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    newscaler = preprocessing.MinMaxScaler()
    newscaler.fit(dtest)
    dtest = newscaler.transform(dtest)
    #print(x_train[x_train < 0])
    gnb = CategoricalNB()
    gnb.fit(x_train, y_train)
    print("GNB Features")
    print("GNB cat count Features")
    print(gnb.category_count_)
    print("GNB class count Features")
    print(gnb.class_count_)
    print("GNB feature log prob Features")
    print(gnb.feature_log_prob_)
    print("GNB n Features")
    print(gnb.n_features_)
    print("Length test")
    print(len(dtest[0]))
    predictions = gnb.predict(dtest)
    return predictions
def run_kfold(fields, labels):
    kf = KFold(n_splits=5)
    best = [], []
    best_accuracy = 0

    # train_index and test_index index into fields and labels
    for train_index, test_index in kf.split(fields):
        train_fields = fields.iloc[train_index].reset_index(drop = True)
        train_labels = [labels[i] for i in train_index]
        test_fields = fields.iloc[test_index].reset_index(drop = True)
        test_labels = [labels[i] for i in test_index]

        clf = CategoricalNB()
        clf.fit(train_fields, train_labels)

        try:
            res = clf.predict(test_fields).tolist()
        except IndexError:
            continue
        
        accuracy = []
        for i in range(len(res)):
            if res[i] == test_labels[i]:
                accuracy.append(1)
            else:
                accuracy.append(0)
        accuracy = [1 if res[i] == test_labels[i] else 0 for i in range(len(res))]
        acc = sum(accuracy)/len(accuracy)

        if (acc > best_accuracy):
            best = train_index, test_index
            best_accuracy = acc

        print("accuracy rate: ", acc)
    return best
def run_model(training, testing, fields, labels):
    train_fields = fields.iloc[training].reset_index(drop = True)
    train_labels = [labels[i] for i in training]
    test_fields = fields.iloc[testing].reset_index(drop = True)
    test_labels = [labels[i] for i in testing]

    clf = CategoricalNB()
    clf.fit(train_fields, train_labels)

    res = clf.predict(test_fields).tolist()

    accuracy = []
    for i in range(len(res)):
        if res[i] == 1 and test_labels[i] == 0:
            accuracy.append(1)
        elif res[i] == 0 and test_labels[i] == 1:
            accuracy.append(-1)
        else:
            accuracy.append(0)

    fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
    fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
    acc = sum([1 if accuracy[i] == 0 else 0 for i in range(len(accuracy))])/len(accuracy)
    print("false positive rate: %4f" % fp)
    print("false negative rate: %4f" % fn)
    print("accuracy: %4f" % acc)
    return res, acc, fp, fn
Example #5
0
def perform_bayes(df):
    les = build_labelencoders(df)
    res = []
    for i in range(len(df.columns)):
        col = df.iloc[:, i].values
        res.append(les[i].transform(col))
    res = pd.DataFrame(res).transpose()
    x = res.iloc[:, :-1]
    y = res.iloc[:, -1:]

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=10)

    model = CategoricalNB()
    model.fit(x_train, y_train.values.ravel())

    y_pred = model.predict(x_test)
    y_pred_probability = model.predict_proba(x_test)[::, 1]

    accuracy = accuracy_score(y_test, y_pred) * 100
    print(accuracy)

    # example patient
    test = ['50-59','ge40','50-54','24-26','no','1','right','left_up','yes']
    print(test)

    # transform using labelencoders
    for i in range(len(test)):
        e = test[i]
        test[i] = les[i].transform(np.array(e).reshape(1, ))
    test = np.array(test)

    # do prediction
    y = model.predict(test.reshape(1, -1))

    # translate back
    y = les[-1].inverse_transform(y)[0]
    print(y)

    a, b, _ = roc_curve(y_test, y_pred_probability)
    area_under_curve = roc_auc_score(y_test, y_pred_probability)
    plt.plot(a, b, label="area under curve="+str(area_under_curve))
    plt.xlabel("false positive rate")
    plt.ylabel("true positive rate")
    plt.axis
    plt.legend(loc=4)
    plot_confusion_matrix(model, x_train, y_train.values.ravel(), normalize='true', display_labels=les[-1].inverse_transform([0, 1]))
    plt.show()
Example #6
0
def test(X_train, Y_train, X_val, Y_val, categorical=False):
    from sklearn.naive_bayes import GaussianNB, CategoricalNB
    if categorical:
        model = CategoricalNB()
    else:
        model = GaussianNB()
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_val)
    # accuracy_score(Y_val, Y_pred)
    return f1_score(Y_val, Y_pred)
def check_sklearn_dev():
    """
    This just verifies that sklearn 0.23-dev is installed properly
    by checking CategoricalNB results
    """
    rng = np.random.RandomState(1)
    X = rng.randint(5, size=(6, 100))
    y = np.array([1, 2, 3, 4, 5, 6])

    clf = CategoricalNB()
    clf.fit(X, y)
    assert [3] == clf.predict(X[2:3])
Example #8
0
def test_categoricalnb_with_min_categories(
    min_categories, exp_X1_count, exp_X2_count, new_X, exp_n_categories_
):
    X_n_categories = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
    y_n_categories = np.array([1, 1, 2, 2])
    expected_prediction = np.array([1])

    clf = CategoricalNB(alpha=1, fit_prior=False, min_categories=min_categories)
    clf.fit(X_n_categories, y_n_categories)
    X1_count, X2_count = clf.category_count_
    assert_array_equal(X1_count, exp_X1_count)
    assert_array_equal(X2_count, exp_X2_count)
    predictions = clf.predict(new_X)
    assert_array_equal(predictions, expected_prediction)
    assert_array_equal(clf.n_categories_, exp_n_categories_)
Example #9
0
def cnb(train_x, train_y, test_x, test_y):
    compnb = CategoricalNB()
    compnb.fit(train_x, train_y)
    y_predictions = compnb.predict(test_x)
    print("RMSE for Complement Naive Bayes model = ",
          mean_squared_error(test_y, y_predictions))
    my_f1 = f1_score(test_y, y_predictions, average='macro')
    print("f1_macro for Categorical Naive Bayes Classifier = ", my_f1)
    cm = confusion_matrix(test_y, y_predictions, normalize='true')
    sns.heatmap(cm, annot=True)
    plt.title('Confusion matrix of the Categorical Naive Bayes classifier')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig('./output/CompNB.png')
    plt.show()
Example #10
0
def bayes(test_set, training_set, categories):
    classifier = CategoricalNB()

    x, y = build_xy(training_set, categories)
    classifier.fit(x, y)

    false_positives = 0
    false_negatives = 0
    true_positives = 0
    true_negatives = 0

    x, y = build_xy(test_set, categories)
    y_predicted = classifier.predict(x)
    print(f'score: {classifier.score(x, y)}')
    print('bayes confusion matrix')
    print(classification_report(y, y_predicted))
class CategoricalBatchNB(TransformerMixin):
    def __init__(self, batch_size, classes, *args, **kwargs):
        self._batch_size = batch_size
        self._classes = classes
        self._args = args
        self._kwargs = kwargs
        self._model = CategoricalNB(*args, **kwargs)

    def fit(self, x, y, **fit_params):
        batch_size = self._batch_size
        self._model = CategoricalNB(*self._args, **self._kwargs)

        for index in tqdm(range(batch_size, x.shape[0] + batch_size, batch_size)):
            self._model.partial_fit(
                x[index - batch_size:index, :].toarray(),
                y[index - batch_size:index],
                classes=self._classes
            )
        return self

    @staticmethod
    def transform(x, y=None, **fit_params):
        return x

    def predict(self, x):
        batch_size = self._batch_size
        predictions = []
        for index in tqdm(range(batch_size, x.shape[0] + batch_size, batch_size)):
            predictions.extend(
                self._model.predict(
                    x[index - batch_size:index, :].toarray()
                ).tolist()
            )
        return np.array(predictions).ravel()

    def score(self, x, y):
        y_pred = self.predict(x)
        return accuracy_score(y, y_pred)

    def __str__(self):
        return "CategoricalBatchNB()"

    def __repr__(self):
        return self.__str__()
def pengujian():
    if "admin" not in session:
        return redirect(url_for("index"))
    mydb.connect()
    cursor = mydb.cursor()
    cursor.execute(
        "SELECT * FROM preprocessing WHERE preprocessing.tahundibuat REGEXP '(2005|2006|2007|2008|2009|2010|2011|2012|2013|2014|2015|2016|2017|2018|2019)'"
    )
    training = cursor.fetchall()
    X = [[x[0], x[1], x[2], x[3], x[4]] for x in training]
    y = [x[5] for x in training]
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
    clf = CategoricalNB()
    clf.fit(X, y)
    cursor.execute(
        "SELECT * FROM preprocessing WHERE preprocessing.tahundibuat REGEXP '(2020)'"
    )
    testing = cursor.fetchall()
    X_test = [[x[0], x[1], x[2], x[3], x[4]] for x in testing]
    y_test = [x[5] for x in testing]
    predicted = clf.predict(X_test)
    payload = []
    for index, x in enumerate(X_test):
        arr = x
        arr.append(y[index])
        payload.append({
            "no": index + 1,
            "stasiuntv": arr[0],
            "genre": arr[1],
            "writer": arr[2],
            "director": arr[3],
            "actor": arr[4],
            "status": arr[5],
        })
    hasil = confusion_matrix(y_test, predicted)
    akurasi = (hasil[0][0] + hasil[1][1]) / (hasil[0][0] + hasil[0][1] +
                                             hasil[1][0] + hasil[1][1])

    return render_template("pengujian.html",
                           hasil=hasil,
                           akurasi=round(akurasi * 100))
Example #13
0
def test_predict_meta_override():
    X = pd.DataFrame({"c_0": [1, 2, 3, 4]})
    y = np.array([1, 2, 3, 4])

    base = CategoricalNB()
    base.fit(pd.DataFrame(X), y)

    dd_X = dd.from_pandas(X, npartitions=2)
    dd_X._meta = pd.DataFrame({"c_0": [5]})

    # Failure when not proving predict_meta
    # because of value dependent model
    wrap = ParallelPostFit(base)
    with pytest.raises(ValueError):
        wrap.predict(dd_X)

    # Success when providing meta over-ride
    wrap = ParallelPostFit(base, predict_meta=np.array([1]))
    result = wrap.predict(dd_X)
    expected = base.predict(X)
    assert_eq_ar(result, expected)
def classificationCategoricalNaiveBayes():
    col_names = [
        '*', 'web1', 'web2', 'cosine', 'len', 'word', 'sameDomain', 'label'
    ]
    #load dataset
    pima = pd.read_csv("data.csv", names=col_names)

    #split dataset in features and target variable
    feature_cols = ['cosine', 'len', 'word', 'sameDomain']
    X = pima[feature_cols]  # Features
    y = pima.label  # Target variable

    #Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=4)  # 80% training and 20% test

    clf = CategoricalNB()
    clf.fit(X_train, y_train)

    # save the model
    dump(clf, open('model.pkl', 'wb'))
    startTime = datetime.now()
    #Predict the response for test dataset
    y_pred = clf.predict(X_test)
    endTime = datetime.now()

    print("exec time :", endTime - startTime)

    #Model Accuracy, how often is the classifier correct?
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

    print("precision:", metrics.average_precision_score(y_test, y_pred))

    print("recall:", metrics.recall_score(y_test, y_pred))

    print()

    print(confusion_matrix(y_test, y_pred))
Example #15
0
def naive_bayes_adapter(**kwargs):
    # getting data from kwargs
    train = kwargs['train']
    test = kwargs['test']
    # megreing data to get all uniques and to build encoder for all
    merged_data = pd.concat([train, test])
    # adding unknown uniques to train dataframe
    # by adding new row with that unknown unique and rest are most cummon uniques
    train = improove_train(train, merged_data)
    # bulding encoder
    merged_data_without_class = merged_data.drop('class', 1)
    encoder = OrdinalEncoder()
    encoder.fit(merged_data_without_class)
    # seperating classification column from datasets
    train_without_class = train.drop('class', 1)
    test_without_class = test.drop('class', 1)
    train_classifications = train['class']
    test_classifications = test['class']
    # encoding them all
    encoded_train_without_class = encoder.transform(train_without_class)
    encoded_test_without_class = encoder.transform(test_without_class)
    encoded_train_classifications = train_classifications.map({
        'yes': 1,
        'no': 0
    })
    encoded_test_classifications = test_classifications.map({
        'yes': 1,
        'no': 0
    })
    # building classifer
    clf = CategoricalNB(alpha=1)  # when alpha=1 its Laplace smoothing
    clf.fit(encoded_train_without_class, encoded_train_classifications)
    # pridicting with the tree
    predictions = clf.predict(encoded_test_without_class)
    # returning matrix and cakculating score
    return create_return_dict(predictions, encoded_test_classifications)
Example #16
0
    y_test = np.array(y_test)
    label_encoder = LabelEncoder()
    for i in disc_columns:
        x_train[i] = label_encoder.fit_transform(x_train[i])
        x_test[i] = label_encoder.fit_transform(x_test[i])
    n_b = MixedNB(categorical_features=disc_columns)

'''

# "uczymy" sie na zbiorze treningowym
start_time = time.time()
print("Learning and predicting with naive_bayes ...", end=" ")
n_b.fit(x_train, y_train)

# przewidujemy na testowym
y_pred = n_b.predict(x_test)
print("  took %s seconds " % round((time.time() - start_time), 5))

# na testowym znalismy prawdziwe klasy, mozemy porownac jak "dobrze" poszlo

metric_accuracy = metrics.accuracy_score(y_test, y_pred)
print("naive_bayes: accuracy = ", metric_accuracy)

print("full classification report:")
if type(classes_names) is not list:
    target_nms = classes_names.astype(str)
else:
    target_nms = classes_names

print(classification_report(y_test, y_pred, target_names=target_nms))
Example #17
0
#建立模型
from sklearn.naive_bayes import CategoricalNB

#建立模型实例
model = CategoricalNB()

#训练模型
model.fit(X, Y)

y_prdict_prob = model.predict_proba(X)

print(y_prdict_prob)

#输出预测y
y_predict = model.predict(X)
print(y_predict)

#计算模型准确率
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(Y, y_predict)
print(accuracy)


#测试样本预测
X_test = np.array([[0,0,0,1,1,0]])
#模型的可能性预测
y_test_proba = model.predict_proba(X_test)
print(y_test_proba)
#模型预测结果
y_test = model.predict(X_test)
Example #18
0
target_names = ["0", "1"]

dataset = {
"data": dataArray,
"target": target,
"feature_names": columnsIncluded,
"target_names": target_names
}

# predict and output the test result
df_test = pd.DataFrame(data_test)
df_test.loc[df_test["Geography"]=="France", "Geography"] = 0
df_test.loc[df_test["Geography"]=="Spain", "Geography"] = 1
df_test.loc[df_test["Geography"]=="Germany", "Geography"] = 2
df_test.loc[df_test["Gender"]=="Male", "Gender"] = 0
df_test.loc[df_test["Gender"]=="Female", "Gender"] = 1

##########################################################
# train the model
clf = CategoricalNB(alpha = 1)
clf.fit(dataset['data'],dataset['target'])
predictedTestResult = clf.predict(df_test[columnsIncluded].values)

df_testOutput = df_test[["RowNumber"]]
df_testOutput.insert(1, "Exited", predictedTestResult, True)
df_testOutput.to_csv("submission_2_Bayes.csv", index=False)

# compute f1 score
f1_score_result = evaluateTask2.f1_score(predictedTestResult)
print("f1-score: " + str(f1_score_result))
Example #19
0
    encoder.fit([row[:-1] for row in dataset])

    print('Encoding')

    X = [row[:-1] for row in trainingSet]
    X = encoder.transform(X)
    Y = [row[-1] for row in trainingSet]

    classifier.fit(X, Y)

    test_set_x = encoder.transform([row[:-1] for row in testingSet])

    test_set_y = [row[-1] for row in testingSet]

    print('Predicting')
    predictions = classifier.predict(test_set_x)

    right = 0

    for y, prediction in zip(test_set_y, predictions):
        if y == prediction:
            right += 1

    accuracy = right / len(testingSet)
    print(accuracy)
    print(f1_score(test_set_y, predictions))

    randomForestClassification(X, Y, test_set_x, test_set_y)

    test_primeroci = pd.read_csv("Sample.csv").values.tolist()
    test_primeroci = getSampleDataset(test_primeroci)
Example #20
0
model_linreg = LinearRegression()
print_dict(model_linreg.get_params(), 'LinearRegressor params:')
model_linreg.fit(X_train, y_train)
y_predict_linreg = model_linreg.predict(X_test)
y_predict_linreg = np.round(y_predict_linreg).astype(
    int)  # Regressor -> classifier!
error_rate_linreg = test(y_predict_linreg, y_test)
print(f'Linear Regressor score: {model_linreg.score(X_test, y_test):.3g}')

# %% [markdown]
# # Naive Bayesian:
# %%
model_bayes = CategoricalNB()
print_dict(model_bayes.get_params(), 'CategoricalNB params:')
model_bayes.fit(X_train, y_train)
y_predict_bayes = model_bayes.predict(X_test)
error_rate_bayes = test(y_predict_bayes, y_test)
print(f'Naive Bayesian score: {model_bayes.score(X_test, y_test):.3g}')

# %% [markdown]
# # NearestNeighbors:
# %%
model_nn = KNeighborsClassifier()
print_dict(model_nn.get_params(), 'KNeighborsClassifier params:')
model_nn.fit(X_train, y_train)
y_predict_nn = model_nn.predict(X_test)
error_rate_nn = test(y_predict_nn, y_test)
print(f'Nearest Neighbors score: {model_nn.score(X_test, y_test):.3g}')

# %% [markdown]
# # DecisionTree:
Example #21
0
# Read pixel values into X, read class values into y
df_X = pandas.read_csv("../../data/x_train_gr_smpl.csv")
df_y = pandas.read_csv("../../data/y_train_smpl.csv")

# Shuffle the order of the data (keeping the X and y rows in sync)
df_X, df_y = shuffle(df_X, df_y)

# Split dataset into training and testing set, 90% and 10%, respectively
X_train, X_test, y_train, y_test = train_test_split(df_X,
                                                    df_y,
                                                    test_size=0.1,
                                                    random_state=0)

naive_bayes = CategoricalNB()
classifier = naive_bayes.fit(X_train, y_train)
y_predicted = naive_bayes.predict(X_test)
print("\nNaive Bayes accuracy score: ",
      round(metrics.accuracy_score(y_test, y_predicted) * 100, 2), "%\n")

# Plot non-normalized confusion matrix
labels = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]

np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]

for title, normalize in titles_options:
    disp = plot_confusion_matrix(classifier,
                                 X_test,
Example #22
0
def class_metric_full_process(data, target):

    x = data
    y = target

    # MLP
    print("MLP")

    mlp = MLPClassifier(hidden_layer_sizes=(10, ))

    total_pred = list()
    total_res = list()

    loo = LeaveOneOut()

    count = 0

    for train_index, test_index in loo.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        mlp.fit(x_train, y_train)
        try:
            results = mlp.predict(x_test)
            total_pred.append(results)
            total_res.append(y_test)
        except:
            count += 1

    print(classification_report(total_pred, total_res, digits=3))

    # KNN
    print("KNN")
    knn = KNeighborsClassifier()

    total_pred = list()
    total_res = list()

    loo = LeaveOneOut()

    count = 0

    for train_index, test_index in loo.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        knn.fit(x_train, y_train)
        try:
            results = knn.predict(x_test)
            total_pred.append(results)
            total_res.append(y_test)
        except:
            count += 1

    print(classification_report(total_pred, total_res, digits=3))

    # Bayes
    print("Naive Bayes")
    clf = CategoricalNB()

    total_pred = list()
    total_res = list()

    loo = LeaveOneOut()

    count = 0

    for train_index, test_index in loo.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(x_train, y_train)
        try:
            results = clf.predict(x_test)
            total_pred.append(results)
            total_res.append(y_test)
        except:
            count += 1

    print(classification_report(total_pred, total_res, digits=3))

    # Tree
    print("Decision Tree")
    tree = DecisionTreeClassifier()

    total_pred = list()
    total_res = list()

    loo = LeaveOneOut()

    count = 0

    for train_index, test_index in loo.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        tree.fit(x_train, y_train)
        try:
            results = tree.predict(x_test)
            total_pred.append(results)
            total_res.append(y_test)
        except:
            count += 1

    print(classification_report(total_pred, total_res, digits=3))
Example #23
0
    # Saves off encoded labels separately
    train_labels = train_data['Label']
    test_labels = test_data['Label']

    # Dropping unneccessary features (and labels)
    train_data = train_data.drop("Label", axis=1)
    test_data = test_data.drop("Label", axis=1)

    columns = list(train_data.to_dict().keys())

    # Create NB model
    clf = CategoricalNB()
    clf.fit(train_data[columns], train_labels)

    # Get TP, FP, TN, and FN rates
    nb_predictions = clf.predict(test_data[columns])
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for i in range(len(nb_predictions)):
        # True positive
        if nb_predictions[i] == 'Win' and test_labels[i] == 'Win':
            tp += 1
        # False positive
        elif nb_predictions[i] == 'Win' and test_labels[i] == 'Lose':
            fp += 1
        #False negative
        elif nb_predictions[i] == 'Lose' and test_labels[i] == 'Win':
            fn += 1
        # True negative
              (tbl.loc['setting'] == top_model[1]).values]
best_params = tmp.loc['params'].iloc[0]

#%%
# ## Test model
nb = CategoricalNB()
nb.set_params(**best_params)

#Prep data
x, y = prep_nb(x_train, y_train)

#Fit model
nb.fit(x, y)

#Compute performance on training set
pred = nb.predict(x)
score_training = [m(y, pred) for m in metrics]

#Predict on test set
x, y = prep_nb(x_test, y_test)
pred = nb.predict(x)

#Compute scores
score = [m(y, pred) for m in metrics]

# We can see that training scores and test scores are equivalent, i.e. we are confident to not have overfitted.

#%%
plot_confusion_matrix(nb, x, y, cmap=plt.cm.Blues, normalize='true')
#fig =plot_roc_curve(nb, x,y, response_method='predict_proba')
Example #25
0
    X = df.drop(df.columns[-1], axis=1)
    y = df[df.columns[-1]]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    clf = CategoricalNB(min_categories=np.array(list(
        min_categories.values())).astype(int)[:-1])
    clf.fit(X_train, y_train)

    if args.v:
        print("----------------------")
        print("Initial accuracy:")
        print("Train accuracy: ", accuracy_score(clf.predict(X_train),
                                                 y_train))
        print("Test accuracy: ", accuracy_score(clf.predict(X_test), y_test))
        print("----------------------")

        if args.p is not None:
            print("----------------------")
            print("Rounded accuracy (precision=" + str(args.p) + "):")
            print("Train accuracy: ",
                  accuracy_score(predict_proba(X_train, clf, args.p), y_train))
            print("Test accuracy: ",
                  accuracy_score(predict_proba(X_test, clf, args.p), y_test))
            print("----------------------")

    if args.ox:
        if not os.path.exists(os.path.dirname(args.ox)):
Example #26
0
accuracy = np.mean(
    pred == dev_labels['bank_transaction_category'].values.astype('U'))
print(accuracy, 'amounts')

# doesnt take into account correlations between features

# model for transaction type
encoder = OrdinalEncoder()
train_cat = encoder.fit_transform(
    (train_X['bank_transaction_type'].values.astype('U')).reshape(-1, 1))
dev_cat = encoder.fit_transform(
    (dev_X['bank_transaction_type'].values.astype('U')).reshape(-1, 1))
clf_type = CategoricalNB()
clf_type.fit(train_cat,
             train_labels['bank_transaction_category'].values.astype('U'))
predicted = clf_type.predict(dev_cat)
accuracy = np.mean(
    predicted == dev_labels['bank_transaction_category'].values.astype('U'))
print(accuracy, 'transaction type')

# combine features
# weighted probabilites
total_probs = 0.91 * clf_desc.predict_proba(
    X_dev_tfidf) + 0.6 * clf_amount.predict_proba(dev_amount.reshape(
        -1, 1)) + 0.6 * clf_type.predict_proba(dev_cat)
index = clf_desc.classes_

predicted = []
for probs in total_probs:
    max_index = np.nanargmax(probs)
    predicted.append(index[max_index])
def public_classification():
    if request.method == "POST":

        stasiuntv_ = request.form["stasiuntv"]
        genre_ = request.form["genre"]
        penulis_ = request.form["penulis"]
        direktur_ = request.form["direktur"]
        tokohutama_ = request.form["tokohutama"]

        mydb.connect()
        cursor = mydb.cursor()
        cursor.execute("SELECT * FROM dataset")
        data = cursor.fetchall()

        labelEncoderStasiunTV = LabelEncoder()
        stasiuntv = [x[1] for x in data]
        stasiuntv = labelEncoderStasiunTV.fit_transform(stasiuntv)

        labelEncoderGenre = LabelEncoder()
        genre = [x[2] for x in data]
        genre = labelEncoderGenre.fit_transform(genre)

        labelEncoderWriter = LabelEncoder()
        writer = [x[3] for x in data]
        writer = labelEncoderWriter.fit_transform(writer)

        labelEncoderDirector = LabelEncoder()
        director = [x[4] for x in data]
        director = labelEncoderDirector.fit_transform(director)

        labelEncoderActor = LabelEncoder()
        actor = [x[5] for x in data]
        actor = labelEncoderActor.fit_transform(actor)

        labelEncoderStatus = LabelEncoder()
        status = [x[10] for x in data]
        status = labelEncoderStatus.fit_transform(status)

        s = labelEncoderStasiunTV.transform([stasiuntv_])[0]
        g = labelEncoderGenre.transform([genre_])[0]
        p = labelEncoderWriter.transform([penulis_])[0]
        d = labelEncoderDirector.transform([direktur_])[0]
        t = labelEncoderActor.transform([tokohutama_])[0]

        cursor.execute(
            "SELECT * FROM preprocessing WHERE preprocessing.tahundibuat REGEXP '(2005|2006|2007|2008|2009|2010|2011|2012|2013|2014|2015|2016|2017|2018|2019|2020)'"
        )
        training = cursor.fetchall()
        X = [[x[0], x[1], x[2], x[3], x[4]] for x in training]
        y = [x[5] for x in training]
        clf = CategoricalNB()
        clf.fit(X, y)

        hasil = labelEncoderStatus.inverse_transform(
            [clf.predict([[s, g, p, d, t]])[0]])[0]
        mydb.close()

        return render_template("public_classification.html", hasil=hasil)
    mydb.connect()
    cursor = mydb.cursor()

    cursor.execute("SELECT DISTINCT(stasiuntv) FROM dataset")
    stasiuntv = [x[0] for x in cursor.fetchall()]

    cursor.execute("SELECT DISTINCT(genre) FROM dataset")
    genre = [x[0] for x in cursor.fetchall()]

    cursor.execute("SELECT DISTINCT(penulis) FROM dataset")
    penulis = [x[0] for x in cursor.fetchall()]

    cursor.execute("SELECT DISTINCT(direktur) FROM dataset")
    direktur = [x[0] for x in cursor.fetchall()]

    cursor.execute("SELECT DISTINCT(tokohutama) FROM dataset")
    tokohutama = [x[0] for x in cursor.fetchall()]

    cursor.close()
    mydb.close()
    return render_template("public_Classification.html",
                           genre=genre,
                           stasiuntv=stasiuntv,
                           penulis=penulis,
                           direktur=direktur,
                           tokohutama=tokohutama)
Example #28
0
#scoring with train data
print('train score:', LR_final.score(X_train_new, y_train))

# scoring with test data
print('test score:', LR_final.score(X_test_new, y_test))

LR_final.predict_proba(X_test_new)
"""# Naive Bayes"""

#use the same train test set as logistic regression
prediction = dict()
NB = CategoricalNB()
NB.fit(X_train_new, y_train)

prediction['Naive Bayes'] = NB.predict(X_test_new)

#accuracy, precision, recall, confusion matrix
print("Acurracy:")
print(accuracy_score(y_test, prediction['Naive Bayes']))
print("\n")
print("Classfication report:")
print(classification_report(y_test, prediction['Naive Bayes']))
print("\n")
print("Confusion Matrix:")
print(confusion_matrix(y_test, prediction['Naive Bayes']))

#scoring with train data
print('train score:', NB.score(X_train_new, y_train))

# scoring with test data
Example #29
0
drop_cols = ['id']

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score

for i in cat_cols:
    le = LabelEncoder()
    data[i] = le.fit_transform(data[i])

data.drop(columns=drop_cols, inplace=True)
train_df = data.loc[data['risk_flag'] != -1]
test_df = data.loc[data['risk_flag'] == -1]

X_tr, X_tst, y_tr, y_tst = train_test_split(
    train_df.drop(columns=['risk_flag']),
    train_df['risk_flag'],
    stratify=train_df['risk_flag'])

from sklearn.naive_bayes import CategoricalNB
clf = CategoricalNB()
clf.fit(X_tr, y_tr)

clf.feature_count_

k = clf.predict(X_tst)
roc_auc_score(y_tst, clf.predict(X_tst))

print(clf.predict(X[2:3]))
Example #30
0
"""
clf_b = BernoulliNB()

clf_b.fit(X_train, y_train)
prd = clf_b.predict(X_test)

metrics.accuracy_score(y_test, prd)
# 0.9818
roc_auc_score(y_test, prd)
# 0.6832

# ---------------------------------
clf_c = CategoricalNB()

clf_c.fit(X_train, y_train)
prd = clf_c.predict(X_test)

metrics.accuracy_score(y_test, prd)
# 0.9827
roc_auc_score(y_test, prd)
# 0.6793
""" 
------------------------------------------------------------------
Hyper parameter tuning - gridSearch (on smaller subsets of data - memory & time)
May have to check params individually on best model fit 
------------------------------------------------------------------
"""
param_grid = {
    'class_weight': ['balanced', 'balanced_subsample', None],
    'max_depth': [2, 4, 6, 10, None],
    'max_features': ['auto', 'sqrt', 'log2'],