def run_kfold(fields, labels):
    kf = KFold(n_splits=5)
    best = [], []
    best_accuracy = 0

    # train_index and test_index index into fields and labels
    for train_index, test_index in kf.split(fields):
        train_fields = fields.iloc[train_index].reset_index(drop = True)
        train_labels = [labels[i] for i in train_index]
        test_fields = fields.iloc[test_index].reset_index(drop = True)
        test_labels = [labels[i] for i in test_index]

        clf = CategoricalNB()
        clf.fit(train_fields, train_labels)

        try:
            res = clf.predict(test_fields).tolist()
        except IndexError:
            continue
        
        accuracy = []
        for i in range(len(res)):
            if res[i] == test_labels[i]:
                accuracy.append(1)
            else:
                accuracy.append(0)
        accuracy = [1 if res[i] == test_labels[i] else 0 for i in range(len(res))]
        acc = sum(accuracy)/len(accuracy)

        if (acc > best_accuracy):
            best = train_index, test_index
            best_accuracy = acc

        print("accuracy rate: ", acc)
    return best
Exemple #2
0
def categoricalNaiveBayes(dtrain, dtest):
    # can use split?
    y_train = dtrain[:, -1]
    x_train = dtrain[:, :-1]
    scaler = preprocessing.MinMaxScaler()
    scaler.fit(x_train)
    x_train = scaler.transform(x_train)
    newscaler = preprocessing.MinMaxScaler()
    newscaler.fit(dtest)
    dtest = newscaler.transform(dtest)
    #print(x_train[x_train < 0])
    gnb = CategoricalNB()
    gnb.fit(x_train, y_train)
    print("GNB Features")
    print("GNB cat count Features")
    print(gnb.category_count_)
    print("GNB class count Features")
    print(gnb.class_count_)
    print("GNB feature log prob Features")
    print(gnb.feature_log_prob_)
    print("GNB n Features")
    print(gnb.n_features_)
    print("Length test")
    print(len(dtest[0]))
    predictions = gnb.predict(dtest)
    return predictions
def run_model(training, testing, fields, labels):
    train_fields = fields.iloc[training].reset_index(drop = True)
    train_labels = [labels[i] for i in training]
    test_fields = fields.iloc[testing].reset_index(drop = True)
    test_labels = [labels[i] for i in testing]

    clf = CategoricalNB()
    clf.fit(train_fields, train_labels)

    res = clf.predict(test_fields).tolist()

    accuracy = []
    for i in range(len(res)):
        if res[i] == 1 and test_labels[i] == 0:
            accuracy.append(1)
        elif res[i] == 0 and test_labels[i] == 1:
            accuracy.append(-1)
        else:
            accuracy.append(0)

    fp = sum([1 if accuracy[i] == 1 else 0 for i in range(len(accuracy))])/len(accuracy)
    fn = sum([1 if accuracy[i] == -1 else 0 for i in range(len(accuracy))])/len(accuracy)
    acc = sum([1 if accuracy[i] == 0 else 0 for i in range(len(accuracy))])/len(accuracy)
    print("false positive rate: %4f" % fp)
    print("false negative rate: %4f" % fn)
    print("accuracy: %4f" % acc)
    return res, acc, fp, fn
Exemple #4
0
def test_categoricalnb_min_categories_errors(min_categories, error_msg):

    X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
    y = np.array([1, 1, 2, 2])

    clf = CategoricalNB(alpha=1, fit_prior=False, min_categories=min_categories)
    with pytest.raises(ValueError, match=error_msg):
        clf.fit(X, y)
Exemple #5
0
    def setUp(self):
        rng = np.random.RandomState(1)

        self.X = rng.randint(5, size=(6, 100))
        y = np.array([1, 2, 3, 4, 5, 6])

        model = CategoricalNB()
        model.fit(self.X, y)
        self.model = model
Exemple #6
0
def test(X_train, Y_train, X_val, Y_val, categorical=False):
    from sklearn.naive_bayes import GaussianNB, CategoricalNB
    if categorical:
        model = CategoricalNB()
    else:
        model = GaussianNB()
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_val)
    # accuracy_score(Y_val, Y_pred)
    return f1_score(Y_val, Y_pred)
def check_sklearn_dev():
    """
    This just verifies that sklearn 0.23-dev is installed properly
    by checking CategoricalNB results
    """
    rng = np.random.RandomState(1)
    X = rng.randint(5, size=(6, 100))
    y = np.array([1, 2, 3, 4, 5, 6])

    clf = CategoricalNB()
    clf.fit(X, y)
    assert [3] == clf.predict(X[2:3])
Exemple #8
0
    def fit(self, X, Y):
        """
        Fit the classifier to training data X and lables Y.

        Arguments:
            X (np.array): training data matrix of shape (n_samples, n_features)
            Y (np.array): label matrix of shape (n_samples, n_labels)
        """
        n_labels = Y.shape[1]
        for idx in range(n_labels):
            Y_col = Y[:, idx]
            predictor = CategoricalNB()
            predictor.fit(X, Y_col)
            self.predictors.append(predictor)
Exemple #9
0
def cnb(train_x, train_y, test_x, test_y):
    compnb = CategoricalNB()
    compnb.fit(train_x, train_y)
    y_predictions = compnb.predict(test_x)
    print("RMSE for Complement Naive Bayes model = ",
          mean_squared_error(test_y, y_predictions))
    my_f1 = f1_score(test_y, y_predictions, average='macro')
    print("f1_macro for Categorical Naive Bayes Classifier = ", my_f1)
    cm = confusion_matrix(test_y, y_predictions, normalize='true')
    sns.heatmap(cm, annot=True)
    plt.title('Confusion matrix of the Categorical Naive Bayes classifier')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig('./output/CompNB.png')
    plt.show()
Exemple #10
0
def test_categoricalnb_with_min_categories(
    min_categories, exp_X1_count, exp_X2_count, new_X, exp_n_categories_
):
    X_n_categories = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
    y_n_categories = np.array([1, 1, 2, 2])
    expected_prediction = np.array([1])

    clf = CategoricalNB(alpha=1, fit_prior=False, min_categories=min_categories)
    clf.fit(X_n_categories, y_n_categories)
    X1_count, X2_count = clf.category_count_
    assert_array_equal(X1_count, exp_X1_count)
    assert_array_equal(X2_count, exp_X2_count)
    predictions = clf.predict(new_X)
    assert_array_equal(predictions, expected_prediction)
    assert_array_equal(clf.n_categories_, exp_n_categories_)
Exemple #11
0
def bayes(test_set, training_set, categories):
    classifier = CategoricalNB()

    x, y = build_xy(training_set, categories)
    classifier.fit(x, y)

    false_positives = 0
    false_negatives = 0
    true_positives = 0
    true_negatives = 0

    x, y = build_xy(test_set, categories)
    y_predicted = classifier.predict(x)
    print(f'score: {classifier.score(x, y)}')
    print('bayes confusion matrix')
    print(classification_report(y, y_predicted))
Exemple #12
0
def perform_bayes(df):
    les = build_labelencoders(df)
    res = []
    for i in range(len(df.columns)):
        col = df.iloc[:, i].values
        res.append(les[i].transform(col))
    res = pd.DataFrame(res).transpose()
    x = res.iloc[:, :-1]
    y = res.iloc[:, -1:]

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=10)

    model = CategoricalNB()
    model.fit(x_train, y_train.values.ravel())

    y_pred = model.predict(x_test)
    y_pred_probability = model.predict_proba(x_test)[::, 1]

    accuracy = accuracy_score(y_test, y_pred) * 100
    print(accuracy)

    # example patient
    test = ['50-59','ge40','50-54','24-26','no','1','right','left_up','yes']
    print(test)

    # transform using labelencoders
    for i in range(len(test)):
        e = test[i]
        test[i] = les[i].transform(np.array(e).reshape(1, ))
    test = np.array(test)

    # do prediction
    y = model.predict(test.reshape(1, -1))

    # translate back
    y = les[-1].inverse_transform(y)[0]
    print(y)

    a, b, _ = roc_curve(y_test, y_pred_probability)
    area_under_curve = roc_auc_score(y_test, y_pred_probability)
    plt.plot(a, b, label="area under curve="+str(area_under_curve))
    plt.xlabel("false positive rate")
    plt.ylabel("true positive rate")
    plt.axis
    plt.legend(loc=4)
    plot_confusion_matrix(model, x_train, y_train.values.ravel(), normalize='true', display_labels=les[-1].inverse_transform([0, 1]))
    plt.show()
Exemple #13
0
def test_categoricalnb():
    # Check the ability to predict the training set.
    clf = CategoricalNB()
    y_pred = clf.fit(X2, y2).predict(X2)
    assert_array_equal(y_pred, y2)

    X3 = np.array([[1, 4], [2, 5]])
    y3 = np.array([1, 2])
    clf = CategoricalNB(alpha=1, fit_prior=False)

    clf.fit(X3, y3)
    assert_array_equal(clf.n_categories_, np.array([3, 6]))

    # Check error is raised for X with negative entries
    X = np.array([[0, -1]])
    y = np.array([1])
    error_msg = re.escape("Negative values in data passed to CategoricalNB (input X)")
    with pytest.raises(ValueError, match=error_msg):
        clf.predict(X)
    with pytest.raises(ValueError, match=error_msg):
        clf.fit(X, y)

    # Test alpha
    X3_test = np.array([[2, 5]])
    # alpha=1 increases the count of all categories by one so the final
    # probability for each category is not 50/50 but 1/3 to 2/3
    bayes_numerator = np.array([[1 / 3 * 1 / 3, 2 / 3 * 2 / 3]])
    bayes_denominator = bayes_numerator.sum()
    assert_array_almost_equal(
        clf.predict_proba(X3_test), bayes_numerator / bayes_denominator
    )

    # Assert category_count has counted all features
    assert len(clf.category_count_) == X3.shape[1]

    # Check sample_weight
    X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
    y = np.array([1, 1, 2, 2])
    clf = CategoricalNB(alpha=1, fit_prior=False)
    clf.fit(X, y)
    assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([1]))
    assert_array_equal(clf.n_categories_, np.array([2, 2]))

    for factor in [1.0, 0.3, 5, 0.0001]:
        X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
        y = np.array([1, 1, 2, 2])
        sample_weight = np.array([1, 1, 10, 0.1]) * factor
        clf = CategoricalNB(alpha=1, fit_prior=False)
        clf.fit(X, y, sample_weight=sample_weight)
        assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([2]))
        assert_array_equal(clf.n_categories_, np.array([2, 2]))
def pengujian():
    if "admin" not in session:
        return redirect(url_for("index"))
    mydb.connect()
    cursor = mydb.cursor()
    cursor.execute(
        "SELECT * FROM preprocessing WHERE preprocessing.tahundibuat REGEXP '(2005|2006|2007|2008|2009|2010|2011|2012|2013|2014|2015|2016|2017|2018|2019)'"
    )
    training = cursor.fetchall()
    X = [[x[0], x[1], x[2], x[3], x[4]] for x in training]
    y = [x[5] for x in training]
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
    clf = CategoricalNB()
    clf.fit(X, y)
    cursor.execute(
        "SELECT * FROM preprocessing WHERE preprocessing.tahundibuat REGEXP '(2020)'"
    )
    testing = cursor.fetchall()
    X_test = [[x[0], x[1], x[2], x[3], x[4]] for x in testing]
    y_test = [x[5] for x in testing]
    predicted = clf.predict(X_test)
    payload = []
    for index, x in enumerate(X_test):
        arr = x
        arr.append(y[index])
        payload.append({
            "no": index + 1,
            "stasiuntv": arr[0],
            "genre": arr[1],
            "writer": arr[2],
            "director": arr[3],
            "actor": arr[4],
            "status": arr[5],
        })
    hasil = confusion_matrix(y_test, predicted)
    akurasi = (hasil[0][0] + hasil[1][1]) / (hasil[0][0] + hasil[0][1] +
                                             hasil[1][0] + hasil[1][1])

    return render_template("pengujian.html",
                           hasil=hasil,
                           akurasi=round(akurasi * 100))
Exemple #15
0
def example_weather_nominal():
    path = (base_path / "weather-nominal.csv").resolve()
    series = pd.read_csv(path)
    # arrange table in X(features) and y(target)
    X = series.iloc[:, :-1]
    X = X.apply(LabelEncoder().fit_transform)
    y = series.iloc[:, -1]
    # apply GaussianNB and CategoricalNB
    gNB = GaussianNB()
    gNB.fit(X, y)
    cNB = CategoricalNB()
    cNB.fit(X, y)
    print(
        f"Prediction GaussianNB ([Sunny,Cool,High,True]]): {gNB.predict([[2,0,0,1]])}"
    )
    print(f"Probability GaussianNB: {gNB.predict_proba([[2,0,0,1]])}")
    print("\n")
    print(
        f"Prediction CategoricalNB ([Sunny,Cool,High,True]]): {cNB.predict([[2, 0, 0, 1]])}"
    )
    print(f"Probability CategoricalNB: {cNB.predict_proba([[2, 0, 0, 1]])}")
Exemple #16
0
def test_predict_meta_override():
    X = pd.DataFrame({"c_0": [1, 2, 3, 4]})
    y = np.array([1, 2, 3, 4])

    base = CategoricalNB()
    base.fit(pd.DataFrame(X), y)

    dd_X = dd.from_pandas(X, npartitions=2)
    dd_X._meta = pd.DataFrame({"c_0": [5]})

    # Failure when not proving predict_meta
    # because of value dependent model
    wrap = ParallelPostFit(base)
    with pytest.raises(ValueError):
        wrap.predict(dd_X)

    # Success when providing meta over-ride
    wrap = ParallelPostFit(base, predict_meta=np.array([1]))
    result = wrap.predict(dd_X)
    expected = base.predict(X)
    assert_eq_ar(result, expected)
Exemple #17
0
def combine_two_categorical():
    """
    run this so see how combining 2 sklearn.CategoricalNB == combining them externally
    """
    data = Data()
    data.cleanse()
    data.encode()

    knr = np.array(data.x["Kontonummer"]).reshape((-1, 1))
    text = np.array(data.x["BLZ"]).reshape((-1, 1))

    k_cat_nb = CategoricalNB()
    t_cat_nb = CategoricalNB()

    k_cat_nb.fit(knr, data.y)
    t_cat_nb.fit(text, data.y)

    k_proba = k_cat_nb.predict_log_proba(knr)
    t_proba = t_cat_nb.predict_log_proba(text)

    combined_proba = k_proba + t_proba - k_cat_nb.class_log_prior_
    combined_proba -= np.expand_dims(logsumexp(combined_proba, axis=1), axis=1)

    # now the same thing but in one cat_nb
    combi = data.x.loc[:, ["BLZ", "Kontonummer"]]

    c_cat_nb = CategoricalNB()

    c_cat_nb.fit(combi, data.y)

    proba = c_cat_nb.predict_log_proba(combi)

    diff = np.exp(proba) - np.exp(combined_proba)

    print("total difference in probabilities: %d" % np.sum(np.abs(diff)))
Exemple #18
0
def example_weather_numeric():
    path = (base_path / "weather-numeric.csv").resolve()
    series = pd.read_csv(path)
    # arrange table in X(features) and y(target)
    X = series.iloc[:, :-1]
    X.outlook = LabelEncoder().fit_transform(X.outlook)
    X.windy = LabelEncoder().fit_transform(X.windy)
    y = series.iloc[:, -1]
    # apply GaussianNB and CategoricalNB
    gNB = GaussianNB()
    gNB.fit(X, y)
    cNB = CategoricalNB()
    cNB.fit(X, y)
    print(
        f"Prediction GaussianNB ([Sunny,66,90,True]]]): {gNB.predict([[2, 66, 90, 1]])}"
    )
    print(f"Probability GaussianNB: {gNB.predict_proba([[2, 66, 90, 1]])}")
    print("\n")
    print(
        f"Prediction CategoricalNB ([Sunny,66,90,True]]): {cNB.predict([[2, 66, 90, 1]])}"
    )
    print(f"Probability CategoricalNB: {cNB.predict_proba([[2, 66, 90, 1]])}")
def classificationCategoricalNaiveBayes():
    col_names = [
        '*', 'web1', 'web2', 'cosine', 'len', 'word', 'sameDomain', 'label'
    ]
    #load dataset
    pima = pd.read_csv("data.csv", names=col_names)

    #split dataset in features and target variable
    feature_cols = ['cosine', 'len', 'word', 'sameDomain']
    X = pima[feature_cols]  # Features
    y = pima.label  # Target variable

    #Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=4)  # 80% training and 20% test

    clf = CategoricalNB()
    clf.fit(X_train, y_train)

    # save the model
    dump(clf, open('model.pkl', 'wb'))
    startTime = datetime.now()
    #Predict the response for test dataset
    y_pred = clf.predict(X_test)
    endTime = datetime.now()

    print("exec time :", endTime - startTime)

    #Model Accuracy, how often is the classifier correct?
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

    print("precision:", metrics.average_precision_score(y_test, y_pred))

    print("recall:", metrics.recall_score(y_test, y_pred))

    print()

    print(confusion_matrix(y_test, y_pred))
Exemple #20
0
def naive_bayes_adapter(**kwargs):
    # getting data from kwargs
    train = kwargs['train']
    test = kwargs['test']
    # megreing data to get all uniques and to build encoder for all
    merged_data = pd.concat([train, test])
    # adding unknown uniques to train dataframe
    # by adding new row with that unknown unique and rest are most cummon uniques
    train = improove_train(train, merged_data)
    # bulding encoder
    merged_data_without_class = merged_data.drop('class', 1)
    encoder = OrdinalEncoder()
    encoder.fit(merged_data_without_class)
    # seperating classification column from datasets
    train_without_class = train.drop('class', 1)
    test_without_class = test.drop('class', 1)
    train_classifications = train['class']
    test_classifications = test['class']
    # encoding them all
    encoded_train_without_class = encoder.transform(train_without_class)
    encoded_test_without_class = encoder.transform(test_without_class)
    encoded_train_classifications = train_classifications.map({
        'yes': 1,
        'no': 0
    })
    encoded_test_classifications = test_classifications.map({
        'yes': 1,
        'no': 0
    })
    # building classifer
    clf = CategoricalNB(alpha=1)  # when alpha=1 its Laplace smoothing
    clf.fit(encoded_train_without_class, encoded_train_classifications)
    # pridicting with the tree
    predictions = clf.predict(encoded_test_without_class)
    # returning matrix and cakculating score
    return create_return_dict(predictions, encoded_test_classifications)
def test_categoricalnb():
    # Check the ability to predict the training set.
    clf = CategoricalNB()
    y_pred = clf.fit(X2, y2).predict(X2)
    assert_array_equal(y_pred, y2)

    X3 = np.array([[1, 4], [2, 5]])
    y3 = np.array([1, 2])
    clf = CategoricalNB(alpha=1, fit_prior=False)

    clf.fit(X3, y3)

    # Check error is raised for X with negative entries
    X = np.array([[0, -1]])
    y = np.array([1])
    error_msg = "X must not contain negative values."
    assert_raise_message(ValueError, error_msg, clf.predict, X)
    assert_raise_message(ValueError, error_msg, clf.fit, X, y)

    # Check error is raised for incorrect X
    X = np.array([[1, 4, 1], [2, 5, 6]])
    msg = "Expected input with 2 features, got 3 instead"
    assert_raise_message(ValueError, msg, clf.predict, X)

    # Test alpha
    X3_test = np.array([[2, 5]])
    # alpha=1 increases the count of all categories by one so the final
    # probability for each category is not 50/50 but 1/3 to 2/3
    bayes_numerator = np.array([[1 / 3 * 1 / 3, 2 / 3 * 2 / 3]])
    bayes_denominator = bayes_numerator.sum()
    assert_array_almost_equal(clf.predict_proba(X3_test),
                              bayes_numerator / bayes_denominator)

    # Assert category_count has counted all features
    assert len(clf.category_count_) == X3.shape[1]

    # Check sample_weight
    X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
    y = np.array([1, 1, 2, 2])
    clf = CategoricalNB(alpha=1, fit_prior=False)
    clf.fit(X, y)
    assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([1]))

    for factor in [1., 0.3, 5, 0.0001]:
        X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
        y = np.array([1, 1, 2, 2])
        sample_weight = np.array([1, 1, 10, 0.1]) * factor
        clf = CategoricalNB(alpha=1, fit_prior=False)
        clf.fit(X, y, sample_weight=sample_weight)
        assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([2]))
Exemple #22
0
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

X_df = train_df

clf = CategoricalNB()

# errorList = [658, 1562, 5532, 5629, 7401, 9458, 9981, 14080, 17258, 24716, 25047]

binned_test_df.loc[658,:] = 0
binned_test_df.loc[1562,:] = 0
binned_test_df.loc[5532,:] = 0
binned_test_df.loc[5629,:] = 0
binned_test_df.loc[7401,:] = 0
binned_test_df.loc[9458,:] = 0
binned_test_df.loc[9981,:] = 0
binned_test_df.loc[14080,:] = 0
binned_test_df.loc[17258,:] = 0
binned_test_df.loc[24716,:] = 0
binned_test_df.loc[25047,:] = 0



predict = clf.fit(X_df, Y_df).predict(test_df.loc[2500:5000])

f = open("CNB.csv", "w")
f.write("ImageId,Label\n")
for i in range(0, predict.size):
    f.write("{},{}\n".format(i+1, predict[i]))

f.close()
Exemple #23
0
print("Confusion Matrix:")
print(confusion_matrix(y_test, prediction['Logistic Regression']))

#scoring with train data
print('train score:', LR_final.score(X_train_new, y_train))

# scoring with test data
print('test score:', LR_final.score(X_test_new, y_test))

LR_final.predict_proba(X_test_new)
"""# Naive Bayes"""

#use the same train test set as logistic regression
prediction = dict()
NB = CategoricalNB()
NB.fit(X_train_new, y_train)

prediction['Naive Bayes'] = NB.predict(X_test_new)

#accuracy, precision, recall, confusion matrix
print("Acurracy:")
print(accuracy_score(y_test, prediction['Naive Bayes']))
print("\n")
print("Classfication report:")
print(classification_report(y_test, prediction['Naive Bayes']))
print("\n")
print("Confusion Matrix:")
print(confusion_matrix(y_test, prediction['Naive Bayes']))

#scoring with train data
print('train score:', NB.score(X_train_new, y_train))
# 交叉验证
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.7,
                                                    shuffle=True)

# 特征提取 + 数据标准化
pipe = Pipeline([
    ('count',
     CountVectorizer(max_features=100,
                     tokenizer=jieba_tokenizer,
                     stop_words=stop_words,
                     min_df=200)),
    ('tf-idf', TfidfTransformer()),
    ('norm', Normalizer()),
])

X_train = pipe.fit_transform(X_train).toarray()
print("train size", X_train.shape)
X_test = pipe.transform(X_test).toarray()

# 训练
model = CategoricalNB()
model.fit(X_train, y_train)

# 模型评估
y_train_pred = model.predict_proba(X_train)[:, 1]
y_test_pred = model.predict_proba(X_test)[:, 1]
plot_rocs([y_train, y_test], [y_train_pred, y_test_pred], ["train", "test"])
plot_pcs([y_train, y_test], [y_train_pred, y_test_pred], ["train", "test"])
Exemple #25
0
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_test = np.array(x_test)
    y_test = np.array(y_test)
    label_encoder = LabelEncoder()
    for i in disc_columns:
        x_train[i] = label_encoder.fit_transform(x_train[i])
        x_test[i] = label_encoder.fit_transform(x_test[i])
    n_b = MixedNB(categorical_features=disc_columns)

'''

# "uczymy" sie na zbiorze treningowym
start_time = time.time()
print("Learning and predicting with naive_bayes ...", end=" ")
n_b.fit(x_train, y_train)

# przewidujemy na testowym
y_pred = n_b.predict(x_test)
print("  took %s seconds " % round((time.time() - start_time), 5))

# na testowym znalismy prawdziwe klasy, mozemy porownac jak "dobrze" poszlo

metric_accuracy = metrics.accuracy_score(y_test, y_pred)
print("naive_bayes: accuracy = ", metric_accuracy)

print("full classification report:")
if type(classes_names) is not list:
    target_nms = classes_names.astype(str)
else:
    target_nms = classes_names
Exemple #26
0
X = data.drop(['y'], axis= 1)

print(X)

Y = data.loc[:,'y']
print(Y)

#建立模型
from sklearn.naive_bayes import CategoricalNB

#建立模型实例
model = CategoricalNB()

#训练模型
model.fit(X, Y)

y_prdict_prob = model.predict_proba(X)

print(y_prdict_prob)

#输出预测y
y_predict = model.predict(X)
print(y_predict)

#计算模型准确率
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(Y, y_predict)
print(accuracy)

Exemple #27
0
def test_alpha():
    # Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case
    X = np.array([[1, 0], [1, 1]])
    y = np.array([0, 1])
    nb = BernoulliNB(alpha=0.0)
    msg = "alpha too small will result in numeric errors, setting alpha = 1.0e-10"
    with pytest.warns(UserWarning, match=msg):
        nb.partial_fit(X, y, classes=[0, 1])
    with pytest.warns(UserWarning, match=msg):
        nb.fit(X, y)
    prob = np.array([[1, 0], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    nb = MultinomialNB(alpha=0.0)
    with pytest.warns(UserWarning, match=msg):
        nb.partial_fit(X, y, classes=[0, 1])
    with pytest.warns(UserWarning, match=msg):
        nb.fit(X, y)
    prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    nb = CategoricalNB(alpha=0.0)
    with pytest.warns(UserWarning, match=msg):
        nb.fit(X, y)
    prob = np.array([[1.0, 0.0], [0.0, 1.0]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    # Test sparse X
    X = scipy.sparse.csr_matrix(X)
    nb = BernoulliNB(alpha=0.0)
    with pytest.warns(UserWarning, match=msg):
        nb.fit(X, y)
    prob = np.array([[1, 0], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    nb = MultinomialNB(alpha=0.0)
    with pytest.warns(UserWarning, match=msg):
        nb.fit(X, y)
    prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]])
    assert_array_almost_equal(nb.predict_proba(X), prob)

    # Test for alpha < 0
    X = np.array([[1, 0], [1, 1]])
    y = np.array([0, 1])
    expected_msg = re.escape(
        "Smoothing parameter alpha = -1.0e-01. alpha should be > 0."
    )
    b_nb = BernoulliNB(alpha=-0.1)
    m_nb = MultinomialNB(alpha=-0.1)
    c_nb = CategoricalNB(alpha=-0.1)
    with pytest.raises(ValueError, match=expected_msg):
        b_nb.fit(X, y)
    with pytest.raises(ValueError, match=expected_msg):
        m_nb.fit(X, y)
    with pytest.raises(ValueError, match=expected_msg):
        c_nb.fit(X, y)

    b_nb = BernoulliNB(alpha=-0.1)
    m_nb = MultinomialNB(alpha=-0.1)
    with pytest.raises(ValueError, match=expected_msg):
        b_nb.partial_fit(X, y, classes=[0, 1])
    with pytest.raises(ValueError, match=expected_msg):
        m_nb.partial_fit(X, y, classes=[0, 1])
Exemple #28
0
    testingSet = getTestingSet(full_data)
    dataset = trainingSet + testingSet

    print('Classifying')

    classifier = CategoricalNB()
    encoder = OrdinalEncoder()
    encoder.fit([row[:-1] for row in dataset])

    print('Encoding')

    X = [row[:-1] for row in trainingSet]
    X = encoder.transform(X)
    Y = [row[-1] for row in trainingSet]

    classifier.fit(X, Y)

    test_set_x = encoder.transform([row[:-1] for row in testingSet])

    test_set_y = [row[-1] for row in testingSet]

    print('Predicting')
    predictions = classifier.predict(test_set_x)

    right = 0

    for y, prediction in zip(test_set_y, predictions):
        if y == prediction:
            right += 1

    accuracy = right / len(testingSet)
Exemple #29
0
drop_cols = ['id']

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score

for i in cat_cols:
    le = LabelEncoder()
    data[i] = le.fit_transform(data[i])

data.drop(columns=drop_cols, inplace=True)
train_df = data.loc[data['risk_flag'] != -1]
test_df = data.loc[data['risk_flag'] == -1]

X_tr, X_tst, y_tr, y_tst = train_test_split(
    train_df.drop(columns=['risk_flag']),
    train_df['risk_flag'],
    stratify=train_df['risk_flag'])

from sklearn.naive_bayes import CategoricalNB
clf = CategoricalNB()
clf.fit(X_tr, y_tr)

clf.feature_count_

k = clf.predict(X_tst)
roc_auc_score(y_tst, clf.predict(X_tst))

print(clf.predict(X[2:3]))
Exemple #30
0
target_names = ["0", "1"]

dataset = {
"data": dataArray,
"target": target,
"feature_names": columnsIncluded,
"target_names": target_names
}

# predict and output the test result
df_test = pd.DataFrame(data_test)
df_test.loc[df_test["Geography"]=="France", "Geography"] = 0
df_test.loc[df_test["Geography"]=="Spain", "Geography"] = 1
df_test.loc[df_test["Geography"]=="Germany", "Geography"] = 2
df_test.loc[df_test["Gender"]=="Male", "Gender"] = 0
df_test.loc[df_test["Gender"]=="Female", "Gender"] = 1

##########################################################
# train the model
clf = CategoricalNB(alpha = 1)
clf.fit(dataset['data'],dataset['target'])
predictedTestResult = clf.predict(df_test[columnsIncluded].values)

df_testOutput = df_test[["RowNumber"]]
df_testOutput.insert(1, "Exited", predictedTestResult, True)
df_testOutput.to_csv("submission_2_Bayes.csv", index=False)

# compute f1 score
f1_score_result = evaluateTask2.f1_score(predictedTestResult)
print("f1-score: " + str(f1_score_result))