Example #1
0
def test_categoricalnb():
    # Check the ability to predict the training set.
    clf = CategoricalNB()
    y_pred = clf.fit(X2, y2).predict(X2)
    assert_array_equal(y_pred, y2)

    X3 = np.array([[1, 4], [2, 5]])
    y3 = np.array([1, 2])
    clf = CategoricalNB(alpha=1, fit_prior=False)

    clf.fit(X3, y3)
    assert_array_equal(clf.n_categories_, np.array([3, 6]))

    # Check error is raised for X with negative entries
    X = np.array([[0, -1]])
    y = np.array([1])
    error_msg = "Negative values in data passed to CategoricalNB (input X)"
    assert_raise_message(ValueError, error_msg, clf.predict, X)
    assert_raise_message(ValueError, error_msg, clf.fit, X, y)

    # Check error is raised for incorrect X
    X = np.array([[1, 4, 1], [2, 5, 6]])
    msg = "Expected input with 2 features, got 3 instead"
    assert_raise_message(ValueError, msg, clf.predict, X)

    # Test alpha
    X3_test = np.array([[2, 5]])
    # alpha=1 increases the count of all categories by one so the final
    # probability for each category is not 50/50 but 1/3 to 2/3
    bayes_numerator = np.array([[1/3*1/3, 2/3*2/3]])
    bayes_denominator = bayes_numerator.sum()
    assert_array_almost_equal(clf.predict_proba(X3_test),
                              bayes_numerator / bayes_denominator)

    # Assert category_count has counted all features
    assert len(clf.category_count_) == X3.shape[1]

    # Check sample_weight
    X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
    y = np.array([1, 1, 2, 2])
    clf = CategoricalNB(alpha=1, fit_prior=False)
    clf.fit(X, y)
    assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([1]))
    assert_array_equal(clf.n_categories_, np.array([2, 2]))

    for factor in [1., 0.3, 5, 0.0001]:
        X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]])
        y = np.array([1, 1, 2, 2])
        sample_weight = np.array([1, 1, 10, 0.1]) * factor
        clf = CategoricalNB(alpha=1, fit_prior=False)
        clf.fit(X, y, sample_weight=sample_weight)
        assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([2]))
        assert_array_equal(clf.n_categories_, np.array([2, 2]))
Example #2
0
def perform_bayes(df):
    les = build_labelencoders(df)
    res = []
    for i in range(len(df.columns)):
        col = df.iloc[:, i].values
        res.append(les[i].transform(col))
    res = pd.DataFrame(res).transpose()
    x = res.iloc[:, :-1]
    y = res.iloc[:, -1:]

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=10)

    model = CategoricalNB()
    model.fit(x_train, y_train.values.ravel())

    y_pred = model.predict(x_test)
    y_pred_probability = model.predict_proba(x_test)[::, 1]

    accuracy = accuracy_score(y_test, y_pred) * 100
    print(accuracy)

    # example patient
    test = ['50-59','ge40','50-54','24-26','no','1','right','left_up','yes']
    print(test)

    # transform using labelencoders
    for i in range(len(test)):
        e = test[i]
        test[i] = les[i].transform(np.array(e).reshape(1, ))
    test = np.array(test)

    # do prediction
    y = model.predict(test.reshape(1, -1))

    # translate back
    y = les[-1].inverse_transform(y)[0]
    print(y)

    a, b, _ = roc_curve(y_test, y_pred_probability)
    area_under_curve = roc_auc_score(y_test, y_pred_probability)
    plt.plot(a, b, label="area under curve="+str(area_under_curve))
    plt.xlabel("false positive rate")
    plt.ylabel("true positive rate")
    plt.axis
    plt.legend(loc=4)
    plot_confusion_matrix(model, x_train, y_train.values.ravel(), normalize='true', display_labels=les[-1].inverse_transform([0, 1]))
    plt.show()
Example #3
0
def test_predict_proba_meta_override():
    X = pd.DataFrame({"c_0": [1, 2, 3, 4]})
    y = np.array([1, 2, 3, 4])

    base = CategoricalNB()
    base.fit(pd.DataFrame(X), y)

    dd_X = dd.from_pandas(X, npartitions=2)
    dd_X._meta = pd.DataFrame({"c_0": [5]})

    # Failure when not proving predict_proba_meta
    # because of value dependent model
    wrap = ParallelPostFit(base)
    with pytest.raises(ValueError):
        wrap.predict_proba(dd_X)

    # Success when providing meta over-ride
    wrap = ParallelPostFit(base,
                           predict_proba_meta=np.array([[0.0, 0.1, 0.8, 0.1]]))
    result = wrap.predict_proba(dd_X)
    expected = base.predict_proba(X)
    assert_eq_ar(result, expected)
Example #4
0
print(X)

Y = data.loc[:,'y']
print(Y)

#建立模型
from sklearn.naive_bayes import CategoricalNB

#建立模型实例
model = CategoricalNB()

#训练模型
model.fit(X, Y)

y_prdict_prob = model.predict_proba(X)

print(y_prdict_prob)

#输出预测y
y_predict = model.predict(X)
print(y_predict)

#计算模型准确率
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(Y, y_predict)
print(accuracy)


#测试样本预测
X_test = np.array([[0,0,0,1,1,0]])
score_training = [m(y, pred) for m in metrics]

#Predict on test set
x, y = prep_nb(x_test, y_test)
pred = nb.predict(x)

#Compute scores
score = [m(y, pred) for m in metrics]

# We can see that training scores and test scores are equivalent, i.e. we are confident to not have overfitted.

#%%
plot_confusion_matrix(nb, x, y, cmap=plt.cm.Blues, normalize='true')
#fig =plot_roc_curve(nb, x,y, response_method='predict_proba')

y_prop = nb.predict_proba(x)
y_prop = y_prop[:, 1]
roc = roc_curve(y_test, y_prop)

label = 'AUC: {:.4}'.format(auc(roc[0], roc[1]))
plt.plot(roc[0], roc[1], label=label)
plt.plot([0, 1], [0, 1],
         linestyle='--',
         lw=1,
         color='gray',
         label='Random',
         alpha=.8)
plt.legend()
plt.title('ROC Curve')
plt.xlabel('FPR')
plt.ylabel('TPR')
Example #6
0
    (train_X['bank_transaction_type'].values.astype('U')).reshape(-1, 1))
dev_cat = encoder.fit_transform(
    (dev_X['bank_transaction_type'].values.astype('U')).reshape(-1, 1))
clf_type = CategoricalNB()
clf_type.fit(train_cat,
             train_labels['bank_transaction_category'].values.astype('U'))
predicted = clf_type.predict(dev_cat)
accuracy = np.mean(
    predicted == dev_labels['bank_transaction_category'].values.astype('U'))
print(accuracy, 'transaction type')

# combine features
# weighted probabilites
total_probs = 0.91 * clf_desc.predict_proba(
    X_dev_tfidf) + 0.6 * clf_amount.predict_proba(dev_amount.reshape(
        -1, 1)) + 0.6 * clf_type.predict_proba(dev_cat)
index = clf_desc.classes_

predicted = []
for probs in total_probs:
    max_index = np.nanargmax(probs)
    predicted.append(index[max_index])

accuracy = np.mean(
    np.array(predicted) ==
    dev_labels['bank_transaction_category'].values.astype('U'))
print(accuracy, 'NB combination')

# test data on the selection of models

#svm
Example #7
0
# Description:
# Author:朱勇
# Time:2021/3/6 10:41
import pandas as pd
import numpy as np
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score

data = pd.read_csv("task2_data.csv")
x = data.drop(["y"], axis=1)
y = data.loc[:, "y"]
model = CategoricalNB()
model.fit(x, y)
y_predict = model.predict(x)
print(accuracy_score(y, y_predict))
x_test = np.array([[2, 1, 1, 1, 1], [2, 1, 1, 1, 0], [2, 1, 1, 0, 0],
                   [2, 1, 0, 0, 0], [2, 0, 0, 0, 0]])
y_test_predict_prob = model.predict_proba(x_test)
y_test_predict = model.predict(x_test)
#数据组合
test_data_result = np.concatenate(
    (x_test, y_test_predict_prob, y_test_predict.reshape(5, 1)), axis=1)
#格式转化
test_data_result = pd.DataFrame(test_data_result)
#列名称替换
test_data_result.columns = [
    "score", "school", "award", "gender", "English", "p0", "p1", "p2",
    "y_test_predict"
]
test_data_result.to_csv("test_data_result.csv")
Example #8
0
from Postprocessing import *



from sklearn.naive_bayes import CategoricalNB
from Preprocessing import preprocess
from Postprocessing import *
from utils import *

metrics = ["race", "sex", "age", 'c_charge_degree', 'priors_count', 'c_charge_desc']
training_data, training_labels, test_data, test_labels, categories, mappings = preprocess(metrics)

NBC = CategoricalNB()
NBC.fit(training_data, training_labels)

training_class_predictions = NBC.predict_proba(training_data)
training_predictions = []
test_class_predictions = NBC.predict_proba(test_data)
test_predictions = []

for i in range(len(training_labels)):
    training_predictions.append(training_class_predictions[i][1])

for i in range(len(test_labels)):
    test_predictions.append(test_class_predictions[i][1])

training_race_cases = get_cases_by_metric(training_data, categories, "race", mappings, training_predictions, training_labels)
test_race_cases = get_cases_by_metric(test_data, categories, "race", mappings, test_predictions, test_labels)

training_race_cases, thresholds = enforce_equal_opportunity(training_race_cases, 0.01)
Example #9
0
# label encoder for target
le_attend.fit(globo_df_long['attended'])
attend_y = le_attend.fit_transform(globo_df_long['attended'])

# view encodings
print(oe_weather.categories_)
# [array(['cloudy', 'rainy', 'snowy', 'sunny'], dtype=object)]

print(le_attend.classes_)
# ['no' 'yes']


# set and fit classifier
clf = CategoricalNB()
clf.fit(weather, attend_y)

# predict and view any given weather value
weather_classes = np.unique(weather)

for i in weather_classes:
    print("weather", i, "-",
          "attendance probability:", np.round(clf.predict_proba([[i]]), 2),
          ", predicted attendance:", clf.predict([[i]])[0])


# weather 0 - attendance probability: [[0.32 0.68]] , predicted attendance: 1
# weather 1 - attendance probability: [[0.59 0.41]] , predicted attendance: 0
# weather 2 - attendance probability: [[0.66 0.34]] , predicted attendance: 0
# weather 3 - attendance probability: [[0.32 0.68]] , predicted attendance: 1
Example #10
0
clf = CategoricalNB()

clf.fit(inputs, target)

list0 = []

for i in dict1:
    if i != 'junk':
        list0.append(i)

list2 = []
for i in list0:
    list2.append([dict1[i]])

list1 = clf.predict_proba(list2)

list3 = [
    'Turtle Rock Medallion', 'Misery Mire Medallion', 'Waterfall Bottle',
    'Pyramid Bottle'
]

dict4 = {}
j = 0
for j in range(len(list1)):
    top_idx = np.argsort(list1[j])[-len(list1[j]):]
    top_values = [list1[j][i] for i in top_idx]
    counter = 0
    totalPercent = 0
    for i in top_idx[::-1]:
        if dict3[str(i)] not in list3:
Example #11
0
# import matplotlib.pyplot as plt

X_train = train_data.iloc[:, 2:-1].values  # son 3 sütün alındı
y_train = train_data.iloc[:, 5].values  # label değerleri

X_test = test_data.iloc[:, 2:].values  # son 3 sütün alındı

from sklearn.naive_bayes import CategoricalNB

#Create a Gaussian Classifier
model = CategoricalNB()  # caterogical naive bayes algoritması kullanıldı

# Train the model using the training sets
model.fit(X_train, y_train)

testResult = model.predict_proba(
    X_test)  # 2 class için sonuclar 0 yaşıyor 1 ölüm

testResultDF = pd.DataFrame(columns=["0 yaşam", "1 ölüm"], data=testResult)

DeathEstimation = []
for i in range(0, len(testResultDF)):
    if testResultDF["1 ölüm"][i] > 0.50:
        DeathEstimation.append(1)
    else:
        DeathEstimation.append(0)
testResultDF.insert(0, "death_estimation value", DeathEstimation)

writer = pd.ExcelWriter('quiz3_bayes_classification_out_nuriozbey.xlsx',
                        engine='xlsxwriter')

# Write each dataframe to a different worksheet.
Example #12
0
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.naive_bayes import MultinomialNB, CategoricalNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import brier_score_loss

X, y = make_blobs(n_samples=[500, 500],
                  centers=[[0.0, 0.0], [2.0, 2.0]],
                  cluster_std=[0.5, 0.5],
                  random_state=0,
                  shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=420)

# 分箱,将数据转换为分类型
from sklearn.preprocessing import KBinsDiscretizer
kbd = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
kbd.fit(X_train)
X_train_ = kbd.transform(X_train)
X_test_ = kbd.transform(X_test)

print('分类值建模')
cnb = CategoricalNB()
cnb.fit(X_train_, y_train)
print('test accuracy: {}'.format(cnb.score(X_test_, y_test)))
print('test brier_score_loss: {}'.format(
    brier_score_loss(y_test, cnb.predict_proba(X_test_)[:, 1], pos_label=1)))
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split

df = pd.read_csv('https://raw.githubusercontent.com/grbruns/cst383/master/heart.csv')
df['output'] = df['output'] - 1

predictors = ['chestpain', 'exercise']
X = df[predictors].values
y = df['output'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

blind_prediction = np.median(y_train) #median? i thought blind was mean?
print((y_test == blind_prediction).mean())

clf = CategoricalNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test) # why do we want probability/what probability

clf.score(X_test, y_test)

df.describe()
# 交叉验证
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=0.7,
                                                    shuffle=True)

# 特征提取 + 数据标准化
pipe = Pipeline([
    ('count',
     CountVectorizer(max_features=100,
                     tokenizer=jieba_tokenizer,
                     stop_words=stop_words,
                     min_df=200)),
    ('tf-idf', TfidfTransformer()),
    ('norm', Normalizer()),
])

X_train = pipe.fit_transform(X_train).toarray()
print("train size", X_train.shape)
X_test = pipe.transform(X_test).toarray()

# 训练
model = CategoricalNB()
model.fit(X_train, y_train)

# 模型评估
y_train_pred = model.predict_proba(X_train)[:, 1]
y_test_pred = model.predict_proba(X_test)[:, 1]
plot_rocs([y_train, y_test], [y_train_pred, y_test_pred], ["train", "test"])
plot_pcs([y_train, y_test], [y_train_pred, y_test_pred], ["train", "test"])
Example #15
0
from sklearn.neighbors import NearestNeighbors as kNN
from sklearn.neighbors import DistanceMetric as DM
import scipy
import sympy
import statsmodels.api as stats
import sklearn.naive_bayes as nb
from sklearn.naive_bayes import CategoricalNB

purchase = pd.read_csv(
    "F:/Assigmnents/Machine Learning/Assigmnent_04/Purchase_Likelihood.csv",
    delimiter=',',
    usecols=['group_size', 'homeowner', 'married_couple', 'insurance'])

purchase = purchase.dropna()

feature = ['group_size', 'homeowner', 'married_couple']
target = ['insurance']

xTrain = purchase[feature].astype('category')
yTrain = purchase[target].astype('category')

model = CategoricalNB()
fitt = model.fit(xTrain, yTrain)

xTest = xTrain.groupby(feature).first().reset_index()
xTest = pd.DataFrame(xTest)
xTest

pro = model.predict_proba(xTest)

print(pro)
Example #16
0
print("Acurracy:")
print(accuracy_score(y_test, prediction['Naive Bayes']))
print("\n")
print("Classfication report:")
print(classification_report(y_test, prediction['Naive Bayes']))
print("\n")
print("Confusion Matrix:")
print(confusion_matrix(y_test, prediction['Naive Bayes']))

#scoring with train data
print('train score:', NB.score(X_train_new, y_train))

# scoring with test data
print('test score:', NB.score(X_test_new, y_test))

NB.predict_proba(X_test_new)
"""# Random Forest"""

#reassemble training dataset
#for numerical features, use the ones selected in Logistic Regression
#for categorical features, use the datasets after applying label encoding

#training dataset
x = X_sm_num.drop(columns=['hour', 'N1', 'N2', 'N5', 'N6', 'N7'], axis=1)
train1 = x.join(X_sm_c1).join(X_sm['newlabel'])
train1

#split the training dataset into train and test set
X1 = train1
y1 = train1['newlabel']
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2)
folds = StratifiedKFold(n_splits=40, shuffle=True, random_state=1990)

t1 = time.clock()
traintion = np.zeros(len(train))
validation = np.zeros(len(train))
predictions = np.zeros(len(test))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, target)):
    print("fold n°{}".format(fold_))
    train_x = train.iloc[trn_idx][usedfeatures].reset_index(drop=True)
    valid_x = train.iloc[val_idx][usedfeatures].reset_index(drop=True)
    target_train = target.iloc[trn_idx].reset_index(drop=True)
    target_valid = target.iloc[val_idx].reset_index(drop=True)

    CB = CategoricalNB(alpha=5)
    CB.fit(train_x, target_train)
    traintion[trn_idx] += CB.predict_proba(train_x)[:,
                                                    1] / (folds.n_splits - 1)
    validation[val_idx] = CB.predict_proba(valid_x)[:, 1]

    predictions += CB.predict_proba(test[usedfeatures])[:, 1] / folds.n_splits
t2 = time.clock() - t1
print("Train AUC score: {:<8.5f}".format(roc_auc_score(target, traintion)))
print("Valid AUC score: {:<8.5f}".format(roc_auc_score(target, validation)))
#0.5
# Train AUC score: 0.79510
# Valid AUC score: 0.78065
# 1
# Train AUC score: 0.79500
# Valid AUC score: 0.78083
# 5
# Train AUC score: 0.79385
# Valid AUC score: 0.78095