Ejemplo n.º 1
0
## quick test ##
#print("starting to train Graph")
class_col_name = "ak"
#traindf = pd.read_csv("../data/train-chess.csv")
#testdf = pd.read_csv("../data/test-chess.csv")
df = pd.read_csv("../../TAN/data/chess.csv")
n = df.shape[0]

power = []
for x in tqdm(range(1000)):
    ind = np.random.rand(n) < 0.75
    traindf = df.loc[ind]
    testdf = df.loc[~ind]
    model = NaiveBayes(traindf, class_col_name=class_col_name)

    results = model.Predict(testdf)
    results['ak'] = results.idxmax(axis=1).values
    accuracy = (testdf.ak.values == results.ak).mean()
    power.append(accuracy)
    #print(f"TAN accuracy: {round(accuracy, 4)}")

answer = sum(power) / len(power)
print(f"final answer: {round(answer,4)}")

res = pd.DataFrame(power, columns=["accuracy"])
res.hist(bins=20)
plt.show()

#with open("results.txt", "w+") as myfile:
#    for line in power:
#        myfile.write(f"{line}\n")
Ejemplo n.º 2
0
class_col_name = "IsDiabetic"
#df = pd.read_csv("../data/Pima.tr.csv")
#class_col_name = "type"
n = df.shape[0]

ind = np.random.rand(n) < 0.75
traindf = df.loc[ind]
testdf = df.loc[~ind]
traincols = [
    'NoPregnancies', 'PlasmaGlucose', 'DiastolicBP', 'TricepsSkinThickness',
    '2HourSerumInsulin', 'BMI', 'DiabetesPedigreeFunc', 'Age', 'IsDiabetic'
]
nbmodel = NaiveBayes(traindf[traincols],
                     class_col_name=class_col_name,
                     progress_bar=False)
results = nbmodel.Predict(newdf=traindf)
accuracy = (traindf[class_col_name].values == results[class_col_name]).mean()
print(f"TAN accuracy: {round(accuracy, 4)}")

Lik = results[[0, 1]]
loglike = []
for name, frame in g:
    s = 1 - frame[name]  ## calc deviance from true prob
    slog = np.log(s).sum()
    loglike.append(slog)

deviance = -2 * sum(loglike)
k = traindf.columns.shape[0] - 1  ## -1 for class column
n = traindf.shape[0]

BIC = deviance + k * (np.log(n) - np.log(2 * np.pi))
Ejemplo n.º 3
0
col = 'IsDiabetic'
#df = pd.read_csv("../../TAN/data/Pima.tr.csv")
#print(df.dtypes)
#col = 'type'
n = df.shape[0]


results = []
for i in tqdm(range(100)):
    ind = np.random.rand(n) < 0.75
    traindf = df.loc[ind]
    testdf = df.loc[~ind]
    
    nbmodel = NaiveBayes(traindf, col)
    
    testresults = nbmodel.Predict(testdf)
    testresults[col] = testresults.idxmax(axis = 1)
    accuracy = np.mean(testdf[col].values == testresults[col])
    #print(accuracy)
    results.append(accuracy)


res = pd.DataFrame(results, columns = ['accuracy'])
res.hist(bins = 20)
plt.show()

#with open("tmp.pickle", "wb+") as myfile:
#    pickle.dump(nbmodel, myfile)

#print('delete tmp.pickle')
Ejemplo n.º 4
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 11 12:06:30 2017

@author: jonathan
"""
import sys
sys.path.append("../src/")
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import numpy as np
from NaiveBayes import NaiveBayes

df = pd.read_csv("../data/digits/train.csv")
n = df.shape[0]
ind = np.random.rand(n) < 0.75

traindf = df.loc[ind]
testdf = df.loc[~ind]

## build model
nbmodel = NaiveBayes(traindf, 'label', progress_bar=True)
## test model and get predictions
testresults = nbmodel.Predict(testdf, progress_bar=True)
## compare accuracy
accuracy = np.mean(testdf['label'].values == testresults['label'])
print(accuracy)