## quick test ## #print("starting to train Graph") class_col_name = "ak" #traindf = pd.read_csv("../data/train-chess.csv") #testdf = pd.read_csv("../data/test-chess.csv") df = pd.read_csv("../../TAN/data/chess.csv") n = df.shape[0] power = [] for x in tqdm(range(1000)): ind = np.random.rand(n) < 0.75 traindf = df.loc[ind] testdf = df.loc[~ind] model = NaiveBayes(traindf, class_col_name=class_col_name) results = model.Predict(testdf) results['ak'] = results.idxmax(axis=1).values accuracy = (testdf.ak.values == results.ak).mean() power.append(accuracy) #print(f"TAN accuracy: {round(accuracy, 4)}") answer = sum(power) / len(power) print(f"final answer: {round(answer,4)}") res = pd.DataFrame(power, columns=["accuracy"]) res.hist(bins=20) plt.show() #with open("results.txt", "w+") as myfile: # for line in power: # myfile.write(f"{line}\n")
class_col_name = "IsDiabetic" #df = pd.read_csv("../data/Pima.tr.csv") #class_col_name = "type" n = df.shape[0] ind = np.random.rand(n) < 0.75 traindf = df.loc[ind] testdf = df.loc[~ind] traincols = [ 'NoPregnancies', 'PlasmaGlucose', 'DiastolicBP', 'TricepsSkinThickness', '2HourSerumInsulin', 'BMI', 'DiabetesPedigreeFunc', 'Age', 'IsDiabetic' ] nbmodel = NaiveBayes(traindf[traincols], class_col_name=class_col_name, progress_bar=False) results = nbmodel.Predict(newdf=traindf) accuracy = (traindf[class_col_name].values == results[class_col_name]).mean() print(f"TAN accuracy: {round(accuracy, 4)}") Lik = results[[0, 1]] loglike = [] for name, frame in g: s = 1 - frame[name] ## calc deviance from true prob slog = np.log(s).sum() loglike.append(slog) deviance = -2 * sum(loglike) k = traindf.columns.shape[0] - 1 ## -1 for class column n = traindf.shape[0] BIC = deviance + k * (np.log(n) - np.log(2 * np.pi))
col = 'IsDiabetic' #df = pd.read_csv("../../TAN/data/Pima.tr.csv") #print(df.dtypes) #col = 'type' n = df.shape[0] results = [] for i in tqdm(range(100)): ind = np.random.rand(n) < 0.75 traindf = df.loc[ind] testdf = df.loc[~ind] nbmodel = NaiveBayes(traindf, col) testresults = nbmodel.Predict(testdf) testresults[col] = testresults.idxmax(axis = 1) accuracy = np.mean(testdf[col].values == testresults[col]) #print(accuracy) results.append(accuracy) res = pd.DataFrame(results, columns = ['accuracy']) res.hist(bins = 20) plt.show() #with open("tmp.pickle", "wb+") as myfile: # pickle.dump(nbmodel, myfile) #print('delete tmp.pickle')
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Nov 11 12:06:30 2017 @author: jonathan """ import sys sys.path.append("../src/") import matplotlib.pyplot as plt from tqdm import tqdm import pandas as pd import numpy as np from NaiveBayes import NaiveBayes df = pd.read_csv("../data/digits/train.csv") n = df.shape[0] ind = np.random.rand(n) < 0.75 traindf = df.loc[ind] testdf = df.loc[~ind] ## build model nbmodel = NaiveBayes(traindf, 'label', progress_bar=True) ## test model and get predictions testresults = nbmodel.Predict(testdf, progress_bar=True) ## compare accuracy accuracy = np.mean(testdf['label'].values == testresults['label']) print(accuracy)