Beispiel #1
0
def test_std_deviation():

    classifier = NaiveBayesClassifier()
    numbers = [0.5, 1, 4.56, 3]

    assert np.around(classifier.std_deviation(numbers), 13) == 1.8728498783049
    assert classifier.std_deviation(numbers) == np.std(numbers, ddof=1)
def test_load_dataset_from_csv():

    classifier = NaiveBayesClassifier()

    csv_filename = 'datasets/iris.csv'

    data_0 = ['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']
    data_2 = ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa']
    data_39 = ['5.1','3.4','1.5','0.2','Iris-setosa']
    data_60 = ['5.0','2.0','3.5','1.0','Iris-versicolor']
    data_81 = ['5.5','2.4','3.7','1.0','Iris-versicolor']
    data_89 = ['5.5','2.5','4.0','1.3','Iris-versicolor']
    data_104 = ['6.5','3.0','5.8','2.2','Iris-virginica']
    data_110 = ['6.5','3.2','5.1','2.0','Iris-virginica']
    data_125 = ['7.2', '3.2', '6.0', '1.8', 'Iris-virginica']
    data_143 = ['6.8','3.2','5.9','2.3','Iris-virginica']

    readed_dataset = classifier.load_dataset_from_csv(csv_filename)

    assert readed_dataset[0] == data_0
    assert readed_dataset[2] == data_2
    assert readed_dataset[39] == data_39
    assert readed_dataset[60] == data_60
    assert readed_dataset[81] == data_81
    assert readed_dataset[89] == data_89
    assert readed_dataset[104] == data_104
    assert readed_dataset[110] == data_110
    assert readed_dataset[125] == data_125
    assert readed_dataset[143] == data_143

    csv_filename_2 = 'tests/unit_tests/resources/load_test.csv'

    readed_dataset_2 = classifier.load_dataset_from_csv(csv_filename_2)

    assert len(readed_dataset_2) == 3
def test_map_class_names_to_ints():

    classifier = NaiveBayesClassifier()

    dataset = [['3.393533211', '2.331273381', '0'],
               ['3.110073483', '1.781539638', '0'],
               ['1.343808831', '3.368360954', '0'],
               ['3.582294042', '4.67917911', '0'],
               ['2.280362439', '2.866990263', '0'],
               ['7.423436942', '4.696522875', '1'],
               ['5.745051997', '3.533989803', '1'],
               ['9.172168622', '2.511101045', '1'],
               ['7.792783481', '3.424088941', '1'],
               ['7.939820817', '0.791637231', '1']]

    classifier.map_class_names_to_ints(dataset, len(dataset[0]) - 1, True)

    for i in range(0, len(dataset)):
        tested_row = random.randint(0, len(dataset) - 1)
        assert isinstance(dataset[tested_row][len(dataset[0]) - 1], int)

    classifier.map_class_names_to_ints(dataset, len(dataset[0]) - 1, False)

    for i in range(0, len(dataset)):
        tested_row = random.randint(0, len(dataset) - 1)
        assert isinstance(dataset[tested_row][len(dataset[0]) - 1], int)
Beispiel #4
0
    def __init__(self):

        self.dataset_filename = 'datasets/iris.csv'
        self.description_filename = 'datasets/iris.names'
        self.nbc = NaiveBayesClassifier()
        self.dataset = self.nbc.load_dataset_from_csv(self.dataset_filename)
        self.class_map = dict()
def test_gaussian_probability():

    classifier = NaiveBayesClassifier()

    numbers = [[1.0, 1.0, 1.0], [2.0, 1.0, 1.0], [0.0, 1.0, 1.0]]
    results = [0.3989422804014327, 0.24197072451914337, 0.24197072451914337]

    for i in range(0, len(numbers)):
        assert classifier.gaussian_probability(numbers[i][0], numbers[i][1],
                                               numbers[i][2]) == results[i]
Beispiel #6
0
def test_gather_data_params():

    classifier = NaiveBayesClassifier()

    dataset = [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0],
               [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0],
               [2.280362439, 2.866990263, 0], [7.423436942, 4.696522875, 1],
               [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1],
               [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]]

    results_dataset = [(5.178333386499999, 2.7665845055177263, 10),
                       (2.9984683241, 1.218556343617447, 10)]
    test_results = classifier.gather_data_params(dataset)

    assert test_results == results_dataset
def test_evaluate_algorithm():

    classifier = NaiveBayesClassifier()

    dataset = [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0],
               [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0],
               [2.280362439, 2.866990263, 0], [7.423436942, 4.696522875, 1],
               [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1],
               [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]]

    n_folds = 5
    results_data = classifier.evaluate_algorithm(dataset, n_folds)

    assert len(results_data) == n_folds
    assert [data for data in results_data if 0 <= data <= 100]
Beispiel #8
0
def test_k_fold_cross_validation_split():

    classifier = NaiveBayesClassifier()

    dataset = [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0],
               [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0],
               [2.280362439, 2.866990263, 0], [7.423436942, 4.696522875, 1],
               [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1],
               [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]]

    folds_num = 5

    results_dataset = classifier.k_fold_cross_validation_split(
        dataset, folds_num)

    assert len(results_dataset) == folds_num
Beispiel #9
0
def test_predict():

    classifier = NaiveBayesClassifier()

    dataset = {
        1: [(2.7420144012, 0.9265683289298018, 5),
            (3.0054686692, 1.1073295894898725, 5)],
        0: [(7.6146523718, 1.2344321550313704, 5),
            (2.9914679790000003, 1.4541931384601618, 5)]
    }

    row = [3.7, 2.9, 0]

    results_predict = classifier.predict(dataset, row)

    assert results_predict == 1
Beispiel #10
0
def test_calculate_class_parameters():

    classifier = NaiveBayesClassifier()

    dataset = [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0],
               [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0],
               [2.280362439, 2.866990263, 0], [7.423436942, 4.696522875, 1],
               [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1],
               [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]]

    results_dataset = {
        0: [(2.7420144012, 0.9265683289298018, 5),
            (3.0054686692, 1.1073295894898725, 5)],
        1: [(7.6146523718, 1.2344321550313704, 5),
            (2.9914679790000003, 1.4541931384601618, 5)]
    }

    assert classifier.calculate_class_parameters(dataset) == results_dataset
Beispiel #11
0
def main():
    '''

    Main function
    :return:
        NAN
    '''

    # Load Data
    x_train, y_train, x_test, y_test, label_dict = load_mnist(
        which_type='fashion', threshold=0.5)

    # Get the Model
    nbc = NaiveBayesClassifier()

    # Train
    nbc.fit(x_train, y_train)

    # Test
    predictions = nbc.predict(x_test)

    # Evaluate accuracy
    accuracy = np.sum(np.uint8(predictions == y_test)) / len(y_test)
    print("Accuracy: ", accuracy)

    # Show Confusion Matrix
    plot_confusion_matrix(targets=y_test,
                          predictions=predictions,
                          classes=[label_dict[l] for l in label_dict])

    # Plot predictions
    plt.figure()
    while True:
        idx = np.random.randint(0, x_test.shape[0])
        x = x_test[idx]
        p = predictions[idx]
        y = y_test[idx]

        plt.imshow(x, cmap='gray')
        plt.title('Target: {}, Prediction: {}'.format(label_dict[int(y)],
                                                      label_dict[int(p)]))
        plt.waitforbuttonpress()
def test_divide_data_by_class():

    classifier = NaiveBayesClassifier()

    dataset = [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0],
               [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0],
               [2.280362439, 2.866990263, 0], [7.423436942, 4.696522875, 1],
               [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1],
               [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]]

    results_dataset = {
        0: [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0],
            [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0],
            [2.280362439, 2.866990263, 0]],
        1: [[7.423436942, 4.696522875, 1], [5.745051997, 3.533989803, 1],
            [9.172168622, 2.511101045, 1], [7.792783481, 3.424088941, 1],
            [7.939820817, 0.791637231, 1]]
    }

    assert classifier.divide_data_by_class(dataset) == results_dataset
Beispiel #13
0
def main():
    """ Main function """

    # load data
    x_train, y_train, x_test, y_test, label_dict = load_mnist(
        which_type='digits', threshold=0.5)

    # get the model
    nbc = NaiveBayesClassifier()

    # train
    nbc.fit(x_train, y_train)

    # test
    predictions = nbc.predict(x_test)

    # evaluate performances
    accuracy = np.sum(np.uint8(predictions == y_test)) / len(y_test)
    print('Accuracy: {}'.format(accuracy))

    # show confusion matrix
    plot_confusion_matrix(targets=y_test,
                          predictions=predictions,
                          classes=[label_dict[l] for l in label_dict])

    # plot predictions
    plt.figure()
    while True:
        idx = np.random.randint(0, x_test.shape[0])

        x = x_test[idx]
        p = predictions[idx]
        y = y_test[idx]

        plt.imshow(x, cmap='gray')
        plt.title('Target: {}, Prediction: {}'.format(label_dict[int(y)],
                                                      label_dict[int(p)]))
        plt.waitforbuttonpress()
Beispiel #14
0
print()
print(classification_report(y_test, predictions))
'''

### K-NEAREST NEIGHBORS ###
'''
from sklearn.datasets import load_iris
from knn import KNN

X, y = load_iris(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

model = KNN()

model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))
'''

### NAIVE BAYES CLASSIFIER ###
from sklearn.datasets import load_wine
from naive_bayes import NaiveBayesClassifier

X, y = load_wine(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

model = NaiveBayesClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))
    '/Users/rileylittlefield/Desktop/notes/readingnotes/python-ml/data-science-from-scratch/12-exercises'
)
from data_split_for_model_training import split_data

from naive_bayes import NaiveBayesClassifier
from data_harvester import data
import random
import pdb
from collections import defaultdict

random.seed(0)
train_data, test_data = split_data(data, 0.75)
print("train_data_length = %s" % len(train_data))
print("test_data_length = %s" % len(test_data))

classifier = NaiveBayesClassifier()
# pdb.set_trace()
classifier.train(train_data)
classified = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

true_positives = []
true_negatives = []
false_positives = []
false_negatives = []
subject, classification, predicted_prob = 0, 1, 2
for my_tuple in classified:
    is_spam = my_tuple[classification]
    predict_is_spam = (my_tuple[predicted_prob] > 0.5)
    # if predict_is_spam:
    #     print('hey ho!')
Beispiel #16
0
X2 = titanic_data.iloc[:, 1:]
y2 = titanic_data.iloc[:, 0]

########################## Classification ##################################

X_train, X_test, y_train, y_test = train_test_split(X2,
                                                    y2,
                                                    test_size=0.2,
                                                    random_state=42)

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

clf = NaiveBayesClassifier(type='Gaussian')

####### Convert X_train and X_test into an np array for Logistic Regression ######
# clf = LogisticRegression(num_steps=5000, regularisation='L2')

# clf1 = DecisionTree(max_depth=5, split_val_metric='mean', split_node_criterion='gini')
# clf = RandomForest(n_trees=10, sample_size=0.8, max_features=6,
#                    max_depth=5, split_val_metric='mean', split_node_criterion='gini')

##### Using two decision trees and a single naive bayes here while logistic regression is by default the meta-learner
# clf = Stacking([(clf, 1), (clf1, 2)])

# clf1 = BoostingDecisionTree(max_depth=5, split_val_metric='mean', split_node_criterion='gini')
# clf = AdaBoostClassifier(n_trees=100, learning_rate=1)

#### For Logistic Regression
Beispiel #17
0
def test_arithmetic_mean():

    classifier = NaiveBayesClassifier()
    assert classifier.arithmetic_mean(numbers=[1, 2, 3, 4, 5, 6, 7]) == 4
Beispiel #18
0
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from naive_bayes import NaiveBayesClassifier


def compute_accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)


if __name__ == '__main__':
    X, y = make_classification(n_samples=1000, n_features=10, n_classes=2)
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

    clf = NaiveBayesClassifier()
    clf.fit(X_train, Y_train)

    predictions = clf.predict(X_test)

    accuracy = compute_accuracy(Y_test, predictions)
    print("The accuracy of the model is: {}".format(accuracy))
Beispiel #19
0
def train(nb, file):
    correct = 0
    count = 0
    with codecs.open(file, 'r', encoding=ModelReader.encoding) as f:
        for line in f:
            count += 1
            lang, sent = line.strip().split("\t")
            pred_lang = nb.predict(sent)
            print("Predicted {0}, actual {1}".format(pred_lang, lang))
            correct += (pred_lang == lang)
    print("Accuracy: {0}".format(correct / count))

def test(nb, file):
    with codecs.open(file, 'r', encoding=ModelReader.encoding) as f:
        for line in f:
            _, sent = line.strip().split("\t")
            pred_lang = nb.predict(sent)
            print("Predicted '{0}' for '{1}'".format(pred_lang, sent))

if __name__ == "__main__":
    test_pass = True

    dir = "/opt/dropbox/17-18/473/project5/language-models"
    files = [os.path.abspath(os.path.join(dir, file)) for file in os.listdir(dir)]
    lang_file_pairs = { file.split(".")[0][-3:]: ModelReader(file).get() for file in files }
    nb = NaiveBayesClassifier(lang_file_pairs, verbose=True)

    if test_pass:
        test(nb, "/opt/dropbox/17-18/473/project5/test.txt")
    else:
        train(nb, "/opt/dropbox/17-18/473/project5/train.txt")
    def __init__(self):

        self.dataset_filename = 'datasets/pima-indians-diabetes.csv'
        self.description_filename = 'datasets/pima-indians-diabetes.names'
        self.nbc = NaiveBayesClassifier()
        self.dataset = self.nbc.load_dataset_from_csv(self.dataset_filename)