Exemple #1
0
 def __init__(self, train_file, test_file, N):
     self.V = ['lib', 'con']
     self.test_file = test_file
     self.NB = NaiveBayes(train_file)
     self.N = N
     self.exclude_stopwords()
     self.NB.calculate_p_wv()
     self.test()
Exemple #2
0
class NBStopWords:
    def __init__(self, train_file, test_file, N):
        self.V = ['lib', 'con']
        self.test_file = test_file
        self.NB = NaiveBayes(train_file)
        self.N = N
        self.exclude_stopwords()
        self.NB.calculate_p_wv()
        self.test()

    def exclude(self, word, counter):
        if word in counter:
            del counter[word]

    def exclude_stopwords(self):
        stopwords = list()
        for word in self.NB.vocabulary.most_common(self.N):
            w = word[0]
            stopwords.append(w)
            self.exclude(w, self.NB.text_libs)
            self.exclude(w, self.NB.text_cons)
        for word in stopwords:
            self.exclude(word, self.NB.vocabulary)

    def test(self):
        accurate = 0
        sum = 0
        with open(self.test_file, "r") as docs:
            for doc in docs:
                sum += 1
                doc = doc.rstrip("\n\r")
                if self.V[0] in doc:
                    label = self.V[0]
                else:
                    label = self.V[1]
                with open(doc, "r") as text:
                    v_lib, v_con = self.NB.classify(text)
                if v_lib > v_con:
                    v_nb = self.V[0]
                    print "L"
                else:
                    v_nb = self.V[1]
                    print "C"
                if v_nb == label:
                    accurate += 1
        accuracy = float(accurate) / sum
        print("Accuracy: %.04f" % accuracy)
Exemple #3
0
def digitClassification(percent: int):
    n_total_digits = 450
    n_samples_digits = int(n_total_digits * (percent / 100))
    n_testing_digits = 1000
    digit_list = read_digits_file("digitdata/trainingimages", n_samples_digits)
    digit_test_list = read_digits_file("digitdata/testimages",
                                       n_testing_digits)

    y = np.array(
        util.loadLabelsFile("digitdata/traininglabels", n_samples_digits))

    train_y = y
    test_y = np.array(
        util.loadLabelsFile("digitdata/testlabels", n_testing_digits))

    x = perceptron_faces_features(digit_list)
    train_x = x
    test_x = perceptron_faces_features(digit_test_list)
    p_model = Perceptron(100)

    start_time = time.time()
    p_model.train(train_x, train_y)
    elapsed_time = time.time() - start_time
    print('%.3f' % (elapsed_time) + " seconds for training")

    start_time = time.time()
    matches = list(p_model.predict(test_x) == test_y).count(True)
    elapsed_time = time.time() - start_time
    print('%.3f' % (elapsed_time) + " seconds for predicting")

    accuracy = matches / n_testing_digits
    print('Perceptron accuracy:', 100 * accuracy, '%')
    print()

    x = nb_faces_features(digit_list)
    train_x = x
    test_x = nb_faces_features(digit_test_list)
    nb_model = NaiveBayes()

    start_time = time.time()
    nb_model.train(train_x, train_y)
    elapsed_time = time.time() - start_time
    print('%.3f' % (elapsed_time) + " seconds for training")

    start_time = time.time()
    matches = list(nb_model.predict(test_x) == test_y).count(True)
    elapsed_time = time.time() - start_time
    print('%.3f' % (elapsed_time) + " seconds for predicting")

    accuracy = matches / n_testing_digits
    print('nb accuracy:', 100 * accuracy, '%')
    print()

    x = knn_faces_features(digit_list)
    train_x = x
    test_x = knn_faces_features(digit_test_list)
    knn_model = KNN()

    start_time = time.time()
    knn_model.train(train_x, train_y)
    elapsed_time = time.time() - start_time
    print('%.3f' % (elapsed_time) + " seconds for training")

    start_time = time.time()
    matches = list(knn_model.predict(test_x) == test_y).count(True)
    elapsed_time = time.time() - start_time
    print('%.3f' % (elapsed_time) + " seconds for predicting")

    accuracy = matches / n_testing_digits
    print('KNN accuracy:', 100 * accuracy, '%')
    print()
from nb import NaiveBayes
import numpy as np
import scipy.io as sio

from sklearn.metrics import zero_one_loss

#change this to where
mat_dict = sio.loadmat('XwindowsDocData.mat')

Xtrain = mat_dict['xtrain'].toarray()
Xtest = mat_dict['xtest'].toarray()

ytrain = mat_dict['ytrain'].flatten()
ytest = mat_dict['ytest'].flatten()

nb = NaiveBayes()
pi, theta = nb.fit(Xtrain, ytrain)

ypred_train = nb.predict(Xtrain)
ypred_test = nb.predict(Xtest)

print(ypred_train)
print(ypred_test)

#because the classes are 1,2
ypred_train = 1 + ypred_train.argmax(axis=1)
ypred_test = 1 + ypred_test.argmax(axis=1)

print(ypred_train[-20:])
print(ytrain[-20:])
print(ypred_test[-20:])
Exemple #5
0
from nb import NaiveBayes
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np


def accuracy(pred, label):
    acc = np.sum(pred == label) / len(label)
    return acc


X, y = datasets.make_classification(n_samples=1000,
                                    n_features=4,
                                    n_classes=2,
                                    random_state=123)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123)

model = NaiveBayes()
model.fit(X_train, y_train)
predict = model.predict(X_test)

acc = accuracy(predict, y_test)
print(acc)
Exemple #6
0
data = Data("mushrooms.csv")
method = ANN(data)
i = time()
method.train()
method.predict()
tempo = time() - i
result = method.getPercentage()
print 'Tempo (ms):', tempo
print 'Taxa de acerto:', result
print ''

data = Data("mushrooms.csv")
method = SVM(data)
i = time()
method.train()
method.predict()
tempo = time() - i
result = method.getPercentage()
print 'Tempo (ms):', tempo
print 'Taxa de acerto:', result
print ''

data = Data("mushrooms.csv")
method = NaiveBayes(data)
i = time()
method.train()
method.predict()
tempo = time() - i
result = method.getPercentage()
print 'Tempo (ms):', tempo
print 'Taxa de acerto:', result
Exemple #7
0
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt

from nb import NaiveBayes


def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy


X, y = datasets.make_classification(n_samples=1000,
                                    n_features=10,
                                    n_classes=2,
                                    random_state=123)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123)

nb = NaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)

# print(type(y_train))
print("Naive Bayes classification accuracy", accuracy(y_test, predictions))
Exemple #8
0
def main():

    start_time = time.time()

    # Read documents, divide according to the topics and separate train and test data-set.

    t_path = os.getcwd() + "/bbc/"

    all_docs = defaultdict(lambda: list())

    topic_list = list()

    print("Reading all the documents...\n")

    for topic in os.listdir(t_path):
        d_path = t_path + topic + '/'
        topic_list.append(topic)
        temp_docs = list()

        for f in os.listdir(d_path):
            f_path = d_path + f
            temp_docs.append(Document(f_path, topic))

        all_docs[topic] = temp_docs[:]

    fold_count = 10

    train_docs, test_docs = list(), list()

    for key, value in all_docs.items():
        random.shuffle(value)
        test_len = int(len(value) / fold_count)
        train_docs += value[:-test_len]
        test_docs += value[-test_len:]

    # Create tfidf and tfidfie index of training docs, and store into the docs.
    index = Index(train_docs)

    print("Train Document Count: " + str(len(train_docs)))
    print("Test  Document Count: " + str(len(test_docs)))

    test_topics = [d.topic for d in test_docs]

    for doc in train_docs:
        doc.vector = doc.tfidfie

    for doc in test_docs:
        doc.vector = doc.tf

    # create classifier instances.
    nb = NaiveBayes()
    rc = RankClassifier()
    kmeans = KMeans(topic_list)

    classifier_list = [rc, nb, kmeans]

    for i in range(len(classifier_list)):

        print("\nClassifier #" + str(i + 1) + "\n")

        classifier = classifier_list[i]

        classifier.confusion_matrix, c_dict = init_confusion_matrix(topic_list)

        print("Training...\n")

        classifier.train(train_docs)

        print("Testing... Classifying the test docs...\n")

        predictions = classifier.classify(test_docs)

        # Update the confusion matrix and statistics with updated values.
        classifier.confusion_matrix = update_confusion_matrix(
            test_topics, predictions, classifier.confusion_matrix, c_dict)

        classifier.stats = cal_stats(classifier.confusion_matrix)

        print("Confusion Matrix\n")
        for item in classifier.confusion_matrix:
            print(item)

        print("\nStatistics\n")
        print_table(get_stats_table(classifier.stats))

    print("Run time...{} secs \n".format(round(time.time() - start_time, 4)))

    # call recommendation system once classifiers are ready.
    recommendation(all_docs, test_docs, classifier_list)
Exemple #9
0
# label_encoder = LabelEncoder()

# for col in X.columns:
#     X[col] = label_encoder.fit_transform(X[col])

# y['Play'] = label_encoder.fit_transform(y['Play'])
# print(X.head())
# print(y)

# X = np.array(X, dtype=np.float64)
# y = np.array(y, dtype=np.float64)

# X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=123)
X, y = datasets.make_classification(n_samples=100, n_features=4, n_classes=2, random_state=123)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# print(len(y[y == 0]) / len(y))
# print(len(y[y == 1]) / len(y))

nb = NaiveBayes()
# print(X_test)
nb.fit(X_train, y_train)

# predictions = nb.predict(X_test)
# print(predictions)

# predictions = nb.predict([[ 0.83617024,  0.47576265, 0.76693704,  1.54433392]])
# print(predictions)

# print('Naive Bayes Classification Accuracy: ', accuracy(y_test, predictions))
Exemple #10
0
from nb import NaiveBayes
from itertools import islice
import sys

if __name__ == '__main__':
    nb = NaiveBayes(sys.argv[1])
    nb.calculate_p_wv()
    for w, p in islice(
            sorted(nb.p_wv.items(), key=lambda pair: pair[1][0], reverse=True),
            20):
        print(w + " %.04f" % p[0])
    print
    for w, p in islice(
            sorted(nb.p_wv.items(), key=lambda pair: pair[1][1], reverse=True),
            20):
        print(w + " %.04f" % p[1])
Exemple #11
0
def main():

    start_time = time.time()

    t_path = "../data_set/bbc/"

    all_docs = defaultdict(lambda: list())

    topic_list = list()

    print("Reading all the documents...\n")
    print(os.listdir(t_path))
    for topic in os.listdir(t_path):
        d_path = t_path + topic + '/'
        topic_list.append(topic)
        temp_docs = list()

        for f in os.listdir(d_path):
            f_path = d_path + f
            temp_docs.append(Document(f_path, topic))

        all_docs[topic] = temp_docs[:]

    fold_count = 10

    train_docs, test_docs = list(), list()

    for key, value in all_docs.items():
        random.shuffle(value)
        test_len = int(len(value) / fold_count)
        train_docs += value[:-test_len]
        test_docs += value[-test_len:]

    index = Index(train_docs)

    print("Train Document Count: " + str(len(train_docs)))
    print("Test  Document Count: " + str(len(test_docs)))

    test_topics = [d.topic for d in test_docs]

    for doc in train_docs:
        doc.vector = doc.tfidfie

    for doc in test_docs:
        doc.vector = doc.tf

    nb = NaiveBayes()
    rc = RankClassifier()
    kmeans = KMeans(topic_list)

    classifier_list = [rc, nb, kmeans]

    for i in range(len(classifier_list)):

        print("\nClassifier #" + str(i + 1) + "\n")

        classifier = classifier_list[i]

        classifier.confusion_matrix, c_dict = init_confusion_matrix(topic_list)

        print("Training...\n")

        classifier.train(train_docs)

        print("Testing... Classifying the test docs...\n")

        predictions = classifier.classify(test_docs)

        classifier.confusion_matrix = update_confusion_matrix(
            test_topics, predictions, classifier.confusion_matrix, c_dict)

        classifier.stats = cal_stats(classifier.confusion_matrix)

        print("Confusion Matrix\n")
        for item in classifier.confusion_matrix:
            print(item)

        print("\nStatistics\n")
        print_table(get_stats_table(classifier.stats))

    print("Run time...{} secs \n".format(round(time.time() - start_time, 4)))

    recommendation(all_docs, test_docs, classifier_list)
def main():

    start_time = time.time()

    # Read documents, divide according to the topics and separate train and test data-set.

    t_path = "../bbc/"

    all_docs = defaultdict(lambda: list())

    topic_list = list()
    for topic in os.listdir(t_path):
        d_path = t_path + topic + '/'
        topic_list.append(topic)
        temp_docs = list()

        for f in os.listdir(d_path):
            f_path = d_path + f
            temp_docs.append(Document(f_path, topic))

        all_docs[topic] = temp_docs[:]
    fold_count = 10

    train_docs, test_docs = list(), list()

    for key, value in all_docs.items():
        random.shuffle(value)
        test_len = int(len(value) / fold_count)
        train_docs += value[:-test_len]
        # explanation
        #   lis = [1,2,3,4,5]
        # print(lis[:-4])
        # print(lis[-4:])
        test_docs += value[-test_len:]

    # Create tfidf and tfidfie index of training docs, and store into the docs.
    index = Index(train_docs)

    test_topics = [d.topic for d in test_docs]

    for doc in train_docs:
        doc.vector = doc.tfidfie

    for doc in test_docs:
        doc.vector = doc.tf

    # create classifier instances.
    nb = NaiveBayes()
    rc = RankClassifier()
    kmeans = KMeans(topic_list)

    classifier_list = [nb, rc, kmeans]

    for i in range(len(classifier_list)):

        classifier = classifier_list[i]

        classifier.confusion_matrix, c_dict = init_confusion_matrix(topic_list)

        classifier.train(train_docs)
        predictions = classifier.classify(test_docs)

        # Update the confusion matrix and statistics with updated values.
        classifier.confusion_matrix = update_confusion_matrix(
            test_topics, predictions, classifier.confusion_matrix, c_dict)

        classifier.stats = cal_stats(classifier.confusion_matrix)

    global lst
    lst = []
    lst.append(all_docs)
    lst.append(test_docs)
    lst.append(classifier_list)
    return redirect('http://localhost:5000/recommend')