Ejemplo n.º 1
0
class CrossValidation(object):
    def __init__(self):
        self.classifier = NaiveBayes()

    def create_data(self, user_ids):
        data = []
        for category, ids in user_ids.items():
            tweets = get_tweets(ids)
            categories = [category] * len(tweets)
            data += list(zip(tweets, categories))

        np.random.shuffle(data)
        return data

    def split(self, data, test_percentage):
        n_test = int(len(data)*test_percentage)
        n_training = len(data)-n_test

        # unzip (inverse of zip)
        training = zip(*data[:n_training])
        test = zip(*data[n_training:])
        return training, test

    def show_tweets_with_labels(self, tweets, labels):
        for tweet, label in zip(tweets, labels):
            print("{}:\n{}\n".format(label, tweet))

    def evaluate(self, user_ids, test_percentage=0.2, verbose=True):
        """
        user_ids: Twitter IDs separated into categories.
        test_percentage: Ratio of the amount of test data extracted
        from tweets.
        """

        if not(0 <= test_percentage <= 1):
            raise ValueError("test_percentage must be between 0 and 1 "
                             "(inclusive).")

        data = self.create_data(user_ids)
        training, test = self.split(data, test_percentage)

        tweets, categories = training
        self.classifier.fit(tweets, categories)

        tweets, answers = test
        results = self.classifier.predict(tweets)

        if(verbose):
            self.show_tweets_with_labels(tweets, results)

        return results, answers
Ejemplo n.º 2
0
class CrossValidation(object):
    def __init__(self):
        self.classifier = NaiveBayes()

    def create_data(self, user_ids):
        data = []
        for category, ids in user_ids.items():
            tweets = get_tweets(ids)
            categories = [category] * len(tweets)
            data += list(zip(tweets, categories))

        np.random.shuffle(data)
        return data

    def split(self, data, test_percentage):
        n_test = int(len(data) * test_percentage)
        n_training = len(data) - n_test

        # unzip (inverse of zip)
        training = zip(*data[:n_training])
        test = zip(*data[n_training:])
        return training, test

    def show_tweets_with_labels(self, tweets, labels):
        for tweet, label in zip(tweets, labels):
            print("{}:\n{}\n".format(label, tweet))

    def evaluate(self, user_ids, test_percentage=0.2, verbose=True):
        """
        user_ids: Twitter IDs separated into categories.
        test_percentage: Ratio of the amount of test data extracted
        from tweets.
        """

        if not (0 <= test_percentage <= 1):
            raise ValueError("test_percentage must be between 0 and 1 "
                             "(inclusive).")

        data = self.create_data(user_ids)
        training, test = self.split(data, test_percentage)

        tweets, categories = training
        self.classifier.fit(tweets, categories)

        tweets, answers = test
        results = self.classifier.predict(tweets)

        if (verbose):
            self.show_tweets_with_labels(tweets, results)

        return results, answers
Ejemplo n.º 3
0
    if c_true == pos_class:
        pos += 1
    else:
        neg += 1

result_pos = []
result_neg = []
result_dif = []
result_nor = []
for (v, c_true) in d.test_set:
    """
    prepare predictions for sorting
    in case of equal weight, positive instances come first
    store both true class and first NB prediction
    """
    c_pred_nb = prnb.predict(v)
    wy = 0
    wn = 0
    for c in prnb.clssprobs:
        if c == pos_class: wy += prnb.value_weight(v, c)
        else: wn += prnb.value_weight(v, c)
    result_dif.append((wy - wn, c_true == pos_class, c_true, c_pred_nb[0]))
    result_pos.append((wy, c_true == pos_class, c_true, c_pred_nb[0]))
    result_neg.append((wn, c_true != pos_class, c_true, c_pred_nb[0]))
    result_nor.append(
        (wy / (wy + wn), c_true == pos_class, c_true, c_pred_nb[0]))

plt.plot([-0.001, 1.001], [-0.001, 1.001],
         color="orange")  # diagonal reference

trpos = 0
Ejemplo n.º 4
0
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt

from naivebayes import NaiveBayes


def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy


X, y = datasets.make_classification(n_samples=1000,
                                    n_features=10,
                                    n_classes=2,
                                    random_state=123)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123)

nb = NaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)

print("Naive Bayes classification accuracy", accuracy(y_test, predictions))
Ejemplo n.º 5
0
from data import Data
from naivebayes import NaiveBayes

filename = "datasets/weatherNominal.td"
## filename = "datasets/titanic.td"
## filename = "datasets/cmc.td"

d = Data(filename)
d.report()

pr = NaiveBayes(d)
pr.train()
pr.show()

for (v, c_true) in d.test_set:
    c_pred = pr.predict(v)[0]
    print(v, ":")
    print("   ", c_pred, "( true class:", c_true, ")")

##    print(pr.predict(("Class:1st","Sex:Female","Age:Child")))

##    print(pr.predict(("Class:Crew","Sex:Female","Age:Child")))
Ejemplo n.º 6
0
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)


X, y = datasets.make_blobs(n_samples=1000,
                           n_features=2,
                           centers=3,
                           cluster_std=1.0,
                           center_box=(-10.0, 10.0),
                           shuffle=True,
                           random_state=123,
                           return_centers=False)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.9,
                                                    random_state=1234)

clf = NaiveBayes()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(accuracy(y_test, y_pred))

color_map = {0: 'r', 1: 'k', 2: 'g'}

label_color = [color_map[l] for l in y_pred]
plt.scatter(X_test[:, 0], X_test[:, 1], c=label_color)
plt.show()
filename = "ds/titanicTr.txt"

d = Data(filename, 75)

prmap = MaxAPost(d)
prmap.train()

prnb = NaiveBayes(d)
prnb.train()

cmmap = ConfMat(prmap.clsscnts)
cmnb = ConfMat(prnb.clsscnts)
comparing = set([])
for (v, c_true) in d.test_set:
    c_pred_map = tuple(prmap.predict(v))
    c_pred_nb = tuple(prnb.predict(v))
    if len(c_pred_map) and len(c_pred_nb):
        warn = (c_pred_map[0] != c_pred_nb[0])
        cmmap.mat[c_pred_map[0], c_true] += 1
        cmnb.mat[c_pred_nb[0], c_true] += 1
    else:
        warn = True
    if warn:
        comparing.add((v, c_true, c_pred_map, c_pred_nb))

print
for r in sorted(comparing):
    print r[0], ": true class ", r[1]
    print "    MAP pred", r[2],
    print "    NB pred", r[3]
Ejemplo n.º 8
0
    validation_set_size = 10000
    train_set, validation_set = split_train_validation(dataset,
                                                       validation_set_size)
    num_to_train_on = 10000000
    time_before("training adaboost")
    ab.train_set(dataset[:num_to_train_on])
    time_after("training adaboost")
    time_before("training naive bayes")
    nb.train_set(dataset[:num_to_train_on])
    time_after("training naive bayes")

    kg_validations_nb = []
    kg_validations_ab = []

    for i in validation_set:
        kg_validations_nb.append(nb.predict(*i[1:]) == i[0])
        kg_validations_ab.append(ab.predict(*i[1:]) == i[0])

    # print("Errors nb: %s " % sum([0 if i else 1 for i in kg_validations_nb]))
    print("Errors ab: %s " % sum([0 if i else 1 for i in kg_validations_ab]))

    # import pdb; pdb.set_trace()

    predictions = []

    print("creating predictions...")
    with open(testset, "r") as testfile:
        data = testfile.read()
        lines = data.split('\n')[1:][:num_to_train_on]
        for line in lines:
            if not line:
Ejemplo n.º 9
0
# Warming up textrocessing engines
textPrep = TextPreprocessing(ngrams_n=4,
                             ngrams_count=2000).load(dataset.stopWords)

# Warming up the FeaturesMatrixBuilder
featuresMatrix = FeaturesMatrixBuilder(dataset, textPrep)

# Doing the actual training on the first 22000 reviews
XTrain, yTrain = featuresMatrix.buildTrainingData()
nb = NaiveBayes()
nb.fit(XTrain[:22000, :], yTrain[:22000])

# Validating on the remaining
y = yTrain[22000:]
yhat = nb.predict(XTrain[22000:, :])
m = getConfusionMatrix(yTrain[22000:], yhat)
print("\n=== RESULTS ===")
endTimer(t)
printResults(m)

# Running the model on the test set
print("Training using the whole training set this time")
nb.fit(XTrain, yTrain)
(XTest, ids) = featuresMatrix.buildTestData()
yhat = nb.predict(XTest)
with open("output/test.txt", "w") as f:
    t = timer()
    print("Writing the test results file")
    f.write("Id,Category\n")
    for i, yi in tqdm(enumerate(yhat)):
Ejemplo n.º 10
0
def simplified_bayes(train_letters, test_letters, prior):
    nb = NaiveBayes(train_letters, prior)
    return ''.join([nb.predict(letter) for letter in test_letters])