Ejemplo n.º 1
0
class Classifier(object):
    def __init__(self):
        self.classifier = NaiveBayes()

    def learn_from_tweets(self, user_ids, category):
        """
        Train the classifier by tweets.
        user_ids : A list of twitter ids which their tweets are included
        in the category.
        category : The category of the tweets.
        """
        tweets = get_tweets(user_ids)
        categories = [category] * len(tweets)
        self.classifier.fit(tweets, categories)
        print("Training...")

    def predict_user_input(self):
        """Read user input until 'exit' is entered"""
        sentence = input("input =>")
        while(sentence != 'exit'):
            category = self.classifier.predict_(sentence)
            print("{}\n".format(category))
            sentence = input("input =>")

    def save(self, filename):
        """Save the model."""
        self.classifier.dump_json(filename)

    def load(self, filename):
        """Load the model from a file."""
        self.classifier.load_json(filename)
Ejemplo n.º 2
0
class Classifier(object):
    def __init__(self):
        self.classifier = NaiveBayes()

    def learn_from_tweets(self, user_ids, category):
        """
        Train the classifier by tweets.
        user_ids : A list of twitter ids which their tweets are included
        in the category.
        category : The category of the tweets.
        """
        tweets = get_tweets(user_ids)
        categories = [category] * len(tweets)
        self.classifier.fit(tweets, categories)
        print("Training...")

    def predict_user_input(self):
        """Read user input until 'exit' is entered"""
        sentence = input("input =>")
        while (sentence != 'exit'):
            category = self.classifier.predict_(sentence)
            print("{}\n".format(category))
            sentence = input("input =>")

    def save(self, filename):
        """Save the model."""
        self.classifier.dump_json(filename)

    def load(self, filename):
        """Load the model from a file."""
        self.classifier.load_json(filename)
Ejemplo n.º 3
0
class CrossValidation(object):
    def __init__(self):
        self.classifier = NaiveBayes()

    def create_data(self, user_ids):
        data = []
        for category, ids in user_ids.items():
            tweets = get_tweets(ids)
            categories = [category] * len(tweets)
            data += list(zip(tweets, categories))

        np.random.shuffle(data)
        return data

    def split(self, data, test_percentage):
        n_test = int(len(data)*test_percentage)
        n_training = len(data)-n_test

        # unzip (inverse of zip)
        training = zip(*data[:n_training])
        test = zip(*data[n_training:])
        return training, test

    def show_tweets_with_labels(self, tweets, labels):
        for tweet, label in zip(tweets, labels):
            print("{}:\n{}\n".format(label, tweet))

    def evaluate(self, user_ids, test_percentage=0.2, verbose=True):
        """
        user_ids: Twitter IDs separated into categories.
        test_percentage: Ratio of the amount of test data extracted
        from tweets.
        """

        if not(0 <= test_percentage <= 1):
            raise ValueError("test_percentage must be between 0 and 1 "
                             "(inclusive).")

        data = self.create_data(user_ids)
        training, test = self.split(data, test_percentage)

        tweets, categories = training
        self.classifier.fit(tweets, categories)

        tweets, answers = test
        results = self.classifier.predict(tweets)

        if(verbose):
            self.show_tweets_with_labels(tweets, results)

        return results, answers
Ejemplo n.º 4
0
class CrossValidation(object):
    def __init__(self):
        self.classifier = NaiveBayes()

    def create_data(self, user_ids):
        data = []
        for category, ids in user_ids.items():
            tweets = get_tweets(ids)
            categories = [category] * len(tweets)
            data += list(zip(tweets, categories))

        np.random.shuffle(data)
        return data

    def split(self, data, test_percentage):
        n_test = int(len(data) * test_percentage)
        n_training = len(data) - n_test

        # unzip (inverse of zip)
        training = zip(*data[:n_training])
        test = zip(*data[n_training:])
        return training, test

    def show_tweets_with_labels(self, tweets, labels):
        for tweet, label in zip(tweets, labels):
            print("{}:\n{}\n".format(label, tweet))

    def evaluate(self, user_ids, test_percentage=0.2, verbose=True):
        """
        user_ids: Twitter IDs separated into categories.
        test_percentage: Ratio of the amount of test data extracted
        from tweets.
        """

        if not (0 <= test_percentage <= 1):
            raise ValueError("test_percentage must be between 0 and 1 "
                             "(inclusive).")

        data = self.create_data(user_ids)
        training, test = self.split(data, test_percentage)

        tweets, categories = training
        self.classifier.fit(tweets, categories)

        tweets, answers = test
        results = self.classifier.predict(tweets)

        if (verbose):
            self.show_tweets_with_labels(tweets, results)

        return results, answers
Ejemplo n.º 5
0
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt

from naivebayes import NaiveBayes


def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy


X, y = datasets.make_classification(n_samples=1000,
                                    n_features=10,
                                    n_classes=2,
                                    random_state=123)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123)

nb = NaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)

print("Naive Bayes classification accuracy", accuracy(y_test, predictions))
Ejemplo n.º 6
0
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)


X, y = datasets.make_blobs(n_samples=1000,
                           n_features=2,
                           centers=3,
                           cluster_std=1.0,
                           center_box=(-10.0, 10.0),
                           shuffle=True,
                           random_state=123,
                           return_centers=False)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.9,
                                                    random_state=1234)

clf = NaiveBayes()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(accuracy(y_test, y_pred))

color_map = {0: 'r', 1: 'k', 2: 'g'}

label_color = [color_map[l] for l in y_pred]
plt.scatter(X_test[:, 0], X_test[:, 1], c=label_color)
plt.show()
Ejemplo n.º 7
0
#!/usr/bin/env python3

import pandas as pd
import numpy as np
from naivebayes import NaiveBayes
import sklearn.model_selection

data = pd.read_csv("predict_player_value.csv")
data = data[["overall", "value_eur", "pace", "shooting", "passing", "dribbling", "defending", "physic"]]
#print(data.head())
#print(data.iloc[[1]])


predict = "value_eur"

x = np.array(data.drop([predict], 1))
y = np.array(data[predict])

x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size = 0.01)
clf = NaiveBayes()
print(len(x_train), len(y_train))
clf.fit(x_train, y_train)

#acc = clf.predict(x_test)
#print(x_test[0], acc[0])
#print(clf.predict([[60, 55, 65, 50, 70, 60, 60]]))
#for i in range(len(x_test)):
#    print(x_test[i], acc[i])
Ejemplo n.º 8
0
t = timer()

# Loading datasets
dataset = Lexicons().load()

# Warming up textrocessing engines
textPrep = TextPreprocessing(ngrams_n=4,
                             ngrams_count=2000).load(dataset.stopWords)

# Warming up the FeaturesMatrixBuilder
featuresMatrix = FeaturesMatrixBuilder(dataset, textPrep)

# Doing the actual training on the first 22000 reviews
XTrain, yTrain = featuresMatrix.buildTrainingData()
nb = NaiveBayes()
nb.fit(XTrain[:22000, :], yTrain[:22000])

# Validating on the remaining
y = yTrain[22000:]
yhat = nb.predict(XTrain[22000:, :])
m = getConfusionMatrix(yTrain[22000:], yhat)
print("\n=== RESULTS ===")
endTimer(t)
printResults(m)

# Running the model on the test set
print("Training using the whole training set this time")
nb.fit(XTrain, yTrain)
(XTest, ids) = featuresMatrix.buildTestData()
yhat = nb.predict(XTest)
with open("output/test.txt", "w") as f: