class Classifier(object): def __init__(self): self.classifier = NaiveBayes() def learn_from_tweets(self, user_ids, category): """ Train the classifier by tweets. user_ids : A list of twitter ids which their tweets are included in the category. category : The category of the tweets. """ tweets = get_tweets(user_ids) categories = [category] * len(tweets) self.classifier.fit(tweets, categories) print("Training...") def predict_user_input(self): """Read user input until 'exit' is entered""" sentence = input("input =>") while(sentence != 'exit'): category = self.classifier.predict_(sentence) print("{}\n".format(category)) sentence = input("input =>") def save(self, filename): """Save the model.""" self.classifier.dump_json(filename) def load(self, filename): """Load the model from a file.""" self.classifier.load_json(filename)
class Classifier(object): def __init__(self): self.classifier = NaiveBayes() def learn_from_tweets(self, user_ids, category): """ Train the classifier by tweets. user_ids : A list of twitter ids which their tweets are included in the category. category : The category of the tweets. """ tweets = get_tweets(user_ids) categories = [category] * len(tweets) self.classifier.fit(tweets, categories) print("Training...") def predict_user_input(self): """Read user input until 'exit' is entered""" sentence = input("input =>") while (sentence != 'exit'): category = self.classifier.predict_(sentence) print("{}\n".format(category)) sentence = input("input =>") def save(self, filename): """Save the model.""" self.classifier.dump_json(filename) def load(self, filename): """Load the model from a file.""" self.classifier.load_json(filename)
class CrossValidation(object): def __init__(self): self.classifier = NaiveBayes() def create_data(self, user_ids): data = [] for category, ids in user_ids.items(): tweets = get_tweets(ids) categories = [category] * len(tweets) data += list(zip(tweets, categories)) np.random.shuffle(data) return data def split(self, data, test_percentage): n_test = int(len(data)*test_percentage) n_training = len(data)-n_test # unzip (inverse of zip) training = zip(*data[:n_training]) test = zip(*data[n_training:]) return training, test def show_tweets_with_labels(self, tweets, labels): for tweet, label in zip(tweets, labels): print("{}:\n{}\n".format(label, tweet)) def evaluate(self, user_ids, test_percentage=0.2, verbose=True): """ user_ids: Twitter IDs separated into categories. test_percentage: Ratio of the amount of test data extracted from tweets. """ if not(0 <= test_percentage <= 1): raise ValueError("test_percentage must be between 0 and 1 " "(inclusive).") data = self.create_data(user_ids) training, test = self.split(data, test_percentage) tweets, categories = training self.classifier.fit(tweets, categories) tweets, answers = test results = self.classifier.predict(tweets) if(verbose): self.show_tweets_with_labels(tweets, results) return results, answers
class CrossValidation(object): def __init__(self): self.classifier = NaiveBayes() def create_data(self, user_ids): data = [] for category, ids in user_ids.items(): tweets = get_tweets(ids) categories = [category] * len(tweets) data += list(zip(tweets, categories)) np.random.shuffle(data) return data def split(self, data, test_percentage): n_test = int(len(data) * test_percentage) n_training = len(data) - n_test # unzip (inverse of zip) training = zip(*data[:n_training]) test = zip(*data[n_training:]) return training, test def show_tweets_with_labels(self, tweets, labels): for tweet, label in zip(tweets, labels): print("{}:\n{}\n".format(label, tweet)) def evaluate(self, user_ids, test_percentage=0.2, verbose=True): """ user_ids: Twitter IDs separated into categories. test_percentage: Ratio of the amount of test data extracted from tweets. """ if not (0 <= test_percentage <= 1): raise ValueError("test_percentage must be between 0 and 1 " "(inclusive).") data = self.create_data(user_ids) training, test = self.split(data, test_percentage) tweets, categories = training self.classifier.fit(tweets, categories) tweets, answers = test results = self.classifier.predict(tweets) if (verbose): self.show_tweets_with_labels(tweets, results) return results, answers
import numpy as np from sklearn.model_selection import train_test_split from sklearn import datasets import matplotlib.pyplot as plt from naivebayes import NaiveBayes def accuracy(y_true, y_pred): accuracy = np.sum(y_true == y_pred) / len(y_true) return accuracy X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=123) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) nb = NaiveBayes() nb.fit(X_train, y_train) predictions = nb.predict(X_test) print("Naive Bayes classification accuracy", accuracy(y_test, predictions))
def accuracy(y_true, y_pred): return np.sum(y_true == y_pred) / len(y_true) X, y = datasets.make_blobs(n_samples=1000, n_features=2, centers=3, cluster_std=1.0, center_box=(-10.0, 10.0), shuffle=True, random_state=123, return_centers=False) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=1234) clf = NaiveBayes() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(accuracy(y_test, y_pred)) color_map = {0: 'r', 1: 'k', 2: 'g'} label_color = [color_map[l] for l in y_pred] plt.scatter(X_test[:, 0], X_test[:, 1], c=label_color) plt.show()
#!/usr/bin/env python3 import pandas as pd import numpy as np from naivebayes import NaiveBayes import sklearn.model_selection data = pd.read_csv("predict_player_value.csv") data = data[["overall", "value_eur", "pace", "shooting", "passing", "dribbling", "defending", "physic"]] #print(data.head()) #print(data.iloc[[1]]) predict = "value_eur" x = np.array(data.drop([predict], 1)) y = np.array(data[predict]) x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size = 0.01) clf = NaiveBayes() print(len(x_train), len(y_train)) clf.fit(x_train, y_train) #acc = clf.predict(x_test) #print(x_test[0], acc[0]) #print(clf.predict([[60, 55, 65, 50, 70, 60, 60]])) #for i in range(len(x_test)): # print(x_test[i], acc[i])
t = timer() # Loading datasets dataset = Lexicons().load() # Warming up textrocessing engines textPrep = TextPreprocessing(ngrams_n=4, ngrams_count=2000).load(dataset.stopWords) # Warming up the FeaturesMatrixBuilder featuresMatrix = FeaturesMatrixBuilder(dataset, textPrep) # Doing the actual training on the first 22000 reviews XTrain, yTrain = featuresMatrix.buildTrainingData() nb = NaiveBayes() nb.fit(XTrain[:22000, :], yTrain[:22000]) # Validating on the remaining y = yTrain[22000:] yhat = nb.predict(XTrain[22000:, :]) m = getConfusionMatrix(yTrain[22000:], yhat) print("\n=== RESULTS ===") endTimer(t) printResults(m) # Running the model on the test set print("Training using the whole training set this time") nb.fit(XTrain, yTrain) (XTest, ids) = featuresMatrix.buildTestData() yhat = nb.predict(XTest) with open("output/test.txt", "w") as f: