def __init__(self):
     """ initialises the naive bayes
     """
     self.classes = ["drug", "group", "brand", "drug_n", "none"]
     self.nb = NaiveBayes(self.classes)
     self.tagger = PosTagger()
     self.set_feature_names()
Example #2
0
 def __init__(self, spam_emails_path, ham_emails_path, unknown_emails_path):
     # 0 - No spam (Ham)
     # 1 - Spam
     self.naive_bayes = NaiveBayes([0, 1])
     self.spam_emails_path = spam_emails_path
     self.ham_emails_path = ham_emails_path
     self.unknown_emails_path = unknown_emails_path
Example #3
0
 def test_blogs_bag(self):
     # Classify blog authors using bag-of-words
     train, test = self.split_blogs_corpus(BagOfWords)
     classifier = NaiveBayes()
     classifier.train(train)
     classified = classify(classifier, test)
     self.assertGreater(compute_all_stats(test, classified), 0.55)
Example #4
0
 def test_even_odd(self):
     """Classify numbers as even or odd"""
     classifier = NaiveBayes()
     classifier.train([EvenOdd(0, True), EvenOdd(1, False)])
     test = [EvenOdd(i, i % 2 == 0) for i in range(2, 1000)]
     classified = classify(classifier, test)
     self.assertEqual(compute_all_stats(test, classified), 1.0)
Example #5
0
 def test_names_nltk(self):
     # Classify names using NLTK features
     train, test = self.split_names_corpus()
     classifier = NaiveBayes()
     classifier.train(train)
     classified = classify(classifier, test)
     self.assertGreater(compute_all_stats(test, classified), 0.70)
Example #6
0
class Stacking():
    def __init__(self):
        pass

    def fit(self, X, y):
        self.rf = RandomForest(num_trees=15, max_depth=np.inf)
        self.rf.fit(X, y)
        y_rf = self.rf.predict(X)

        self.nb = NaiveBayes()
        self.nb.fit(X, y)
        y_nb = self.nb.predict(X)

        self.knn = KNN(k=3)
        self.knn.fit(X, y)
        y_knn = self.knn.predict(X)

        newX = np.array([y_rf, y_nb, y_knn]).transpose()

        model = DecisionTree(max_depth=np.inf,
                             stump_class=DecisionStumpErrorRate)
        self.model = model

        model.fit(newX, y)

    def predict(self, X):
        y_rf = self.rf.predict(X)
        y_nb = self.nb.predict(X)
        y_knn = self.knn.predict(X)
        x_test = np.array([y_rf, y_nb, y_knn]).transpose()

        return self.model.predict(x_test)
Example #7
0
 def test_blogs_bag(self):
     """Classify blog authors using bag-of-words"""
     print "\nsplit_blogs_corpus"
     train, test = self.split_blogs_corpus(BagOfWords)
     classifier = NaiveBayes()
     classifier.train(train)
     self.assertGreater(accuracy(classifier, test), 0.55)
Example #8
0
 def test_blogs_bag(self):
     """Classify blog authors using bag-of-words"""
     train, test = self.split_blogs_corpus(BagOfWords)
     classifier = NaiveBayes()
     classifier.train(train)
     #classifier.save("/home/anna/StatNLP/model_all.p")
     self.assertGreater(accuracy(classifier, test), 0.55)
Example #9
0
    def fit(self, X, y):
        # instantiate the input models
        rf = RandomForest(num_trees=15)
        knn = KNN(k=3)
        nb = NaiveBayes(num_classes=2)

        # Random Forest fit and predict
        rf.create_splits(X)
        rf.fit(X, y)
        rf_pred = rf.predict(X)

        # K-Nearest Neighbors fit and predict
        knn.fit(X, y)
        knn_pred = knn.predict(X)

        # Naive Bayes fit and predict
        nb.fit(X, y)
        nb_pred = nb.predict(X)

        # use predictions from input models as inputs for meta-classifiers
        meta_input = np.hstack((rf_pred.reshape(
            (rf_pred.size, 1)), knn_pred.reshape(
                (knn_pred.size, 1)), nb_pred.reshape((nb_pred.size, 1))))

        # use Decision Tree as meta-classifier
        dt = DecisionTree(max_depth=np.inf)
        dt.fit(meta_input, y)

        self.rf = rf
        self.knn = knn
        self.nb = nb
        self.meta_classifier = dt
 def test_names_nltk(self):
     """Classify names using NLTK features"""
     train, test = self.split_names_corpus()
     classifier = NaiveBayes()
     classifier.train(train)
     acc = accuracy(classifier, test)
     self.assertGreater(acc, 0.70)
def mnb(x_train, y_train, x_test, y_test):
    mynb = NaiveBayes()
    mynb.fit(x_train, y_train)
    pred = mynb.predict(x_test)
    print("The score of my Naive Bayes result (based on testing data): " +
          str(sum(pred == y_test) / len(pred)))
    return pred
Example #12
0
 def test_blogs_imba(self):
     train, test = self.split_blogs_corpus_imba(BagOfWords)
     classifier = NaiveBayes()
     classifier.train(train)
     # you don't need to pass this test
     classified = classify(classifier, test)
     self.assertGreater(compute_all_stats(test, classified), 0.1)
Example #13
0
def main():
    neg_revs = read_reviews_in_file("./rt-polaritydata/rt-polarity.neg")
    pos_revs = read_reviews_in_file("./rt-polaritydata/rt-polarity.pos")

    nb = NaiveBayes(neg_revs, pos_revs, val_split=0.2)
    nb.evaluate_naive_bayes()

    lr = LogisticRegression(neg_revs,
                            pos_revs,
                            val_split=0.2,
                            lr=0.85,
                            num_inter=1000)
    lr.evaluate_logistic_regression()

    lr = LogisticRegression(neg_revs,
                            pos_revs,
                            val_split=0.2,
                            lr=0.85,
                            num_inter=3000)
    lr.evaluate_logistic_regression()

    # Just for fun – tensorflow
    LogisticRegression_tf(neg_revs,
                          pos_revs,
                          val_split=0.2,
                          lr=0.01,
                          num_inter=200)
Example #14
0
    def __init__(self, no_of_testcases=100, verbose=True, nb=None, bw=None):
        self.logger = Logger('Comparer',
                             'logs\\comparer.log',
                             is_verbose=verbose)
        self.load_html_structure()

        if nb is None:
            self.nb = NaiveBayes(verbose=False,
                                 test_set_count=no_of_testcases,
                                 no_of_grams=4)
            self.nb.ready()
        else:
            self.nb = nb
            self.nb.logger.is_verbose = False

        if bw is None:
            self.bw = BagOfWordSentiment(verbose=False, no_of_grams=4)
            self.bw.ready()
        else:
            self.bw = bw
            self.bw.logger.is_verbose = False

        self.no_of_testcases = no_of_testcases
        self.nb_correct, self.bw_correct, self.tb_correct = 0, 0, 0
        self.nb_wrong, self.bw_wrong, self.tb_wrong = 0, 0, 0
        self.nb_accuracy, self.bw_accuracy, self.tb_accuracy = 0, 0, 0

        self.counter = 0
        self.testcases = dict()
Example #15
0
 def test_names_nltk(self):
     """Classify names using NLTK features"""
     print "\ntest_names_nltk"
     train, test = self.split_names_corpus()
     classifier = NaiveBayes()
     classifier.train(train)
     self.assertGreater(accuracy(classifier, test), 0.70)
Example #16
0
 def test_even_odd(self):
     """Classify numbers as even or odd"""
     print "\ntest_even_odd"
     classifier = NaiveBayes()
     classifier.train([EvenOdd(0, True), EvenOdd(1, False)])
     test = [EvenOdd(i, i % 2 == 0) for i in range(2, 1000)]
     self.assertEqual(accuracy(classifier, test), 1.0)
 def test_collect_counts(self):
     classifier = NaiveBayes()
     classifier._collect_counts(self.training_set)
     
     cat_index = classifier.label_codebook.get_index('cat')
     dog_index = classifier.label_codebook.get_index('dog') 
     purr_index = classifier.feature_codebook.get_index('purr') 
     meow_index = classifier.feature_codebook.get_index('meow') 
     bark_index = classifier.feature_codebook.get_index('bark') 
     woof_index = classifier.feature_codebook.get_index('woof') 
     
     print """Test collecting counts
     If any of these fails, check if you have updated the codebooks
     and check if the counts have been collected correctly without smoothing"""
     count_x_y = classifier.count_x_y_table
     
     self.assertEqual(count_x_y[purr_index, cat_index], 2)
     self.assertEqual(count_x_y[meow_index, cat_index], 2)
     self.assertEqual(count_x_y[bark_index, cat_index], 0)
     self.assertEqual(count_x_y[woof_index, cat_index], 1)
     
     self.assertEqual(count_x_y[purr_index, dog_index], 0)
     self.assertEqual(count_x_y[meow_index, dog_index], 1)
     self.assertEqual(count_x_y[bark_index, dog_index], 1)
     self.assertEqual(count_x_y[woof_index, dog_index], 2)
     
     count_y = classifier.count_y_table
     self.assertEqual(count_y[cat_index], 3)
     self.assertEqual(count_y[dog_index], 2)
Example #18
0
def index(request):
	if request.method == 'POST':
		form = TextForm(request.POST)
		if form.is_valid():
			db_data = SpamData.objects.all()
			training_data = np.empty([len(db_data), 58])
			for i in range(0, len(db_data)):
				training_data[i,:] = db_data[i].get_data()

			input_vector = text_to_frequencies(form.cleaned_data['text'])
			bayes = NaiveBayes(training_data)
			classification = bayes.classify(input_vector)
			data = str(np.append(classification, input_vector))
			str_class = "NOT SPAM"
			if classification == 1:
				str_class = "SPAM"

			return render(request, 'spam_classifier/results.html',{
				'input': ImmutableTextForm(request.POST),
				'isspam': str_class,
				'details': data
			})

	else:
		form = TextForm()

	return render(request, 'spam_classifier/index.html', {'form': form})
Example #19
0
def get_predictions_naive_bayes(train_data,
                                train_target,
                                test_data,
                                q_tag=None):
    from naive_bayes import NaiveBayes
    nb = NaiveBayes(serial_filename=get_serial_filename_nb(q_tag=q_tag))
    nb.train(train_data, train_target)
    return nb.get_predictions(test_data)
Example #20
0
 def setUp(self):
     self.naive_bayes = NaiveBayes()
     with open('./data/jojo.dat', 'r') as file:
         for rec in file:
             name, serif = rec.strip().split("\t")
             self.naive_bayes.category = name
             self.naive_bayes.word = serif
             self.naive_bayes.learn()
Example #21
0
def k_cross_validation(X, T, K, binary=0):

    # K-fold cross-validation

    fold_len = len(X) // 5
    X_folds = []
    T_folds = []
    k = 0
    results = []
    mean_test_accuracy = 0
    mean_f_score = 0
    stddev_test_accuracy = 0
    stddev_f_score = 0

    for i in range(K):
        X_folds.append(X[k:k + fold_len])
        T_folds.append(T[k:k + fold_len])
        k += fold_len
        if k > len(X):
            k = len(X)

    for i in range(K):

        X_test = X_folds[i]
        T_test = T_folds[i]
        X_train = []
        T_train = []

        for j in range(K):
            if j == i:
                continue
            X_train = X_train + X_folds[j]
            T_train = T_train + T_folds[j]

        model = NaiveBayes(alpha=1)
        prior, likelihood, classes, vocabulary = model.fit(
            X_train, T_train, binary)
        model_prediction = model.predict(X_test, prior, likelihood, classes,
                                         vocabulary)
        test_accuracy, f_score = model.evaluate(model_prediction, T_test)
        results.append((test_accuracy, f_score))
        mean_test_accuracy += test_accuracy
        mean_f_score += f_score
        # print("TRAINING ACCURACY :", model.train_accuracy)
        # print("Run {} : test accuracy = {}, f-score = {}".format(i,test_accuracy,f_score))

    mean_test_accuracy /= K
    mean_f_score /= K

    for i in range(len(results)):

        stddev_test_accuracy += (results[i][0] - mean_test_accuracy)**2
        stddev_f_score += (results[i][1] - mean_f_score)**2

    stddev_test_accuracy = np.sqrt(stddev_test_accuracy / len(results))
    stddev_f_score = np.sqrt(stddev_f_score / len(results))

    return results, mean_test_accuracy, mean_f_score, stddev_test_accuracy, stddev_f_score
Example #22
0
def main(argv):
    setpath()
    try:
        opts, args = getopt.getopt(argv, "ht:e:", ["train=", "test="])
        if (len(sys.argv) < 5):
            raise getopt.GetoptError(None)

    except getopt.GetoptError:
        print('\nusage: run.py -t <trainfile> -e <testfile>\n')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('run.py -t <trainfile> -e <testfile>')
            sys.exit()
        elif opt in ("-t", "--train"):
            trainfile = arg
        elif opt in ("-e", "--test"):
            testfile = arg

    from file_reader import FileReader
    fr = FileReader(testfile)
    from naive_bayes import NaiveBayes
    nb = NaiveBayes(trainfile)

    test_file_reader = FileReader(testfile)
    testData = test_file_reader.getRows()

    num_errors = 0
    true_positive = 0
    false_positive = 0
    true_negative = 0
    false_negative = 0

    #Testing phase
    for idx, row in enumerate(testData):
        prediction = nb.binary_classify(row)
        if row[-1] != prediction:
            num_errors += 1.0
            print("Error on row: %s" % str(idx + 1))
            if row[-1] == '1':
                false_negative += 1
            else:
                false_positive += 1
        elif row[-1] == '0':
            true_negative += 1
        else:
            true_positive += 1

    print('\n\n--------------Error Count----------------')
    print(num_errors)
    print('\n\n--------------Accuracy----------------')

    print("\n\nThe Accuracy is " +
          str((len(testData) - num_errors) * 100 / len(testData)) + "%")
    print("\n===========The confusion matrix===========")
    print("\t No \t Yes")
    print("No \t", str(true_negative) + "\t", str(false_positive))
    print("Yes \t", str(false_negative) + "\t", str(true_positive))
    def test_predict_record_with_binary_dataset(self):
        expected_prediction = 1

        test_record = [1, 1, 0]
        clf = NaiveBayes(self.extract_features)
        clf.fit(self.design_matrix, self.target_values)
        prediction = clf.predict_record(test_record)

        self.assertEqual(expected_prediction, prediction)
Example #24
0
def main():
    dataset = load_loan_defaulters()
    design_matrix = [row[:-1] for row in dataset]
    target_values = [row[-1] for row in dataset]
    clf = NaiveBayes(extract_features)
    clf.fit(design_matrix, target_values)
    prediction = clf.predict_record([1, 1, 50700])
    negation_word = " not " if prediction == 0.0 else ""
    print("testing negative sentiment" + negation_word + "of the tweet")
Example #25
0
def main(argv):
  setpath()
  try:
    opts, args = getopt.getopt(argv,"ht:e:",["train=","test="])
    if(len(sys.argv) < 5):
      raise getopt.GetoptError(None)

  except getopt.GetoptError:
    print('\nusage: run.py -t <trainfile> -e <testfile>\n')
    sys.exit(2)
  for opt, arg in opts:
    if opt == '-h':
      print('run.py -t <trainfile> -e <testfile>')
      sys.exit()
    elif opt in ("-t", "--train"):
       trainfile = arg
    elif opt in ("-e", "--test"):
       testfile = arg

  from file_reader import FileReader
  fr = FileReader(testfile)
  from naive_bayes import NaiveBayes
  nb = NaiveBayes(trainfile)

  test_file_reader = FileReader(testfile)
  testData = test_file_reader.getRows()

  num_errors = 0
  true_positive = 0
  false_positive = 0
  true_negative = 0
  false_negative = 0

  #Testing phase
  for idx, row in enumerate(testData):
    prediction = nb.binary_classify(row)
    if row[-1] != prediction:
      num_errors += 1.0
      print("Error on row: %s" % str(idx+1))
      if row[-1] == '1':
        false_negative += 1
      else:
        false_positive += 1
    elif row[-1] == '0':
      true_negative += 1
    else:
      true_positive += 1

  print('\n\n--------------Error Count----------------')
  print(num_errors)
  print('\n\n--------------Accuracy----------------')

  print("\n\nThe Accuracy is " +str((len(testData) - num_errors)*100/len(testData)) + "%")
  print("\n===========The confusion matrix===========")
  print("\t No \t Yes")
  print("No \t", str(true_negative) + "\t", str(false_positive))
  print("Yes \t", str(false_negative) +"\t", str(true_positive))
 def test_prediction(self):
     print """Test basic classification"""
     classifier = NaiveBayes()
     classifier.train(self.training_set)
     predictions = [classifier.classify_instance(x) for x in self.training_set]
     self.assertEqual(predictions[0], 'cat')
     self.assertEqual(predictions[1], 'dog')
     self.assertEqual(predictions[2], 'cat')
     self.assertEqual(predictions[3], 'dog')
     self.assertEqual(predictions[4], 'dog')
Example #27
0
def main():
    dataset = load_loan_defaulters()
    design_matrix = [row[:-1] for row in dataset]
    target_values = [row[-1] for row in dataset]
    clf = NaiveBayes(extract_features)
    clf.fit(design_matrix, target_values)
    prediction = clf.predict_record([1, 1, 50700])
    negation_word = " not " if prediction == 0.0 else ""
    print("We predict this person will" + negation_word +
          "default on their loans.")
Example #28
0
def cross_validation_nb(Xs_train, Ys_train, feats):
    """Optimizes a parameter for the naive bayes classifier using the cross validation technic"""
    folds = 5
    kf = StratifiedKFold(n_splits=folds)
    #cross_error_list = []
    bws = []
    train_errors = []
    val_errors = []

    #For loop iterating every band-width value from 0.02 to 0.6 with a step of 0.02
    for bw in np.arange(0.02, 0.6, 0.02):

        summed_train_errors = 0
        summed_val_errors = 0

        #Stratified k folds
        for train_idx, valid_idx in kf.split(Ys_train, Ys_train):

            #Obtain the training and validation folds from the training set
            x_train_set = Xs_train[train_idx]
            x_val_set = Xs_train[valid_idx]

            y_train_set = Ys_train[train_idx]
            y_val_set = Ys_train[valid_idx]

            # Calculate naive bayes for this specific bandwidth
            nb = NaiveBayes(bw, feats)

            train_error, kde_list, prior_class0, prior_class1 = nb.fit(
                x_train_set, y_train_set)

            val_error, pred_val = nb.predict(x_val_set, y_val_set, kde_list,
                                             prior_class0, prior_class1)

            summed_train_errors += train_error
            summed_val_errors += val_error

        bws.append(bw)
        train_errors.append(summed_train_errors / folds)
        val_errors.append(summed_val_errors / folds)

    # Choose best bandwidth
    best_bandwidth = 0
    best_bw_val_error = 100
    for i in range(len(bws)):
        bw = bws[i]
        if val_errors[i] < best_bw_val_error:
            best_bandwidth = bw
            best_bw_val_error = val_errors[i]
    print("Best BW training")
    print(best_bandwidth)

    return (best_bandwidth, bws, train_errors, val_errors)
def main():
    parser = argparse.ArgumentParser(
        description='Naive Bayes for Spam Classification')

    parser.add_argument('--suffix',
                        default="",
                        type=str,
                        help='Dataset to be used for training')

    parser.add_argument('--alpha',
                        default=0.001,
                        type=float,
                        help='Smoothing factor of the model')

    args = parser.parse_args()

    # Load data
    data = DataLoader()
    data.load_data(args.suffix)

    # Initialize model
    model = NaiveBayes(data.vocab_size, args.alpha)

    # Train
    model.fit(data.trainX, data.trainY)

    # Evaluation
    predictions = []
    tp, tn, fp, fn = 0, 0, 0, 0
    for (x, y) in zip(data.testX, data.testY):
        probs = model.predict(x)
        labels = list(probs.keys())
        probs = list(probs.values())
        label = labels[np.argmax(probs)]
        if label == 0 and y == 0:
            tn += 1
        elif label == 1 and y == 1:
            tp += 1
        elif label == 0 and y == 1:
            fn += 1
        elif label == 1 and y == 0:
            fp += 1
        predictions.append(label)

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    print(f"Precision: {precision * 100:.2f}%")
    print(f"Recall: {recall * 100:.2f}%")
    print(f"Accuracy: {(tp + tn) / len(data.testY) * 100:.2f}%")
    print(f"F1 Score: {2 * (precision * recall) / (precision + recall)}")
Example #30
0
def naive_bayes():

    X_train, X_test, y_train, y_test = data.load_dbworld()

    start = timeit.default_timer()

    nb = NaiveBayes()
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)

    stop = timeit.default_timer()

    print("Accuracy of the models is : %f" % ((y_pred == y_test).sum()/len(y_test)))
    print("Running time : %f" % (stop - start))
Example #31
0
def train_and_val():
    training_data = dp.read_data('dataset/splice-Xtrain.dat',
                                 'dataset/splice-Ytrain.dat')
    training_set_indices, validation_set_indices = dp.read_training_val_set(
        'dataset/train.txt', 'dataset/val.txt')
    feature = Features()
    features_labels_pair = feature.simple(training_data)
    training_set = []
    for index in training_set_indices:
        training_set.append(features_labels_pair[index])

    #dp.remove_ambiguous_entry(training_set)
    naive_bayes = NaiveBayes(training_set, 4, False)

    validation_set = []
    for index in validation_set_indices:
        validation_set.append(features_labels_pair[index])

    dp.remove_ambiguous_entry(validation_set)

    confusion_matrix = np.zeros([3, 3])
    correct = 0.0
    total = 0.0
    for feature_vector, correct_class in validation_set:
        prediction = naive_bayes.predict(feature_vector)
        total += 1
        if prediction == correct_class:
            correct += 1
        if prediction == 0 and correct_class == 0:
            confusion_matrix[0, 0] += 1
        if prediction == 0 and correct_class == 1:
            confusion_matrix[0, 1] += 1
        if prediction == 0 and correct_class == 2:
            confusion_matrix[0, 2] += 1
        if prediction == 1 and correct_class == 0:
            confusion_matrix[1, 0] += 1
        if prediction == 1 and correct_class == 1:
            confusion_matrix[1, 1] += 1
        if prediction == 1 and correct_class == 2:
            confusion_matrix[1, 2] += 1
        if prediction == 2 and correct_class == 0:
            confusion_matrix[2, 0] += 1
        if prediction == 2 and correct_class == 1:
            confusion_matrix[2, 1] += 1
        if prediction == 2 and correct_class == 2:
            confusion_matrix[2, 2] += 1
        #print prediction, correct_class
    print confusion_matrix
    print correct / total
    def __init__(self):
        # download stopwords
        nltk.download('stopwords')

        # tweets and their labels
        self.tweets = []
        self.labels = []
        # retrieve tweets
        file = open("tweets.txt", "r")

        # tweet to be added to the tweets list
        tweet = ""
        # line to be read from the file
        line = file.readline()
        while line:
            #  if the line is the label, add tweet and its label to the corresponding lists
            if line.startswith("$$$$$"):
                # add tweet
                self.tweets.append(tweet)
                # add label
                self.labels.append(int(line[5:].replace("\n", "")))
                # clear tweet object
                tweet = ""
            #  else, the line is a part of the tweet
            else:
                tweet += line.replace("\n", "").strip().lower()
            # read new line
            line = file.readline()
        # Preprocessing
        preprocessor = Preprocessor(self.tweets, nltk.PorterStemmer())
        tweets = preprocessor.start()

        # Tokenize tweets
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(tweets)

        tokenized_tweets = self.tokenizer.texts_to_sequences(tweets)
        num_tokens = [len(tokens) for tokens in tokenized_tweets]
        num_tokens = np.array(num_tokens)

        self.max_tokens = int(np.mean(num_tokens) + 2 * np.std(num_tokens))

        tokenized_tweets_padding = pad_sequences(tokenized_tweets, maxlen=self.max_tokens)
        X_train, X_test, y_train, y_test = train_test_split(tokenized_tweets_padding, self.labels, test_size=0.2,
                                                            random_state=123)

        self.nb = NaiveBayes()
        self.nb.fit(X_train, y_train)
    def setUpClass(cls):
        cls.dataset = cls.get_six_separable_points()
        cls.design_matrix = [row[:-1] for row in cls.dataset]
        cls.target_values = [row[-1] for row in cls.dataset]

        cls.clf = NaiveBayes(cls.extract_features)
        cls.clf.fit(cls.design_matrix, cls.target_values)
    def setUpClass(cls):
        cls.dataset = load_loan_defaulters()
        cls.design_matrix = [row[:-1] for row in cls.dataset]
        cls.target_values = [row[-1] for row in cls.dataset]

        cls.clf = NaiveBayes(cls.extract_features)
        cls.clf.fit(cls.design_matrix, cls.target_values)
Example #35
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("labeled_data",
                        help="The data file of labeled SMS texts")
    parser.add_argument("non_labeled_data",
                        help="The data file of non-labeled SMS texts")
    parser.add_argument("database_file",
                        help="The path of file to store the database")
    args = parser.parse_args()
    db_file = open(args.database_file, "wb")

    # 载入已分类数据集
    load_dataset(args.labeled_data, True)

    # 将已分类的数据集划分为训练集(90%)和测试集(10%),以测试分类精度
    labeled_documents = list(documents)
    random.shuffle(labeled_documents)
    labeled_count = len(labeled_documents)
    labeled_train_count = int(math.ceil(labeled_count * 0.9))
    labeled_train = labeled_documents[:labeled_train_count]
    labeled_test = labeled_documents[labeled_train_count:]

    # 载入未分类数据集
    load_dataset(args.non_labeled_data, False)
    calc_tfidf()

    # 训练分类器
    classifier = NaiveBayes(
        2, [(document.words, document.label)
            for document in labeled_train if document.label is not None])
    # 测试分类器精度
    confusion_matrix = [[0, 0], [0, 0]]
    for document in labeled_test:
        predicted_label = classifier.predict(document.words)
        confusion_matrix[document.label][predicted_label] += 1
    print(
        "Confusion Matrix:\tReal 0\t\tReal 1\n\tPredicted 0\t%d\t\t%d\n\tPredicted 1\t%d\t\t%d\n"
        % (confusion_matrix[0][0], confusion_matrix[0][1],
           confusion_matrix[1][0], confusion_matrix[1][1]))

    # 对无标签数据进行分类
    for document in documents:
        if document.label is None:
            document.label = classifier.predict(document.words)

    # 保存搜索数据库
    pickle.dump(Database(documents, keywords), db_file)
def train_and_val():
    training_data = dp.read_data('dataset/splice-Xtrain.dat', 'dataset/splice-Ytrain.dat')
    training_set_indices, validation_set_indices = dp.read_training_val_set('dataset/train.txt', 'dataset/val.txt')
    feature = Features()    
    features_labels_pair = feature.simple(training_data)
    training_set = []
    for index in training_set_indices:
        training_set.append(features_labels_pair[index])
    
    #dp.remove_ambiguous_entry(training_set)
    naive_bayes = NaiveBayes(training_set, 4, False)
    
    validation_set = []
    for index in validation_set_indices:
        validation_set.append(features_labels_pair[index])
    
    dp.remove_ambiguous_entry(validation_set)
    
    confusion_matrix = np.zeros([3,3])
    correct = 0.0
    total = 0.0
    for feature_vector, correct_class in validation_set: 
        prediction = naive_bayes.predict(feature_vector)
        total += 1
        if prediction == correct_class:
            correct += 1
        if prediction == 0 and correct_class == 0:
            confusion_matrix[0,0] += 1
        if  prediction == 0 and correct_class == 1:
            confusion_matrix[0,1] += 1
        if  prediction == 0 and correct_class == 2:
            confusion_matrix[0,2] += 1
        if  prediction == 1 and correct_class == 0:
            confusion_matrix[1,0] += 1
        if  prediction == 1 and correct_class == 1:
            confusion_matrix[1,1] += 1
        if  prediction == 1 and correct_class == 2:
            confusion_matrix[1,2] += 1
        if  prediction == 2 and correct_class == 0:
            confusion_matrix[2,0] += 1
        if  prediction == 2 and correct_class == 1:
            confusion_matrix[2,1] += 1
        if  prediction == 2 and correct_class == 2:
            confusion_matrix[2,2] += 1            
        #print prediction, correct_class
    print confusion_matrix      
    print correct/total
    def test_save_load(self):
        """Test saving and loading with blog classifier"""
        train, test = self.split_blogs_corpus(BlogFeatures)
        classifier = NaiveBayes()
        classifier.train(train)
        classifier.save("model")

        class2 = NaiveBayes()
        class2.load("model")
        self.assertGreater(accuracy(class2, test), 0.55)
Example #38
0
def cross_validation(corpus, idf):
    nb_results = {'precision': [], 'recall': [], 'f1': []}

    knn_results = {'precision': [], 'recall': [], 'f1': []}

    vocab = sorted(idf.keys())

    random.shuffle(corpus)

    for i in range(10):
        print('cross validation', i)

        training, testing = split_data(corpus, i, 10)

        nb = NaiveBayes(training, vocab, 0.1)
        knn = KNN(5, 5)
        knn.fit([d.vector for d in training], [d.label for d in training])

        labels = [d.label for d in testing]
        nb_preds = [nb.predict(d) for d in testing]
        knn_preds = [knn.predict(d.vector) for d in testing]

        metrics = model_metrics(labels, nb_preds)
        for m, k in zip(metrics, ['precision', 'recall', 'f1']):
            nb_results[k].append(m)

        metrics = model_metrics(labels, knn_preds)
        for m, k in zip(metrics, ['precision', 'recall', 'f1']):
            knn_results[k].append(m)

    for m in ['precision', 'recall', 'f1']:
        print('nb', m)
        print(nb_results[m])
        print(m, 'nb mean', mean(nb_results[m]))

        print('knn', m)
        print(knn_results[m])
        print(m, 'knn mean', mean(knn_results[m]))

        diff = [a - b for a, b in zip(nb_results[m], knn_results[m])]
        print(m, 'diff')
        print(diff)

        t = mean(diff) / (stdev(diff) / len(diff)**0.5)
        print(m, 't value:', t)
Example #39
0
    def rebuild_models(self):
        ''' Rebuilds all models over the current labeled datasets. '''
        datasets = self.labeled_datasets
        if self.undersample_first:
            print "undersampling before building models.."
            datasets = self.undersample_labeled_datasets()

        all_train_sets, labels = self._datasets_to_matrices(datasets)
        self.models = [NB_Model(NaiveBayes.train(training_set, labels)) for training_set in all_train_sets]
 def test_smoothing(self):
     print """Test smoothing
     
     Zero counts must not result in zero probability. 
     When turning the counts into probability, some smoothing must be done
     """
     
     classifier = NaiveBayes()
     classifier.train(self.training_set)
     
     cat_index = classifier.label_codebook.get_index('cat')
     dog_index = classifier.label_codebook.get_index('dog') 
     purr_index = classifier.feature_codebook.get_index('purr') 
     bark_index = classifier.feature_codebook.get_index('bark') 
     
     p_x_given_y = classifier.p_x_given_y_table
     self.assertNotEqual(p_x_given_y[bark_index, cat_index], 0)
     self.assertNotEqual(p_x_given_y[purr_index, dog_index], 0)
 def test_probability_tables(self):
     print """Test probability tables
     
     Regardless of what kind of smoothing you do, the signs have to be right
     """
     classifier = NaiveBayes()
     classifier.train(self.training_set)
     
     cat_index = classifier.label_codebook.get_index('cat')
     dog_index = classifier.label_codebook.get_index('dog') 
     meow_index = classifier.feature_codebook.get_index('meow') 
     bark_index = classifier.feature_codebook.get_index('bark') 
     
     p_x_given_y = classifier.p_x_given_y_table
     self.assertGreater(p_x_given_y[meow_index, cat_index], p_x_given_y[meow_index, dog_index])
     self.assertLess(p_x_given_y[bark_index, cat_index], p_x_given_y[bark_index, dog_index])
     
     p_y = classifier.p_y_table
     self.assertGreater(p_y[cat_index], p_y[dog_index])
    def test_save_load_blogs_bag(self):
        train, test = self.split_blogs_corpus(BagOfWords)
        classifier = NaiveBayes()
        classifier.train(train)
        classifier.save('trained_model.p')

        c2 = NaiveBayes()
        c2.load('trained_model.p')

        self.assertEqual(classifier.model, c2.model)
        self.assertEqual(classifier.priorCount, c2.priorCount)
        self.assertEqual(classifier.countPerFeature, c2.countPerFeature)
def train_and_test():
    training_data = dp.read_data("dataset/splice-Xtrain.dat", "dataset/splice-Ytrain.dat")
    test_data = dp.read_data("dataset/test40.txt", "dataset/ytest40.txt")
    feature = Features()
    training_set = feature.simple(training_data)
    test_set = feature.simple(test_data)

    # dp.remove_ambiguous_entry(training_set)
    naive_bayes = NaiveBayes(training_set, 4, False)

    confusion_matrix = np.zeros([3, 3])
    correct = 0.0
    total = 0.0
    for index in range(len(test_set)):
        feature_vector, correct_class = test_set[index]
        prediction = naive_bayes.predict(feature_vector)
        total += 1
        if prediction == correct_class:
            correct += 1
        if prediction == 0 and correct_class == 0:
            confusion_matrix[0, 0] += 1
        if prediction == 0 and correct_class == 1:
            confusion_matrix[0, 1] += 1
        if prediction == 0 and correct_class == 2:
            confusion_matrix[0, 2] += 1
        if prediction == 1 and correct_class == 0:
            confusion_matrix[1, 0] += 1
        if prediction == 1 and correct_class == 1:
            confusion_matrix[1, 1] += 1
        if prediction == 1 and correct_class == 2:
            confusion_matrix[1, 2] += 1
        if prediction == 2 and correct_class == 0:
            confusion_matrix[2, 0] += 1
        if prediction == 2 and correct_class == 1:
            confusion_matrix[2, 1] += 1
        if prediction == 2 and correct_class == 2:
            confusion_matrix[2, 2] += 1
    print confusion_matrix
    print correct / total
def main(args):
    train_file_path = "./data/restaurant_train.txt"
    test_file_path = "./data/restaurant_test.txt"
    model_file_path = "./perc_mod.m"
    data = Data()
    data.read_train_file(train_file_path)
    data.read_test_file(test_file_path)
    model = NaiveBayes(data)
    model.train(data)
    model = Perceptron(data)
    model.train(data, 5)
Example #45
0
#In this file we extract the feature from .txt files and we assume that the training process uses the 
#raw_data of instance to train while the testing precess using the data of instance to test. But for #convenience of using the test_naive_bayes.py, I populate the data and raw_data of instance at the #same time
# coding: utf-8
import nltk
from helper import Alphabet, Instance
from naive_bayes import NaiveBayes
import util
from evaluator import split_train_test
import random
import argparse

#filelist1 = get_ipython().getoutput(u'ls txt_sentoken/neg/')
#filelist2 = get_ipython().getoutput(u'ls txt_sentoken/pos/')
#ins_list = []

nb = NaiveBayes()
ID=None
limits=None
#get the feature selection function number 
parser = argparse.ArgumentParser(description ='choose certain feature selection fucntion')
parser.add_argument('ID',metavar='N',type=int)
parser.add_argument('limits', metavar='N',type=int)
args = parser.parse_args()
ID = args.ID
limits = args.limits

def load_instance(filepath):
	ins_list=[]
	filelist = get_ipython().getoutput(u'ls '+filepath)
	for filename in filelist:
		f = open(filepath+filename,'r')
def main():
	parser = AP.ArgumentParser(description = "A command-line interface for " \
		"the maximum entropy classifier.")
	parser.add_argument("-d", "--datafile", action = "store", default = "blog-gender-dataset.txt", \
		help = "specify the input data file (default: ")
	parser.add_argument("-g", "--gaussian_prior", dest = "gpv", action = "store", \
		help = "specify the Gaussian prior variance")
	parser.add_argument("-m", "--mode", dest = "mode", action = "store", default = "train", \
		help = "run as train, train/ test, exp(eriment)1, exp(eriment)2, exp(eriment)3")
	parser.add_argument("-s", "--save", dest = "outfile", action = "store", default = None, \
		help = "specify output file to serialize trained classifier")
	parser.add_argument("-l", "--load", dest = "infile", action = "store", default = None, \
		help = "specify input file to load trained classifier")
	parser.add_argument("-i", "--instances", dest = "instances", action = "store", default = None, \
		help = "load preprocessed instances instead of data")
	parser.add_argument("-f", "--featurefile", dest = "featfile", action = "store", default = None, \
		help = "serialize preprocessed instances")	
	
	args = parser.parse_args() #parse argument structure
	
	#begin running classifier
	try:
		print "Importing data ... "
		if args.instances: #get serialized features
			instance_list = cPickle.load(open(args.instances, 'rb'))
			print "Done."
		else: #create features from data
			data_list = import_data(args.datafile)
			print "Done.\nExtracting features ... "
			instance_list = []
			l = len(data_list)
			for i, (label, post) in enumerate(data_list):
				print "Featurizing string %d of %d ... " % (i, l)
				instance_list.append(Instance(label = label, data = featurize(post)))
			print "Done."
		if args.featfile: #serialize instance_list
			with open(args.featfile, 'wb') as outf:
				cPickle.dump(instance_list, outf)
		piv1 = int(.7 * len(instance_list)) #split training from test
		piv2 = int(.9 * len(instance_list)) #split test from dev
		training, test, dev = instance_list[:piv1], instance_list[piv1:piv2], \
			instance_list[piv2:]
			
		if args.infile: #load a previously trained classifier
			with open(args.infile, 'rb') as inf:
				me_classifier = MaxEnt.from_dict(cPickle.load(inf))
		else: #create a new classifier
			exec('me_classifier = MaxEnt(%s)' % args.gpv)

		#experiment one
		if re.search(r'exp.*1', args.mode):
		
			if not args.infile:
				print "Training classifier ... "
				me_classifier.train(training)
				print "Done.\nTesting classification ... "
			if args.outfile:
				with open(args.outfile, 'wb') as outf:
					cPickle.dump(me_classifier.to_dict(), outf)
		
			for data in [training, test]:
				test_classifier(me_classifier, data).print_out()
				
		#experiment two; run in batch as for i in {.05,...,numpy.Infinity} ...
		#run with -s $i.classifier
		elif re.search(r'exp.*2', args.mode):
			#for value in [.05, 0.1, .5, 1, 3, 5, 10, numpy.Infinity]:
			#for value in [10, numpy.Infinity]:
			#me_classifier = MaxEnt(value)
			print "Training classifier with Gaussian prior variance %s ..." \
				% str(me_classifier.gaussian_prior_variance)
			me_classifier.train(training)
			print "Done. Testing classifier over dev set ..."
			test_classifier(me_classifier, dev).print_out()
			print "Done. Testing classifier over test set ..."
			test_classifier(me_classifier, test).print_out()
			print "Done.\n\n\n"
			
		#experiment three; run with -l 1.classifier
		elif re.search(r'exp.*3', args.mode):
			if not args.infile:
				print "Training Maximum Entropy classifier ... "
				me_classifier.train(training)
				print "Done."
			nb_classifier = NaiveBayes()
			print "Training Naive Bayes classifier ... "
			nb_classifier.train(training)
			print "Done.\nTesting Maximum Entropy over test set ... "
			test_classifier(me_classifier, test).print_out()
			print "Done.\nTesting Naive Bayes over test set ... "
			test_classifier(nb_classifier, test).print_out()
			
		if args.outfile: #serialize trained classifier
			with open(args.outfile, 'wb') as outf:
				cPickle.dump(me_classifier.to_dict(), outf)

	except: #something is WROOOONG
		parser.print_help()
		raise
from similarity_calculator import SimilarityCalculator
from naive_bayes import NaiveBayes
import constants
import pickle
import sys
import pdb
from collections import OrderedDict

if __name__ == '__main__':
    sc = SimilarityCalculator()
    with open(constants.NB_PKL_FILENAME, 'rb') as f:
        nb_classifier = pickle.load(f)

        nb_input = NaiveBayes()

        for query in sys.stdin:
            nb_input.word_count = {}
            nb_input.train(query, 'input')
            results = OrderedDict()

            for category in nb_classifier.word_count:
                sim_cos = sc.sim_cos(nb_input.word_count['input'], nb_classifier.word_count[category])
                results[category] = sim_cos

            for result in results:
                print('カテゴリ「%s」との類以度は %f です' % (result, results[result]))

            best_score_before = 0.0
            best_category = ''
            
            for i, category in enumerate(results):
import os
import pickle
import constants
from naive_bayes import NaiveBayes
import utils

if __name__ == '__main__':
    utils.go_to_fetched_pages_dir()
    pages = utils.load_html_files()
    pkl_nb_path = os.path.join('..', constants.NB_PKL_FILENAME)

    # もしすでにNaiveBayesオブジェクトをpickle保存していたらそれを学習させる
    if os.path.exists(pkl_nb_path):
        with open(pkl_nb_path, 'rb') as f:
            nb = pickle.load(f)
    else:
        nb = NaiveBayes()
    for page in pages:
        nb.train(page.html_body, constants.QUERY)
    # せっかく学習させたんだから保存しよう
    with open(pkl_nb_path, 'wb') as f:
        pickle.dump(nb, f)
Example #49
0
from helper import Instance
from naive_bayes import NaiveBayes

cat1 = Instance(label='cat', data=[1,0], raw_data=['purr' ,'purr' ,'meow'])
cat2 = Instance(label='cat', data=[0,2],raw_data=['meow','woof'])
cat3 = Instance(label='cat', data=[1],raw_data=['purr'])
dog1 = Instance(label='dog', data=[3,2],raw_data=['bark','woof'])
dog2 = Instance(label='dog', data=[2,0],raw_data=['woof','meow'])

training_set = [cat1, cat2, cat3, dog1, dog2]

classifier = NaiveBayes()
classifier._collect_counts(training_set)

cat_index = classifier.label_codebook.get_index('cat')
dog_index = classifier.label_codebook.get_index('dog') 
purr_index = classifier.feature_codebook.get_index('purr') 
meow_index = classifier.feature_codebook.get_index('meow') 
bark_index = classifier.feature_codebook.get_index('bark') 
woof_index = classifier.feature_codebook.get_index('woof') 

"""Test counting"""
count_x_y = classifier.count_table
count_x_y[purr_index, cat_index] == (2+1)
count_x_y[meow_index, cat_index] == (2+1)
count_x_y[bark_index, cat_index] == (0+1)
count_x_y[woof_index, cat_index] == (1+1)

count_x_y[purr_index, dog_index] == (0+1)
count_x_y[meow_index, dog_index] == (1+1)
count_x_y[bark_index, dog_index] == (1+1)
Example #50
0
test_data_raw = [("dogs like the pool", 0),
                 ("seals like the sea", 1)]

def create_data(data):
    counts = list()
    labels = list()
    for text, label in data:
        counts.append(Counter(text.split()))
        labels.append(label)
    return (counts, labels)

if __name__ == "__main__":
    train_data = create_data(train_data_raw)
    print("word_counts: {0}\nlabels: {1}\n"
            .format(train_data[0], train_data[1]))
    NB = NaiveBayes()
    NB.train(train_data)
    print("p_label0: {0}\np_label1: {1}\n"
            .format(exp(NB.p_c[0]), exp(NB.p_c[1])))
    label0_p_x_given_c = [(word, exp(prob)) for word, prob in
            NB.p_x_given_c[0].items()]
    label1_p_x_given_c = [(word, exp(prob)) for word, prob in
            NB.p_x_given_c[1].items()]
    print("p_x_given_c_label0: {0}\np_x_given_c_label1: {1}\n"
        .format(label0_p_x_given_c, label1_p_x_given_c))
    test_data = create_data(test_data_raw)
    predictions = NB.predict(test_data)
    data, labels = test_data
    NB.report(predictions, labels)
Example #51
0
X = pca.transform(X, n_components=5) # Reduce to 5 dimensions


# ..........................
#  TRAIN / TEST SPLIT
# ..........................
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# Rescale label for Adaboost to {-1, 1}
rescaled_y_train = 2*y_train - np.ones(np.shape(y_train))
rescaled_y_test = 2*y_test - np.ones(np.shape(y_test))

# .......
#  SETUP
# .......
adaboost = Adaboost(n_clf = 8)
naive_bayes = NaiveBayes()
knn = KNN(k=4)
logistic_regression = LogisticRegression()
mlp = MultilayerPerceptron(n_hidden=20)
perceptron = Perceptron()
decision_tree = DecisionTree()
random_forest = RandomForest(n_estimators=150)
support_vector_machine = SupportVectorMachine(C=1, kernel=rbf_kernel)

# ........
#  TRAIN
# ........
print "Training:"
print "\tAdaboost"
adaboost.fit(X_train, rescaled_y_train)
print "\tNaive Bayes"

if __name__ == '__main__':
    data = np.genfromtxt('data/spam.csv', delimiter=',')

    y = data[:, -1]
    X = data[:, 0:-1]

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    print 'Train shape:', X_train.shape
    print 'Test shape:', X_test.shape

    print

    print "My Implementation:"
    my_nb = NaiveBayes()
    my_nb.fit(X_train, y_train)
    print 'Accuracy:', my_nb.score(X_test, y_test)
    my_predictions =  my_nb.predict(X_test)

    print

    print "sklearn's Implementation"
    mnb = MultinomialNB()
    mnb.fit(X_train, y_train)
    print 'Accuracy:', mnb.score(X_test, y_test)
    sklearn_predictions = mnb.predict(X_test)

    # Assert I get the same results as sklearn
    # (will give an error if different)
    assert np.all(sklearn_predictions == my_predictions)
 def test_even_odd(self):
     """Classify numbers as even or odd"""
     classifier = NaiveBayes()
     classifier.train([EvenOdd(0, True), EvenOdd(1, False)])
     test = [EvenOdd(i, i % 2 == 0) for i in range(2, 1000)]
     self.assertEqual(accuracy(classifier, test), 1.0)
 def test_blogs_bag(self):
     """Classify blog authors using bag-of-words"""
     train, test = self.split_blogs_corpus(BagOfWords)
     classifier = NaiveBayes()
     classifier.train(train)
     self.assertGreater(accuracy(classifier, test), 0.55)
from naive_bayes import NaiveBayes
import constants
import pickle
import sys
import pdb
from collections import OrderedDict


if __name__ == '__main__':
    sc = SimCalculator()
    with open(constants.NB_PKL_FILENAME, 'rb') as f:
        nb_classifier = pickle.load(f)

    # 標準入力した文字列を、trainとword_countを使って {'input': {'スギ花粉': 4, '薬':3}}という形式に整形するためNBオブジェクトにした
    # 分類器としては使わないので本当は別のクラスを作ってやるべきだがめんどい
    nb_input = NaiveBayes()

    for query in sys.stdin:
        nb_input.word_count = {}  # 二回目以降のinputのための初期化
        nb_input.train(query, 'input')  # 標準入力で入れた文字列を'input'カテゴリとして学習
        results = OrderedDict()
        for category in nb_classifier.word_count:
            # sim_cosのかわりにsim_simpsonも使える
            sim_cos = sc.sim_cos(nb_input.word_count['input'], nb_classifier.word_count[category])
            results[category] = sim_cos

        for result in results:
            print('カテゴリー「%s」との類似度は %f です' % (result, results[result]))

        # http://cointoss.hatenablog.com/entry/2013/10/16/123129 の通りやってもmaxのkey取れない(´・ω・`)
        best_score_before = 0.0
Example #56
0
def main():
    if not RUN_DIRTY:
        clean_up()

    if not RUN_DIRTY and not os.path.exists(TRAINING_PATH) and not os.path.exists(TEST_PATH):
        print("Splicing raw data")
        split_data.split_data(RAW_PATH)

    training_labels_file = TRAINING_PATH + "/_label"
    test_labels_file = TEST_PATH + "/_label"

    print("Reading labels")
    training_labels = file_util.read_line_list(training_labels_file)
    test_labels = file_util.read_line_list(test_labels_file)

    training_tokens_path = tokenizer.get_token_path(TRAINING_PATH)
    test_tokens_path = tokenizer.get_token_path(TEST_PATH)

    print("Tokenizing...")
    if not os.path.exists(training_tokens_path):
        print("Tokenizing training set...")
        tokenizer.tokenize_path(TRAINING_PATH)
        print("Training set tokenization complete")

    if not os.path.exists(test_tokens_path):
        print("Tokenizing test set...")
        tokenizer.tokenize_path(TEST_PATH)
        print("Test set tokenization complete")

    print("Reading tokens")
    training_set_tokens = article_util.load_tokenized_articals(training_tokens_path)
    test_set_tokens = article_util.load_tokenized_articals(test_tokens_path)

    print("Training naive bayes")
    naive_bayes = NaiveBayes(training_set_tokens, training_labels)

    print("Validating with training set")

    training_true_positives = 0
    training_false_positives = 0
    training_false_negative = 0
    for i in range(len(training_set_tokens)):
        predictedClass = naive_bayes.classify(training_set_tokens[i], N_OF_WORDS, N_OF_COMMAS)
        if VERBOSE:
            print("Predicted " + training_labels[i] + " as " + predictedClass)
        if predictedClass == training_labels[i]:
            training_true_positives += 1
        else:
            training_false_positives += 1
            training_false_negative += 1

    training_precisions = training_true_positives / (
        training_true_positives + training_false_positives)

    training__recall = training_true_positives / (
        training_true_positives + training_false_negative)
    training_class_f_score = (2 * training_precisions * training__recall) / (
        training_precisions + training__recall)

    print("Training Precision " + str(training_precisions))
    print("Training Recall " + str(training__recall))
    print("Training F-Score " + str(training_class_f_score))

    print("*" * 50)

    print("Validating with test set")
    test_true_positives = 0
    test_false_positives = 0
    test_false_negative = 0
    for i in range(len(test_set_tokens)):
        predictedClass = naive_bayes.classify(test_set_tokens[i], N_OF_WORDS, N_OF_COMMAS)
        if VERBOSE:
            print("Predicted " + test_labels[i] + " as " + predictedClass)
        if predictedClass == test_labels[i]:
            test_true_positives += 1
        else:
            test_false_positives += 1
            test_false_negative += 1

    test_precisions = test_true_positives / (
        test_true_positives + test_false_positives)

    test__recall = test_true_positives / (
        test_true_positives + test_false_negative)
    test_class_f_score = (2 * test_precisions * test__recall) / (
        test_precisions + test__recall)

    print("Test Precision " + str(test_precisions))
    print("Test Recall " + str(test__recall))
    print("Test F-Score " + str(test_class_f_score))