class TestNaiveBayesClassifier(unittest.TestCase): def setUp(self): self.classifier = NaiveBayesClassifier(train_set) def test_default_extractor(self): text = "I feel happy this morning." assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set)) def test_classify(self): res = self.classifier.classify("I feel happy this morning") assert_equal(res, 'positive') assert_equal(len(self.classifier.train_set), len(train_set)) def test_classify_a_list_of_words(self): res = self.classifier.classify(["I", "feel", "happy", "this", "morning"]) assert_equal(res, "positive") def test_train_from_lists_of_words(self): # classifier can be trained on lists of words instead of strings train = [(doc.split(), label) for doc, label in train_set] classifier = NaiveBayesClassifier(train) assert_equal(classifier.accuracy(test_set), self.classifier.accuracy(test_set)) def test_prob_classify(self): res = self.classifier.prob_classify("I feel happy this morning") assert_equal(res.max(), "positive") assert_true(res.prob("positive") > res.prob("negative")) def test_accuracy(self): acc = self.classifier.accuracy(test_set) assert_true(isinstance(acc, float)) def test_update(self): res1 = self.classifier.prob_classify("lorem ipsum") original_length = len(self.classifier.train_set) self.classifier.update([("lorem ipsum", "positive")]) new_length = len(self.classifier.train_set) res2 = self.classifier.prob_classify("lorem ipsum") assert_true(res2.prob("positive") > res1.prob("positive")) assert_equal(original_length + 1, new_length) def test_labels(self): labels = self.classifier.labels() assert_true("positive" in labels) assert_true("negative" in labels) def test_show_informative_features(self): feats = self.classifier.show_informative_features() def test_informative_features(self): feats = self.classifier.informative_features(3) assert_true(isinstance(feats, list)) assert_true(isinstance(feats[0], tuple)) def test_custom_feature_extractor(self): cl = NaiveBayesClassifier(train_set, custom_extractor) cl.classify("Yay! I'm so happy it works.") assert_equal(cl.train_features[0][1], 'positive') def test_init_with_csv_file(self): with open(CSV_FILE) as fp: cl = NaiveBayesClassifier(fp, format="csv") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_csv_file_without_format_specifier(self): with open(CSV_FILE) as fp: cl = NaiveBayesClassifier(fp) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file(self): with open(JSON_FILE) as fp: cl = NaiveBayesClassifier(fp, format="json") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file_without_format_specifier(self): with open(JSON_FILE) as fp: cl = NaiveBayesClassifier(fp) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_custom_format(self): redis_train = [('I like turtles', 'pos'), ('I hate turtles', 'neg')] class MockRedisFormat(formats.BaseFormat): def __init__(self, client, port): self.client = client self.port = port @classmethod def detect(cls, stream): return True def to_iterable(self): return redis_train formats.register('redis', MockRedisFormat) mock_redis = mock.Mock() cl = NaiveBayesClassifier(mock_redis, format='redis', port=1234) assert_equal(cl.train_set, redis_train) def test_data_with_no_available_format(self): mock_fp = mock.Mock() mock_fp.read.return_value = '' assert_raises(FormatError, lambda: NaiveBayesClassifier(mock_fp)) def test_accuracy_on_a_csv_file(self): with open(CSV_FILE) as fp: a = self.classifier.accuracy(fp) assert_equal(type(a), float) def test_accuracy_on_json_file(self): with open(CSV_FILE) as fp: a = self.classifier.accuracy(fp) assert_equal(type(a), float) def test_init_with_tsv_file(self): with open(TSV_FILE) as fp: cl = NaiveBayesClassifier(fp) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_bad_format_specifier(self): assert_raises(ValueError, lambda: NaiveBayesClassifier(CSV_FILE, format='unknown')) def test_repr(self): assert_equal(repr(self.classifier), "<NaiveBayesClassifier trained on {0} instances>".format(len(train_set)))
#train_data = train.Title train_target = remove_prefix(train.Team, 'UCM ') test_data = remove_stopwords(test.Title) #test_data = test.Title test_target = remove_prefix(test.Team, 'UCM ') train = list(zip(train_data, train_target)) test = list(zip(test_data, test_target)) start_time = time.time() cl = NaiveBayesClassifier(train) # Compute accuracy print("NaiveBayes Accuracy: {0}".format(cl.accuracy(test))) # Show 10 most informative features cl.show_informative_features(10) print(cl.informative_features(10)) elapsed_time = time.time() - start_time print(elapsed_time) if (not ignoreDT): start_time = time.time() cl = DecisionTreeClassifier(train) print("DecisionTree Accuracy: {0}".format(cl.accuracy(test))) print(cl.pseudocode()) elapsed_time = time.time() - start_time print(elapsed_time) start_time = time.time() from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.naive_bayes import MultinomialNB
class TestNaiveBayesClassifier(unittest.TestCase): def setUp(self): self.classifier = NaiveBayesClassifier(train_set) def test_default_extractor(self): text = "I feel happy this morning." assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set)) def test_classify(self): res = self.classifier.classify("I feel happy this morning") assert_equal(res, 'positive') assert_equal(len(self.classifier.train_set), len(train_set)) def test_classify_a_list_of_words(self): res = self.classifier.classify( ["I", "feel", "happy", "this", "morning"]) assert_equal(res, "positive") def test_train_from_lists_of_words(self): # classifier can be trained on lists of words instead of strings train = [(doc.split(), label) for doc, label in train_set] classifier = NaiveBayesClassifier(train) assert_equal(classifier.accuracy(test_set), self.classifier.accuracy(test_set)) def test_prob_classify(self): res = self.classifier.prob_classify("I feel happy this morning") assert_equal(res.max(), "positive") assert_true(res.prob("positive") > res.prob("negative")) def test_accuracy(self): acc = self.classifier.accuracy(test_set) assert_true(isinstance(acc, float)) def test_update(self): res1 = self.classifier.prob_classify("lorem ipsum") original_length = len(self.classifier.train_set) self.classifier.update([("lorem ipsum", "positive")]) new_length = len(self.classifier.train_set) res2 = self.classifier.prob_classify("lorem ipsum") assert_true(res2.prob("positive") > res1.prob("positive")) assert_equal(original_length + 1, new_length) def test_labels(self): labels = self.classifier.labels() assert_true("positive" in labels) assert_true("negative" in labels) def test_show_informative_features(self): feats = self.classifier.show_informative_features() def test_informative_features(self): feats = self.classifier.informative_features(3) assert_true(isinstance(feats, list)) assert_true(isinstance(feats[0], tuple)) def test_custom_feature_extractor(self): cl = NaiveBayesClassifier(train_set, custom_extractor) cl.classify("Yay! I'm so happy it works.") assert_equal(cl.train_features[0][1], 'positive') def test_init_with_csv_file(self): cl = NaiveBayesClassifier(CSV_FILE, format="csv") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_csv_file_without_format_specifier(self): cl = NaiveBayesClassifier(CSV_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file(self): cl = NaiveBayesClassifier(JSON_FILE, format="json") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file_without_format_specifier(self): cl = NaiveBayesClassifier(JSON_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_accuracy_on_a_csv_file(self): a = self.classifier.accuracy(CSV_FILE) assert_true(isinstance(a, float)) def test_accuracy_on_json_file(self): a = self.classifier.accuracy(JSON_FILE) assert_true(isinstance(a, float)) def test_init_with_tsv_file(self): cl = NaiveBayesClassifier(TSV_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_bad_format_specifier(self): assert_raises(ValueError, lambda: NaiveBayesClassifier(CSV_FILE, format='unknown')) def test_repr(self): assert_equal( repr(self.classifier), "<NaiveBayesClassifier trained on {0} instances>".format( len(train_set)))
class TestNaiveBayesClassifier(unittest.TestCase): def setUp(self): self.classifier = NaiveBayesClassifier(train_set) def test_default_extractor(self): text = "I feel happy this morning." assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set)) def test_classify(self): res = self.classifier.classify("I feel happy this morning") assert_equal(res, 'positive') assert_equal(len(self.classifier.train_set), len(train_set)) def test_classify_a_list_of_words(self): res = self.classifier.classify(["I", "feel", "happy", "this", "morning"]) assert_equal(res, "positive") def test_train_from_lists_of_words(self): # classifier can be trained on lists of words instead of strings train = [(doc.split(), label) for doc, label in train_set] classifier = NaiveBayesClassifier(train) assert_equal(classifier.accuracy(test_set), self.classifier.accuracy(test_set)) def test_prob_classify(self): res = self.classifier.prob_classify("I feel happy this morning") assert_equal(res.max(), "positive") assert_true(res.prob("positive") > res.prob("negative")) def test_accuracy(self): acc = self.classifier.accuracy(test_set) assert_true(isinstance(acc, float)) def test_update(self): res1 = self.classifier.prob_classify("lorem ipsum") original_length = len(self.classifier.train_set) self.classifier.update([("lorem ipsum", "positive")]) new_length = len(self.classifier.train_set) res2 = self.classifier.prob_classify("lorem ipsum") assert_true(res2.prob("positive") > res1.prob("positive")) assert_equal(original_length + 1, new_length) def test_labels(self): labels = self.classifier.labels() assert_true("positive" in labels) assert_true("negative" in labels) def test_show_informative_features(self): feats = self.classifier.show_informative_features() def test_informative_features(self): feats = self.classifier.informative_features(3) assert_true(isinstance(feats, list)) assert_true(isinstance(feats[0], tuple)) def test_custom_feature_extractor(self): cl = NaiveBayesClassifier(train_set, custom_extractor) cl.classify("Yay! I'm so happy it works.") assert_equal(cl.train_features[0][1], 'positive') def test_init_with_csv_file(self): cl = NaiveBayesClassifier(CSV_FILE, format="csv") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_csv_file_without_format_specifier(self): cl = NaiveBayesClassifier(CSV_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file(self): cl = NaiveBayesClassifier(JSON_FILE, format="json") assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_json_file_without_format_specifier(self): cl = NaiveBayesClassifier(JSON_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_accuracy_on_a_csv_file(self): a = self.classifier.accuracy(CSV_FILE) assert_true(isinstance(a, float)) def test_accuracy_on_json_file(self): a = self.classifier.accuracy(JSON_FILE) assert_true(isinstance(a, float)) def test_init_with_tsv_file(self): cl = NaiveBayesClassifier(TSV_FILE) assert_equal(cl.classify("I feel happy this morning"), 'pos') training_sentence = cl.train_set[0][0] assert_true(isinstance(training_sentence, unicode)) def test_init_with_bad_format_specifier(self): assert_raises(ValueError, lambda: NaiveBayesClassifier(CSV_FILE, format='unknown')) def test_repr(self): assert_equal(repr(self.classifier), "<NaiveBayesClassifier trained on {0} instances>".format(len(train_set)))