Ejemplo n.º 1
0
class Model(object):
    """docstring for Model"""
    def __init__(self, name='Guess', config={}):
        self.name = name
        self.config = config
        self.clf = NaiveBayesClassifier([])

    def train(self, training_data):

        safe_training = []

        for example in training_data:
            safe_training.append((example.get('text'), example.get('label')))

        self.clf.update(safe_training)

    def evaluate(self, text):
        label = self.clf.classify(text)
        prob_dist = self.clf.prob_classify(text)
        label_prob = prob_dist.prob(label)
        return label, label_prob

    def get_classes(self):
        return self.clf.labels()

    def save(self):
        pass

    def load(self):
        pass
Ejemplo n.º 2
0
class Classifier:

  def __init__(self):
    fp = open("./data/train.csv")
    self.cl = NaiveBayesClassifier(fp, format="csv")
    fp.close()

  def test(self):
    return self.cl.classify("This is a test sentence")

  def classify(self, text):
    return self.cl.classify(text)

  def n_classify(self, text):
    dist = self.cl.prob_classify(text)

    probs = {"sentiments": []}
    for s in dist.samples():
      if dist.prob(s) >= .10:
        probs["sentiments"].append({s: dist.prob(s)})

    return json.dumps(probs)

  def accuracy(self):

    fp = open('./data/train.csv')
    train_accuracy = self.cl.accuracy(fp, format="csv")
    fp.close()
    fp = open('./data/test.csv')
    test_accuracy = self.cl.accuracy(fp, format="csv")
    fp.close()
    return json.dumps({"train_accuracy": train_accuracy, "test_accuracy": test_accuracy})

  def labels(self):
    return json.dumps({"labels": self.cl.labels()})
Ejemplo n.º 3
0
def run_test(train, test, name):
   print "Training..."
   cll = NaiveBayesClassifier(train)
   print "Done training\n"
   accuracy = cll.accuracy(test)
   print "Accuracy: " + str(accuracy)

   # get matching lists of predicted and true labels
   pred_labels = list()
   true_labels = list()
   for obj in test:
      prob_label = cll.prob_classify(obj[0]).max()
      true_label = obj[1]
      true_labels.append(true_label)
      pred_labels.append(prob_label)

   # transform our labels to numbers
   labels = cll.labels()
   i = 0
   label_num = dict()
   for label in labels:
      label_num[label] = i
      i = i + 1

   # match our predicted and true labels with the number representations
   true_label_nums = list()
   pred_label_nums = list()
   for true_l, pred_l in zip(true_labels, pred_labels):
      true_label_nums.append(label_num[true_l])
      pred_label_nums.append(label_num[pred_l])

   cm = confusion_matrix(true_label_nums, pred_label_nums)
   print cm
   print "\n"

   with open("test_results.txt", "a") as tr:
      tr.write(str(name) + "\n")
      tr.write(str(accuracy) + "\n")
      tr.write(str(cm))
      tr.write("\n\n")

   import matplotlib.pyplot as plt
   fig = plt.figure()
   ax = fig.add_subplot(111)
   cax = ax.matshow(cm)
   plt.title("Confusion Matrix For "+name)
   fig.colorbar(cax)
   ax.set_xticklabels(['']+labels)
   ax.set_yticklabels(['']+labels)
   plt.xlabel("Predicted")
   plt.ylabel("True")
   plt.savefig('plots/'+name+'.pdf', bbox_inches='tight') 
Ejemplo n.º 4
0
class TestNaiveBayesClassifier(unittest.TestCase):

    def setUp(self):
        self.classifier = NaiveBayesClassifier(train_set)

    def test_default_extractor(self):
        text = "I feel happy this morning."
        assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set))

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_classify_a_list_of_words(self):
        res = self.classifier.classify(["I", "feel", "happy", "this", "morning"])
        assert_equal(res, "positive")

    def test_train_from_lists_of_words(self):
        # classifier can be trained on lists of words instead of strings
        train = [(doc.split(), label) for doc, label in train_set]
        classifier = NaiveBayesClassifier(train)
        assert_equal(classifier.accuracy(test_set),
                        self.classifier.accuracy(test_set))

    def test_prob_classify(self):
        res = self.classifier.prob_classify("I feel happy this morning")
        assert_equal(res.max(), "positive")
        assert_true(res.prob("positive") > res.prob("negative"))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        res1 = self.classifier.prob_classify("lorem ipsum")
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        res2 = self.classifier.prob_classify("lorem ipsum")
        assert_true(res2.prob("positive") > res1.prob("positive"))
        assert_equal(original_length + 1, new_length)

    def test_labels(self):
        labels = self.classifier.labels()
        assert_true("positive" in labels)
        assert_true("negative" in labels)

    def test_show_informative_features(self):
        feats = self.classifier.show_informative_features()

    def test_informative_features(self):
        feats = self.classifier.informative_features(3)
        assert_true(isinstance(feats, list))
        assert_true(isinstance(feats[0], tuple))

    def test_custom_feature_extractor(self):
        cl = NaiveBayesClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_init_with_csv_file(self):
        with open(CSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp, format="csv")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_csv_file_without_format_specifier(self):
        with open(CSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file(self):
        with open(JSON_FILE) as fp:
            cl = NaiveBayesClassifier(fp, format="json")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file_without_format_specifier(self):
        with open(JSON_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_custom_format(self):
        redis_train = [('I like turtles', 'pos'), ('I hate turtles', 'neg')]

        class MockRedisFormat(formats.BaseFormat):
            def __init__(self, client, port):
                self.client = client
                self.port = port

            @classmethod
            def detect(cls, stream):
                return True

            def to_iterable(self):
                return redis_train

        formats.register('redis', MockRedisFormat)
        mock_redis = mock.Mock()
        cl = NaiveBayesClassifier(mock_redis, format='redis', port=1234)
        assert_equal(cl.train_set, redis_train)

    def test_data_with_no_available_format(self):
        mock_fp = mock.Mock()
        mock_fp.read.return_value = ''

        assert_raises(FormatError, lambda: NaiveBayesClassifier(mock_fp))

    def test_accuracy_on_a_csv_file(self):
        with open(CSV_FILE) as fp:
            a = self.classifier.accuracy(fp)
        assert_equal(type(a), float)

    def test_accuracy_on_json_file(self):
        with open(CSV_FILE) as fp:
            a = self.classifier.accuracy(fp)
        assert_equal(type(a), float)

    def test_init_with_tsv_file(self):
        with open(TSV_FILE) as fp:
            cl = NaiveBayesClassifier(fp)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_bad_format_specifier(self):
        assert_raises(ValueError,
            lambda: NaiveBayesClassifier(CSV_FILE, format='unknown'))

    def test_repr(self):
        assert_equal(repr(self.classifier),
            "<NaiveBayesClassifier trained on {0} instances>".format(len(train_set)))
Ejemplo n.º 5
0
class TestNaiveBayesClassifier(unittest.TestCase):
    def setUp(self):
        self.classifier = NaiveBayesClassifier(train_set)

    def test_default_extractor(self):
        text = "I feel happy this morning."
        assert_equal(self.classifier.extract_features(text),
                     basic_extractor(text, train_set))

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_classify_a_list_of_words(self):
        res = self.classifier.classify(
            ["I", "feel", "happy", "this", "morning"])
        assert_equal(res, "positive")

    def test_train_from_lists_of_words(self):
        # classifier can be trained on lists of words instead of strings
        train = [(doc.split(), label) for doc, label in train_set]
        classifier = NaiveBayesClassifier(train)
        assert_equal(classifier.accuracy(test_set),
                     self.classifier.accuracy(test_set))

    def test_prob_classify(self):
        res = self.classifier.prob_classify("I feel happy this morning")
        assert_equal(res.max(), "positive")
        assert_true(res.prob("positive") > res.prob("negative"))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        res1 = self.classifier.prob_classify("lorem ipsum")
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        res2 = self.classifier.prob_classify("lorem ipsum")
        assert_true(res2.prob("positive") > res1.prob("positive"))
        assert_equal(original_length + 1, new_length)

    def test_labels(self):
        labels = self.classifier.labels()
        assert_true("positive" in labels)
        assert_true("negative" in labels)

    def test_show_informative_features(self):
        feats = self.classifier.show_informative_features()

    def test_informative_features(self):
        feats = self.classifier.informative_features(3)
        assert_true(isinstance(feats, list))
        assert_true(isinstance(feats[0], tuple))

    def test_custom_feature_extractor(self):
        cl = NaiveBayesClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_init_with_csv_file(self):
        cl = NaiveBayesClassifier(CSV_FILE, format="csv")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_csv_file_without_format_specifier(self):
        cl = NaiveBayesClassifier(CSV_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file(self):
        cl = NaiveBayesClassifier(JSON_FILE, format="json")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file_without_format_specifier(self):
        cl = NaiveBayesClassifier(JSON_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_accuracy_on_a_csv_file(self):
        a = self.classifier.accuracy(CSV_FILE)
        assert_true(isinstance(a, float))

    def test_accuracy_on_json_file(self):
        a = self.classifier.accuracy(JSON_FILE)
        assert_true(isinstance(a, float))

    def test_init_with_tsv_file(self):
        cl = NaiveBayesClassifier(TSV_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_bad_format_specifier(self):
        assert_raises(ValueError,
                      lambda: NaiveBayesClassifier(CSV_FILE, format='unknown'))

    def test_repr(self):
        assert_equal(
            repr(self.classifier),
            "<NaiveBayesClassifier trained on {0} instances>".format(
                len(train_set)))
Ejemplo n.º 6
0
                         'what are you working on',
                         'what you making')

experience_utterances = [(x, 'experience') for x in experience_utterances]
environment_utterances = [(x, 'enivornment') for x in environment_utterances]
working_on_utterances = [(x, 'working') for x in working_on_utterances]

# FIXME: find better way to flatten lists together
training_set = []
training_set.extend(experience_utterances)
training_set.extend(environment_utterances)
training_set.extend(working_on_utterances)


classifier = NaiveBayesClassifier(training_set)
print(classifier.show_informative_features(), classifier.labels())

bogus_utterances = (
        'if you going to use nltk u may want to check this out spacy .io',
        'sup people? I see the weather\'s getting better over there, Ben.',
        'i had the same problem your having so thats my i made my own.',
        'try http, instead of https'
        )

# TODO: Figure out how to make this stronger
dual_utterance = ('how long have you been coding and what IDE do you use',)

test_utterances = ('what are you making',
                   'hey that nyancat is cool, how do you get that?')

for t in test_utterances:
#
# Revisao
#
from textblob.classifiers import NaiveBayesClassifier

treinamento = [('Eu sou lindo', 'A'), ('Eu sou feio', 'B')]

c = NaiveBayesClassifier(treinamento)

print(c.classify('seu feio'))  # B
print(c.classify('seu horroroso'))  # B
print(c.classify('seu bonito'))  # B
print(c.classify('seu lindo'))  # A

print('labels:', c.labels())  # labels: ['A', 'B']

test = [('Voce e muito gato', 'A'), ('Voce e muito feio', 'B')]

print('acuracia:', c.accuracy(test))  # acuracia: 0.5

test = [('Voce e muito lindo', 'A'), ('Voce e muito feio', 'B')]

print('acuracia:', c.accuracy(test))  # acuracia: 1.0

print('features:', c.extract_features('Eu sou horroroso'))
# features: {'contains(Eu)': True, 'contains(sou)': True, 'contains(lindo)': False, 'contains(feio)': False}

c.show_informative_features()
# Most Informative Features
#            contains(sou) = True                B : A      =      1.0 : 1.0
#             contains(Eu) = True                B : A      =      1.0 : 1.0
Ejemplo n.º 8
0
                         'what are you doing',
                         'what are you doing at the moment',
                         'what are you working on', 'what you making')

experience_utterances = [(x, 'experience') for x in experience_utterances]
environment_utterances = [(x, 'enivornment') for x in environment_utterances]
working_on_utterances = [(x, 'working') for x in working_on_utterances]

# FIXME: find better way to flatten lists together
training_set = []
training_set.extend(experience_utterances)
training_set.extend(environment_utterances)
training_set.extend(working_on_utterances)

classifier = NaiveBayesClassifier(training_set)
print(classifier.show_informative_features(), classifier.labels())

bogus_utterances = (
    'if you going to use nltk u may want to check this out spacy .io',
    'sup people? I see the weather\'s getting better over there, Ben.',
    'i had the same problem your having so thats my i made my own.',
    'try http, instead of https')

# TODO: Figure out how to make this stronger
dual_utterance = ('how long have you been coding and what IDE do you use', )

test_utterances = ('what are you making',
                   'hey that nyancat is cool, how do you get that?')

for t in test_utterances:
    prob_dist = classifier.prob_classify(t)
Ejemplo n.º 9
0
class TestNaiveBayesClassifier(unittest.TestCase):

    def setUp(self):
        self.classifier = NaiveBayesClassifier(train_set)

    def test_default_extractor(self):
        text = "I feel happy this morning."
        assert_equal(self.classifier.extract_features(text), basic_extractor(text, train_set))

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_classify_a_list_of_words(self):
        res = self.classifier.classify(["I", "feel", "happy", "this", "morning"])
        assert_equal(res, "positive")

    def test_train_from_lists_of_words(self):
        # classifier can be trained on lists of words instead of strings
        train = [(doc.split(), label) for doc, label in train_set]
        classifier = NaiveBayesClassifier(train)
        assert_equal(classifier.accuracy(test_set),
                        self.classifier.accuracy(test_set))

    def test_prob_classify(self):
        res = self.classifier.prob_classify("I feel happy this morning")
        assert_equal(res.max(), "positive")
        assert_true(res.prob("positive") > res.prob("negative"))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        res1 = self.classifier.prob_classify("lorem ipsum")
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        res2 = self.classifier.prob_classify("lorem ipsum")
        assert_true(res2.prob("positive") > res1.prob("positive"))
        assert_equal(original_length + 1, new_length)

    def test_labels(self):
        labels = self.classifier.labels()
        assert_true("positive" in labels)
        assert_true("negative" in labels)

    def test_show_informative_features(self):
        feats = self.classifier.show_informative_features()

    def test_informative_features(self):
        feats = self.classifier.informative_features(3)
        assert_true(isinstance(feats, list))
        assert_true(isinstance(feats[0], tuple))

    def test_custom_feature_extractor(self):
        cl = NaiveBayesClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_init_with_csv_file(self):
        cl = NaiveBayesClassifier(CSV_FILE, format="csv")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_csv_file_without_format_specifier(self):
        cl = NaiveBayesClassifier(CSV_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file(self):
        cl = NaiveBayesClassifier(JSON_FILE, format="json")
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_json_file_without_format_specifier(self):
        cl = NaiveBayesClassifier(JSON_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_accuracy_on_a_csv_file(self):
        a = self.classifier.accuracy(CSV_FILE)
        assert_true(isinstance(a, float))

    def test_accuracy_on_json_file(self):
        a = self.classifier.accuracy(JSON_FILE)
        assert_true(isinstance(a, float))

    def test_init_with_tsv_file(self):
        cl = NaiveBayesClassifier(TSV_FILE)
        assert_equal(cl.classify("I feel happy this morning"), 'pos')
        training_sentence = cl.train_set[0][0]
        assert_true(isinstance(training_sentence, unicode))

    def test_init_with_bad_format_specifier(self):
        assert_raises(ValueError,
            lambda: NaiveBayesClassifier(CSV_FILE, format='unknown'))

    def test_repr(self):
        assert_equal(repr(self.classifier),
            "<NaiveBayesClassifier trained on {0} instances>".format(len(train_set)))
Ejemplo n.º 10
0
         ("What hunger in this poor town", 'y'),
         ('I do not like this poor weather', 'y'), ('I do have money.', 'y'),
         ("The weather is so warm", 'n'), ('The food was lacking', 'n'),
         ('My neighbor is rich.', 'n')]
test = [('The poor suffer from lack of money.', 'y'),
        ('Tha weather is getting hot', 'n'),
        ("the lack of food is overwhelming.", 'n'),
        ("I don't have money", 'y'), ('Gary does not have any food', 'n'),
        ("Global climate weather change is happening.", 'n')]

cl = NaiveBayesClassifier(train)

# Classify some text
print(cl.classify("Their village was so poor"))  # "y"
#print(cl.classify("I have my sandals"))   # "n"
print('labels : ', cl.labels())

#See the probability of each label
prob_dist = cl.prob_classify("Their village was so poor")
print(prob_dist.max())  #the higherst probability
print(prob_dist.prob("n"))  #probability of no
print(prob_dist.prob("y"))  #probability of yes

# Classify a TextBlob
# blob = TextBlob("The poor weather was increasing in temperature. There were people without food. "
#                 "My family did not have any money, they were very poor.", classifier=cl)
# print(blob)
# print(blob.classify())
#
# for sentence in blob.sentences:
#     print(sentence)