Beispiel #1
0
    def test_get_data_constructor(self):
        ds = data_sources.RedisDataSource(self.db, 'stanford-corpus',
                                          ['positive', 'negative'])
        c = mock.MagicMock()

        sentiments = ds.get_data(c)
        self.assertEqual(c.call_count, 3)
def main(collection, destination, nfeats, nbigrams, classifier_type):
    LOGGER.info("Started classifier")
    if not destination:
        destination = generate_path_for_classifier(collection, nfeats,
                                                   nbigrams, classifier_type)
    LOGGER.info("Classifier will be saved in: %s" % destination)
    LOGGER.info("Training a %s classifier with %s feats and %s bigrams" %
                (classifier_type, nfeats, nbigrams))

    # Get training data using data source
    LOGGER.info("Building datasource")
    make_phrase = functools.partial(phrase.Phrase, tokenizer=TOKENIZER)
    ds = data_sources.RedisDataSource(redis.Redis(), collection,
                                      ['positive', 'negative'])
    phrases = ds.get_data(make_phrase)

    # Initialize the Text Processor, get bigrams and informative features
    LOGGER.info("Building text processor")
    processor = phrase.TextProcessor(phrases, FORMATTER)

    # Train the classifier using the Text Processor
    meta = {'train_corpus': collection, 'classifier_type': classifier_type}
    LOGGER.info("Training Classifier")
    classifier = processor.train_classifier(FORMATTER, nbigrams, nfeats, meta)

    # Serialize the classifier
    LOGGER.info("Serializing classifier")
    if not os.path.exists(destination):
        os.makedirs(destination)
    classifier.serialize(destination)
Beispiel #3
0
    def test_get_data(self):
        ds = data_sources.RedisDataSource(self.db, 'stanford-corpus',
                                          ['positive', 'negative'])

        sentiments = ds.get_data()
        self.assertEqual(len(sentiments.keys()), 2)
        self.assertEqual(sentiments['positive'][0], 'I am positive')
        self.assertEqual(sentiments['negative'][0], 'I am negative')
def main(path, against, nodb):
    LOGGER.info("Started testing")

    LOGGER.info("Loading classifier")
    classifier = phrase.TrainedClassifier.load(path, FORMATTER)

    LOGGER.info("Loading testing data")
    make_phrase = functools.partial(phrase.Phrase, tokenizer=TOKENIZER)
    ds = data_sources.RedisDataSource(redis.Redis(), against,
        ['positive', 'negative'])
    data = ds.get_data(make_phrase)

    LOGGER.info("Making testing data")
    test_data = []
    for sentiment, phrases in data.iteritems():
        for p in phrases:
            test_data.append((p, sentiment))

    try:
        classifier.show_most_informative_features()
    except AttributeError:
        pass
    accuracy = nltk.classify.util.accuracy(classifier, test_data)
    LOGGER.info("Accuracy is: %s" % accuracy)

    if not nodb:
        conn = pymongo.Connection()
        db = conn['worldmood']
        coll = db['statistics']

        s = classifier.meta
        s['accuracy'] = accuracy
        s['test_corpus'] = against

        coll.update({ 'uid': classifier.get_uid() }, s, upsert=True)
        LOGGER.info("Updated collection database: %s" % s)
Beispiel #5
0
 def test_it_initializes(self):
     ds = data_sources.RedisDataSource(self.db, 'stanford-corpus',
                                       ['positive', 'negative'])
     self.assertEqual(len(ds.get_classes()), 2)