Esempio n. 1
0
 def _build_prob_dist(self, fd, cfd):
     LOGGER.info("Building frequency distribution")
     for word, sentiment in self.phrases_it.iterate_formatted_words(
             self.formatter):
         fd.inc(word)
         cfd[sentiment].inc(word)
     return fd, cfd
Esempio n. 2
0
    def train_classifier(self, formatter, n_bigrams, n_feats, meta= {}):
        freq_dist, cond_freq_dist = self._build_prob_dist(FreqDist(),
                                                ConditionalFreqDist())
        feats = self._get_most_informative_features(n_feats, freq_dist,
                                                                    cond_freq_dist)
        bigrams = self.get_bigram_analyzer(n_bigrams, freq_dist.iterkeys())

        meta['n_bigrams'] = n_bigrams
        meta['n_feats'] = n_feats

        LOGGER.info("Building TrainedClassifier")
        return TrainedClassifier(formatter, bigrams, feats, meta,
                            phrases_iterator=self.phrases_it)
Esempio n. 3
0
    def train_classifier(self, formatter, n_bigrams, n_feats, meta={}):
        freq_dist, cond_freq_dist = self._build_prob_dist(
            FreqDist(), ConditionalFreqDist())
        feats = self._get_most_informative_features(n_feats, freq_dist,
                                                    cond_freq_dist)
        bigrams = self.get_bigram_analyzer(n_bigrams, freq_dist.iterkeys())

        meta['n_bigrams'] = n_bigrams
        meta['n_feats'] = n_feats

        LOGGER.info("Building TrainedClassifier")
        return TrainedClassifier(formatter,
                                 bigrams,
                                 feats,
                                 meta,
                                 phrases_iterator=self.phrases_it)
Esempio n. 4
0
    def _get_most_informative_features(self, nfeats, freq_dist,
                                       cond_freq_dist):

        LOGGER.info("Getting most informative fearures")

        LOGGER.info("Building Heap")
        heap = []
        smallest_score = -1

        res = []
        for word, total_freq in freq_dist.iteritems():
            score = 0
            for sentiment in self._get_class_sentiments():
                score += BigramAssocMeasures.chi_sq(
                    cond_freq_dist[sentiment][word],
                    (total_freq, cond_freq_dist[sentiment].N()), freq_dist.N())

                if len(heap) < nfeats:
                    heapq.heappush(heap, (score, word))
                    if score < smallest_score:
                        smallest_score = score
                elif score > smallest_score:
                    smallest_score = score
                    heapq.heapreplace(heap, (score, word))
                    LOGGER.info("Smallest score has increased to: %s" %
                                smallest_score)

        sorted_res = []
        while heap:
            score, word = heapq.heappop(heap)
            sorted_res.insert(0, word)
        return sorted_res
Esempio n. 5
0
    def _get_most_informative_features(self, nfeats,
                            freq_dist, cond_freq_dist):

        LOGGER.info("Getting most informative fearures")

        LOGGER.info("Building Heap")
        heap = []
        smallest_score = -1

        res = []
        for word, total_freq in freq_dist.iteritems():
            score = 0
            for sentiment in self._get_class_sentiments():
                score += BigramAssocMeasures.chi_sq(
                    cond_freq_dist[sentiment][word],
                    (total_freq, cond_freq_dist[sentiment].N()),
                    freq_dist.N()
                )

                if len(heap) < nfeats:
                    heapq.heappush(heap, (score, word))
                    if score < smallest_score:
                        smallest_score = score
                elif score > smallest_score:
                    smallest_score = score
                    heapq.heapreplace(heap, (score, word))
                    LOGGER.info("Smallest score has increased to: %s" % smallest_score)

        sorted_res = []
        while heap:
            score, word = heapq.heappop(heap)
            sorted_res.insert(0, word)
        return sorted_res
def main(path):
    LOGGER.info("Started worker")

    LOGGER.info("Loading classifier")
    make_phrase = functools.partial(phrase.Phrase, tokenizer=TOKENIZER)
    classifier = phrase.TrainedClassifier.load(path, FORMATTER)

    LOGGER.info("Ready and waiting for work")
    socket = get_zmq_socket(DEFAULT_ADDRESS)
    while True:
        try:
            message = socket.recv()
            data = json.loads(message)
            p = make_phrase(data['text'])

            result = prob_dist_to_dict(classifier.prob_classify(p))
            if not result:
                socket.send('')
                continue

            LOGGER.info("[%s] %s" % (result['result'], data['text']))
            data['prediction'] = result
            socket.send(json.dumps(data))

        except zmq.error.ZMQError as e:
            LOGGER.error("Trying to recover from ZMQError crash, sending NIL")
            socket.send('')

        except Exception as e:
            LOGGER.error(e)
def main(path):
    LOGGER.info("Started worker")

    LOGGER.info("Loading classifier")
    make_phrase = functools.partial(phrase.Phrase, tokenizer=TOKENIZER)
    classifier = phrase.TrainedClassifier.load(path, FORMATTER)

    LOGGER.info("Ready and waiting for work")
    socket = get_zmq_socket(DEFAULT_ADDRESS)
    while True:
        try:
            message = socket.recv()
            data = json.loads(message)
            p = make_phrase(data["text"])

            result = prob_dist_to_dict(classifier.prob_classify(p))
            if not result:
                socket.send("")
                continue

            LOGGER.info("[%s] %s" % (result["result"], data["text"]))
            data["prediction"] = result
            socket.send(json.dumps(data))

        except zmq.error.ZMQError as e:
            LOGGER.error("Trying to recover from ZMQError crash, sending NIL")
            socket.send("")

        except Exception as e:
            LOGGER.error(e)
Esempio n. 8
0
def main(path, against, nodb):
    LOGGER.info("Started testing")

    LOGGER.info("Loading classifier")
    classifier = phrase.TrainedClassifier.load(path, FORMATTER)

    LOGGER.info("Loading testing data")
    make_phrase = functools.partial(phrase.Phrase, tokenizer=TOKENIZER)
    ds = data_sources.RedisDataSource(redis.Redis(), against,
        ['positive', 'negative'])
    data = ds.get_data(make_phrase)

    LOGGER.info("Making testing data")
    test_data = []
    for sentiment, phrases in data.iteritems():
        for p in phrases:
            test_data.append((p, sentiment))

    try:
        classifier.show_most_informative_features()
    except AttributeError:
        pass
    accuracy = nltk.classify.util.accuracy(classifier, test_data)
    LOGGER.info("Accuracy is: %s" % accuracy)

    if not nodb:
        conn = pymongo.Connection()
        db = conn['worldmood']
        coll = db['statistics']

        s = classifier.meta
        s['accuracy'] = accuracy
        s['test_corpus'] = against

        coll.update({ 'uid': classifier.get_uid() }, s, upsert=True)
        LOGGER.info("Updated collection database: %s" % s)
Esempio n. 9
0
 def get_bigram_analyzer(self, n, words):
     LOGGER.info("Building Bigram Analyzer")
     bigram_measures = collocations.BigramAssocMeasures()
     finder = collocations.BigramCollocationFinder.from_words(words)
     return BigramAnalyzer(
         finder.above_score(bigram_measures.likelihood_ratio, n))
Esempio n. 10
0
def main(collection, destination, nfeats, nbigrams, classifier_type):
    LOGGER.info("Started classifier")
    if not destination:
        destination = generate_path_for_classifier(collection, nfeats, nbigrams,
                                                classifier_type)
    LOGGER.info("Classifier will be saved in: %s" % destination)
    LOGGER.info("Training a %s classifier with %s feats and %s bigrams" % (classifier_type, nfeats, nbigrams))

    # Get training data using data source
    LOGGER.info("Building datasource")
    make_phrase = functools.partial(phrase.Phrase, tokenizer=TOKENIZER)
    ds = data_sources.RedisDataSource(redis.Redis(), collection,
        ['positive', 'negative'])
    phrases = ds.get_data(make_phrase)

    # Initialize the Text Processor, get bigrams and informative features
    LOGGER.info("Building text processor")
    processor = phrase.TextProcessor(phrases, FORMATTER)

    # Train the classifier using the Text Processor
    meta = {
        'train_corpus': collection,
        'classifier_type': classifier_type
    }
    LOGGER.info("Training Classifier")
    classifier = processor.train_classifier(FORMATTER, nbigrams, nfeats, meta)

    # Serialize the classifier
    LOGGER.info("Serializing classifier")
    if not os.path.exists(destination):
        os.makedirs(destination)
    classifier.serialize(destination)
Esempio n. 11
0
def main(collection, destination, nfeats, nbigrams, classifier_type):
    LOGGER.info("Started classifier")
    if not destination:
        destination = generate_path_for_classifier(collection, nfeats,
                                                   nbigrams, classifier_type)
    LOGGER.info("Classifier will be saved in: %s" % destination)
    LOGGER.info("Training a %s classifier with %s feats and %s bigrams" %
                (classifier_type, nfeats, nbigrams))

    # Get training data using data source
    LOGGER.info("Building datasource")
    make_phrase = functools.partial(phrase.Phrase, tokenizer=TOKENIZER)
    ds = data_sources.RedisDataSource(redis.Redis(), collection,
                                      ['positive', 'negative'])
    phrases = ds.get_data(make_phrase)

    # Initialize the Text Processor, get bigrams and informative features
    LOGGER.info("Building text processor")
    processor = phrase.TextProcessor(phrases, FORMATTER)

    # Train the classifier using the Text Processor
    meta = {'train_corpus': collection, 'classifier_type': classifier_type}
    LOGGER.info("Training Classifier")
    classifier = processor.train_classifier(FORMATTER, nbigrams, nfeats, meta)

    # Serialize the classifier
    LOGGER.info("Serializing classifier")
    if not os.path.exists(destination):
        os.makedirs(destination)
    classifier.serialize(destination)
Esempio n. 12
0
 def _build_prob_dist(self, fd, cfd):
     LOGGER.info("Building frequency distribution")
     for word, sentiment in self.phrases_it.iterate_formatted_words(self.formatter):
         fd.inc(word)
         cfd[sentiment].inc(word)
     return fd, cfd
Esempio n. 13
0
 def get_bigram_analyzer(self, n, words):
     LOGGER.info("Building Bigram Analyzer")
     bigram_measures = collocations.BigramAssocMeasures()
     finder = collocations.BigramCollocationFinder.from_words(words)
     return BigramAnalyzer(finder.above_score(bigram_measures.likelihood_ratio, n))