Beispiel #1
0
        def global_collocations_rejoin():
            print 'Global re-population...'
            # add existing if do not exist yet
            for article in cls.objects.filter(cluster_id=cluster_id):
                index = json.loads(article.index)
                for colloc in all_collocs.intersection(index.keys()):
                    # get or create because we are not filtrating old ones
                    TestCollocations.objects.get_or_create(ngram=colloc,
                                                           article=article,
                                                           defaults={'count': index[colloc]})
                # we could screw up counts completely, need to update them
            print 'Starting updates...'
            from axel.libs.utils import print_progress
            from axel.libs.nlp import _update_ngram_counts

            for article in print_progress(cls.objects.filter(cluster_id=cluster_id)):
                ngrams = sorted(article.testcollocations_set.values_list('ngram', 'count'),
                                key=lambda x: (x[1], x[0]))
                if not ngrams:
                    continue
                index = json.loads(article.index)
                new_ngrams = nlp._generate_possible_ngrams([tuple(c.split()) for c in zip(*ngrams)[0]],
                                                           index)
                new_ngrams = _update_ngram_counts(new_ngrams, index)
                new_ngrams = sorted(new_ngrams.items(), key=lambda x: (x[1], x[0]))
                new_ngrams = [k for k in new_ngrams if k[1] > 0]
                if new_ngrams != ngrams:
                    obsolete_ngrams = set(ngrams).difference(new_ngrams)
                    article.testcollocations_set.filter(ngram__in=zip(*obsolete_ngrams)[0]) \
                        .delete()
                    for ngram, score in set(new_ngrams).difference(ngrams):
                        TestCollocations.objects.create(ngram=ngram, count=score, article=article)
Beispiel #2
0
    def collocations(text, index, measures):
        """
        Extract collocations from n-gram index
        :type index: dict
        :rtype list
        """

        def filter_punkt(word):
            return _PUNKT_RE.match(word)

        def filter_len(word):
            return len(word) < 3 and not word.isupper()

        # do filtration by frequency > 2
        bigram_index = dict([(tuple(k.split()), v) for k, v in index.iteritems()
                             if len(k.split()) == 2 and v > 2])

        # Get abstract finder because we already have index
        finder = nltk.collocations.AbstractCollocationFinder(None, bigram_index)
        # remove collocation from 2 equal words
        finder.apply_ngram_filter(lambda x, y: x == y)
        # remove weird collocations
        finder.apply_ngram_filter(lambda x, y: nlp._DIGIT_RE.match(x) and nlp._DIGIT_RE.match(y))
        # remove punctuation, len and stopwords
        finder.apply_word_filter(filter_punkt)
        finder.apply_word_filter(filter_len)
        finder.apply_word_filter(lambda w: w in nlp._STOPWORDS)

        # build word distribution
        from nltk.probability import FreqDist
        word_fd = FreqDist()
        for word in text.split():
            word_fd.inc(word)
        finder_big = nltk.collocations.BigramCollocationFinder(word_fd, finder.ngram_fd)

        filtered_collocs = _update_ngram_counts(_generate_possible_ngrams(finder.ngram_fd, index),
                                                index).items()
        filtered_collocs.sort(key=lambda col: col[1], reverse=True)
        # do not keep zero scores to exclude them in other rankings
        filtered_collocs = [col for col, score in filtered_collocs if score > 0]
        yield 'raw', filtered_collocs

        # build bigram correspondence dict
        corr_dict = defaultdict(list)
        for ngram in filtered_collocs:
            for bigram in nltk.ngrams(ngram.split(), 2):
                corr_dict[bigram].append(ngram)

        for measure_name, measure_func in measures:
            bigrams = finder_big.score_ngrams(measure_func)
            scored_ngrams = defaultdict(lambda: 0)
            for bigram, score in bigrams:
                for ngram in corr_dict[bigram]:
                    if scored_ngrams[ngram] < score:
                        scored_ngrams[ngram] = score
            scored_ngrams = scored_ngrams.items()
            scored_ngrams.sort(key=lambda col: col[1], reverse=True)
            yield measure_name, zip(*scored_ngrams)[0]