def collocations(index, cutoff=2): """ Extract collocations from n-gram index :type index: dict :rtype list """ def filter_punkt(word): return _PUNKT_RE.match(word) def filter_len(word): return len(word) < 3 and not word.isupper() # do filtration by frequency > 2 bigram_index = dict([(tuple(k.split()), v) for k, v in index.iteritems() if len(k.split()) == 2 and v > cutoff]) # Get abstract finder because we already have index finder = AbstractCollocationFinder(None, bigram_index) # remove collocation from 2 equal words finder.apply_ngram_filter(lambda x, y: x == y) # remove weird collocations finder.apply_ngram_filter(lambda x, y: _DIGIT_RE.match(x) and _DIGIT_RE.match(y)) # remove punctuation, len and stopwords finder.apply_word_filter(filter_punkt) finder.apply_word_filter(filter_len) finder.apply_word_filter(lambda w: w in _STOPWORDS) filtered_collocs = finder.ngram_fd """:type: dict""" # generate possible n-grams filtered_collocs = _update_ngram_counts(_generate_possible_ngrams(filtered_collocs, index), index) return filtered_collocs