def __init__(self, n, word_fd, otherngram_fd, wildcard_fd, ngram_fd): """Construct a NramCollocationFinder, given FreqDists for appearances of words, bigrams, two words with any word between them, and trigrams. """ AbstractCollocationFinder.__init__(self, word_fd, ngram_fd) self.wildcard_fd = wildcard_fd self.otherngram_fd = otherngram_fd self.n = n
def __init__(self, n, word_fd, otherngram_fd, wildcard_fd, ngram_fd): """Construct a NramCollocationFinder, given FreqDists for appearances of words, bigrams, two words with any word between them, and trigrams. """ AbstractCollocationFinder.__init__(self, word_fd, ngram_fd) self.wildcard_fd = wildcard_fd self.otherngram_fd = otherngram_fd self.n = n
def __init__(self, word_fd, bigram_fd, wildcard2_fd, trigram_fd, wildcard3_fd, quadgram_fd): """Construct a QuadgramCollocationFinder, given FreqDists for appearances of words, bigrams, trigrams, three words with any word between them, and quadgrans. """ AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd) self.wildcard3_fd = wildcard3_fd self.wildcard2_fd = wildcard2_fd self.trigram_fd = trigram_fd self.bigram_fd = bigram_fd
def __init__(self, word_fd, bigram_fd, wildcard2_fd, trigram_fd, wildcard3_fd, quadgram_fd): """Construct a QuadgramCollocationFinder, given FreqDists for appearances of words, bigrams, trigrams, three words with any word between them, and quadgrans. """ AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd) self.wildcard3_fd = wildcard3_fd self.wildcard2_fd = wildcard2_fd self.trigram_fd = trigram_fd self.bigram_fd = bigram_fd
def collocations(index, cutoff=2): """ Extract collocations from n-gram index :type index: dict :rtype list """ def filter_punkt(word): return _PUNKT_RE.match(word) def filter_len(word): return len(word) < 3 and not word.isupper() # do filtration by frequency > 2 bigram_index = dict([(tuple(k.split()), v) for k, v in index.iteritems() if len(k.split()) == 2 and v > cutoff]) # Get abstract finder because we already have index finder = AbstractCollocationFinder(None, bigram_index) # remove collocation from 2 equal words finder.apply_ngram_filter(lambda x, y: x == y) # remove weird collocations finder.apply_ngram_filter(lambda x, y: _DIGIT_RE.match(x) and _DIGIT_RE.match(y)) # remove punctuation, len and stopwords finder.apply_word_filter(filter_punkt) finder.apply_word_filter(filter_len) finder.apply_word_filter(lambda w: w in _STOPWORDS) filtered_collocs = finder.ngram_fd """:type: dict""" # generate possible n-grams filtered_collocs = _update_ngram_counts(_generate_possible_ngrams(filtered_collocs, index), index) return filtered_collocs