def __init__(self, n, word_fd, otherngram_fd, wildcard_fd, ngram_fd):
     """Construct a NramCollocationFinder, given FreqDists for
     appearances of words, bigrams, two words with any word between them,
     and trigrams.
     """
     AbstractCollocationFinder.__init__(self, word_fd, ngram_fd)
     self.wildcard_fd = wildcard_fd
     self.otherngram_fd = otherngram_fd
     self.n = n
 def __init__(self, n, word_fd, otherngram_fd, wildcard_fd, ngram_fd):
     """Construct a NramCollocationFinder, given FreqDists for
     appearances of words, bigrams, two words with any word between them,
     and trigrams.
     """
     AbstractCollocationFinder.__init__(self, word_fd, ngram_fd)
     self.wildcard_fd = wildcard_fd
     self.otherngram_fd = otherngram_fd
     self.n = n
 def __init__(self, word_fd, bigram_fd, wildcard2_fd, trigram_fd, wildcard3_fd, quadgram_fd):
     """Construct a QuadgramCollocationFinder, given FreqDists for
     appearances of words, bigrams, trigrams, three words with any word between them,
     and quadgrans.
     """
     AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd)
     self.wildcard3_fd = wildcard3_fd
     self.wildcard2_fd = wildcard2_fd
     self.trigram_fd = trigram_fd
     self.bigram_fd = bigram_fd
 def __init__(self, word_fd, bigram_fd, wildcard2_fd, trigram_fd, wildcard3_fd, quadgram_fd):
     """Construct a QuadgramCollocationFinder, given FreqDists for
     appearances of words, bigrams, trigrams, three words with any word between them,
     and quadgrans.
     """
     AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd)
     self.wildcard3_fd = wildcard3_fd
     self.wildcard2_fd = wildcard2_fd
     self.trigram_fd = trigram_fd
     self.bigram_fd = bigram_fd
Exemple #5
0
def collocations(index, cutoff=2):
    """
    Extract collocations from n-gram index
    :type index: dict
    :rtype list
    """

    def filter_punkt(word):
        return _PUNKT_RE.match(word)

    def filter_len(word):
        return len(word) < 3 and not word.isupper()

    # do filtration by frequency > 2
    bigram_index = dict([(tuple(k.split()), v) for k, v in index.iteritems()
                         if len(k.split()) == 2 and v > cutoff])

    # Get abstract finder because we already have index
    finder = AbstractCollocationFinder(None, bigram_index)
    # remove collocation from 2 equal words
    finder.apply_ngram_filter(lambda x, y: x == y)
    # remove weird collocations
    finder.apply_ngram_filter(lambda x, y: _DIGIT_RE.match(x) and _DIGIT_RE.match(y))
    # remove punctuation, len and stopwords
    finder.apply_word_filter(filter_punkt)
    finder.apply_word_filter(filter_len)
    finder.apply_word_filter(lambda w: w in _STOPWORDS)

    filtered_collocs = finder.ngram_fd
    """:type: dict"""

    # generate possible n-grams
    filtered_collocs = _update_ngram_counts(_generate_possible_ngrams(filtered_collocs, index),
                                            index)
    return filtered_collocs