コード例 #1
0
class NGramCounter(object):
    """
    Takes as input a corpus, and then updates an internal frequency with
    word counts from the corpus.
    """
    def __init__(self, corpus, N=1):
        self.corpus = corpus
        self.frequency = Frequency()
        self.N = N

    def words(self):
        """
        A generator that goes through all the words in the corpus, makes
        them lowercase and (possibly) could remove punctuation or stopwords
        """
        for reader in self.corpus:
            for word in reader.words():
                word = word.strip()
                if word:
                    yield word.lower()

    def __iter__(self):
        """
        Expects a generator to return the specific ngram to save in the
        frequency counts.
        """
        if self.N == 1:
            # Special case for Unigrams
            for word in self.words():
                yield word
        else:
            ngram = []
            for word in self.words():
                if len(ngram) < self.N:
                    ngram.append(word)
                if len(ngram) == self.N:
                    yield tuple(ngram)
                    ngram = ngram[1:]

    def count(self):
        if not self.frequency:
            for ngram in self:
                self.frequency.increment(ngram)
        return self.frequency
コード例 #2
0
ファイル: ngram.py プロジェクト: bbengfort/ngram-light
class NGramCounter(object):
    """
    Takes as input a corpus, and then updates an internal frequency with
    word counts from the corpus.
    """

    def __init__(self, corpus, N=1):
        self.corpus = corpus
        self.frequency = Frequency()
        self.N = N

    def words(self):
        """
        A generator that goes through all the words in the corpus, makes
        them lowercase and (possibly) could remove punctuation or stopwords
        """
        for reader in self.corpus:
            for word in reader.words():
                word = word.strip()
                if word:
                    yield word.lower()

    def __iter__(self):
        """
        Expects a generator to return the specific ngram to save in the
        frequency counts.
        """
        if self.N == 1:
            # Special case for Unigrams
            for word in self.words(): yield word
        else:
            ngram = []
            for word in self.words():
                if len(ngram) < self.N:
                    ngram.append(word)
                if len(ngram) == self.N:
                    yield tuple(ngram)
                    ngram = ngram[1:]

    def count(self):
        if not self.frequency:
            for ngram in self:
                self.frequency.increment(ngram)
        return self.frequency
コード例 #3
0
 def __init__(self, corpus, N=1):
     self.corpus = corpus
     self.frequency = Frequency()
     self.N = N
コード例 #4
0
ファイル: ngram.py プロジェクト: bbengfort/ngram-light
 def __init__(self, corpus, N=1):
     self.corpus = corpus
     self.frequency = Frequency()
     self.N = N