class NGramCounter(object): """ Takes as input a corpus, and then updates an internal frequency with word counts from the corpus. """ def __init__(self, corpus, N=1): self.corpus = corpus self.frequency = Frequency() self.N = N def words(self): """ A generator that goes through all the words in the corpus, makes them lowercase and (possibly) could remove punctuation or stopwords """ for reader in self.corpus: for word in reader.words(): word = word.strip() if word: yield word.lower() def __iter__(self): """ Expects a generator to return the specific ngram to save in the frequency counts. """ if self.N == 1: # Special case for Unigrams for word in self.words(): yield word else: ngram = [] for word in self.words(): if len(ngram) < self.N: ngram.append(word) if len(ngram) == self.N: yield tuple(ngram) ngram = ngram[1:] def count(self): if not self.frequency: for ngram in self: self.frequency.increment(ngram) return self.frequency
def __init__(self, corpus, N=1): self.corpus = corpus self.frequency = Frequency() self.N = N