def __init__(self, length, dbfile="ngrams.db"): """ Initializes a new nGramPrinter, with a dbfile set by default to "ngrams.db" """ Session = db.get_session(dbfile) self.session = Session() self.length = length self.ngrams = self.find_nGrams_by_length(length)
def __init__(self, filename, ngram_count): """ Populates the dictionary of word counts that make up the reader with counts based on [ngram_count] previous words when given a [filename] file that acts as a corpus. """ word_table = {} phrase_counts = {} totalwords = 0 # deque automatically evicts the first entry when over maxlen tempwords = collections.deque(maxlen=ngram_count) with open(filename) as f: for line in f: for word in line.strip().split(" "): word = word.lower() # if this is the first word in the corpus if not len(tempwords) == ngram_count: tempwords.append(word) continue # otherwise we use the previous words to generate the next tempword = " ".join(tempwords) if tempword not in word_table: word_table[tempword] = {} phrase_counts[tempword] = 1 else: phrase_counts[tempword] += 1 # add counts to the word table if word not in word_table[tempword]: word_table[tempword][word] = 1 else: word_table[tempword][word] += 1 # add the current word to the previous words and evict one tempwords.append(word) # add to the total count of words totalwords += 1 self.total_words = totalwords self.word_table = word_table self.ngram_size = ngram_count self.Session = db.get_session()