Beispiel #1
0
    def __init__(self, length, dbfile="ngrams.db"):
        """
        Initializes a new nGramPrinter, with a dbfile set by
        default to "ngrams.db"
        """
        Session = db.get_session(dbfile)

        self.session = Session()
        self.length = length
        self.ngrams = self.find_nGrams_by_length(length)
Beispiel #2
0
    def __init__(self, filename, ngram_count):
        """
        Populates the dictionary of word counts that make up the reader
        with counts based on [ngram_count] previous words when given a
        [filename] file that acts as a corpus.
        """

        word_table = {}
        phrase_counts = {}
        totalwords = 0
        # deque automatically evicts the first entry when over maxlen
        tempwords = collections.deque(maxlen=ngram_count)

        with open(filename) as f:
            for line in f:
                for word in line.strip().split(" "):
                    word = word.lower()
                    # if this is the first word in the corpus
                    if not len(tempwords) == ngram_count:
                        tempwords.append(word)
                        continue
                    # otherwise we use the previous words to generate the next
                    tempword = " ".join(tempwords)
                    if tempword not in word_table:
                        word_table[tempword] = {}
                        phrase_counts[tempword] = 1
                    else:
                        phrase_counts[tempword] += 1
                    # add counts to the word table
                    if word not in word_table[tempword]:
                        word_table[tempword][word] = 1
                    else:
                        word_table[tempword][word] += 1
                    # add the current word to the previous words and evict one
                    tempwords.append(word)
                    # add to the total count of words
                    totalwords += 1

        self.total_words = totalwords
        self.word_table = word_table
        self.ngram_size = ngram_count
        self.Session = db.get_session()