def populateWordDict(wordDict=None, write=True): '''this populates the wordDict from wordDict.py with the counts of all google's input''' if wordDict is None: wordDict = getWordObjectDictionary("dict.p") print "getting the ngrams" ngrams = files.readFile("all.grams") # all.grams is the file I got # from parsing all of google's input ngrams = ngrams.splitlines() populateWordDictHelper(ngrams, wordDict) populatePossesives(wordDict) print "wordDict populated" if write: files.writePickle("populatedDict.p", wordDict) return wordDict
def add2grams(wordDict=None, write=True): if wordDict is None: wordDict = files.readPickle("./populatedDict.p") for text in getBooks(): name = text[text.rfind(os.sep) + 1:] print "reading book: %s" % name book = files.readFile(text) print "generating the list of words" words = generateWordList(book) count = 0 for i in xrange(2, len(words) - 2): if words[i] not in wordDict: continue wordDict[words[i]].addWordAfter(words[i + 2], second=True) wordDict[words[i]].addWordAfter(words[i + 1], second=False) wordDict[words[i]].addWordBefore(words[i - 2], second=True) wordDict[words[i]].addWordBefore(words[i - 1], second=False) if count % 10000: print "%d words read" % count if write: files.writePickle("2gramDict.p", wordDict) return wordDict