Esempio n. 1
0
def populateWordDict(wordDict=None, write=True):
    '''this populates the wordDict from wordDict.py with the counts
    of all google's input'''
    if wordDict is None:
        wordDict = getWordObjectDictionary("dict.p")
    print "getting the ngrams"
    ngrams = files.readFile("all.grams")  # all.grams is the file I got
    # from parsing all of google's input
    ngrams = ngrams.splitlines()
    populateWordDictHelper(ngrams, wordDict)
    populatePossesives(wordDict)
    print "wordDict populated"
    if write:
        files.writePickle("populatedDict.p", wordDict)
    return wordDict
Esempio n. 2
0
def add2grams(wordDict=None, write=True):
    if wordDict is None:
        wordDict = files.readPickle("./populatedDict.p")
    for text in getBooks():
        name = text[text.rfind(os.sep) + 1:]
        print "reading book: %s" % name
        book = files.readFile(text)
        print "generating the list of words"
        words = generateWordList(book)
        count = 0
        for i in xrange(2, len(words) - 2):
            if words[i] not in wordDict:
                continue
            wordDict[words[i]].addWordAfter(words[i + 2], second=True)
            wordDict[words[i]].addWordAfter(words[i + 1], second=False)
            wordDict[words[i]].addWordBefore(words[i - 2], second=True)
            wordDict[words[i]].addWordBefore(words[i - 1], second=False)
            if count % 10000:
                print "%d words read" % count
    if write:
        files.writePickle("2gramDict.p", wordDict)
    return wordDict