Beispiel #1
0
def add2grams(wordDict=None, write=True):
    if wordDict is None:
        wordDict = files.readPickle("./populatedDict.p")
    for text in getBooks():
        name = text[text.rfind(os.sep) + 1:]
        print "reading book: %s" % name
        book = files.readFile(text)
        print "generating the list of words"
        words = generateWordList(book)
        count = 0
        for i in xrange(2, len(words) - 2):
            if words[i] not in wordDict:
                continue
            wordDict[words[i]].addWordAfter(words[i + 2], second=True)
            wordDict[words[i]].addWordAfter(words[i + 1], second=False)
            wordDict[words[i]].addWordBefore(words[i - 2], second=True)
            wordDict[words[i]].addWordBefore(words[i - 1], second=False)
            if count % 10000:
                print "%d words read" % count
    if write:
        files.writePickle("2gramDict.p", wordDict)
    return wordDict
Beispiel #2
0
def populatePossesives(wordDict, force=False, ratio=10, debug=False):
    print "populating possesives"
    count = 0
    size = 5000
    for word in wordDict:
        count += 1
        if debug and count % size == 0:
            print "%d words done" % count
        if word.endswith("'s") and (force or wordDict[word].count == 0):
            # this makes certain that word is a word that is a possive
            # ie: "man's" is a possessive
            # my current list of ngrams doesn't have many words in
            # possesive form so I'll artificially give it a count
            try:
                wordDict[word].count = wordDict[word[:-2]].count / ratio
            except:
                if debug:
                    print "error at %s" % word
                continue
    return wordDict


if __name__ == "__main__":
    wordDict = files.readPickle("./2gramDict.p")
    print readWord(wordDict["argument's"])
    lis = ["fun", "help", "what", "the", "equality", "parity", "argument",
           "aardvark", "Alps", "elk"]
    for word in lis:
        print readWord(wordDict[word])