def add2grams(wordDict=None, write=True): if wordDict is None: wordDict = files.readPickle("./populatedDict.p") for text in getBooks(): name = text[text.rfind(os.sep) + 1:] print "reading book: %s" % name book = files.readFile(text) print "generating the list of words" words = generateWordList(book) count = 0 for i in xrange(2, len(words) - 2): if words[i] not in wordDict: continue wordDict[words[i]].addWordAfter(words[i + 2], second=True) wordDict[words[i]].addWordAfter(words[i + 1], second=False) wordDict[words[i]].addWordBefore(words[i - 2], second=True) wordDict[words[i]].addWordBefore(words[i - 1], second=False) if count % 10000: print "%d words read" % count if write: files.writePickle("2gramDict.p", wordDict) return wordDict
def populatePossesives(wordDict, force=False, ratio=10, debug=False): print "populating possesives" count = 0 size = 5000 for word in wordDict: count += 1 if debug and count % size == 0: print "%d words done" % count if word.endswith("'s") and (force or wordDict[word].count == 0): # this makes certain that word is a word that is a possive # ie: "man's" is a possessive # my current list of ngrams doesn't have many words in # possesive form so I'll artificially give it a count try: wordDict[word].count = wordDict[word[:-2]].count / ratio except: if debug: print "error at %s" % word continue return wordDict if __name__ == "__main__": wordDict = files.readPickle("./2gramDict.p") print readWord(wordDict["argument's"]) lis = ["fun", "help", "what", "the", "equality", "parity", "argument", "aardvark", "Alps", "elk"] for word in lis: print readWord(wordDict[word])