def getAllParses(compendium, tfidfVals, threshold = 1.0): outputParses = [] for num, key in enumerate(compendium.keys()): print "Currently working on topic", num+1, "/", len(compendium.keys()) topic = [CW.standardizeWords(word) for word in key.split()] for subkey in compendium[key].keys(): subtopic = [CW.standardizeWords(word) for word in subkey.split()] for paragraph in compendium[key][subkey]: for sentence in nltk.tokenize.sent_tokenize(paragraph): print "Current sentence:", sentence for parse in getSentParses(sentence): if len(parse) == 0: continue rawParse = [stripPunct(parse[i].leaves()) for i in range(3)] for i,part in enumerate(rawParse): rawParse[i] = [CW.standardizeWords(word) for word in part] if i > 0 and CW.getAvgTfIdf(key, tfidfVals, rawParse[i]) < threshold: rawParse[i] += topic rawParse[i] += subtopic printParse(rawParse) outputParses += [rawParse] return outputParses