コード例 #1
0
ファイル: phraseParser.py プロジェクト: Jnanayogi33/ScienceQA
def getAllParses(compendium, tfidfVals, threshold = 1.0):

    outputParses = []
    for num, key in enumerate(compendium.keys()):
        print "Currently working on topic", num+1, "/", len(compendium.keys())
        topic = [CW.standardizeWords(word) for word in key.split()]
        for subkey in compendium[key].keys():
            subtopic = [CW.standardizeWords(word) for word in subkey.split()]
            for paragraph in compendium[key][subkey]:
                for sentence in nltk.tokenize.sent_tokenize(paragraph):
                    print "Current sentence:", sentence
                    for parse in getSentParses(sentence):
                        if len(parse) == 0: continue
                        rawParse = [stripPunct(parse[i].leaves()) for i in range(3)]
                        for i,part in enumerate(rawParse):
                            rawParse[i] = [CW.standardizeWords(word) for word in part]
                            if i > 0 and CW.getAvgTfIdf(key, tfidfVals, rawParse[i]) < threshold:
                                rawParse[i] += topic
                                rawParse[i] += subtopic
                        printParse(rawParse)
                        outputParses += [rawParse]

    return outputParses