コード例 #1
0
    def CalVector(self, sentencelist):
        debug_print("Standard.CalVector(%s)" % sentencelist, level=5)
        text_words = []
        
        # Gather words from all sentences
        for sentence in sentencelist:
            debug_print("sentence: " + str(sentence), level=6)
            raw = self.ParseKeyword(sentence['KeyS'])
            text = nltk.word_tokenize(raw)
            part_of_speech_tagged_words =  nltk.pos_tag(text)
            debug_print("part_of_speech_tagged_words = %s" % str(part_of_speech_tagged_words), level=4)
            stopwords_list = nltk.corpus.stopwords.raw('english').split()
            # OLD: #word.lower() + '/' + tag
            words = list(nltk.corpus.wordnet.morphy(word.lower())
                         for word, tag in part_of_speech_tagged_words
                         # TODO: allow for comparatives and particles (e.g., back/RP)
                         if (tag.startswith('V') or tag.startswith('NN') or tag == 'JJ' or tag == 'DET' or tag == 'RB')
                         and word not in stopwords_list)
            words_proper = list(word for word in words if word)
            if self.use_part_of_speech:
                # Prefix each word with wordnet part-of-speech indicator (e.g., ['fast', 'car'] => ['a:fast', 'n:car'])
                words_proper = [wordnet.get_part_of_speech(tag) + ":" + word
                                for (word, (token, tag)) in zip(words, part_of_speech_tagged_words) 
                                if word]
            debug_print("words_proper: " + str(words_proper), level=7)

            # remove empty words and store in SenWords property
            sentence['SenWords'] = words_proper
            text_words += sentence['SenWords']

        # Get frequency distribution
        debug_print("text_words: " + str(text_words), level=6)
        textfdist = nltk.FreqDist(text_words)
        debug_print("Standard.CalVector => %s" % str(textfdist), level=5)
        return textfdist
コード例 #2
0
ファイル: standard.py プロジェクト: lion416/intemass_old
 def UpdateKBVec(self, sentencelist):
     """
     Update KeyBVec with only proper words
     """
     for sentence in sentencelist:
         print "sentence['KeyBVec'] = ", sentence['KeyBVec']
         keybvec = '.'.join(sentence['KeyBVec'])
         print 'remove_latex(keybvec) = ', remove_latex(keybvec)
         raw = self.ParseKeyword(remove_latex(keybvec))
         print "rawwwwwwwwwwwwwwwwwwwwwwww = ", raw
         text = nltk.word_tokenize(raw)
         part_of_speech_tagged_words = nltk.pos_tag(text)
         stopwords_list = nltk.corpus.stopwords.raw('english').split()
         # OLD: #word.lower() + '/' + tag
         words = list(
             nltk.corpus.wordnet.morphy(word.lower())
             for word, tag in part_of_speech_tagged_words
             if (tag.startswith('V') or tag.startswith('NN') or tag == 'JJ'
                 or tag == 'DET' or tag == 'RB')
             and word not in stopwords_list)
         words_proper = list(word for word in words if word)
         if self.use_part_of_speech:
             # Prefix each word with wordnet part-of-speech indicator (e.g., ['fast', 'car'] => ['a:fast', 'n:car'])
             words_proper = [
                 wordnet.get_part_of_speech(tag) + ":" + word
                 for (word,
                      (token,
                       tag)) in zip(words, part_of_speech_tagged_words)
                 if word
             ]
     print 'words_proper   ' * 12
     print words_proper
     sentence['KeyBVec'] = words_proper
コード例 #3
0
def preprocess(text, use_part_of_speech):
    # Tokenize and part-of-speech tag
    text_tokens = nltk.word_tokenize(full_text_proper)
    part_of_speech_tagged_words =  nltk.pos_tag(text_tokens)
    text_words = [nltk.corpus.wordnet.morphy(word.lower()) for (word, tag) in part_of_speech_tagged_words]
    text_words_proper = list(word for word in text_words if word)
    # Optionally prefix each word with WordNet part-of-speech indicator (e.g., ['fast', 'car'] => ['a:fast', 'n:car'])
    if use_part_of_speech:
        text_words_proper = [wordnet.get_part_of_speech(tag) + ":" + word
                             for (word, (token, tag)) in zip(text_words, part_of_speech_tagged_words) 
                             if word]
    return text_words_proper
コード例 #4
0
    def SentenceAnalysis(self, fulltext, textfdist):
        ans_sentencelist = []
        text = fulltext.replace('\n', ' ')

        # Separate text into sentences
        # TODO: See if NLTK sentence tokenizer works better
        #p = re.compile(r'.+\.')
        p = re.compile(r'([\w\"\'\<\(][\S ]+?[\.!?])[ \n\"]')
        keysen = p.findall(text)
        sen_no = 0
        for sen in keysen:
            debug_print("sen: " + str(sen), level=6)
            sen_no += 1
            # Tokenize text, part-of-speech tag, derive WordNet base word (lemma), and then add information for words found.
            # Note: An optional part-of-speech tag prefix can be included.
            # TODO: Isolate text preprocessing code in a separate function 
            text = nltk.word_tokenize(sen)
            part_of_speech_tagged_words =  nltk.pos_tag(text)
            text_words = list(nltk.corpus.wordnet.morphy(word.lower()) for (word, tag) in part_of_speech_tagged_words)
            text_words_proper = list(word for word in text_words if word)
            if self.use_part_of_speech:
                # Prefix each word with wordnet part-of-speech indicator (e.g., ['fast', 'car'] => ['a:fast', 'n:car'])
                text_words_proper = [wordnet.get_part_of_speech(tag) + ":" + word
                                     for (word, (token, tag)) in zip(text_words, part_of_speech_tagged_words) 
                                     if word]
            ans_sentencelist.append({'StuS': sen,
                                     'StuWords': text_words_proper,
                                     'No': sen_no})

        # Compute TF/IDF-style weighting scheme
        for sentence in ans_sentencelist:
            debug_print("sentence: " + str(sentence), level=6)
            fdist = nltk.FreqDist(sentence['StuWords'])
            max_freq = max([f for f in fdist.values()])
            log_max_freq = math.log(max_freq) if (max_freq > 1) else 1
            senvec = {}
            for word in sorted(textfdist):
                if fdist[word]:
                    wordfreq = sum(1 for senten in ans_sentencelist if word in senten['StuWords'])
                    if (self.use_true_tf_idf):
                        tf = 1 + math.log(fdist[word]) / log_max_freq
                        idf = 1 + math.log(len(keysen) / wordfreq)
                        senvec[word] = tf * idf
                    else:
                        senvec[word] = (1 + math.log(2.0 * fdist[word])) * math.log(2.0 * len(keysen) / wordfreq)
                else:
                    senvec[word] = 0
            sentence['StuSVec'] = senvec
        debug_print("Answer.SentenceAnalysis(%s,_) => %s" % (str(fulltext), str(ans_sentencelist)), level=6)
        debug_print("\t_ [textfdist]: %s" % str(textfdist), level=7)
        return ans_sentencelist
コード例 #5
0
    def SentenceAnalysis(self, fulltext, textfdist):
        debug_print("Answer.SentenceAnalysis(_,_)", level=5)
        ans_sentencelist = []
        # Perform text normalization, while preserving offsets
        text = fulltext.replace('\n', ' ')

        # Separate text into sentences
        # TODO: See if NLTK sentence tokenizer works better
        ## OLD: p = re.compile(r'.+\.')
        p = re.compile(r'([\w\"\'\<\(][\S ]+?[\.!?])[ \n\"]')
        ## OLD: keysen = p.findall(text)
        offset = 0
        keysen = []
        starts = []
        ends = []
        while (len(text) > 0):
            match = p.search(text)
            if not match:
                break
            keysen.append(match.group(0))
            starts.append(offset + match.start(0))
            ends.append(offset + match.end(0))
            text = text[match.end(0) : ]
            offset += match.end(0)

        # Create hash entries for each sentence
        sen_no = 0
        for sen in keysen:
            debug_print("sen: " + str(sen), level=6)
            sen_no += 1
            # Tokenize text, part-of-speech tag, derive WordNet base word (lemma), and then add information for words found.
            # Note: An optional part-of-speech tag prefix can be included.
            # TODO: Isolate text preprocessing code in a separate function 
            text = nltk.word_tokenize(sen)
            part_of_speech_tagged_words =  nltk.pos_tag(text)
            text_words = list(nltk.corpus.wordnet.morphy(word.lower()) for (word, tag) in part_of_speech_tagged_words)
            text_words_proper = list(word for word in text_words if word)
            if self.use_part_of_speech:
                # Prefix each word with wordnet part-of-speech indicator (e.g., ['fast', 'car'] => ['a:fast', 'n:car'])
                text_words_proper = [wordnet.get_part_of_speech(tag) + ":" + word
                                     for (word, (token, tag)) in zip(text_words, part_of_speech_tagged_words) 
                                     if word]
            ans_sentencelist.append({'StuS': sen,
                                     'StuWords': text_words_proper,
                                     'No': sen_no, 'Start': starts[sen_no - 1], 'End': ends[sen_no - 1]})

        # Compute TF/IDF-style weighting scheme
        for sentence in ans_sentencelist:
            debug_print("sentence: " + str(sentence), level=6)
            fdist = nltk.FreqDist(sentence['StuWords'])
            try:
                max_freq = max([f for f in fdist.values()])
            except ValueError:
                print_stderr("Exception in Answer.SentenceAnalysis: " + str(sys.exc_info()))
                max_freq = 1
            log_max_freq = math.log(max_freq) if (max_freq > 1) else 1
            senvec = {}
            for word in sorted(textfdist):
                if fdist[word]:
                    wordfreq = sum(1 for senten in ans_sentencelist if word in senten['StuWords'])
                    if (self.use_true_tf_idf):
                        tf = 1 + math.log(fdist[word]) / log_max_freq
                        idf = 1 + math.log(len(keysen) / wordfreq)
                        senvec[word] = tf * idf
                    else:
                        senvec[word] = (1 + math.log(2.0 * fdist[word])) * math.log(2.0 * len(keysen) / wordfreq)
                else:
                    senvec[word] = 0
            sentence['StuSVec'] = senvec
        debug_print("Answer.SentenceAnalysis(%s,_) => %s" % (str(fulltext), str(ans_sentencelist)), level=6)
        debug_print("\t_ [textfdist]: %s" % str(textfdist), level=7)
        return ans_sentencelist