def CalVector(self, sentencelist): debug_print("Standard.CalVector(%s)" % sentencelist, level=5) text_words = [] # Gather words from all sentences for sentence in sentencelist: debug_print("sentence: " + str(sentence), level=6) raw = self.ParseKeyword(sentence['KeyS']) text = nltk.word_tokenize(raw) part_of_speech_tagged_words = nltk.pos_tag(text) debug_print("part_of_speech_tagged_words = %s" % str(part_of_speech_tagged_words), level=4) stopwords_list = nltk.corpus.stopwords.raw('english').split() # OLD: #word.lower() + '/' + tag words = list(nltk.corpus.wordnet.morphy(word.lower()) for word, tag in part_of_speech_tagged_words # TODO: allow for comparatives and particles (e.g., back/RP) if (tag.startswith('V') or tag.startswith('NN') or tag == 'JJ' or tag == 'DET' or tag == 'RB') and word not in stopwords_list) words_proper = list(word for word in words if word) if self.use_part_of_speech: # Prefix each word with wordnet part-of-speech indicator (e.g., ['fast', 'car'] => ['a:fast', 'n:car']) words_proper = [wordnet.get_part_of_speech(tag) + ":" + word for (word, (token, tag)) in zip(words, part_of_speech_tagged_words) if word] debug_print("words_proper: " + str(words_proper), level=7) # remove empty words and store in SenWords property sentence['SenWords'] = words_proper text_words += sentence['SenWords'] # Get frequency distribution debug_print("text_words: " + str(text_words), level=6) textfdist = nltk.FreqDist(text_words) debug_print("Standard.CalVector => %s" % str(textfdist), level=5) return textfdist
def UpdateKBVec(self, sentencelist): """ Update KeyBVec with only proper words """ for sentence in sentencelist: print "sentence['KeyBVec'] = ", sentence['KeyBVec'] keybvec = '.'.join(sentence['KeyBVec']) print 'remove_latex(keybvec) = ', remove_latex(keybvec) raw = self.ParseKeyword(remove_latex(keybvec)) print "rawwwwwwwwwwwwwwwwwwwwwwww = ", raw text = nltk.word_tokenize(raw) part_of_speech_tagged_words = nltk.pos_tag(text) stopwords_list = nltk.corpus.stopwords.raw('english').split() # OLD: #word.lower() + '/' + tag words = list( nltk.corpus.wordnet.morphy(word.lower()) for word, tag in part_of_speech_tagged_words if (tag.startswith('V') or tag.startswith('NN') or tag == 'JJ' or tag == 'DET' or tag == 'RB') and word not in stopwords_list) words_proper = list(word for word in words if word) if self.use_part_of_speech: # Prefix each word with wordnet part-of-speech indicator (e.g., ['fast', 'car'] => ['a:fast', 'n:car']) words_proper = [ wordnet.get_part_of_speech(tag) + ":" + word for (word, (token, tag)) in zip(words, part_of_speech_tagged_words) if word ] print 'words_proper ' * 12 print words_proper sentence['KeyBVec'] = words_proper
def preprocess(text, use_part_of_speech): # Tokenize and part-of-speech tag text_tokens = nltk.word_tokenize(full_text_proper) part_of_speech_tagged_words = nltk.pos_tag(text_tokens) text_words = [nltk.corpus.wordnet.morphy(word.lower()) for (word, tag) in part_of_speech_tagged_words] text_words_proper = list(word for word in text_words if word) # Optionally prefix each word with WordNet part-of-speech indicator (e.g., ['fast', 'car'] => ['a:fast', 'n:car']) if use_part_of_speech: text_words_proper = [wordnet.get_part_of_speech(tag) + ":" + word for (word, (token, tag)) in zip(text_words, part_of_speech_tagged_words) if word] return text_words_proper
def SentenceAnalysis(self, fulltext, textfdist): ans_sentencelist = [] text = fulltext.replace('\n', ' ') # Separate text into sentences # TODO: See if NLTK sentence tokenizer works better #p = re.compile(r'.+\.') p = re.compile(r'([\w\"\'\<\(][\S ]+?[\.!?])[ \n\"]') keysen = p.findall(text) sen_no = 0 for sen in keysen: debug_print("sen: " + str(sen), level=6) sen_no += 1 # Tokenize text, part-of-speech tag, derive WordNet base word (lemma), and then add information for words found. # Note: An optional part-of-speech tag prefix can be included. # TODO: Isolate text preprocessing code in a separate function text = nltk.word_tokenize(sen) part_of_speech_tagged_words = nltk.pos_tag(text) text_words = list(nltk.corpus.wordnet.morphy(word.lower()) for (word, tag) in part_of_speech_tagged_words) text_words_proper = list(word for word in text_words if word) if self.use_part_of_speech: # Prefix each word with wordnet part-of-speech indicator (e.g., ['fast', 'car'] => ['a:fast', 'n:car']) text_words_proper = [wordnet.get_part_of_speech(tag) + ":" + word for (word, (token, tag)) in zip(text_words, part_of_speech_tagged_words) if word] ans_sentencelist.append({'StuS': sen, 'StuWords': text_words_proper, 'No': sen_no}) # Compute TF/IDF-style weighting scheme for sentence in ans_sentencelist: debug_print("sentence: " + str(sentence), level=6) fdist = nltk.FreqDist(sentence['StuWords']) max_freq = max([f for f in fdist.values()]) log_max_freq = math.log(max_freq) if (max_freq > 1) else 1 senvec = {} for word in sorted(textfdist): if fdist[word]: wordfreq = sum(1 for senten in ans_sentencelist if word in senten['StuWords']) if (self.use_true_tf_idf): tf = 1 + math.log(fdist[word]) / log_max_freq idf = 1 + math.log(len(keysen) / wordfreq) senvec[word] = tf * idf else: senvec[word] = (1 + math.log(2.0 * fdist[word])) * math.log(2.0 * len(keysen) / wordfreq) else: senvec[word] = 0 sentence['StuSVec'] = senvec debug_print("Answer.SentenceAnalysis(%s,_) => %s" % (str(fulltext), str(ans_sentencelist)), level=6) debug_print("\t_ [textfdist]: %s" % str(textfdist), level=7) return ans_sentencelist
def SentenceAnalysis(self, fulltext, textfdist): debug_print("Answer.SentenceAnalysis(_,_)", level=5) ans_sentencelist = [] # Perform text normalization, while preserving offsets text = fulltext.replace('\n', ' ') # Separate text into sentences # TODO: See if NLTK sentence tokenizer works better ## OLD: p = re.compile(r'.+\.') p = re.compile(r'([\w\"\'\<\(][\S ]+?[\.!?])[ \n\"]') ## OLD: keysen = p.findall(text) offset = 0 keysen = [] starts = [] ends = [] while (len(text) > 0): match = p.search(text) if not match: break keysen.append(match.group(0)) starts.append(offset + match.start(0)) ends.append(offset + match.end(0)) text = text[match.end(0) : ] offset += match.end(0) # Create hash entries for each sentence sen_no = 0 for sen in keysen: debug_print("sen: " + str(sen), level=6) sen_no += 1 # Tokenize text, part-of-speech tag, derive WordNet base word (lemma), and then add information for words found. # Note: An optional part-of-speech tag prefix can be included. # TODO: Isolate text preprocessing code in a separate function text = nltk.word_tokenize(sen) part_of_speech_tagged_words = nltk.pos_tag(text) text_words = list(nltk.corpus.wordnet.morphy(word.lower()) for (word, tag) in part_of_speech_tagged_words) text_words_proper = list(word for word in text_words if word) if self.use_part_of_speech: # Prefix each word with wordnet part-of-speech indicator (e.g., ['fast', 'car'] => ['a:fast', 'n:car']) text_words_proper = [wordnet.get_part_of_speech(tag) + ":" + word for (word, (token, tag)) in zip(text_words, part_of_speech_tagged_words) if word] ans_sentencelist.append({'StuS': sen, 'StuWords': text_words_proper, 'No': sen_no, 'Start': starts[sen_no - 1], 'End': ends[sen_no - 1]}) # Compute TF/IDF-style weighting scheme for sentence in ans_sentencelist: debug_print("sentence: " + str(sentence), level=6) fdist = nltk.FreqDist(sentence['StuWords']) try: max_freq = max([f for f in fdist.values()]) except ValueError: print_stderr("Exception in Answer.SentenceAnalysis: " + str(sys.exc_info())) max_freq = 1 log_max_freq = math.log(max_freq) if (max_freq > 1) else 1 senvec = {} for word in sorted(textfdist): if fdist[word]: wordfreq = sum(1 for senten in ans_sentencelist if word in senten['StuWords']) if (self.use_true_tf_idf): tf = 1 + math.log(fdist[word]) / log_max_freq idf = 1 + math.log(len(keysen) / wordfreq) senvec[word] = tf * idf else: senvec[word] = (1 + math.log(2.0 * fdist[word])) * math.log(2.0 * len(keysen) / wordfreq) else: senvec[word] = 0 sentence['StuSVec'] = senvec debug_print("Answer.SentenceAnalysis(%s,_) => %s" % (str(fulltext), str(ans_sentencelist)), level=6) debug_print("\t_ [textfdist]: %s" % str(textfdist), level=7) return ans_sentencelist