def style_convert_string(self, input_text): """ For each word in input text, look up synonyms in the author's thesaurus and probabilistically select a replacement word. Write output to outfile. """ text = tokenize_string(input_text) output = "" tagged_tuples = nltk.pos_tag(text) untagged_string = " ".join([tagged_tuple[0] for tagged_tuple in tagged_tuples]) for index, tagged_tuple in enumerate(tagged_tuples): orig_word, temp_pos = tagged_tuple word = orig_word.strip().lower() was_title = orig_word.istitle() # "Title" was_capitalized = orig_word.isupper() # "UPPER" was_lower = orig_word.islower() # "lower" # Don't replace determinants if temp_pos == u'DT': weighted_key = word else: if temp_pos in ['VBD', 'VBG', 'VBN', 'NNS', 'NNPS']: synset = None else: # Replace word # Converts penn tree bank pos tag to wordnet pos tag wordnet_pos = reduce_pos_tagset(temp_pos) if wordnet_pos: synset = nltk_lesk(untagged_string, word, wordnet_pos) else: synset = nltk_lesk(untagged_string, word) # Probabilistically choose a synonym in thesaurus[synset] weighted_key = self._weighted_choice_lesk(str(synset), word) # Match capitalization of original word if was_title: weighted_key = weighted_key.title() elif was_capitalized: weighted_key = weighted_key.upper() elif not was_lower: weighted_key = orig_word # Add a space between words, no space for punctuation if word not in string.punctuation and index != 0: output += " " output += weighted_key return output
def lesk_builtin(wsd): """returns word sense for synset found using lesk's algorithm""" synset = nltk_lesk(wsd.context, wsd.lemma) if synset is not None: return get_first_sense_key(synset) else: logger.debug('synset empty for {}'.format(wsd.lemma)) return (None)
def lesk(self, context, word): from nltk.wsd import lesk as nltk_lesk context_words = self.context2words(context) return nltk_lesk(context_words, word, 'n')
def style_convert_lesk(self, infile_name, outfile_name): """ For each word in input text, look up synonyms in the author's thesaurus and probabilistically select a replacement word. Write output to outfile. """ with open(infile_name, 'r') as infile, open(outfile_name, 'w') as outfile: for line in infile: # POS tag, and then lesk-ify the input, # look it up in the thesauri try: line = line.decode('ascii', 'ignore') except (UnicodeDecodeError, UnicodeEncodeError): continue line = tokenize_string(line) tagged_tuples = nltk.pos_tag(line) untagged_string = " ".join([tagged_tuple[0] for tagged_tuple in tagged_tuples]) for index, tagged_tuple in enumerate(tagged_tuples): orig_word, temp_pos = tagged_tuple word = orig_word.strip().lower() was_title = orig_word.istitle() # "Title" was_capitalized = orig_word.isupper() # "UPPER" was_lower = orig_word.islower() # "lower" # Don't replace determinants if temp_pos == u'DT': weighted_key = word else: # Skip past tense verbs and nouns for synsets if temp_pos in ['VBD', 'VBG', 'VBN', 'NNS', 'NNPS']: synset = None else: # Replace word # Converts penn tree bank pos tag to wordnet pos tag wordnet_pos = reduce_pos_tagset(temp_pos) if wordnet_pos: synset = nltk_lesk(untagged_string, word, wordnet_pos) else: synset = nltk_lesk(untagged_string, word) # Probabilistically choose a synonym in thesaurus[synset] # -> Interpolates to non-WordNet/Synset if synset doesn't exist weighted_key = self._weighted_choice_lesk(str(synset), word) # Match capitalization of original word if was_title: weighted_key = weighted_key.title() elif was_capitalized: weighted_key = weighted_key.upper() elif not was_lower: weighted_key = orig_word # Add a space between words, no space for punctuation if word not in string.punctuation and index != 0: outfile.write(" ") outfile.write(weighted_key) outfile.write('\n') return outfile_name