def _generate_synonym_candidates(doc, disambiguate=False, rank_fn=None): ''' Generate synonym candidates. For each token in the doc, the list of WordNet synonyms is expanded. the synonyms are then ranked by their GloVe similarity to the original token and a context window around the token. :param disambiguate: Whether to use lesk sense disambiguation before expanding the synonyms. :param rank_fn: Functions that takes (doc, original_token, synonym) and returns a similarity score ''' if rank_fn is None: rank_fn = vsm_similarity candidates = [] for position, token in enumerate(doc): #pdb.set_trace() if token.tag_ in supported_pos_tags: wordnet_pos = _get_wordnet_pos(token) wordnet_synonyms = [] if disambiguate: try: synset = disambiguate(doc.text, token.text, pos=wordnet_pos) wordnet_synonyms = synset.lemmas() except: continue else: #pdb.set_trace() synsets = wn.synsets(token.text, pos=wordnet_pos) for synset in synsets: wordnet_synonyms.extend(synset.lemmas()) synonyms = [] for wordnet_synonym in wordnet_synonyms: spacy_synonym = nlp(wordnet_synonym.name().replace('_', ' '))[0] synonyms.append(spacy_synonym) synonyms = filter(partial(_synonym_prefilter_fn, token), synonyms) synonyms = reversed( sorted(synonyms, key=partial(rank_fn, doc, token))) for rank, synonym in enumerate(synonyms): candidate_word = synonym.text candidate = SubstitutionCandidate( token_position=position, similarity_rank=rank, original_token=token, candidate_word=candidate_word) candidates.append(candidate) return candidates
def _generate_synonym_candidates(doc, disambiguate=False, rank_fn=None): if rank_fn is None: rank_fn = vsm_similarity candidates = [] for position, token in enumerate(doc): if token.tag_ in supported_pos_tags: wordnet_pos = _get_wordnet_pos(token) wordnet_synonyms = [] if disambiguate: try: synset = disambiguate(doc.text, token.text, pos=wordnet_pos) wordnet_synonyms = synset.lemmas() except: continue else: synsets = wn.synsets(token.text, pos=wordnet_pos) for synset in synsets: wordnet_synonyms.extend(synset.lemmas()) synonyms = [] for wordnet_synonym in wordnet_synonyms: spacy_synonym = nlp(wordnet_synonym.name().replace('_', ' '))[0] synonyms.append(spacy_synonym) synonyms = filter(partial(_synonym_prefilter_fn, token), synonyms) synonyms = reversed( sorted(synonyms, key=partial(rank_fn, doc, token))) for rank, synonym in enumerate(synonyms): candidate_word = synonym.text candidate = SubstitutionCandidate( token_position=position, similarity_rank=rank, original_token=token, candidate_word=candidate_word) candidates.append(candidate) return candidates