def simplified_lesk(word: str, sentence: str) -> Synset: """ Computes the max_overlap to understand what is the best sense :param word: word to dissmbiguate :param sentence: sentence in which word appears :return: Synset that maximizes the overlap """ synsets = wordnet.synsets(word) try: lemmatizer = WordNetLemmatizer() best_sense = wordnet.synsets(word)[0] max_overlap = 0 context = set(lemmatizer.lemmatize(word)for word in sentence.split(" ")) for sense in synsets: signature = set(lemmatizer.lemmatize(word)for word in sense.definition().split(" ")) for example in sense.examples(): signature.union(set(lemmatizer.lemmatize(word)for word in example.split(" "))) overlap = len(signature.intersection(context)) if overlap > max_overlap: max_overlap = overlap best_sense = sense return best_sense except: return Synset(None)
def extended_context_lesk(word: str, sentence: str) -> Synset: """ Computes the max_overlap to understand what is the best sense, using hypernyms and hyponyms :param word: the word to be disambiguated :param sentence: input sentence which contains param 'word' :return: best_sense, which is a Wordnet Synset, for param 'word' """ stopwords_set = set(stopwords.words('english')) synsets = wordnet.synsets(word) try: lemmatizer = WordNetLemmatizer() best_sense = wordnet.synsets(word)[0] max_overlap = 0 context = set(lemmatizer.lemmatize(word)for word in sentence.split(" ")) for sense in synsets: signature = set(lemmatizer.lemmatize(word)for word in sense.definition().split(" ")) for example in sense.examples(): signature.union(set(lemmatizer.lemmatize(word)for word in example.split(" "))) for hypernym in sense.hypernyms(): signature = signature.union(set(lemmatizer.lemmatize(word)for word in hypernym.definition().split(" "))) for hyponym in sense.hyponyms(): signature = signature.union(set(lemmatizer.lemmatize(word)for word in hyponym.definition().split(" "))) signature.difference(stopwords_set) overlap = len(signature.intersection(context)) if overlap > max_overlap: max_overlap = overlap best_sense = sense return best_sense except: return Synset(None)
def lowest_common_hypernyms(self, synset, other, simulate_root=False, use_min_depth=False): ''' -- NOTE: THIS CODE IS COPIED FROM NLTK3 -- Get a list of lowest synset(s) that both synsets have as a hypernym. When `use_min_depth == False` this means that the synset which appears as a hypernym of both `self` and `other` with the lowest maximum depth is returned or if there are multiple such synsets at the same depth they are all returned However, if `use_min_depth == True` then the synset(s) which has/have the lowest minimum depth and appear(s) in both paths is/are returned. By setting the use_min_depth flag to True, the behavior of NLTK2 can be preserved. This was changed in NLTK3 to give more accurate results in a small set of cases, generally with synsets concerning people. (eg: 'chef.n.01', 'fireman.n.01', etc.) This method is an implementation of Ted Pedersen's "Lowest Common Subsumer" method from the Perl Wordnet module. It can return either "self" or "other" if they are a hypernym of the other. :type other: Synset :param other: other input synset :type simulate_root: bool :param simulate_root: The various verb taxonomies do not share a single root which disallows this metric from working for synsets that are not connected. This flag (False by default) creates a fake root that connects all the taxonomies. Set it to True to enable this behavior. For the noun taxonomy, there is usually a default root except for WordNet version 1.6. If you are using wordnet 1.6, a fake root will need to be added for nouns as well. :type use_min_depth: bool :param use_min_depth: This setting mimics older (v2) behavior of NLTK wordnet. If True, will use the min_depth function to calculate the lowest common hypernyms. This is known to give strange results for some synset pairs (eg: 'chef.n.01', 'fireman.n.01') but is retained for backwards compatibility :return: The synsets that are the lowest common hypernyms of both synsets ''' fake_synset = Synset(None) fake_synset._name = '*ROOT*' fake_synset.hypernyms = lambda: [] fake_synset.instance_hypernyms = lambda: [] if simulate_root: self_hypernyms = chain(synset._iter_hypernym_lists(), [[fake_synset]]) other_hypernyms = chain(other._iter_hypernym_lists(), [[fake_synset]]) else: self_hypernyms = synset._iter_hypernym_lists() other_hypernyms = other._iter_hypernym_lists() synsets = set(s for synsets in self_hypernyms for s in synsets) others = set(s for synsets in other_hypernyms for s in synsets) if self.core_taxonomy is not None: synsets.intersection_update( map(lambda syn: wordnet.synset(syn), self.known_concepts)) others.intersection_update( map(lambda syn: wordnet.synset(syn), self.known_concepts)) synsets.intersection_update(others) try: if use_min_depth: max_depth = max(s.min_depth() for s in synsets) unsorted_lch = [ s for s in synsets if s.min_depth() == max_depth ] else: max_depth = max(s.max_depth() for s in synsets) unsorted_lch = [ s for s in synsets if s.max_depth() == max_depth ] return sorted(unsorted_lch) except ValueError: return []