def get_hypernyms(self, pos_tags): """ Return the hypernyms for each word in a list of POS tagged words. """ results = [] for word, pos in pos_tags: try: synsets = wordnet.synsets(word, utils.treebank_to_wordnet(pos), lang=self.language.ISO_639) except WordNetError: synsets = None except LookupError: # Don't return any synsets if the language is not supported synsets = None if synsets: synset = synsets[0] hypernyms = synset.hypernyms() if hypernyms: results.append(hypernyms[0].name().split('.')[0]) else: results.append(word) else: results.append(word) return results
def compare(self, statement, other_statement): """ Return the calculated similarity of two statements based on the Jaccard index. """ import nltk import string # Get default English stopwords stopwords = nltk.corpus.stopwords.words('english') lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() # Make both strings lowercase a = statement.text.lower() b = other_statement.text.lower() # Remove punctuation from each string table = str.maketrans(dict.fromkeys(string.punctuation)) a = a.translate(table) b = b.translate(table) pos_a = nltk.pos_tag(nltk.tokenize.word_tokenize(a)) pos_b = nltk.pos_tag(nltk.tokenize.word_tokenize(b)) lemma_a = [ lemmatizer.lemmatize( token, utils.treebank_to_wordnet(pos) ) for token, pos in pos_a if token not in stopwords ] lemma_b = [ lemmatizer.lemmatize( token, utils.treebank_to_wordnet(pos) ) for token, pos in pos_b if token not in stopwords ] # Calculate Jaccard similarity numerator = len(set(lemma_a).intersection(lemma_b)) denominator = float(len(set(lemma_a).union(lemma_b))) ratio = numerator / denominator return ratio
def compare(self, statement, other_statement): """ Return the calculated similarity of two statements based on the Jaccard index. """ from nltk import pos_tag word_tokenizer = self.get_word_tokenizer() # Get the stopwords for the current language stopwords = self.get_stopwords() lemmatizer = self.get_lemmatizer() # Make both strings lowercase a = statement.text.lower() b = other_statement.text.lower() # Remove punctuation from each string a = a.translate(self.punctuation_table) b = b.translate(self.punctuation_table) pos_a = pos_tag(word_tokenizer.tokenize(a)) pos_b = pos_tag(word_tokenizer.tokenize(b)) lemma_a = [ lemmatizer.lemmatize(token, utils.treebank_to_wordnet(pos)) for token, pos in pos_a if token not in stopwords ] lemma_b = [ lemmatizer.lemmatize(token, utils.treebank_to_wordnet(pos)) for token, pos in pos_b if token not in stopwords ] # Calculate Jaccard similarity numerator = len(set(lemma_a).intersection(lemma_b)) denominator = float(len(set(lemma_a).union(lemma_b))) ratio = numerator / denominator return ratio
def get_hypernyms(self, pos_tags): """ Return the hypernyms for each word in a list of POS tagged words. """ results = [] for word, pos in pos_tags: synsets = wordnet.synsets(word, treebank_to_wordnet(pos)) if synsets: synset = synsets[0] hypernyms = synset.hypernyms() if hypernyms: results.append(hypernyms[0].name().split('.')[0]) else: results.append(word) else: results.append(word) return results
def test_treebank_to_wordnet_no_match(self): self.assertEqual(utils.treebank_to_wordnet('XXX'), None)
def test_treebank_to_wordnet(self): self.assertEqual(utils.treebank_to_wordnet('NNS'), 'n')