Example #1
0
    def get_hypernyms(self, pos_tags):
        """
        Return the hypernyms for each word in a list of POS tagged words.
        """
        results = []

        for word, pos in pos_tags:
            try:
                synsets = wordnet.synsets(word, utils.treebank_to_wordnet(pos), lang=self.language.ISO_639)
            except WordNetError:
                synsets = None
            except LookupError:
                # Don't return any synsets if the language is not supported
                synsets = None

            if synsets:
                synset = synsets[0]
                hypernyms = synset.hypernyms()

                if hypernyms:
                    results.append(hypernyms[0].name().split('.')[0])
                else:
                    results.append(word)
            else:
                results.append(word)

        return results
Example #2
0
    def compare(self, statement, other_statement):
        """
        Return the calculated similarity of two
        statements based on the Jaccard index.
        """
        import nltk
        import string

        # Get default English stopwords
        stopwords = nltk.corpus.stopwords.words('english')

        lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

        # Make both strings lowercase
        a = statement.text.lower()
        b = other_statement.text.lower()

        # Remove punctuation from each string
        table = str.maketrans(dict.fromkeys(string.punctuation))
        a = a.translate(table)
        b = b.translate(table)

        pos_a = nltk.pos_tag(nltk.tokenize.word_tokenize(a))
        pos_b = nltk.pos_tag(nltk.tokenize.word_tokenize(b))

        lemma_a = [
            lemmatizer.lemmatize(
                token, utils.treebank_to_wordnet(pos)
            ) for token, pos in pos_a if token not in stopwords
        ]
        lemma_b = [
            lemmatizer.lemmatize(
                token, utils.treebank_to_wordnet(pos)
            ) for token, pos in pos_b if token not in stopwords
        ]

        # Calculate Jaccard similarity
        numerator = len(set(lemma_a).intersection(lemma_b))
        denominator = float(len(set(lemma_a).union(lemma_b)))
        ratio = numerator / denominator

        return ratio
Example #3
0
    def compare(self, statement, other_statement):
        """
        Return the calculated similarity of two
        statements based on the Jaccard index.
        """
        from nltk import pos_tag

        word_tokenizer = self.get_word_tokenizer()

        # Get the stopwords for the current language
        stopwords = self.get_stopwords()

        lemmatizer = self.get_lemmatizer()

        # Make both strings lowercase
        a = statement.text.lower()
        b = other_statement.text.lower()

        # Remove punctuation from each string
        a = a.translate(self.punctuation_table)
        b = b.translate(self.punctuation_table)

        pos_a = pos_tag(word_tokenizer.tokenize(a))
        pos_b = pos_tag(word_tokenizer.tokenize(b))

        lemma_a = [
            lemmatizer.lemmatize(token, utils.treebank_to_wordnet(pos))
            for token, pos in pos_a if token not in stopwords
        ]
        lemma_b = [
            lemmatizer.lemmatize(token, utils.treebank_to_wordnet(pos))
            for token, pos in pos_b if token not in stopwords
        ]

        # Calculate Jaccard similarity
        numerator = len(set(lemma_a).intersection(lemma_b))
        denominator = float(len(set(lemma_a).union(lemma_b)))
        ratio = numerator / denominator

        return ratio
Example #4
0
    def get_hypernyms(self, pos_tags):
        """
        Return the hypernyms for each word in a list of POS tagged words.
        """
        results = []

        for word, pos in pos_tags:
            synsets = wordnet.synsets(word, treebank_to_wordnet(pos))

            if synsets:
                synset = synsets[0]
                hypernyms = synset.hypernyms()

                if hypernyms:
                    results.append(hypernyms[0].name().split('.')[0])
                else:
                    results.append(word)
            else:
                results.append(word)

        return results
Example #5
0
 def test_treebank_to_wordnet_no_match(self):
     self.assertEqual(utils.treebank_to_wordnet('XXX'), None)
Example #6
0
 def test_treebank_to_wordnet(self):
     self.assertEqual(utils.treebank_to_wordnet('NNS'), 'n')