def synset_distance(statement, other_statement): """ Calculate the similarity of two statements. This is based on the total similarity between each word in each sentence. """ from chatterbot.utils.pos_tagger import POSTagger from chatterbot.utils.stop_words import StopWordsManager from chatterbot.utils.word_net import Wordnet import itertools wordnet = Wordnet() tagger = POSTagger() stopwords = StopWordsManager() def get_tokens(text, exclude_stop_words=True): """ Takes a string and converts it to a tuple of each word. Skips common stop words such as ("is, the, a, ...") is 'exclude_stop_words' is True. """ lower = text.lower() tokens = tagger.tokenize(lower) # Remove any stop words from the string if exclude_stop_words: excluded_words = stopwords.words('english') tokens = set(tokens) - set(excluded_words) return tokens tokens1 = get_tokens(statement.text) tokens2 = get_tokens(other_statement.text) total_similarity = 0 # Get the highest matching value for each possible combination of words for combination in itertools.product(*[tokens1, tokens2]): synset1 = wordnet.synsets(combination[0]) synset2 = wordnet.synsets(combination[1]) if synset1 and synset2: max_similarity = 0 # Get the highest similarity for each combination of synsets for synset in itertools.product(*[synset1, synset2]): similarity = synset[0].path_similarity(synset[1]) if similarity and (similarity > max_similarity): max_similarity = similarity # Add the most similar path value to the total total_similarity += max_similarity return total_similarity
def __init__(self, **kwargs): super(ClosestMeaningAdapter, self).__init__(**kwargs) self.wordnet = Wordnet() self.tagger = POSTagger() self.stopwords = StopWordsManager()
def test_word_net(self): wordnet = Wordnet() synsets = wordnet.synsets('test') self.assertEqual(0.06666666666666667, synsets[0].path_similarity(synsets[1]))