def test_wordnet(self): self.assertIsInstance(wordnet.langs(), list) self.assertIn("tha", wordnet.langs()) self.assertEqual( wordnet.synset("spy.n.01").lemma_names("tha"), ["สปาย", "สายลับ"]) self.assertIsNotNone(wordnet.synsets("นก")) self.assertIsNotNone(wordnet.all_synsets(pos=wn.ADJ)) self.assertIsNotNone(wordnet.lemmas("นก")) self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADV)) self.assertIsNotNone(wordnet.lemma("cat.n.01.cat")) self.assertEqual(wordnet.morphy("dogs"), "dog") bird = wordnet.synset("bird.n.01") mouse = wordnet.synset("mouse.n.01") self.assertEqual(wordnet.path_similarity(bird, mouse), bird.path_similarity(mouse)) self.assertEqual(wordnet.wup_similarity(bird, mouse), bird.wup_similarity(mouse)) self.assertEqual(wordnet.lch_similarity(bird, mouse), bird.lch_similarity(mouse)) cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key() self.assertIsNotNone(wordnet.lemma_from_key(cat_key))
def test_wordnet(self): self.assertIsNotNone(wordnet.langs()) self.assertEqual( wordnet.synset("spy.n.01").lemma_names("tha"), ["สปาย", "สายลับ"] ) self.assertIsNotNone(wordnet.synsets("นก")) self.assertIsNotNone(wordnet.all_synsets(pos=wn.ADJ)) self.assertIsNotNone(wordnet.lemmas("นก")) self.assertIsNotNone(wordnet.all_lemma_names(pos=wn.ADV)) self.assertIsNotNone(wordnet.lemma("cat.n.01.cat")) self.assertEqual(wordnet.morphy("dogs"), "dog") bird = wordnet.synset("bird.n.01") mouse = wordnet.synset("mouse.n.01") self.assertEqual( wordnet.path_similarity(bird, mouse), bird.path_similarity(mouse) ) self.assertEqual( wordnet.wup_similarity(bird, mouse), bird.wup_similarity(mouse) ) cat_key = wordnet.synsets("แมว")[0].lemmas()[0].key() self.assertIsNotNone(wordnet.lemma_from_key(cat_key))
def compute_wordnet_path_scores(pairs): """ Compute WordNet path similarity for a list of input word pairs Note: Thai WordNet has 3 methods to compute a similarity value: wordnet.path_similarity, wordnet.lch_similarity, wordnet.wup_similarity lch_similarity we can't use. path_similarity seems to have better results than wup_similarity If we don't find a path between the two works, we add "None" to the result list @returns: this list of simility scores, and the number of OOV-word-pairs """ structed_oov_pairs = 0 wn_scores = [] for index, pair in enumerate(pairs): w1 = wordnet.synsets(pair[0]) w2 = wordnet.synsets(pair[1]) if len(w1) > 0 and len(w2) > 0: # just use the first synset of each term if WORDNET_PATH_SIMILARITY_TYPE == 'first_synset': path = wordnet.path_similarity(w1[0], w2[0]) # return the highest sim between all synset combinations elif WORDNET_PATH_SIMILARITY_TYPE == 'most_similar': path = -1 for syn1 in w1: for syn2 in w2: tmppath = wordnet.path_similarity(syn1, syn2) if tmppath and tmppath > path: path = tmppath if path == -1: # if no path found, set back to None path = None else: raise RuntimeError( 'WORDNET_PATH_SIMILARITY_TYPE is not set in config!') wn_scores.append(path) else: wn_scores.append(None) structed_oov_pairs += 1 return wn_scores, structed_oov_pairs
def compute_wordnet_path_scores(pairs): """ Compute WordNet path similarity for a list of input word pairs Note: Thai WordNet has 3 methods to compute a similarity value: wordnet.path_similarity, wordnet.lch_similarity, wordnet.wup_similarity lch_similarity we can't use. path_similarity seems to have better results than wup_similarity If we don't find a path between the two works, we add "None" to the result list @returns: this list of simility scores, and the number of OOV-word-pairs """ print("DEBUG: starting compute_wordnet_path_scores") from pythainlp.corpus import wordnet structed_oov_pairs = 0 # wohlg: we count word pairs for which we have no path wn_scores = [] for index, pair in enumerate(pairs): w1 = wordnet.synsets(pair[0]) w2 = wordnet.synsets(pair[1]) if len(w1) > 0 and len(w2) > 0: if WORDNET_PATH_SIMILARITY_TYPE == 'first_synset': # just use the first synset of each term path = wordnet.path_similarity(w1[0], w2[0]) # path = wordnet.lch_similarity(w1[0], w2[0]) ## we can't use it, requires the same part-of-speech for both words # path = wordnet.wup_similarity(w1[0], w2[0]) elif WORDNET_PATH_SIMILARITY_TYPE == 'most_similar': # return the highest sim between all synset combinations path = -1 for syn1 in w1: for syn2 in w2: tmppath = wordnet.path_similarity(syn1, syn2) if tmppath and tmppath > path: path = tmppath if path == -1: path = None # if no path found, set back to None wn_scores.append(path) else: wn_scores.append(None) structed_oov_pairs += 1 return wn_scores, structed_oov_pairs