def test_word_similarity(): from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() dog = wns.word2synset('dog') cat = wns.word2synset('cat') # Measuring semantic similarity between concepts using Path method assert wns.similarity(dog[0], cat[0], 'path') is not None # 0.2 # Computing English word similarity using Li method assert wns.word_similarity('dog', 'cat', 'li') is not None # 0.449327301063 # Computing Spanish word similarity using Lin method assert wns.monol_word_similarity('perro', 'gato', 'spa', 'lin') is not None #0.876800984373 # Computing Chinese word similarity using Wu & Palmer method assert wns.monol_word_similarity('狗', '猫', 'cmn', 'wup') is not None # 0.857142857143 # Computing Spanish and English word similarity using Resnik method assert wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng', 'res') is not None #7.91166650904 # Computing Spanish and Chinese word similarity using Jiang & Conrad method assert wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn', 'jcn') is not None #0.31023804699 # Computing Chinese and English word similarity using WPath method assert wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng', 'wpath') is not None #0.593666388463
def map_subjects(subjects: list, filter_dis=0.2): # mapping the subjects, filter the i,j in M wns = WordNetSimilarity() # enumerate pairing and calculate distances # [['中国人', '安乐死'], ['太阳', '很好']] pair = [] # return the indexes pairing pair_idxs = [] for index, value in enumerate(subjects): i = index + 1 while i < len(subjects): # compare list : next list com_value = subjects[i] for v in value: for cv in com_value: pair_distance = wns.monol_word_similarity( v, cv, 'cmn', 'wup') # print(f'{v} -> {cv}: {pair_distance}') if pair_distance > filter_dis: pair.append(pair_distance) # pairing index: (row, column) pair_idxs.append( ([index, value.index(v)], [i, com_value.index(cv)])) i += 1 return pair_idxs
def test_wordnet_similarity(): from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() dog = wns.word2synset('dog') cat = wns.word2synset('cat') # Measuring semantic similarity between concepts using Path method assert wns.similarity(dog[0], cat[0], 'path') is not None # 0.2 # Computing English word similarity using Li method assert wns.word_similarity('dog', 'cat', 'li') is not None# 0.449327301063 # Computing Spanish word similarity using Lin method assert wns.monol_word_similarity('perro', 'gato', 'spa', 'lin') is not None#0.876800984373 # Computing Chinese word similarity using Wu & Palmer method assert wns.monol_word_similarity('狗', '猫', 'cmn', 'wup') is not None# 0.857142857143 # Computing Spanish and English word similarity using Resnik method assert wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng', 'res') is not None#7.91166650904 # Computing Spanish and Chinese word similarity using Jiang & Conrad method assert wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn', 'jcn') is not None#0.31023804699 # Computing Chinese and English word similarity using WPath method assert wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng', 'wpath') is not None#0.593666388463
def test_wordsim_evaluation(): from sematch.evaluation import WordSimEvaluation from sematch.semantic.similarity import WordNetSimilarity evaluation = WordSimEvaluation() print evaluation.dataset_names() wns = WordNetSimilarity() # define similarity metrics wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.8) # evaluate similarity metrics print evaluation.evaluate_metric('wpath', wpath, 'noun_simlex') # performa Steiger's Z significance Test print evaluation.statistical_test('wpath', 'path', 'noun_simlex') wpath_es = lambda x, y: wns.monol_word_similarity(x, y, 'spa', 'path') wpath_en_es = lambda x, y: wns.crossl_word_similarity( x, y, 'eng', 'spa', 'wpath') print evaluation.evaluate_metric('wpath_es', wpath_es, 'rg65_spanish') print evaluation.evaluate_metric('wpath_en_es', wpath_en_es, 'rg65_EN-ES')
from sematch.semantic.similarity import WordNetSimilarity # import jieba # import synonyms # import jieba.posseg as pseg wns = WordNetSimilarity() wns.monol_word_similarity('狗', '猫', 'cmn', 'wup') # print(wns.word_similarity('dog', 'cat', 'li')) # print(wns.monol_word_similarity('忧患', '安乐', 'cmn', 'wup')) print(wns.monol_word_similarity('狗', '猫', 'cmn', 'wup')) print(wns.monol_word_similarity('猫', '狗', 'cmn', 'wup')) # print(wns.monol_word_similarity('电脑', '键盘', 'cmn', 'wup')) # print(wns.monol_word_similarity('电脑', '电脑', 'cmn', 'wup')) # print(wns.monol_word_similarity('国家', '国家', 'cmn', 'wup')) # # def parse_token(data): # # words = [] # # for d in data: # # # jieba.enable_paddle() # seg_data = pseg.cut(data, use_paddle=True) #default # # per_word = [str(word) for word in seg_data if not str(word) in jieba_sp_words] # # for word, flag in seg_data: # # print(f'{word}, {flag}') # # words.append(seg_data) # return seg_data # # # def word_flag(sentence:list): # for word,flag in sentence: # return word,flag #
from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() # Computing English word similarity using Li method wns.word_similarity('dog', 'cat', 'li') # 0.449327301063 # Computing Spanish word similarity using Lin method wns.monol_word_similarity('perro', 'gato', 'spa', 'lin') #0.876800984373 # Computing Chinese word similarity using Wu & Palmer method wns.monol_word_similarity('狗', '猫', 'cmn', 'wup') # 0.857142857143 # Computing Spanish and English word similarity using Resnik method wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng', 'res') #7.91166650904 # Computing Spanish and Chinese word similarity using Jiang & Conrad method wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn', 'jcn') #0.31023804699 # Computing Chinese and English word similarity using WPath method wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng', 'wpath') #0.593666388463