def pair(self): words, keep, drop, mapping = super().load() common_chars = common_char() ws_tool = WordSimilarity2010() visit = set() for word1 in list(words): for word2 in list(words): if word1 == word2: continue if len(word1) > len(word2): w1, w2 = word2, word1 elif len(word1) == len(word2) and word1 > word2: w1, w2 = word2, word1 else: w1, w2 = word1, word2 if (w1, w2) in visit: continue visit.add((w1, w2)) chars = set(w1) for char in chars: if char in common_chars: continue if char in w2 and len(w1) == len(w2): sim = ws_tool.similarity(w1, w2) if sim < self.threshold: continue mapping.add((w1, w2)) keep.setdefault(w1, set()).add(w2) keep.setdefault(w2, set()).add(w1) super().dump(words, keep, drop, mapping)
def pair(self): words, keep, drop, mapping = super().load() ws_tool = WordSimilarity2010() common_chars = common_char() for word1 in words: for word2 in words: if word1 == word2: continue sim = ws_tool.similarity(word1, word2) if sim >= self.threshold: if len(word1) > len(word2): w1, w2 = word2, word1 elif len(word1) == len(word2) and word1 > word2: w1, w2 = word2, word1 else: w1, w2 = word1, word2 if 0 != len(set(w1) & common_chars) \ and 0 != len(set(w2) & common_chars): # print("they are not similar words", w1, w2) continue mapping.add((w1, w2)) keep.setdefault(w1, set()).add(w2) keep.setdefault(w2, set()).add(w1) super().dump(words, keep, drop, mapping)
def test_similarity_2010(self): ws_tool = WordSimilarity2010() b_a = "抄袭" b_b = "克隆" sim_b = ws_tool.similarity(b_a, b_b) print(b_a, b_b, '相似度为', sim_b) #抄袭 克隆 最终的相似度为 0.585642777645155 w_a = '人民' sample_list = [ "国民", "群众", "党群", "良民", "同志", "成年人", "市民", "亲属", "志愿者", "先锋" ] for s_a in sample_list: sim_a = ws_tool.similarity(w_a, s_a) print(w_a, s_a, '相似度为', sim_a)
import thulac import json import sys import os import os.path as osp from tqdm import tqdm from word_similarity import WordSimilarity2010 ws_tool = WordSimilarity2010() import numpy as np thu = thulac.thulac() word2sem = json.load(open('/home/anonymous/Sememe/data/word2sememe.json')) word2id = json.load(open('/home/anonymous/Sememe/data/word2id.json')) word2pos = json.load(open('/home/anonymous/Sememe/data/word2pos.json')) word2topk = {} correct1 = np.load('/home/anonymous/lcqmc.t.npy') correct2 = np.load('/home/anonymous/lcqmc.tsep.npy') fname = sys.argv[1] def score(src_word, tgt_word): return ws_tool.similarity(src_word, tgt_word) def get_top_k_words(src_word, label, top_k): if src_word not in word2id: return [] if src_word in word2topk: return word2topk[src_word] word_with_score_list = [] for tgt_word in word2sem: # ------------------------------------------------------------------------------ # 1) Filter by vocab if tgt_word not in word2id: continue