Example #1
0
    def pair(self):
        words, keep, drop, mapping = super().load()
        common_chars = common_char()
        ws_tool = WordSimilarity2010()

        visit = set()
        for word1 in list(words):
            for word2 in list(words):
                if word1 == word2:
                    continue
                if len(word1) > len(word2):
                    w1, w2 = word2, word1
                elif len(word1) == len(word2) and word1 > word2:
                    w1, w2 = word2, word1
                else:
                    w1, w2 = word1, word2
                if (w1, w2) in visit:
                    continue
                visit.add((w1, w2))
                chars = set(w1)
                for char in chars:
                    if char in common_chars:
                        continue
                    if char in w2 and len(w1) == len(w2):
                        sim = ws_tool.similarity(w1, w2)
                        if sim < self.threshold:
                            continue
                        mapping.add((w1, w2))
                        keep.setdefault(w1, set()).add(w2)
                        keep.setdefault(w2, set()).add(w1)

        super().dump(words, keep, drop, mapping)
Example #2
0
    def pair(self):
        words, keep, drop, mapping = super().load()
        ws_tool = WordSimilarity2010()
        common_chars = common_char()

        for word1 in words:
            for word2 in words:
                if word1 == word2:
                    continue
                sim = ws_tool.similarity(word1, word2)
                if sim >= self.threshold:
                    if len(word1) > len(word2):
                        w1, w2 = word2, word1
                    elif len(word1) == len(word2) and word1 > word2:
                        w1, w2 = word2, word1
                    else:
                        w1, w2 = word1, word2
                    if 0 != len(set(w1) & common_chars) \
                            and 0 != len(set(w2) & common_chars):
                        # print("they are not similar words", w1, w2)
                        continue
                    mapping.add((w1, w2))
                    keep.setdefault(w1, set()).add(w2)
                    keep.setdefault(w2, set()).add(w1)

        super().dump(words, keep, drop, mapping)
    def test_similarity_2010(self):
        ws_tool = WordSimilarity2010()

        b_a = "抄袭"
        b_b = "克隆"
        sim_b = ws_tool.similarity(b_a, b_b)
        print(b_a, b_b, '相似度为', sim_b)
        #抄袭 克隆 最终的相似度为 0.585642777645155

        w_a = '人民'
        sample_list = [
            "国民", "群众", "党群", "良民", "同志", "成年人", "市民", "亲属", "志愿者", "先锋"
        ]

        for s_a in sample_list:
            sim_a = ws_tool.similarity(w_a, s_a)
            print(w_a, s_a, '相似度为', sim_a)
import thulac
import json
import sys
import os
import os.path as osp
from tqdm import tqdm
from word_similarity import WordSimilarity2010
ws_tool = WordSimilarity2010()
import numpy as np

thu = thulac.thulac()
word2sem = json.load(open('/home/anonymous/Sememe/data/word2sememe.json'))
word2id = json.load(open('/home/anonymous/Sememe/data/word2id.json'))
word2pos = json.load(open('/home/anonymous/Sememe/data/word2pos.json'))
word2topk = {}
correct1 = np.load('/home/anonymous/lcqmc.t.npy')
correct2 = np.load('/home/anonymous/lcqmc.tsep.npy')

fname = sys.argv[1]

def score(src_word, tgt_word):
    return ws_tool.similarity(src_word, tgt_word)

def get_top_k_words(src_word, label, top_k):
    if src_word not in word2id: return []
    if src_word in word2topk: return word2topk[src_word]
    word_with_score_list = []
    for tgt_word in word2sem:
        # ------------------------------------------------------------------------------
        # 1) Filter by vocab
        if tgt_word not in word2id: continue