def get_synonyms_word(word):
    # 查找与实体相近的词
    ret_tuple = synonyms.nearby(word)
    word_list = synonyms.nearby(word)[0]
    score_list = synonyms.nearby(word)[1]
    # 设定阈值,查找与所输入的实体相近的实体
    for x, y in zip(word_list, score_list):
        if y > 0.6 and y < 1:
            #print(x)
            yield x
Exemple #2
0
def get_similar_words_2(word):
    visited = set(word)

    words = synonyms.nearby(word)[0]

    for w in words:
        ext_words = synonyms.nearby(w)[0]
        for w in ext_words:
            visited.add(w)

    return list(visited)
Exemple #3
0
def data_enforce(label_file, review_file):
    """数据增强: 以0.3 的概率对样本进行替换"""
    columns_1 = "id,AspectTerms,A_start,A_end,OpinionTerms,O_start,O_end,Categories,Polarities".split(
        ",")
    columns_2 = "id,Reviews".split(",")
    df_labels = pd.read_csv(open(label_file, encoding="utf-8"),
                            header=0)[columns_1]
    df_reviews = pd.read_csv(open(review_file, encoding="utf-8"),
                             header=0)[columns_2]
    df_reviews.index = df_reviews["id"].values
    print(df_labels[:3])
    print(df_reviews[:3])
    res_1 = []
    res_2 = []
    for _ in range(10):
        print(_)
        for row1 in df_labels.values:
            row2 = df_reviews.loc[row1[0]].values
            # print(row2)
            # print(row1)
            row_label = row1
            row_review = row2
            if row_label[1] != "_":
                # AspectTerms 随机替换
                aspect = row_label[1]
                aspect_syn = synonyms.nearby(aspect)[0]
                if uniform() < 0.3 and len(aspect_syn) > 3:
                    aspect_replace = aspect_syn[1:4][ranint()]
                    row_label[1] = aspect_replace
                    row_review[1] = row_review[1].replace(
                        aspect, aspect_replace)
                    # row_label[0] = "%s@" % row_label[0]  # 只为了测试观测开启,id后面对用还有用

            if row_label[4] != "_":
                # 情感 随机替换
                opinion = row_label[4]
                opinion_syn = synonyms.nearby(opinion)[0]
                if uniform() < 0.3 and len(opinion_syn) > 3:
                    opinion_replace = opinion_syn[1:4][ranint()]
                    row_label[4] = opinion_replace
                    row_review[1] = row_review[1].replace(
                        opinion, opinion_replace)
                    # row_label[0] = "%s@" % row_label[0]

            res_1.append(row_label)
            res_2.append(row_review)
    pd.DataFrame(data=res_1, columns=columns_1).to_csv(
        "zhejiang/enforce_data/train_labels_enforce.csv",
        index=False,
        encoding="utf-8")
    pd.DataFrame(data=res_2, columns=columns_2).to_csv(
        "zhejiang/enforce_data/train_reviews_enforce.csv",
        index=False,
        encoding="utf-8")
Exemple #4
0
def get_synonyms(word):
    """
    获得词的同义词
    :param word:
    :return:
    """
    return synonyms.nearby(word)[0]
def get_synonyms(word):
    '''
    获取同义词
    :param word:
    :return:
    '''
    return synonyms.nearby(word)[0]
def get_syn_word(input1):
    str_onw_syn_list = []

    for i in range(len(input1)):
        str_onw_syn_list.append(synonyms.nearby(input1[i]))

    return str_onw_syn_list
Exemple #7
0
def get_fixed_keywords(query_word_list, min_match=1):
    """
    对查询词列表中的每一个词,找其近义词列表,最终返回总列表

    :param query_word_list: 查询词 列表
    :param min_match: 近义词程度,0-1,1 代表不找近义词,取值取决于近义词库
    :return: [[str, str], []] or None
    """
    if len(query_word_list) == 0:
        return None
    if min_match < 0.5:
        min_match = 0.5
    if min_match < 1:
        import synonyms as sy
    # 可接受的最小近义词相似度,1 代表禁用近义词, 一般不用变
    # min_synonyms = 0.77
    fixed = []
    for w in query_word_list:
        r = []
        if min_match < 1:
            sy_words, sy_scores = sy.nearby(w)
            for i in range(len(sy_words)):
                if sy_scores[i] > min_match:
                    r.append(sy_words[i])
        if w not in r:
            r.insert(0, w)
        fixed.append(r)
    return fixed
def replace_word(similar_question):
    cut_list = list(jieba.cut(similar_question))
    count1 = 0
    while (True):
        idx = random.randint(0, len(cut_list) - 1)
        if len(synonyms.nearby(cut_list[idx])[0]) >= 2:
            change_word = synonyms.nearby(cut_list[idx])[0][1]
            break
        count1 += 1
        if count1 > len(cut_list):
            break
    if count1 > len(cut_list):
        cut_list.pop(idx)
    else:
        cut_list[idx] = change_word
    return ''.join(cut_list)
Exemple #9
0
 def assoSynAll(self, sentence):
     # sentence: 中文句/词,并且对这些词先找近义词,再分别找古词
     # 找到前五个近义词的所有关联古词,按照相关性顺序返回list
     # 计算量或许有点大
     toks = jieba.lcut(sentence)
     assoRes = []
     synWords = []
     for word in toks:
         synlist, score = synonyms.nearby(word)
         synlist, score = synlist[:5], score[:5]
         synWords.extend(zip(synlist, score))
     # print(synWords)
     for synword in synWords:
         try:
             synwordList = self.assoDict[synword[0]]
             synwordList = [(x[0], x[1] * (1.1 - synword[1]))
                            for x in synwordList]  #因为synonyms库中,越大越近
             assoRes.extend(synwordList)
         except:
             continue
     assoRes = sorted(assoRes, key=lambda x: x[1])
     assoRes = [x[0] for x in assoRes]
     finalRes = list(set(assoRes))
     finalRes.sort(key=assoRes.index)
     return finalRes
Exemple #10
0
    def edaRepalcement(self, text, stop_words, replace_num):
        #        中文同义词词典 synonyms 中文近义词工具包,可以用于自然语言理解的很多任务:文本对齐,推荐算法,相似度计算,语义偏移,关键字提取,概念提取,自动摘要,搜索引擎等。
        '''
        随机替换
        '''
        new_words = text.copy()
        random_word_list = list(
            set([word for word in text if word not in stop_words]))
        random.shuffle(random_word_list)
        num_replaced = 0
        for random_word in random_word_list:

            synonym_list = synonyms.nearby(random_word)[
                0]  #返回的是近义词列表 nearby 返回[[近义词],[相似值]]

            if len(synonym_list) >= 1:
                synonym = random.choice(synonym_list)  #随机选取一个近义词
                new_words = [
                    synonym if word == random_word else word
                    for word in new_words
                ]
                num_replaced += 1

            if num_replaced >= replace_num:
                break
        sentence = ' '.join(new_words)
        sentence = sentence.strip()
        new_words = sentence.split(' ')

        return new_words  #返回的是替换后的词的列表
Exemple #11
0
def add_keys_relations(keywords, start, end):
    tmp_keywords = {}
    keys = list(keywords.keys())
    for i in range(start, end):
        tmp_keywords[keys[i]] = []
    for i in range(start, end):
        print(i)
        k = keys[i]
        neighbors = synonyms.nearby(k)[0]
        for n in neighbors:
            if n == k:
                continue
            if n in keys:
                try:
                    tmp_keywords[n].append(k)
                except:
                    tmp_keywords[n] = [k]
                tmp_keywords[k].append(n)


#        if i % 10000 == 0:
#            print('Saving dictionary ...')
#            with open('keywords/keywords{0}_{1}.json'.format(start, end), 'w') as json_file:
#                json.dump(tmp_keywords, json_file)
#            print('Saved.')
    for k in list(tmp_keywords.keys()):
        tmp_keywords[k] = list(set(tmp_keywords[k]))
    with open('keywords/keywords{0}_{1}.json'.format(start, end),
              'w') as json_file:
        json.dump(tmp_keywords, json_file)
Exemple #12
0
def change(text, rate=1, level=2):
    i = 0
    while (i < level):
        i += 1
        seg_list = list(jieba.cut(text, cut_all=False))
        for i in range(len(seg_list)):
            s = seg_list[i]
            try:
                if synonyms.nearby(s)[0] != None:
                    s = synonyms.nearby(s)[0][2]
            except:
                pass
            seg_list[i] = s

        text = ''.join(seg_list)
    return text
Exemple #13
0
def get_synonyms(word):
    """
    获取同义词基于word2vec
    :param word:
    :return:
    """
    return synonyms.nearby(word)[0]
Exemple #14
0
def synonym_replacement(words):
    # ============1. 去掉停用词并打同义乱替换顺序================#
    words = list(jieba.cut(words))
    new_words = words.copy()
    if len(words) > 512:
        new_words = new_words[:256] + new_words[-256:]
    # 忽略停用词
    random_word_list = list(
        set([word for word in new_words if word not in stop_words]))
    np.random.shuffle(random_word_list)
    n = len(random_word_list) * 0.6
    num_replaced = 0
    # ============2. 遍历句子替换n个词的同义词=============#
    for random_word in random_word_list:
        synonym = synonyms.nearby(random_word)[0][1]
        if len(synonym) >= 1:
            synonym = np.random.choice(synonym)
            new_words = [
                synonym if word == random_word else word for word in new_words
            ]
            num_replaced += 1
        if num_replaced >= n:
            break

    new_words = ''.join(new_words).replace(' ', '')
    return new_words
Exemple #15
0
def sameword():

    w = open(newwordpath, 'w', encoding="UTF-8-sig")
    f = open(wordpath, encoding="UTF-8-sig")
    for line in f:
        z = 0
        i = []

        #print(line)
        #print(synonyms.nearby(str(line))[0])
        for a in (synonyms.nearby(str(line))[0]):
            #print(a)
            z = z + 1
            if a != []:
                i.append(a)

            if z > 2:
                break
        if z == 0:
            i.append(line.strip('\n'))
        #print(str(i))
        w.write(" ".join(i) + '\n')

    f.close()
    w.close()
Exemple #16
0
def get_synonyms(word):
    "同义词选择方法一:synonyms"
    synonyms_cadidate = set()
    for sy_word in sy.nearby(word)[0]:
        synonyms_cadidate.add(sy_word)
    if word in synonyms_cadidate:
        synonyms_cadidate.remove(word)
    return synonyms_cadidate
Exemple #17
0
 def get_synonyms(word, size=10):
     """
     近义词获取
     :param size: 获取的近义词数量
     :param word:
     :return:
     """
     return synonyms.nearby(word, size=size)[0]
Exemple #18
0
def get_synonyms(word):
    # 这里使用了word_net + synonyms, 将两者的同义词召回做合并
    synonyms_word = set()
    for syn in wordnet.synsets(word, lang='cmn'):
        synonyms_word = set(syn.lemma_names('cmn'))
    for w in synonyms.nearby(word)[0]:
        synonyms_word.add(w)
    return list(synonyms_word)
 def get_nearby_words(self, words_list):
     words = ""
     for one in words_list:
         nearby_words_tmp = synonyms.nearby(one)
         if len(nearby_words_tmp[0]) == 0:
             words = words + one
         else:
             words = words + nearby_words_tmp[0][3]
     return words
Exemple #20
0
def get_syns(token):
    syns = set()
    syns.add(token)
    syn_raw, scores = synonyms.nearby(token)
    for syn, score in zip(syn_raw, scores):
        print(syn, score)
        if score > 0.75:
            syns.add(syn)
    return syns
Exemple #21
0
def search_fun(keyword):
    """

    :param keyword: 网页搜索的问题
    :return: 素材文件名
    """
    resources_list = []

    def retrieval(file_dir):
        for filenames in os.walk(file_dir):
            resources_list.append(filenames[2])

    resources_dir = './static/images/'  #素材库路径
    retrieval(resources_dir)

    seg_list = jieba.lcut_for_search(keyword)  # 搜索引擎模式

    synonyms_list = []

    for word in seg_list:
        if synonyms.nearby(word)[0]:
            synonyms_list.append(synonyms.nearby(word)[0])

    resources = []

    for i in synonyms_list:
        for j in i:
            for x in resources_list[0]:
                if j in x:
                    if x not in resources:  # 去重
                        resources.append(x)

    print(resources)

    resources_item = ['fileName']
    resources_item_list = []

    for n in range(0, len(resources)):
        resources_item_list.append(
            dict(zip(resources_item, resources[n:n + 1])))

    print(resources_item_list)

    return resources_item_list
def find_shanglian(input_info, tag_mode=1, final_output_number=5):
    start = time.clock()
    filename = './couplet_100k.txt'
    #input_tag_list = ['chair', 'moon','mountain']
    input_tag_list = ['moon']
    input_tag_list = input_info['description']['tags']
    print(input_tag_list)
    list_trans = []
    final_input_tag_list = []
    for i in range(len(input_tag_list)):
        list_trans.append(get_reuslt(translate(input_tag_list[i])))
    print(list_trans)
    list_trans = list(set(list_trans))
    print(list_trans)
    for i in range(len(list_trans)):
        synonyms_words = synonyms.nearby(list_trans[i])
        d = dict(zip(synonyms_words[0], synonyms_words[1]))
        synonyms_words = [k for k, v in d.items() if v >= 0.7]
        if tag_mode == 1:
            final_input_tag_list += synonyms_words[:3]
        else:
            final_input_tag_list.append(synonyms_words[:3])
    retrieval_results = []
    print(final_input_tag_list)
    with open(filename, 'r', encoding='utf-8') as in_file:
        all_lines = in_file.readlines()
    ''' # much slower?
    pool = ThreadPool()
    func = partial(retrieve_tag, all_lines, -1)
    pool.map(func, input_tag_list)
    pool.close()
    pool.join()
    '''
    for i in range(len(final_input_tag_list)):
        if tag_mode == 1:
            tag = final_input_tag_list[i]
            #tag_retrieval_result = retrieve_tag(all_lines,result_length = retrieval_result_length, tag )
            tag_retrieval_result = retrieve_tag(all_lines, -1, tag)
            retrieval_results += tag_retrieval_result
        else:
            tag = final_input_tag_list[i]
            tag_retrieval_result = retrieve_tag(all_lines, -1, tag, tag_mode=2)
            retrieval_results += tag_retrieval_result
    #'''
    results = {}
    for i in retrieval_results:
        results[i] = results.get(i, 0) + 1
    results = sorted(results.items(), key=lambda item: item[1], reverse=True)
    output_results_index = [
        index[0] for index in results[:final_result_length]
    ]
    print([index[1] for index in results[:final_result_length]])
    results = [all_lines[i][:-1] for i in output_results_index]
    #tic = time.clock()-start
    #print(tic)
    return results
Exemple #23
0
def get_one_syn_words(word, syn_score=0.7):
    syn_words = synonyms.nearby(word)
    syn_words = [
        syn_words[0][i] for i, score in enumerate(syn_words[1])
        if score > syn_score
    ]
    if len(syn_words) >= 1 and syn_words != []:
        if word in syn_words:
            syn_words.remove(word)
    if len(syn_words) == 0:
        return False
    return random.choice(syn_words)
def get_syn_word(word):
    """
        获取同义词
    :param word: str, like "学生"
    :return: str, like "学生仔"
    """
    if not is_number(word.strip()) or not is_english(word.strip()):
        word_syn = synonyms.nearby(word)
        word_syn = word_syn if not word_syn else [word]
        return word_syn
    else:
        return [word]
def add_word(new_words):
    syn_words = []
    counter = 0    
    while len(syn_words) < 1:
        random_word = new_words[random.randint(0, len(new_words)-1)]
        syn_words = synonyms.nearby(random_word)[0]
        counter += 1
        if counter >= 10:
            return
    random_synonym = random.choice(syn_words)
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)
Exemple #26
0
 def testNearbyWords(self):
     thu1 = thulac.thulac()  #默认模式
     text = thu1.cut("人脸识别", text=True)  #进行一句话分词
     words, tags = [], []
     data = [x.rsplit('_', 1) for x in text.split()]
     for _ in data:
         assert len(_) == 2, "seg len should be 2"
         words.append(_[0])
         tags.append(_[1])
     for (k, v) in enumerate(tags):
         if v.startswith("n") or v.startswith("v"):  # 去停,去标,去副词、形容词、代词 etc.
             print("%s: %s" % (words[k], synonyms.nearby(words[k])))
Exemple #27
0
 def add_word(self, new_words):
     synonym_words = []
     counter = 0
     while len(synonym_words) < 1:
         random_word = new_words[random.randint(0, len(new_words) - 1)]
         synonym_words, _ = synonyms.nearby(random_word)
         counter += 1
         if counter >= 10:
             return
     random_synonym = synonym_words[0]
     random_idx = random.randint(0, len(new_words) - 1)
     new_words.insert(random_idx, random_synonym)
Exemple #28
0
    def bm25_syn(self, query):
        bm25_model = self.bm25_model_uncat

        query_weights = bm25_model.get_scores(query)  # 普通的bm25算法
        max_pos = np.argsort(query_weights)[::-1][0]  # 最高得分所在的index(而不是真正的value)

        # 找出来query里哪个词是最关键的
        max_score = 0
        kw = ''  # 最关键的那个词
        kw_idx = -1
        for idx, word in enumerate(query):
            word_weight = bm25_model.get_score([word], index=max_pos)
            if word_weight > max_score:
                max_score = word_weight
                kw = word
                kw_idx = idx

        # 为这个最关键的词创造一个近义词列表
        nearby_list = synonyms.nearby(kw)
        syn_list = [kw]  # 先手动把关键词自己加到列表里
        for word, score in zip(nearby_list[0], nearby_list[1]):
            # 条件:得分大于阈值
            if score > args.syn_threshold and word not in syn_list:
                syn_list.append(word)

        # 找出来哪个近义词得分最高
        max_score = -1
        best_kw = ''  # 得分最高的词
        for syn in syn_list:
            query[kw_idx] = syn  # 替换query中的那个最关键的词
            weights = bm25_model.get_scores(query)  # 普通的bm25算法
            score = sorted(weights, reverse=True)[0]  # 将得分从大到小排序,取第1个
            if score > max_score:
                max_score = score
                best_kw = syn

        # if best_kw != kw:
        #     print('1')
        # else:
        #     print('0')
        # print(kw + '\t' + best_kw)

        # 找到最合适的关键词了,回到正规,返回sorted_scores, max_pos, answers
        query[kw_idx] = best_kw
        bm25_weights = bm25_model.get_scores(query)

        sorted_scores = sorted(bm25_weights, reverse=True)  # 将得分从大到小排序
        sorted_scores = [s / (len(query) + 1) for s in sorted_scores]  # 将得分除以句长
        max_pos = np.argsort(bm25_weights)[::-1]  # 从大到小排序,返回index(而不是真正的value)
        answers = self.__max_pos2answers(max_pos, self.uncut_answers)

        return sorted_scores, max_pos, answers
Exemple #29
0
def get_synonyms(word):
    synonyms_cadidate = set()
    # for syn in wordnet.synsets(word):
    # 	for l in syn.lemmas():
    # 		synonym = l.name().replace("_", " ").replace("-", " ").lower()
    # 		synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
    # 		synonyms.add(synonym)

    for sy_word in sy.nearby(word)[0]:
        synonyms_cadidate.add(sy_word)
    if word in synonyms_cadidate:
        synonyms_cadidate.remove(word)
    return list(synonyms_cadidate)
Exemple #30
0
 def _add_words(self, new_words):
     synonym = []
     count = 0
     while len(synonym) < 1:
         random_word = new_words[random.randint(0, len(new_words) - 1)]
         synonym = synonyms.nearby(random_word)[0]
         count += 1
         #如果10次还没有同义词的,就返回
         if count >= 10:
             return
     random_sysnonym = random.choice(synonym)
     random_index = random.randint(0, len(new_words) - 1)
     new_words.insert(random_index, random_sysnonym)