def get_synonyms_word(word): # 查找与实体相近的词 ret_tuple = synonyms.nearby(word) word_list = synonyms.nearby(word)[0] score_list = synonyms.nearby(word)[1] # 设定阈值,查找与所输入的实体相近的实体 for x, y in zip(word_list, score_list): if y > 0.6 and y < 1: #print(x) yield x
def get_similar_words_2(word): visited = set(word) words = synonyms.nearby(word)[0] for w in words: ext_words = synonyms.nearby(w)[0] for w in ext_words: visited.add(w) return list(visited)
def data_enforce(label_file, review_file): """数据增强: 以0.3 的概率对样本进行替换""" columns_1 = "id,AspectTerms,A_start,A_end,OpinionTerms,O_start,O_end,Categories,Polarities".split( ",") columns_2 = "id,Reviews".split(",") df_labels = pd.read_csv(open(label_file, encoding="utf-8"), header=0)[columns_1] df_reviews = pd.read_csv(open(review_file, encoding="utf-8"), header=0)[columns_2] df_reviews.index = df_reviews["id"].values print(df_labels[:3]) print(df_reviews[:3]) res_1 = [] res_2 = [] for _ in range(10): print(_) for row1 in df_labels.values: row2 = df_reviews.loc[row1[0]].values # print(row2) # print(row1) row_label = row1 row_review = row2 if row_label[1] != "_": # AspectTerms 随机替换 aspect = row_label[1] aspect_syn = synonyms.nearby(aspect)[0] if uniform() < 0.3 and len(aspect_syn) > 3: aspect_replace = aspect_syn[1:4][ranint()] row_label[1] = aspect_replace row_review[1] = row_review[1].replace( aspect, aspect_replace) # row_label[0] = "%s@" % row_label[0] # 只为了测试观测开启,id后面对用还有用 if row_label[4] != "_": # 情感 随机替换 opinion = row_label[4] opinion_syn = synonyms.nearby(opinion)[0] if uniform() < 0.3 and len(opinion_syn) > 3: opinion_replace = opinion_syn[1:4][ranint()] row_label[4] = opinion_replace row_review[1] = row_review[1].replace( opinion, opinion_replace) # row_label[0] = "%s@" % row_label[0] res_1.append(row_label) res_2.append(row_review) pd.DataFrame(data=res_1, columns=columns_1).to_csv( "zhejiang/enforce_data/train_labels_enforce.csv", index=False, encoding="utf-8") pd.DataFrame(data=res_2, columns=columns_2).to_csv( "zhejiang/enforce_data/train_reviews_enforce.csv", index=False, encoding="utf-8")
def get_synonyms(word): """ 获得词的同义词 :param word: :return: """ return synonyms.nearby(word)[0]
def get_synonyms(word): ''' 获取同义词 :param word: :return: ''' return synonyms.nearby(word)[0]
def get_syn_word(input1): str_onw_syn_list = [] for i in range(len(input1)): str_onw_syn_list.append(synonyms.nearby(input1[i])) return str_onw_syn_list
def get_fixed_keywords(query_word_list, min_match=1): """ 对查询词列表中的每一个词,找其近义词列表,最终返回总列表 :param query_word_list: 查询词 列表 :param min_match: 近义词程度,0-1,1 代表不找近义词,取值取决于近义词库 :return: [[str, str], []] or None """ if len(query_word_list) == 0: return None if min_match < 0.5: min_match = 0.5 if min_match < 1: import synonyms as sy # 可接受的最小近义词相似度,1 代表禁用近义词, 一般不用变 # min_synonyms = 0.77 fixed = [] for w in query_word_list: r = [] if min_match < 1: sy_words, sy_scores = sy.nearby(w) for i in range(len(sy_words)): if sy_scores[i] > min_match: r.append(sy_words[i]) if w not in r: r.insert(0, w) fixed.append(r) return fixed
def replace_word(similar_question): cut_list = list(jieba.cut(similar_question)) count1 = 0 while (True): idx = random.randint(0, len(cut_list) - 1) if len(synonyms.nearby(cut_list[idx])[0]) >= 2: change_word = synonyms.nearby(cut_list[idx])[0][1] break count1 += 1 if count1 > len(cut_list): break if count1 > len(cut_list): cut_list.pop(idx) else: cut_list[idx] = change_word return ''.join(cut_list)
def assoSynAll(self, sentence): # sentence: 中文句/词,并且对这些词先找近义词,再分别找古词 # 找到前五个近义词的所有关联古词,按照相关性顺序返回list # 计算量或许有点大 toks = jieba.lcut(sentence) assoRes = [] synWords = [] for word in toks: synlist, score = synonyms.nearby(word) synlist, score = synlist[:5], score[:5] synWords.extend(zip(synlist, score)) # print(synWords) for synword in synWords: try: synwordList = self.assoDict[synword[0]] synwordList = [(x[0], x[1] * (1.1 - synword[1])) for x in synwordList] #因为synonyms库中,越大越近 assoRes.extend(synwordList) except: continue assoRes = sorted(assoRes, key=lambda x: x[1]) assoRes = [x[0] for x in assoRes] finalRes = list(set(assoRes)) finalRes.sort(key=assoRes.index) return finalRes
def edaRepalcement(self, text, stop_words, replace_num): # 中文同义词词典 synonyms 中文近义词工具包,可以用于自然语言理解的很多任务:文本对齐,推荐算法,相似度计算,语义偏移,关键字提取,概念提取,自动摘要,搜索引擎等。 ''' 随机替换 ''' new_words = text.copy() random_word_list = list( set([word for word in text if word not in stop_words])) random.shuffle(random_word_list) num_replaced = 0 for random_word in random_word_list: synonym_list = synonyms.nearby(random_word)[ 0] #返回的是近义词列表 nearby 返回[[近义词],[相似值]] if len(synonym_list) >= 1: synonym = random.choice(synonym_list) #随机选取一个近义词 new_words = [ synonym if word == random_word else word for word in new_words ] num_replaced += 1 if num_replaced >= replace_num: break sentence = ' '.join(new_words) sentence = sentence.strip() new_words = sentence.split(' ') return new_words #返回的是替换后的词的列表
def add_keys_relations(keywords, start, end): tmp_keywords = {} keys = list(keywords.keys()) for i in range(start, end): tmp_keywords[keys[i]] = [] for i in range(start, end): print(i) k = keys[i] neighbors = synonyms.nearby(k)[0] for n in neighbors: if n == k: continue if n in keys: try: tmp_keywords[n].append(k) except: tmp_keywords[n] = [k] tmp_keywords[k].append(n) # if i % 10000 == 0: # print('Saving dictionary ...') # with open('keywords/keywords{0}_{1}.json'.format(start, end), 'w') as json_file: # json.dump(tmp_keywords, json_file) # print('Saved.') for k in list(tmp_keywords.keys()): tmp_keywords[k] = list(set(tmp_keywords[k])) with open('keywords/keywords{0}_{1}.json'.format(start, end), 'w') as json_file: json.dump(tmp_keywords, json_file)
def change(text, rate=1, level=2): i = 0 while (i < level): i += 1 seg_list = list(jieba.cut(text, cut_all=False)) for i in range(len(seg_list)): s = seg_list[i] try: if synonyms.nearby(s)[0] != None: s = synonyms.nearby(s)[0][2] except: pass seg_list[i] = s text = ''.join(seg_list) return text
def get_synonyms(word): """ 获取同义词基于word2vec :param word: :return: """ return synonyms.nearby(word)[0]
def synonym_replacement(words): # ============1. 去掉停用词并打同义乱替换顺序================# words = list(jieba.cut(words)) new_words = words.copy() if len(words) > 512: new_words = new_words[:256] + new_words[-256:] # 忽略停用词 random_word_list = list( set([word for word in new_words if word not in stop_words])) np.random.shuffle(random_word_list) n = len(random_word_list) * 0.6 num_replaced = 0 # ============2. 遍历句子替换n个词的同义词=============# for random_word in random_word_list: synonym = synonyms.nearby(random_word)[0][1] if len(synonym) >= 1: synonym = np.random.choice(synonym) new_words = [ synonym if word == random_word else word for word in new_words ] num_replaced += 1 if num_replaced >= n: break new_words = ''.join(new_words).replace(' ', '') return new_words
def sameword(): w = open(newwordpath, 'w', encoding="UTF-8-sig") f = open(wordpath, encoding="UTF-8-sig") for line in f: z = 0 i = [] #print(line) #print(synonyms.nearby(str(line))[0]) for a in (synonyms.nearby(str(line))[0]): #print(a) z = z + 1 if a != []: i.append(a) if z > 2: break if z == 0: i.append(line.strip('\n')) #print(str(i)) w.write(" ".join(i) + '\n') f.close() w.close()
def get_synonyms(word): "同义词选择方法一:synonyms" synonyms_cadidate = set() for sy_word in sy.nearby(word)[0]: synonyms_cadidate.add(sy_word) if word in synonyms_cadidate: synonyms_cadidate.remove(word) return synonyms_cadidate
def get_synonyms(word, size=10): """ 近义词获取 :param size: 获取的近义词数量 :param word: :return: """ return synonyms.nearby(word, size=size)[0]
def get_synonyms(word): # 这里使用了word_net + synonyms, 将两者的同义词召回做合并 synonyms_word = set() for syn in wordnet.synsets(word, lang='cmn'): synonyms_word = set(syn.lemma_names('cmn')) for w in synonyms.nearby(word)[0]: synonyms_word.add(w) return list(synonyms_word)
def get_nearby_words(self, words_list): words = "" for one in words_list: nearby_words_tmp = synonyms.nearby(one) if len(nearby_words_tmp[0]) == 0: words = words + one else: words = words + nearby_words_tmp[0][3] return words
def get_syns(token): syns = set() syns.add(token) syn_raw, scores = synonyms.nearby(token) for syn, score in zip(syn_raw, scores): print(syn, score) if score > 0.75: syns.add(syn) return syns
def search_fun(keyword): """ :param keyword: 网页搜索的问题 :return: 素材文件名 """ resources_list = [] def retrieval(file_dir): for filenames in os.walk(file_dir): resources_list.append(filenames[2]) resources_dir = './static/images/' #素材库路径 retrieval(resources_dir) seg_list = jieba.lcut_for_search(keyword) # 搜索引擎模式 synonyms_list = [] for word in seg_list: if synonyms.nearby(word)[0]: synonyms_list.append(synonyms.nearby(word)[0]) resources = [] for i in synonyms_list: for j in i: for x in resources_list[0]: if j in x: if x not in resources: # 去重 resources.append(x) print(resources) resources_item = ['fileName'] resources_item_list = [] for n in range(0, len(resources)): resources_item_list.append( dict(zip(resources_item, resources[n:n + 1]))) print(resources_item_list) return resources_item_list
def find_shanglian(input_info, tag_mode=1, final_output_number=5): start = time.clock() filename = './couplet_100k.txt' #input_tag_list = ['chair', 'moon','mountain'] input_tag_list = ['moon'] input_tag_list = input_info['description']['tags'] print(input_tag_list) list_trans = [] final_input_tag_list = [] for i in range(len(input_tag_list)): list_trans.append(get_reuslt(translate(input_tag_list[i]))) print(list_trans) list_trans = list(set(list_trans)) print(list_trans) for i in range(len(list_trans)): synonyms_words = synonyms.nearby(list_trans[i]) d = dict(zip(synonyms_words[0], synonyms_words[1])) synonyms_words = [k for k, v in d.items() if v >= 0.7] if tag_mode == 1: final_input_tag_list += synonyms_words[:3] else: final_input_tag_list.append(synonyms_words[:3]) retrieval_results = [] print(final_input_tag_list) with open(filename, 'r', encoding='utf-8') as in_file: all_lines = in_file.readlines() ''' # much slower? pool = ThreadPool() func = partial(retrieve_tag, all_lines, -1) pool.map(func, input_tag_list) pool.close() pool.join() ''' for i in range(len(final_input_tag_list)): if tag_mode == 1: tag = final_input_tag_list[i] #tag_retrieval_result = retrieve_tag(all_lines,result_length = retrieval_result_length, tag ) tag_retrieval_result = retrieve_tag(all_lines, -1, tag) retrieval_results += tag_retrieval_result else: tag = final_input_tag_list[i] tag_retrieval_result = retrieve_tag(all_lines, -1, tag, tag_mode=2) retrieval_results += tag_retrieval_result #''' results = {} for i in retrieval_results: results[i] = results.get(i, 0) + 1 results = sorted(results.items(), key=lambda item: item[1], reverse=True) output_results_index = [ index[0] for index in results[:final_result_length] ] print([index[1] for index in results[:final_result_length]]) results = [all_lines[i][:-1] for i in output_results_index] #tic = time.clock()-start #print(tic) return results
def get_one_syn_words(word, syn_score=0.7): syn_words = synonyms.nearby(word) syn_words = [ syn_words[0][i] for i, score in enumerate(syn_words[1]) if score > syn_score ] if len(syn_words) >= 1 and syn_words != []: if word in syn_words: syn_words.remove(word) if len(syn_words) == 0: return False return random.choice(syn_words)
def get_syn_word(word): """ 获取同义词 :param word: str, like "学生" :return: str, like "学生仔" """ if not is_number(word.strip()) or not is_english(word.strip()): word_syn = synonyms.nearby(word) word_syn = word_syn if not word_syn else [word] return word_syn else: return [word]
def add_word(new_words): syn_words = [] counter = 0 while len(syn_words) < 1: random_word = new_words[random.randint(0, len(new_words)-1)] syn_words = synonyms.nearby(random_word)[0] counter += 1 if counter >= 10: return random_synonym = random.choice(syn_words) random_idx = random.randint(0, len(new_words)-1) new_words.insert(random_idx, random_synonym)
def testNearbyWords(self): thu1 = thulac.thulac() #默认模式 text = thu1.cut("人脸识别", text=True) #进行一句话分词 words, tags = [], [] data = [x.rsplit('_', 1) for x in text.split()] for _ in data: assert len(_) == 2, "seg len should be 2" words.append(_[0]) tags.append(_[1]) for (k, v) in enumerate(tags): if v.startswith("n") or v.startswith("v"): # 去停,去标,去副词、形容词、代词 etc. print("%s: %s" % (words[k], synonyms.nearby(words[k])))
def add_word(self, new_words): synonym_words = [] counter = 0 while len(synonym_words) < 1: random_word = new_words[random.randint(0, len(new_words) - 1)] synonym_words, _ = synonyms.nearby(random_word) counter += 1 if counter >= 10: return random_synonym = synonym_words[0] random_idx = random.randint(0, len(new_words) - 1) new_words.insert(random_idx, random_synonym)
def bm25_syn(self, query): bm25_model = self.bm25_model_uncat query_weights = bm25_model.get_scores(query) # 普通的bm25算法 max_pos = np.argsort(query_weights)[::-1][0] # 最高得分所在的index(而不是真正的value) # 找出来query里哪个词是最关键的 max_score = 0 kw = '' # 最关键的那个词 kw_idx = -1 for idx, word in enumerate(query): word_weight = bm25_model.get_score([word], index=max_pos) if word_weight > max_score: max_score = word_weight kw = word kw_idx = idx # 为这个最关键的词创造一个近义词列表 nearby_list = synonyms.nearby(kw) syn_list = [kw] # 先手动把关键词自己加到列表里 for word, score in zip(nearby_list[0], nearby_list[1]): # 条件:得分大于阈值 if score > args.syn_threshold and word not in syn_list: syn_list.append(word) # 找出来哪个近义词得分最高 max_score = -1 best_kw = '' # 得分最高的词 for syn in syn_list: query[kw_idx] = syn # 替换query中的那个最关键的词 weights = bm25_model.get_scores(query) # 普通的bm25算法 score = sorted(weights, reverse=True)[0] # 将得分从大到小排序,取第1个 if score > max_score: max_score = score best_kw = syn # if best_kw != kw: # print('1') # else: # print('0') # print(kw + '\t' + best_kw) # 找到最合适的关键词了,回到正规,返回sorted_scores, max_pos, answers query[kw_idx] = best_kw bm25_weights = bm25_model.get_scores(query) sorted_scores = sorted(bm25_weights, reverse=True) # 将得分从大到小排序 sorted_scores = [s / (len(query) + 1) for s in sorted_scores] # 将得分除以句长 max_pos = np.argsort(bm25_weights)[::-1] # 从大到小排序,返回index(而不是真正的value) answers = self.__max_pos2answers(max_pos, self.uncut_answers) return sorted_scores, max_pos, answers
def get_synonyms(word): synonyms_cadidate = set() # for syn in wordnet.synsets(word): # for l in syn.lemmas(): # synonym = l.name().replace("_", " ").replace("-", " ").lower() # synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm']) # synonyms.add(synonym) for sy_word in sy.nearby(word)[0]: synonyms_cadidate.add(sy_word) if word in synonyms_cadidate: synonyms_cadidate.remove(word) return list(synonyms_cadidate)
def _add_words(self, new_words): synonym = [] count = 0 while len(synonym) < 1: random_word = new_words[random.randint(0, len(new_words) - 1)] synonym = synonyms.nearby(random_word)[0] count += 1 #如果10次还没有同义词的,就返回 if count >= 10: return random_sysnonym = random.choice(synonym) random_index = random.randint(0, len(new_words) - 1) new_words.insert(random_index, random_sysnonym)