def jinyi(): word = request.args.get('word') if request.method == 'POST': word = request.form['word'] if word == None: return '请提供要取近义的文字,参数名为 word ' str = word lstseg = synonyms.seg(str) nword = '' for i, str1 in enumerate(lstseg[0]): # print('%s = %s'%(str1,lstseg[1][i] ),synonyms.nearby(str1) ) # 忽略不处理的词性 ignore_cixing = ['x', 'nz', 'nr', 'eng', 'nrfg'] if lstseg[1][i] in ignore_cixing: nstr = str1 else: nstr = _nearby_word(str1) if nstr == '': nstr = str1 else: irnd = random.randint(1, 10) if irnd <= 1: nstr = str1 nword += nstr #print('原句:%s\n新句:%s'%(word,nword)) return make_response(nword)
def fenci(self, s): res = synonyms.seg(s) if len(res) > 1: print("获取分词成功") return res[0] else: print("获取分词失败") return None
def nermatch(): keywords['flag']='' for i in range(len(keywords)): keywords['flag'][i]=[] for ii in range(len(keywords['keywords'][i].split(','))): words = synonyms.seg(keywords['keywords'][i].split(',')[ii]) keywords['flag'][i].append(words[1]) if keywords['flag'][i][ii]=='nt' or keywords['flag'][i][ii]=='nr': keywords['entity'][i].append(word.word)
def find_synonyms(word): """ use various tech to find the synonym for the input "word" Args: word (string): [description] Returns: list: [description] """ # import the professions file with open('./dataset/profession2.json', 'rb') as jsonfile: profession_json = json.load(jsonfile, encoding='utf-8') profession_list = [] prof_kwords_list = [] for profession in profession_json['data']: profession_list.append(profession['name']) prof_kwords_list.append(profession['kwords']) # find the nearby words in Synonyms module nearby_words = synonyms.nearby(word) if len(nearby_words[0]): s_words = nearby_words[0][0:5] print(s_words) # if nearby_word not found, using word segmentation(分词) else: seg_words = [] # 分词 seg = synonyms.seg(word) for i, v in enumerate(seg[1]): if v == 'n': seg_words.append(seg[0][i]) s_words = seg_words # use kmcha to search synonyms km_words = kmcha_crawler_model.kmcha_search((word, '01')) # insert kmcha result into already found synonyms for word in km_words: if word not in s_words: s_words.append(word) # add synonyms to result if exist in professions lists result = [] for i, profession in enumerate(profession_list): prof_kword = prof_kwords_list[i] for seg in s_words: # if seg == profession or seg in profession: # fully match if seg == profession or seg == prof_kword: # partly match only with kwords result.append(profession) return result
def generate_vocab(text_data): """ Extract all the noun from dataset :param text: list<str> :return: """ vocab = set() for sentence in text_data: segs = synonyms.seg(sentence) words = [x for x, y in zip(segs[0], segs[1]) if y == "n"] for word in words: if len(word) >= 2: vocab.add(word) if os.path.exists(config.vocab_path): os.remove(config.vocab_path) logger.info("Writing the vocab...") with codecs.open(config.vocab_path, "wb") as f: pickle.dump(vocab, f) del vocab logger.warning("vocab.pkl: %s" % config.vocab_path) print("Done!")
def text(self, text): # text="可以用于自然语言理解的很多任务" kws, s = synonyms.seg(text) # kws =tkit.Text().get_keywords(text,num=3) # l = {'n','nz','ns','nr'} # kws_new = [] keyword = '' # for key,item in enumerate(s): # print(key) # print(item) # print(kws[key]) # if item in l: # keyword = keyword+" "+kws[key] new = '' for key, item in enumerate(s): # print(key) # print(item) # print(kws[key]) # if item in l: # keyword = keyword+" "+kws[key] kn, p = synonyms.nearby(kws[key]) print(kn) print(p) if len(kn) > 1 and p[1] > 0.8: print(kn[1]) # if p[1]>0.7: print('建议选择') new = new + kn[1] else: new = new + kws[key] return new
import synonyms print(synonyms.seg("中国南海")) print(synonyms.seg("中南海")) print("中国:{}".format(synonyms.nearby("人脸"))) print(synonyms.compare("西北","塑料",seg=False))
def cPrintSeg(str): result = synonyms.seg(str) print('@+@') print('分词为:', result) print('@-@')
# import sklearn # from sklearn.preprocessing import PolynomialFeatures # from sklearn.linear_model import LinearRegression # import matplotlib.pyplot as plt # import matplotlib.ticker as ticker # import matplotlib as mpl # from scipy import interpolate # from statsmodels.tsa.ar_model import AR data_long = np.loadtxt("data_test1.csv",str,delimiter=",", skiprows=1) data_long ("国际劳工组织") cixing=[] for i in range(0,len(data_long)): cixing.append(synonyms.seg(data_long[i])) test=synonyms.nearby("人脸") test[0] print("识别: %s" % (synonyms.nearby("识别"))) print("NOT_EXIST: %s" % (synonyms.nearby("NOT_EXIST"))) synonyms.display("金融") synonyms.v() print(1) cixiangliang=[] for i in range(0,len(data_long)):
def fenci(self, s): res = synonyms.seg(s) if len(res) > 1: return res[0] else: return None
def test_wordseg(self): print("test_wordseg") print(synonyms.seg("中文近义词工具包"))
def aug_df(reviews_df, labels_df, op, n=3): for idx in reviews_df.index: id = reviews_df.loc[idx, 'id'] rv = reviews_df.loc[idx, 'Reviews'] for i in reversed(range(len(rv))): if rv[i].strip() == '': for j in labels_df[labels_df['id'] == id].index: lb = labels_df[labels_df['id'] == id].loc[j] a_s = lb['A_start'].strip() a_e = lb['A_end'].strip() if a_s != '' and a_e != '': a_s = int(a_s) a_e = int(a_e) if a_s > i: a_s -= 1 a_e -= 1 labels_df.loc[j, 'A_start'] = str(a_s) labels_df.loc[j, 'A_end'] = str(a_e) o_s = lb['O_start'].strip() o_e = lb['O_end'].strip() if o_s != '' and o_e != '': o_s = int(o_s) o_e = int(o_e) if o_s > i: o_s -= 1 o_e -= 1 labels_df.loc[j, 'O_start'] = str(o_s) labels_df.loc[j, 'O_end'] = str(o_e) rv = rv.replace(' ', '') still_spans = [] for i in labels_df[labels_df['id'] == id].index: lb = labels_df.loc[i] a_s = lb['A_start'].strip() a_e = lb['A_end'].strip() if a_s != '' and a_e != '': still_spans.append((int(a_s), int(a_e))) o_s = lb['O_start'].strip() o_e = lb['O_end'].strip() if o_s != '' and o_e != '': still_spans.append((int(o_s), int(o_e))) still_spans.sort(key=lambda x: x[0]) rv_tokens = synonyms.seg(rv)[0] editable_tokens = [] editable_spans = [] cur = 0 for i in range(len(rv_tokens)): end = cur + len(rv_tokens[i]) editable = True for span in still_spans: if is_intersec(cur, end, span[0], span[1]): editable = False break if editable and (rv_tokens[i] not in [',', ',', '!', '。', '*', '?', '?']): editable_spans.append((cur, end)) editable_tokens.append(rv_tokens[i]) cur = end if not editable_tokens: continue rv_list = list(rv) if op == 'delete' or op == 'replace' or op == 'insert': to_edit = sorted(np.random.choice(range(len(editable_tokens)), size=min(len(editable_tokens), n), replace=False), reverse=True) for ii in to_edit: span = editable_spans[ii] token = editable_tokens[ii] if op == 'delete' or op == 'replace': left, right = span if op == 'delete': target_token = '' else: candi, probs = synonyms.nearby(token) if len(candi) <= 1: target_token = '' else: probs = np.array(probs[1:]) / sum(probs[1:]) target_token = np.random.choice(candi[1:], p=probs) else: left, right = span[-1], span[-1] token = '' candi, probs = synonyms.nearby(editable_tokens[ii]) if len(candi) <= 1: target_token = '' else: probs = np.array(probs[1:]) / sum(probs[1:]) target_token = np.random.choice(candi[1:], p=probs) shift = len(target_token)-len(token) for i in labels_df[labels_df['id'] == id].index: lb = labels_df.loc[i] a_s = lb['A_start'].strip() a_e = lb['A_end'].strip() if a_s != '' and a_e != '': a_s = int(a_s) a_e = int(a_e) if a_s >= span[-1]: a_s += shift a_e += shift labels_df.loc[i, 'A_start'] = str(a_s) labels_df.loc[i, 'A_end'] = str(a_e) o_s = lb['O_start'].strip() o_e = lb['O_end'].strip() if o_s != '' and o_e != '': o_s = int(o_s) o_e = int(o_e) if o_s >= span[-1]: o_s += shift o_e += shift labels_df.loc[i, 'O_start'] = str(o_s) labels_df.loc[i, 'O_end'] = str(o_e) print(token) print(''.join(rv_list[:left]), ''.join(rv_list[right:])) rv_list = rv_list[:left] + list(target_token) + rv_list[right:] elif op == 'swap': cur_time = 0 if len(editable_tokens) < 2: continue if len(editable_tokens) == 2: time = 1 else: time = n while cur_time != time: idx0, idx1 = sorted(np.random.choice(range(len(editable_tokens)), size=2, replace=False)) token0, token1 = editable_tokens[idx0], editable_tokens[idx1] span0, span1 = editable_spans[idx0], editable_spans[idx1] print(token0, token1) editable_tokens[idx0], editable_tokens[idx1] = token1, token0 if len(token0) != len(token1): shift = len(token1) - len(token0) editable_spans[idx0] = (span0[0], span0[0]+len(token1)) editable_spans[idx1] = (span1[0]+shift, span1[0] + shift + len(token0)) for idx_edt in range(len(editable_tokens)): cur_span = editable_spans[idx_edt] if cur_span[0] >= span0[1] and cur_span[1] <= span1[0]: editable_spans[idx_edt] = (cur_span[0]+shift, cur_span[1]+shift) for i in labels_df[labels_df['id'] == id].index: lb = labels_df.loc[i] a_s = lb['A_start'].strip() a_e = lb['A_end'].strip() if a_s != '' and a_e != '': a_s = int(a_s) a_e = int(a_e) if a_s >= span0[1] and a_e <= span1[0]: a_s += shift a_e += shift labels_df.loc[i, 'A_start'] = str(a_s) labels_df.loc[i, 'A_end'] = str(a_e) o_s = lb['O_start'].strip() o_e = lb['O_end'].strip() if o_s != '' and o_e != '': o_s = int(o_s) o_e = int(o_e) if o_s >= span0[1] and o_e <= span1[0]: o_s += shift o_e += shift labels_df.loc[i, 'O_start'] = str(o_s) labels_df.loc[i, 'O_end'] = str(o_e) rv_list = rv_list[:span0[0]] + list(token1) + rv_list[span0[1]: span1[0]] + list(token0) + rv_list[span1[1]:] cur_time += 1 rv_new = ''.join(rv_list) reviews_df.loc[idx, 'Reviews'] = rv_new print(rv) print(rv_new) print(labels_df[labels_df['id'] == id]) return reviews_df, labels_df
def segment(text): """ 分词 """ return synonyms.seg(text)
# # # # seg_snownlp = SnowNLP(content) # # print("/".join(seg_snownlp.words)) # # import pkuseg # print("*****pkuseg******") # pku = pkuseg.pkuseg() # # seg_pku = pku.cut(content) # print("/".join(seg_pku)) # # print("*****thulac******") # import thulac # thu_lac = thulac.thulac(seg_only=True) # thu_seg = thu_lac.cut(content, text=True) # print("/".join(thu_seg)) # # print("*****hanlp******") # from pyhanlp import HanLP # seg_hanlp = HanLP.segment(content) # print("/".join([term.word for term in seg_hanlp])) print("*****synonyms******") import synonyms content = '解放军信息工程大学网络空间安全学院' syn_seg = synonyms.seg(content) print(syn_seg) sen1 = '漏洞挖掘' sen2 = '漏洞检测' print(synonyms.compare(sen1, sen2, seg=True))
import numpy as np import jieba import synonyms import random similar_question = '执行案件在财务系统获取不案件信息' cut_list = list(jieba.cut(similar_question)) print(cut_list) # while (True): # idx = random.randint(0, len(cut_list) - 1) # if len(synonyms.nearby(cut_list[idx])[0]) >= 2: # print(cut_list[idx]) # change_word=synonyms.nearby(cut_list[idx])[0][1] # break # print(change_word) print(synonyms.seg('执行案件在财务系统获取不案件信息')) print(synonyms.nearby('执行')) list1 = ['执行', '继续执行', '督导', '指派', '可执行', '执行者', '监督', '制订', '分派', '拒绝执行'] list1.pop(2) print(list1)
writer = csv.writer(expand) with open('final_shuffle.csv', 'r', encoding='gbk') as all: reader = csv.reader(all) part_rand = [] stand = '' for line in reader: # 将同个标准问的随机问添加到一个数组 # 如果是最后一个标准问会跳过同义词扩充部分,所以需要在文件最后一行随便加一行不同标准问的数据 if line[1] == stand: part_rand.append(line[0]) else: for rand in part_rand: # 将原句写入 writer.writerow([rand, stand]) # 句子分词 cut_word = synonyms.seg(rand) syns = [] # 获取每个词的十个同义词,添加到数组 for word in cut_word[0]: syn = synonyms.nearby(word) syns.append(syn[0]) # 由已有数据量确定每个句子扩充几遍 # for i in range(250//len(part_rand)): for i in range(1): new = '' # 遍历每个词,增加替换概率,0.6不变、0.2替换、0.2unk for index, word in enumerate(cut_word[0]): syn = syns[index] k = random.randint(0, 9) if k in range(6): new += word
def data_enforce_(label_file, review_file): """数据增强: 以0.3 的概率对样本进行替换""" columns_1 = "id,AspectTerms,A_start,A_end,OpinionTerms,O_start,O_end,Categories,Polarities".split( ",") columns_2 = "id,Reviews".split(",") df_labels = pd.read_csv(open(label_file, encoding="utf-8"), header=0)[columns_1] df_reviews = pd.read_csv(open(review_file, encoding="utf-8"), header=0)[columns_2] df_reviews.Reviews = [re.sub("\s+", ",", v) for v in df_reviews.Reviews] print(df_labels[:3]) print(df_reviews[:3]) res_1 = [] res_2 = [] count = 0 for _ in range(20): # for _ in range(3): # test print(_) for row_re in df_reviews.values: count += 1 # logger.info(count) is_fake = False change_type = "" # 随机选择一个label行进行更改 rows_la = df_labels[df_labels.id == row_re[0]].values.copy() one_index = random.randint(0, len(rows_la) - 1) # print(rows_la) row_la = rows_la[one_index] # print(row_la) row_label = list(row_la) row_review = list(row_re) if row_label[1] != "_": # AspectTerms 随机替换 aspect = row_label[1] # 对于置信度大于0.4 的均作为备选 aspect_syn = [ word for word, _ in zip(*synonyms.nearby(aspect)) if _ > 0.4 ] if uniform() < 0.5 and aspect_syn: # 随机选出一个替换 aspect_replace = random.choice(aspect_syn) row_label[1] = aspect_replace row_review[1] = row_review[1].replace( aspect, aspect_replace) is_fake = True change_type += "+" + "替换aspect" if row_label[4] != "_": # 情感 随机替换 opinion = row_label[4] # print(synonyms.nearby(opinion)) opinion_syn = [ word for word, _ in zip(*synonyms.nearby(opinion)) if _ > 0.4 ] if uniform() < 0.5 and opinion_syn: opinion_replace = random.choice(opinion_syn) row_label[4] = opinion_replace row_review[1] = row_review[1].replace( opinion, opinion_replace) is_fake = True change_type += "+" + "替换opinion" # 经过上面两次处理,被替换的概率低于0.49 if uniform() < 0.1: # 以较低的概率 对aspect 和opinion的位置进行交换 if row_label[1] != "_" and row_label[4] != "_": # 对标签位置进行更改 tmp = row_label[2] row_label[2] = row_label[5] row_label[5] = tmp tmp = row_label[3] row_label[3] = row_label[6] row_label[6] = tmp # 对文本进行更改 row_review[1] = row_review[1] \ .replace(row_label[1], row_label[4]) \ .replace(row_label[4], row_label[1]) is_fake = True change_type += "+" + "交换item" # 经过上面的操作被 变换的可能为低于0.54 if uniform() < 0.3: # 随机替换1-3个词汇个词汇 seg_words = synonyms.seg(row_review[1].replace( row_label[1], "@").replace(row_label[4], "@"))[0] num = random_pick([1, 2, 3], [0.7, 0.25, 0.05]) # print(seg_words, num) god_words = np.random.choice(seg_words, min(num, len(seg_words)), replace=False) for god_word in god_words: if god_word != "@" and god_word != "&": tmp = synonyms.nearby(god_word)[0] if tmp and god_word not in row_review[ 1] and god_word not in row_review[4]: # 同义词替换 row_review[1] = row_review[1].replace( god_word, random.choice(tmp)) is_fake = True change_type += "+" + "替换其他" # 经过上面的处理, 该样本为生成样本的概率低于0.74 if uniform() < 0.1: # 随机交换两个词汇 seg_words = synonyms.seg(row_review[1].replace( row_label[1], "@").replace(row_label[4], "@")) if len(seg_words) > 5: tmp = np.random.choice(seg_words, 2, replace=False) row_review[1] = row_review[1].replace(tmp[0], tmp[1]).replace( tmp[1], tmp[0]) is_fake = True change_type += "+" + "交换其他词汇" if uniform() < 0.3: # 随机删除一个字符 char_index = random.randint(0, len(row_review[1]) - 1) if row_review[1][char_index] not in { v for v in (row_label[1] + row_label[4]) }: row_review[1] = row_review[1][:char_index] + row_review[1][ char_index + 1:] is_fake = True change_type += "+" + "删除" # 经过前面的处理增强样本占比低于0.9 # 序列编号id rows_la[one_index] = np.array(row_label) for v in range(len(rows_la)): rows_la[v][0] = count row_review[0] = count row_review[1] = re.sub("\s+", ",", row_review[1]) # logger.info(rows_la) res_1.extend(rows_la) res_2.append(row_review + [is_fake, change_type, row_re[-1]]) pd.DataFrame(data=res_1, columns=columns_1).to_csv( "zhejiang/enforce_data/train_labels_enforce.csv", index=False, encoding="utf-8") pd.DataFrame(data=res_2, columns=columns_2 + ["is_fake", "change_type", "original_review"]).to_csv( "zhejiang/enforce_data/train_reviews_enforce.csv", index=False, encoding="utf-8")
# coding:utf-8 import synonyms print(synonyms.seg("能量")) ''' 中文近义词工具包。支持自然语言理解的很多任务:文本对齐、推荐算法、相似度计算、语义偏移、关键字提取、概念提取、自动摘要、搜索引擎等 输出结果: (['能量'], ['n']) '''
#!/usr/bin/python # -*- coding: utf8 -*- import synonyms print("人脸") result = synonyms.nearby("人脸") words = result[0] scores = result[1] print(words.__len__()) print(scores.__len__()) for i in range(0, words.__len__()): print words[i], "=", scores[i] result1 = synonyms.seg("我是中国人") words1 = result1[0] tags = result1[1] for i in range(0, words1.__len__()): print words1[i], "->", tags[i] r = synonyms.compare('商贸城', '贸易', seg=True) print r
# @Software: PyCharm # @Time : 2020-12-14 10:52 # @Author : Super-Zhang # @Description : import synonyms print(synonyms.seg("中文近义词工具包")) # 分词结果,由两个 list 组成的元组,分别是单词和对应的词性。 # (['中文', '近义词', '工具包'], ['nz', 'n', 'n']) print("交换: ", synonyms.nearby("交换")) print("交换: ", synonyms.nearby("交换")) print("两数: ", synonyms.nearby("两个数")) # sen1 = "两数交换" # sen2 = "a与b交换" # r = synonyms.compare(sen1, sen2, seg=True) # print(r) print("=======语义相似度计算==========") # 名词和动词可以比较好的进行计算。 # print(synonyms.compare("打开主函数", "开启main函数", seg=True)) print("两数交换","交换两个变量", synonyms.compare("两数交换","交换两个变量", seg=True)) print("我爱看中国有嘻哈", "中国有嘻哈是我爱看的节目",synonyms.compare("我爱看中国有嘻哈", "中国有嘻哈是我爱看的节目", seg=True)) print("打开主函数", "开启main函数", synonyms.compare("打开主函数", "开启main函数", seg=True))
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Aug 27 14:17:27 2018 @author: madawei1 """ import synonyms #中文分词 synonyms.seg("中文近义词工具包") #nearby print("破洞: " ,(synonyms.nearby("破洞"))) print("女人: " ,(synonyms.nearby("女人"))) print("NOT_EXIST: " ,(synonyms.nearby("NOT_EXIST"))) sen1 = "快递" sen2 = "物流" r = synonyms.compare(sen1, sen2, seg=True)