def test_swap_sent(self): print("test_swap_sent") s1 = synonyms.compare("教学", "老师") s2 = synonyms.compare("老师", "教学") print('"教学", "老师": %s ' % s1) print('"老师", "教学": %s ' % s2) assert s1 == s2, "Scores should be the same after swap sents"
def list_projects_chat(cls, search_query, page_no=None, page_size=None, default_max_score=0.4, privacy="public"): start = (page_no - 1) * page_size end = page_no * page_size # all_apps = cls.get_all() all_apps = cls.repo.read(query={"privacy": privacy}) # all_apps = cls.read(query={"privacy": privacy}) # 比对打分 for app in all_apps: name_score = synonyms.compare(search_query, app.name, seg=True) description_score = synonyms.compare(search_query, app.description, seg=True) app.score = (name_score + description_score) / 2 # 筛选掉小于 description_score apps = list( filter(lambda app: app.score >= default_max_score, all_apps)) count = len(apps) apps = sorted(apps, key=lambda item: -item.score) return Objects(objects=apps[start:end], count=count, page_no=page_no, page_size=page_size)
def test_similarity(self): ''' Generate sentence similarity ''' sen1 = "旗帜引领方向" sen2 = "道路决定命运" r = synonyms.compare(sen1, sen2, seg=True) print("旗帜引领方向 vs 道路决定命运:", r) # assert r == 0.0, "the similarity should be zero" sen1 = "旗帜引领方向" sen2 = "旗帜指引道路" r = synonyms.compare(sen1, sen2, seg=True) print("旗帜引领方向 vs 旗帜指引道路:", r) # assert r > 0, "the similarity should be bigger then zero" sen1 = "发生历史性变革" sen2 = "发生历史性变革" r = synonyms.compare(sen1, sen2, seg=True) print("发生历史性变革 vs 发生历史性变革:", r) # assert r > 0, "the similarity should be bigger then zero" sen1 = "骨折" sen2 = "巴赫" r = synonyms.compare(sen1, sen2, seg=True) print("%s vs %s" % (sen1, sen2), r) sen1 = "你们好呀" sen2 = "大家好" r = synonyms.compare(sen1, sen2, seg=False) print("%s vs %s" % (sen1, sen2), r)
def if_has_relation(words): """ 判断是否句子中有关系词的出现 :param words: :return: """ # 一句话抽象提取出的关系是否只有一个呢?下面的方法是按一个来做的 relation_word = "" index = -1 for word in words: for key in relations: if synonyms.compare(word, key) > 0.98 and key not in wrong_relation: index = 0 relation_word = key break if index == -1: for values in relations.values(): for relation in values: if synonyms.compare( word, relation ) > 0.98 and relation not in wrong_relation: index = 0 relation_word = word break if index != -1: break if index != -1: break return index, relation_word
def simCal(self, word, entities, flag): """ 计算词语和字典中的词的相似度 相同字符的个数/min(|A|,|B|) + 余弦相似度 :param word: str :param entities:List :return: """ import synonyms as sy a = len(word) scores = [] for entity in entities: sim_num = 0 b = len(entity) c = len(set(entity + word)) temp = [] try: if not np.isnan(sy.compare(word, entity)): score2 = sy.compare(word, entity) temp.append(score2) except: pass score3 = 1 - self.editDistanceDP(word, entity) / (a + b) if score3 > 0.5: temp.append(score3) score = sum(temp) / len(temp) if score >= 0.7: scores.append((entity, score, flag)) scores.sort(key=lambda k: k[1], reverse=True) return scores
def matchBestTriple(self,des): '''match best triple by synonyms''' #analyse the question's triples des_triples=self.extor.triples_main(des) data=self.collect.find() #document's max score domMaxScore=0 domTitle='' for each in data: #read triple list from document triple_list=each["triples"] #set every documents score scoreSum=0 #calculate a score for every triple in document for eachTriple in des_triples: #对于单个的三元组的评分 single_triple_score=0 relaScore=0 n1Score=0 n2Score=0 for Triple in triple_list: #calculate the score of relation description relation_score=sy.compare(Triple[1],eachTriple[1],seg=True) if relation_score<relaScore: relaScore=relation_score #calculate the similarity of two node node1_score=sy.compare(Triple[2],eachTriple[2],seg=True) node2_score=sy.compare(Triple[0],eachTriple[0],seg=True) if n1Score>node1_score: n1Score=node1_score if n2Score>node2_score: n2Score=node2_score single_triple_score=0.6*relaScore+0.2*n1Score+0.2*n2Score scoreSum=single_triple_score+single_triple_score if scoreSum>domMaxScore: domMaxScore=scoreSum domTitle=each['title'] #catch the best triple_list bestDom=collect.find_one({'title':domTitle}) bestTriple=bestDom["triples"] return bestTriple
def testSenSimilarity(self): ''' Generate sentence similarity ''' sen1 = "旗帜引领方向" sen2 = "道路决定命运" assert synonyms.compare(sen1, sen2) == 0.0, "the similarity should be zero" sen1 = "发生历史性变革" sen2 = "取得历史性成就" assert synonyms.compare( sen1, sen2) > 0, "the similarity should be bigger then zero"
def getTop10(Allwords,Allbag_file): ''' :param Allwords: 用户的二维关键字数组 :param Allbag_file: 商品类别文件 :return: 前十个关键词类别 ''' user_words = Allwords all_bag = [] with open(Allbag_file,'r') as f: text = f.read() all_bag = text.split(',') bag_num = dict() bag_len = len(all_bag) for i in range(bag_len): num = 0 for word in user_words: num = num + synonyms.compare(word,all_bag[i])/bag_len bag_num[all_bag[i]] = num test_dict = sorted(bag_num.items(), key=lambda e: e[1], reverse=True)[:10] re_list = [] for one in test_dict: re_list.append(one[0]) return re_list
def NLP(self,context,score,cla_id): #使用SnowNLP来进行情感度计算 #使用jeaba.analyse进行关键词提取 #用synonyms进行同义词相似度比较 tag_words = jae.extract_tags(context,topK = 5, withWeight = True, allowPOS = ()) Predict_Score = SnowNLP(context).sentiments Final_Tag_Score = {} #评论中五个关键词及其情感评分 for word,weight in tag_words: Rank_Dict = dict() for origin_tag in self.Taglist: Rank_Dict[origin_tag] = synonyms.compare(word,origin_tag,seg = True) Sorted_Rank_Dict = sorted(Rank_Dict.items, key = lambda x : x[1], reverse = True) Most_Similiar_Tag = Sorted_Rank_Dict[0][0] Similiar_Score = Sorted_Rank_Dict[0][1] Final_Tag_Score[Most_Similiar_Tag] = score * Predict_Score * Similiar_Score * weight #将评论中提取的特征:评分加入到课程的特征中 origin_feat_vector = self.DataGetter.Get_Cla_Feats({'cla_id':cla_id}) for tag,score in Final_Tag_Score.items(): index = self.Taglist.index(tag) origin_feat_vector[index] += 0.2 * score new_info = {'cla_id':cla_id,'feats':origin_feat_vector} self.DataGetter.Update_ClaFeats(new_info)
def test_eval_synonyms(self): logging.info("test_eval_synonyms") from_ = os.path.join(curdir, os.path.pardir, "data", "simtrain_to05sts.txt") to_ = os.path.join(curdir, os.path.pardir, "data", "synonyms_eval.txt") if os.path.exists(to_): os.remove(to_) append_line_to_file( to_, "# [synonyms](https://github.com/huyingxi/Synonyms) v(%s) 相似度评测 \n" % synonyms.__version__) append_line_to_file( to_, "评测数据源: https://github.com/IAdmireu/ChineseSTS \n \n") append_line_to_file(to_, "置信区间 [0,5],分数越高越相似 \n") append_line_to_file( to_, "| %s | %s | %s | %s | %s | %s | \n" % ("句子A ID", "句子A", "句子B ID", "句子 B", "标注分数", "预测分数")) append_line_to_file(to_, "| --- | --- | --- | --- | --- | --- | \n") with open(from_, "r") as fin: for x in tqdm(list(fin.readlines())): o = x.strip().split("\t") if len(o) == 5: yid, yc, zid, zc, yzs = o syn_score = synonyms.compare(yc, zc) * 5.0 # print("%s | %s => %s | %s" % (yc, zc, yzs, syn_score)) append_line_to_file( to_, "| %s | %s | %s | %s | %s | %.3f | \n" % (yid, yc, zid, zc, yzs, syn_score))
def json_rewrit_statement(): text = request.args.get('text') print('text:',text) key ='json_rewrit_statement'+str(text) if cache.get(key) is None: print('创建新缓存') # tnlp = libs.Nlp() # keywords = tnlp.dnn(text=text) rs = libs.RewritStatement() # text ="python中for循环输出列表索引与对应的值 - tanlangqie的博客" t = rs.text(text) rnk = synonyms.compare(text, t, seg=True) items = { 'text':text, 'new_text':t, 'relevant':rnk } # print(t) cache.set(key ,items) else: print('获取缓存') items = cache.get(key) print('json_rewrit_statement',items) return jsonify(items)
def matrix_dis(self, keys): start = time.time() keys = keys[:20] sim, k = 0, 1 matrix = dict() x_max = np.max(self.weight_list) x_min = np.min(self.weight_list) count_list = dict() for i in range(len(self.keylist)): count_list[i] = (self.weight_list[i] - x_min) / (x_max - x_min) start2 = time.time() for i in range(len(self.keylist)): for j in range(len(keys)): start3 = time.time() matrix[(i, j)] = synonyms.compare( self.keylist[i], keys[j], seg=False) * (1 + count_list[i]) # print(time.time()-start3) # print(time.time()-start2) while matrix: max_couple = max(matrix, key=matrix.get) i, j = max_couple[0], max_couple[1] k += 1 simil = matrix.get(max_couple) sim += simil # if simil > 0.2 else 0 for t1 in range(len(keys)): if (i, t1) in matrix: matrix.pop((i, t1)) for t2 in range(len(self.keylist)): if t2 != i and (t2, j) in matrix: matrix.pop((t2, j)) # print("total compare time", time.time() - start) return sim / k
def similar_scores(word1, word2): """ 使用词向量表示计算两个词之间的语义相似度 :return: word1 和 word2的语义相似度得分(最大 1.0) """ import synonyms return synonyms.compare(word1, word2)
def find_answer(input, file_url): inputSentence = input try: CorpusFile = open(file_url, 'r') except: print("Could not open file") return -1 #inputSentence = CorpusFile.readline() pairs = [] while True: lines = CorpusFile.readlines(10000) if not lines: break for line in lines: question = line.split()[0] ans = line.split()[1] pairs.append((question, ans)) print('inputSentece=', inputSentence, "pairs[0]", pairs[0]) pairs.sort(key=lambda x: -synonyms.compare(inputSentence, x[0], seg=True)) #print(pairs[0]) #synonyms("test", "test", seg=True) ans = "" for i in range(5): ans += pairs[i][1] if i != 4: ans += '\n' #print(ans) return ans
def string_align(word, content_list): new_list = [] list_to_set = set() #把list里的内容放到集合中,实现自动去重 #根据是否是字符串、是否为空、是否是列表、列表是否为空几个情况来判断执行 for i in content_list: if i == '': pass elif type(i).__name__ == 'list' and len(i) > 0: for j in i: list_to_set.add(j) elif type(i).__name__ != 'list': list_to_set.add(i) else: pass new_list = list(list_to_set) newnew_list = [] # 去除相似度比较高的部分(有些类似于冒泡排序的处理方式) for i in range(0, len(new_list)): #temp = i for j in range(i + 1, len(new_list)): if synonyms.compare(new_list[i], new_list[j]) > 0.5 or Levenshtein.ratio( new_list[i], new_list[j]) > 0.8: if len(new_list[i]) > len(new_list[j]): new_list[i] = new_list[j] else: new_list[j] = new_list[i] new_list = list(set(new_list)) return new_list
def visit_baidu(msg): try: jieba.load_userdict("dict.txt") n_flags = ["ns", "n", "nr", "nz", "nt", "nw", "t"] words = pseg.cut(msg, use_paddle=True) task = [] for word, flag in words: print(word, flag) if flag in n_flags: task.append(word) print("分词结果如下:", task) headers = {"User-Agent": "Mozilla/5.0 "} if len(task) == 1: word = task[0] api = 'https://baike.baidu.com/search/word?word=' + word r = requests.get(api, headers=headers, timeout=10) r.encoding = r.apparent_encoding soup = BeautifulSoup(r.text, 'html.parser') content = soup.findAll(name='meta', attrs={"name": "description"}) content = content[0]['content'] return content if len(task) > 1: word = task[0] target = "" for i in task: target = target + i target = target.replace(word, "") print(target) api = 'https://baike.baidu.com/search/word?word=' + word r = requests.get(api, headers=headers, timeout=10) r.encoding = r.apparent_encoding soup = BeautifulSoup(r.text, 'html.parser') names = [] for i in soup.select(".basicInfo-item.name"): if i.text.replace("\xa0", ""): names.append(i.text.replace("\xa0", "")) values = [] for i in soup.select(".basicInfo-item.value"): if i.text.replace("\xa0", ""): values.append(i.text.replace("\xa0", "")) info_items = [] for i in range(len(names)): item = {"name": names[i], "value": values[i]} info_items.append(item) similarity = -1 result = "什么也没有找到~~" for item in info_items: compare = synonyms.compare(target, item["name"], seg=False) if compare > similarity: similarity = compare print(item["name"]) result = item["value"] return "已经为你找到如下信息:\r" + result except Exception as e: print(e) return "抱歉出错了什么也没找到~"
def similarityComparison(): # 近义词处理,缺少近义词库 wb = load_workbook(path) sheet = wb.active for i in sheet["E2:E{}".format(sheet.max_row)]: for j in i: for i2 in sheet["E3:E{}".format(sheet.max_row)]: pass r = synonyms.compare(j.value, )
def get_similrity(self, word1, word2): try: v1 = synonyms.v[word1] v2 = synonyms.v[word2] similar = np.dot(v1, v2) / (np.linalg.norm(v1) * (np.linalg.norm(v2))) return similar except: return synonyms.compare(word1, word2, seg=False)
def get_api_list(get_type, search_query, user_ID, page_no=default_page_no, page_size=default_page_size, default_max_score=0.4): start = (page_no - 1) * page_size end = page_no * page_size # apis = [] if get_type == ApiGetType.all: # 获取所有的 if search_query: # apis = Api.objects.search_text(search_query).order_by('$text_score') apis = search(search_query) else: apis = api_business.get() # 分页 return apis.order_by('-create_time')[start:end] elif get_type == ApiGetType.favor or get_type == ApiGetType.used or get_type == ApiGetType.star: # 获取特定 user = UserBusiness.get_by_user_ID(user_ID=user_ID) apis = user[get_type + "_apis"] if search_query: # 先search, 后用apis filter match_apis = search(search_query) final_apis = list(filter(lambda match_api: match_api in apis, match_apis)) apis = final_apis return custom_sort(apis)[start:end] elif get_type == ApiGetType.chat: # 机器人 apis = api_business.get_all().order_by('-create_time') # 比对打分 apis_score = [] for api in apis: api_json = api.to_mongo() apis_score.append({ **api_json, "score": synonyms.compare(search_query, api.keyword, seg=True) }) # TODO 以后可以更换成 object # for api in apis: # api.score = synonyms.compare(search_query, api.keyword, seg=True) # api_json = api.to_mongo() # apis_score.append({ # **api_json, # "score": synonyms.compare(search_query, api.keyword, seg=True) # }) apis_score = sorted(apis_score, key=lambda item: -item["score"]) # 最大值 max_score = apis_score[0]["score"] if max_score < default_max_score: raise Warning(ErrorMessage.no_match_apis) else: apis = apis_score return apis[start:end] else: raise Error(ErrorMessage.error_get_type)
def similarmatch(): # similar words match by using synonyms keywords['entity']='' for i in range(testnum): keywords['entity'][i]=[] for ii in range(len(keywords['keywords'][i].split(','))): for j in range(len(entities)): if synonyms.compare(entities['name'][j],keywords['keywords'][i].split(',')[ii],seg=False)>threshold: # if exceed threshold, select it as similar word keywords['entity'][i].append(entities['name'][j]) nermatch() return(keywords)
def compare_chs_sentence(sen1, sen2, threshold=0): """ to check the two Chinese sentences input whether be similar, if not, may change the way of mutation :param sen1: the original sentence :param sen2: the mutated sentence :param threshold: how much the two sentence are likely is okay :return: True for the two sentence are similar, false for not """ score = synonyms.compare(sen1, sen2) return False if score <= threshold else True
def similarity(db, keywords): fp = open("D:\\git\\bigdata_test\\bigdata_test\\wordresult.txt", 'r') sentences = fp.readlines() st = sentences[0].split(',') st_list = [] # 词库列表 keywords1 = 0 #最接近的词 # sim=0.01 #相似度 sim1 = 0.01 #相似度 sim2 = 0.01 #相似度 sent1 = 0 for i in range(len(st)): if '\xef\xbb\xbf' in st[i]: st1 = st[i].replace("\xef\xbb\xbf", "") st_list.append(st1.strip()) else: st_list.append(st[i].strip()) # print st_list stlen = len(st_list) #sentences=np.array(st_list).reshape(1,12) #sentences2=np.array(st_list,dtype=str).reshape((1,stlen)) sentences1 = [[] for i in range(stlen)] print type(st_list) print st_list print type(sentences1) for a in range(0, stlen): sentences1[a].append(st_list[a]) # print "###################################" # print sentences1 # model = word2vec.Word2Vec(sentences1, min_count=1,size=500) # kw1=keywords.encode('utf-8') ## print " 获取词性最接近的词###################################" # y1=model.most_similar(kw1) # keywords1=y1[0][0] # sim=y1[0][1] # if kw1==keywords1: # keywords1=y1[1][0] # sim=y1[1][1] print sentences1 keywords1 = keywords.encode('utf-8') sentlen = len(sentences1) for i in range(sentlen): sent = sentences1[i][0] if keywords1 != sent: sim1 = synonyms.compare(keywords1, sent, seg=False) if sim1 > sim2: sim2 = sim1 sent1 = sent return sent1, sim2
def comp(): sen1 = "海尔CXW-200" data_list = [] with open("data.txt", "r", encoding="utf-8") as f: for line in f: data_list.append(line.strip("\n")) with open("output.txt", "w", encoding="utf-8") as f: for sen2 in data_list: r = synonyms.compare(sen1, sen2, seg=True) f.writelines(sen2 + "," + str(r) + "\n") print("success")
def get_reply_content(pre_compare, msg): if pre_compare[1] == 'content': return pre_compare[0] elif pre_compare[1] == 'corpus': ori_list = pre_compare[0].split(',') for x in ori_list: point = synonyms.compare(msg, x, seg=True) if point >= 0.3: return pre_compare[2] return None
def remove_ambiguity(context, possible_entities, kb): if len(possible_entities) == 1: return possible_entities[0] score = [] # print(possible_entities) for e in possible_entities: score.append(synonyms.compare(context, e, ignore=True)) # print(score) return possible_entities[score.index(max(score))]
def replace_ambiguous(context, possible_entities): if(len(possible_entities)==0): return '' if len(possible_entities) == 1: return possible_entities[0] score = [] # print(possible_entities) for e in possible_entities: # ignore: 是否忽略OOV(out of vocabulary,未登录词,即训练时没有测试时遇到的词),False时,随机生成一个向量 score.append(synonyms.compare(context, e, ignore=True)) # 返回最相似的entity //score.index()返回下标 return possible_entities[score.index(max(score))]
def test_similarity(self): ''' Generate sentence similarity ''' sen1 = "旗帜引领方向" sen2 = "道路决定命运" r = synonyms.compare(sen1, sen2, seg=True) print("旗帜引领方向 vs 道路决定命运:", r) # assert r == 0.0, "the similarity should be zero" sen1 = "旗帜引领方向" sen2 = "旗帜指引道路" r = synonyms.compare(sen1, sen2, seg=True) print("旗帜引领方向 vs 旗帜指引道路:", r) # assert r > 0, "the similarity should be bigger then zero" sen1 = "发生历史性变革" sen2 = "发生历史性变革" r = synonyms.compare(sen1, sen2, seg=True) print("发生历史性变革 vs 发生历史性变革:", r)
def analyze(self, data): index = 0 result = {} for item in data: tomerge = item[0] index += 1 result[tomerge] = 1 for others in data[index:]: to_cmp = others[0] score = synonyms.compare(tomerge, to_cmp, seg=True) if score > 0.5: result[tomerge] += 1 print('{}-{}-{}'.format(tomerge, to_cmp, score))
def match_judge(keyword, article): keyword_list = "/".join(jieba.cut(keyword, cut_all=True)).split('/') tags = models.Tag.objects.filter(contact_id=article.id) # 检查标签 for key_word in keyword_list: for tag in tags: if synonyms.compare(key_word, tag.标签, seg=True) > 0.7: return True article_标题们 = "/".join(jieba.cut(article.文章标题, cut_all=True)).split('/') for key_word in keyword_list: for article_标题 in article_标题们: if synonyms.compare(key_word, article_标题, seg=True) > 0.7: return True article_摘要们 = "/".join(jieba.cut(article.文章摘要, cut_all=True)).split('/') for key_word in keyword_list: for article_摘要 in article_摘要们: if synonyms.compare(key_word, article_摘要, seg=True) > 0.7: return True return False
def _try(self, plot): if not self._filter_confession(plot): return 0 max_sim = 0 subs = plot.split(',') for sub in subs: if len(sub) < 1: continue sim = synonyms.compare(self.confession_key, plot, seg=True) if sim > max_sim: max_sim = sim return max_sim
def webCompare(SourceDocString1): #网络对比 keywords = synonyms.keywords(SourceDocString1, topK=3) #关键词列表 提取三个关键词搜索 print("文本关键词为", keywords) site = "m.51test.net" #搜索站点 暂定"无忧考网" browser = webdriver.Edge(EdgeChromiumDriverManager().install()) #使用edge浏览器 browser.get("https://cn.bing.com/search?q=" + " " + keywords[0] + " " + keywords[1] + " " + keywords[2] + " site:" + site) #必应搜索三个关键词 search_links = [] #储存搜索结果的链接list resultcontent = [] #储存搜索结果内容list r = [] #结果数字list rstelmt = [] #result element list results = dict() #输出用!字典 result带s!! key_list2 = [] value_list2 = [] site_contentdict = {} output = {} sitestr = "" result = browser.find_elements_by_css_selector("h2>a") #提取搜索结果项源码 for i in result[0:8]: #前五搜索结果源码中提取链接 if isMatch("https://" + site, i.get_attribute("href")) == True: #排除非该网站的项 search_links.append(i.get_attribute("href")) #提取链接合并到list for j in search_links: #打开结果链接并提取内容 browser.get(j) rstelmt = browser.find_elements_by_css_selector( "div#content-txt>p") #ResultElement for k in rstelmt: #只有当前网页才能提取文本,故需要在循环中加循环嵌套 if len(k.get_attribute("textContent")) > 0: #排除空内容 sitestr += k.get_attribute("textContent") resultcontent.append(sitestr) #单网站内容集 site_contentdict[j] = sitestr browser.quit() #关闭浏览器 for l in resultcontent[0:5]: #取的数字太小 rel = synonyms.compare(SourceDocString1, l, seg=True, ignore=True) print(rel) results[rel] = l #result字典中存储link对应的text(key:list) print(results) r.append(rel) #语句比对 ressorted = sorted(results.items(), key=lambda x: x[0], reverse=True) for key, value in site_contentdict.items(): #创造主字典内反向查找的条件 key_list2.append(key) value_list2.append(value) print(ressorted) for m in ressorted[0:3]: val = m[1] #查找:在results中取前三位的key值 output[m[0]] = key_list2[value_list2.index( val)] #将rel:link加入output字典之中 return output #要改return值