Exemple #1
0
 def test_swap_sent(self):
     print("test_swap_sent")
     s1 = synonyms.compare("教学", "老师")
     s2 = synonyms.compare("老师", "教学")
     print('"教学", "老师": %s ' % s1)
     print('"老师", "教学": %s ' % s2)
     assert s1 == s2, "Scores should be the same after swap sents"
Exemple #2
0
    def list_projects_chat(cls,
                           search_query,
                           page_no=None,
                           page_size=None,
                           default_max_score=0.4,
                           privacy="public"):
        start = (page_no - 1) * page_size
        end = page_no * page_size

        # all_apps = cls.get_all()
        all_apps = cls.repo.read(query={"privacy": privacy})
        # all_apps = cls.read(query={"privacy": privacy})

        #  比对打分
        for app in all_apps:
            name_score = synonyms.compare(search_query, app.name, seg=True)
            description_score = synonyms.compare(search_query,
                                                 app.description,
                                                 seg=True)
            app.score = (name_score + description_score) / 2
        # 筛选掉小于 description_score
        apps = list(
            filter(lambda app: app.score >= default_max_score, all_apps))

        count = len(apps)
        apps = sorted(apps, key=lambda item: -item.score)
        return Objects(objects=apps[start:end],
                       count=count,
                       page_no=page_no,
                       page_size=page_size)
Exemple #3
0
    def test_similarity(self):
        '''
        Generate sentence similarity
        '''
        sen1 = "旗帜引领方向"
        sen2 = "道路决定命运"
        r = synonyms.compare(sen1, sen2, seg=True)
        print("旗帜引领方向 vs 道路决定命运:", r)
        # assert r == 0.0, "the similarity should be zero"

        sen1 = "旗帜引领方向"
        sen2 = "旗帜指引道路"
        r = synonyms.compare(sen1, sen2, seg=True)
        print("旗帜引领方向 vs 旗帜指引道路:", r)
        # assert r > 0, "the similarity should be bigger then zero"

        sen1 = "发生历史性变革"
        sen2 = "发生历史性变革"
        r = synonyms.compare(sen1, sen2, seg=True)
        print("发生历史性变革 vs 发生历史性变革:", r)
        # assert r > 0, "the similarity should be bigger then zero"

        sen1 = "骨折"
        sen2 = "巴赫"
        r = synonyms.compare(sen1, sen2, seg=True)
        print("%s vs %s" % (sen1, sen2), r)

        sen1 = "你们好呀"
        sen2 = "大家好"
        r = synonyms.compare(sen1, sen2, seg=False)
        print("%s vs %s" % (sen1, sen2), r)
def if_has_relation(words):
    """ 判断是否句子中有关系词的出现

    :param words:
    :return:
    """
    # 一句话抽象提取出的关系是否只有一个呢?下面的方法是按一个来做的
    relation_word = ""
    index = -1
    for word in words:
        for key in relations:
            if synonyms.compare(word,
                                key) > 0.98 and key not in wrong_relation:
                index = 0
                relation_word = key
                break
        if index == -1:
            for values in relations.values():
                for relation in values:
                    if synonyms.compare(
                            word, relation
                    ) > 0.98 and relation not in wrong_relation:
                        index = 0
                        relation_word = word
                        break
                if index != -1:
                    break
        if index != -1:
            break
    return index, relation_word
    def simCal(self, word, entities, flag):
        """
        计算词语和字典中的词的相似度
        相同字符的个数/min(|A|,|B|)   +  余弦相似度
        :param word: str
        :param entities:List
        :return:
        """
        import synonyms as sy
        a = len(word)
        scores = []
        for entity in entities:
            sim_num = 0
            b = len(entity)
            c = len(set(entity + word))
            temp = []
            try:
                if not np.isnan(sy.compare(word, entity)):
                    score2 = sy.compare(word, entity)
                    temp.append(score2)
            except:
                pass
            score3 = 1 - self.editDistanceDP(word, entity) / (a + b)
            if score3 > 0.5:
                temp.append(score3)

            score = sum(temp) / len(temp)
            if score >= 0.7:
                scores.append((entity, score, flag))

        scores.sort(key=lambda k: k[1], reverse=True)
        return scores
    def matchBestTriple(self,des):
        '''match best triple by synonyms'''

        #analyse the question's triples
        des_triples=self.extor.triples_main(des)

        data=self.collect.find()

        #document's max score 
        domMaxScore=0
        domTitle=''

        for each in data:

            #read triple list from document
            triple_list=each["triples"]

            #set every documents score
            scoreSum=0

            #calculate a score for every triple in document
            for eachTriple in des_triples:

                #对于单个的三元组的评分
                single_triple_score=0
                
                relaScore=0
                n1Score=0
                n2Score=0

                for Triple in triple_list:

                    #calculate the score of relation description
                    relation_score=sy.compare(Triple[1],eachTriple[1],seg=True)
                    if relation_score<relaScore:
                        relaScore=relation_score

                    #calculate the similarity of two node
                    node1_score=sy.compare(Triple[2],eachTriple[2],seg=True)
                    node2_score=sy.compare(Triple[0],eachTriple[0],seg=True)
                    if n1Score>node1_score:
                        n1Score=node1_score
                    if n2Score>node2_score:
                        n2Score=node2_score

                single_triple_score=0.6*relaScore+0.2*n1Score+0.2*n2Score
                scoreSum=single_triple_score+single_triple_score

                if scoreSum>domMaxScore:
                    domMaxScore=scoreSum
                    domTitle=each['title']

        #catch the best triple_list
        bestDom=collect.find_one({'title':domTitle})
        bestTriple=bestDom["triples"]

        return bestTriple
Exemple #7
0
    def testSenSimilarity(self):
        '''
        Generate sentence similarity
        '''
        sen1 = "旗帜引领方向"
        sen2 = "道路决定命运"
        assert synonyms.compare(sen1,
                                sen2) == 0.0, "the similarity should be zero"

        sen1 = "发生历史性变革"
        sen2 = "取得历史性成就"
        assert synonyms.compare(
            sen1, sen2) > 0, "the similarity should be bigger then zero"
Exemple #8
0
def getTop10(Allwords,Allbag_file):
    '''

    :param Allwords:    用户的二维关键字数组
    :param Allbag_file: 商品类别文件
    :return: 前十个关键词类别
    '''
    user_words = Allwords
    all_bag = []
    with open(Allbag_file,'r') as f:
        text = f.read()
        all_bag = text.split(',')

    bag_num = dict()
    bag_len = len(all_bag)
    for i in range(bag_len):
        num = 0
        for word in user_words:
            num = num + synonyms.compare(word,all_bag[i])/bag_len
        bag_num[all_bag[i]] = num

    test_dict = sorted(bag_num.items(), key=lambda e: e[1], reverse=True)[:10]

    re_list = []
    for one in test_dict:
        re_list.append(one[0])
    return re_list
Exemple #9
0
	def NLP(self,context,score,cla_id):
		#使用SnowNLP来进行情感度计算
		#使用jeaba.analyse进行关键词提取
		#用synonyms进行同义词相似度比较

		tag_words = jae.extract_tags(context,topK = 5, withWeight = True, allowPOS = ())
		Predict_Score = SnowNLP(context).sentiments
		Final_Tag_Score = {}  #评论中五个关键词及其情感评分

		for word,weight in tag_words:
			Rank_Dict = dict()
			for origin_tag in self.Taglist:
				Rank_Dict[origin_tag] = synonyms.compare(word,origin_tag,seg = True)

			Sorted_Rank_Dict = sorted(Rank_Dict.items, key = lambda x : x[1], reverse = True)
			Most_Similiar_Tag = Sorted_Rank_Dict[0][0]
			Similiar_Score = Sorted_Rank_Dict[0][1]
			Final_Tag_Score[Most_Similiar_Tag] = score * Predict_Score * Similiar_Score * weight

		#将评论中提取的特征:评分加入到课程的特征中
		origin_feat_vector = self.DataGetter.Get_Cla_Feats({'cla_id':cla_id})
		for tag,score in Final_Tag_Score.items():
			index = self.Taglist.index(tag)
			origin_feat_vector[index] += 0.2 * score

		new_info = {'cla_id':cla_id,'feats':origin_feat_vector}
		self.DataGetter.Update_ClaFeats(new_info)
Exemple #10
0
 def test_eval_synonyms(self):
     logging.info("test_eval_synonyms")
     from_ = os.path.join(curdir, os.path.pardir, "data",
                          "simtrain_to05sts.txt")
     to_ = os.path.join(curdir, os.path.pardir, "data", "synonyms_eval.txt")
     if os.path.exists(to_): os.remove(to_)
     append_line_to_file(
         to_,
         "# [synonyms](https://github.com/huyingxi/Synonyms) v(%s) 相似度评测 \n"
         % synonyms.__version__)
     append_line_to_file(
         to_, "评测数据源: https://github.com/IAdmireu/ChineseSTS \n \n")
     append_line_to_file(to_, "置信区间 [0,5],分数越高越相似 \n")
     append_line_to_file(
         to_, "| %s | %s | %s | %s | %s | %s | \n" %
         ("句子A ID", "句子A", "句子B ID", "句子 B", "标注分数", "预测分数"))
     append_line_to_file(to_, "| --- | --- | --- | --- | --- | --- | \n")
     with open(from_, "r") as fin:
         for x in tqdm(list(fin.readlines())):
             o = x.strip().split("\t")
             if len(o) == 5:
                 yid, yc, zid, zc, yzs = o
                 syn_score = synonyms.compare(yc, zc) * 5.0
                 # print("%s | %s => %s | %s" % (yc, zc, yzs, syn_score))
                 append_line_to_file(
                     to_, "| %s | %s | %s | %s | %s | %.3f | \n" %
                     (yid, yc, zid, zc, yzs, syn_score))
Exemple #11
0
def json_rewrit_statement():
    text = request.args.get('text')
    print('text:',text)

    key ='json_rewrit_statement'+str(text)
    if cache.get(key) is None:
        print('创建新缓存')

        # tnlp = libs.Nlp()
        # keywords = tnlp.dnn(text=text)
        rs = libs.RewritStatement()
        # text ="python中for循环输出列表索引与对应的值 - tanlangqie的博客"
        t = rs.text(text)
        rnk = synonyms.compare(text, t, seg=True)
        items = {
            'text':text,
            'new_text':t,
            'relevant':rnk
        }
        # print(t)
        cache.set(key ,items)
    else:
        print('获取缓存')
        items = cache.get(key)
    print('json_rewrit_statement',items)
    return jsonify(items)
 def matrix_dis(self, keys):
     start = time.time()
     keys = keys[:20]
     sim, k = 0, 1
     matrix = dict()
     x_max = np.max(self.weight_list)
     x_min = np.min(self.weight_list)
     count_list = dict()
     for i in range(len(self.keylist)):
         count_list[i] = (self.weight_list[i] - x_min) / (x_max - x_min)
     start2 = time.time()
     for i in range(len(self.keylist)):
         for j in range(len(keys)):
             start3 = time.time()
             matrix[(i, j)] = synonyms.compare(
                 self.keylist[i], keys[j], seg=False) * (1 + count_list[i])
             # print(time.time()-start3)
     # print(time.time()-start2)
     while matrix:
         max_couple = max(matrix, key=matrix.get)
         i, j = max_couple[0], max_couple[1]
         k += 1
         simil = matrix.get(max_couple)
         sim += simil  # if simil > 0.2 else 0
         for t1 in range(len(keys)):
             if (i, t1) in matrix:
                 matrix.pop((i, t1))
         for t2 in range(len(self.keylist)):
             if t2 != i and (t2, j) in matrix:
                 matrix.pop((t2, j))
     # print("total compare time", time.time() - start)
     return sim / k
Exemple #13
0
def similar_scores(word1, word2):
    """
    使用词向量表示计算两个词之间的语义相似度
    :return: word1 和 word2的语义相似度得分(最大 1.0)
    """
    import synonyms
    return synonyms.compare(word1, word2)
def find_answer(input, file_url):
    inputSentence = input
    try:
        CorpusFile = open(file_url, 'r')
    except:
        print("Could not open file")
        return -1
    #inputSentence = CorpusFile.readline()
    pairs = []
    while True:
        lines = CorpusFile.readlines(10000)
        if not lines:
            break
        for line in lines:
            question = line.split()[0]
            ans = line.split()[1]
            pairs.append((question, ans))

    print('inputSentece=', inputSentence, "pairs[0]", pairs[0])
    pairs.sort(key=lambda x: -synonyms.compare(inputSentence, x[0], seg=True))
    #print(pairs[0])
    #synonyms("test", "test", seg=True)

    ans = ""
    for i in range(5):
        ans += pairs[i][1]
        if i != 4:
            ans += '\n'
    #print(ans)
    return ans
Exemple #15
0
def string_align(word, content_list):
    new_list = []
    list_to_set = set()  #把list里的内容放到集合中,实现自动去重
    #根据是否是字符串、是否为空、是否是列表、列表是否为空几个情况来判断执行
    for i in content_list:
        if i == '':
            pass
        elif type(i).__name__ == 'list' and len(i) > 0:
            for j in i:
                list_to_set.add(j)
        elif type(i).__name__ != 'list':
            list_to_set.add(i)
        else:
            pass
    new_list = list(list_to_set)
    newnew_list = []
    # 去除相似度比较高的部分(有些类似于冒泡排序的处理方式)
    for i in range(0, len(new_list)):
        #temp = i
        for j in range(i + 1, len(new_list)):
            if synonyms.compare(new_list[i],
                                new_list[j]) > 0.5 or Levenshtein.ratio(
                                    new_list[i], new_list[j]) > 0.8:
                if len(new_list[i]) > len(new_list[j]):
                    new_list[i] = new_list[j]
                else:
                    new_list[j] = new_list[i]
    new_list = list(set(new_list))
    return new_list
Exemple #16
0
def visit_baidu(msg):
    try:
        jieba.load_userdict("dict.txt")
        n_flags = ["ns", "n", "nr", "nz", "nt", "nw", "t"]
        words = pseg.cut(msg, use_paddle=True)
        task = []
        for word, flag in words:
            print(word, flag)
            if flag in n_flags:
                task.append(word)
        print("分词结果如下:", task)
        headers = {"User-Agent": "Mozilla/5.0 "}

        if len(task) == 1:
            word = task[0]
            api = 'https://baike.baidu.com/search/word?word=' + word
            r = requests.get(api, headers=headers, timeout=10)
            r.encoding = r.apparent_encoding
            soup = BeautifulSoup(r.text, 'html.parser')
            content = soup.findAll(name='meta', attrs={"name": "description"})
            content = content[0]['content']
            return content

        if len(task) > 1:
            word = task[0]
            target = ""
            for i in task:
                target = target + i
            target = target.replace(word, "")
            print(target)
            api = 'https://baike.baidu.com/search/word?word=' + word
            r = requests.get(api, headers=headers, timeout=10)
            r.encoding = r.apparent_encoding
            soup = BeautifulSoup(r.text, 'html.parser')
            names = []
            for i in soup.select(".basicInfo-item.name"):
                if i.text.replace("\xa0", ""):
                    names.append(i.text.replace("\xa0", ""))
            values = []
            for i in soup.select(".basicInfo-item.value"):
                if i.text.replace("\xa0", ""):
                    values.append(i.text.replace("\xa0", ""))
            info_items = []
            for i in range(len(names)):
                item = {"name": names[i], "value": values[i]}
                info_items.append(item)
            similarity = -1
            result = "什么也没有找到~~"
            for item in info_items:
                compare = synonyms.compare(target, item["name"], seg=False)
                if compare > similarity:
                    similarity = compare
                    print(item["name"])
                    result = item["value"]
            return "已经为你找到如下信息:\r" + result
    except Exception as e:
        print(e)
        return "抱歉出错了什么也没找到~"
Exemple #17
0
def similarityComparison():
    # 近义词处理,缺少近义词库
    wb = load_workbook(path)
    sheet = wb.active
    for i in sheet["E2:E{}".format(sheet.max_row)]:
        for j in i:
            for i2 in sheet["E3:E{}".format(sheet.max_row)]:
                pass
            r = synonyms.compare(j.value, )
Exemple #18
0
 def get_similrity(self, word1, word2):
     try:
         v1 = synonyms.v[word1]
         v2 = synonyms.v[word2]
         similar = np.dot(v1, v2) / (np.linalg.norm(v1) *
                                     (np.linalg.norm(v2)))
         return similar
     except:
         return synonyms.compare(word1, word2, seg=False)
Exemple #19
0
def get_api_list(get_type, search_query, user_ID, page_no=default_page_no,
                 page_size=default_page_size, default_max_score=0.4):
    start = (page_no - 1) * page_size
    end = page_no * page_size

    # apis = []
    if get_type == ApiGetType.all:
        # 获取所有的
        if search_query:
            # apis = Api.objects.search_text(search_query).order_by('$text_score')
            apis = search(search_query)
        else:
            apis = api_business.get()  # 分页
        return apis.order_by('-create_time')[start:end]

    elif get_type == ApiGetType.favor or get_type == ApiGetType.used or get_type == ApiGetType.star:
        # 获取特定
        user = UserBusiness.get_by_user_ID(user_ID=user_ID)
        apis = user[get_type + "_apis"]
        if search_query:
            # 先search, 后用apis filter
            match_apis = search(search_query)
            final_apis = list(filter(lambda match_api: match_api in apis, match_apis))
            apis = final_apis
        return custom_sort(apis)[start:end]

    elif get_type == ApiGetType.chat:
        # 机器人
        apis = api_business.get_all().order_by('-create_time')
        #  比对打分
        apis_score = []
        for api in apis:
            api_json = api.to_mongo()
            apis_score.append({
                **api_json,
                "score": synonyms.compare(search_query, api.keyword, seg=True)
            })

            # TODO 以后可以更换成 object
            # for api in apis:
            #     api.score = synonyms.compare(search_query, api.keyword, seg=True)
            # api_json = api.to_mongo()
            # apis_score.append({
            #     **api_json,
            #     "score": synonyms.compare(search_query, api.keyword, seg=True)
            # })
        apis_score = sorted(apis_score, key=lambda item: -item["score"])
        # 最大值
        max_score = apis_score[0]["score"]
        if max_score < default_max_score:
            raise Warning(ErrorMessage.no_match_apis)
        else:
            apis = apis_score
            return apis[start:end]
    else:
        raise Error(ErrorMessage.error_get_type)
Exemple #20
0
def similarmatch():   # similar words match by using synonyms
    keywords['entity']=''
    for i in range(testnum):
        keywords['entity'][i]=[]
        for ii in range(len(keywords['keywords'][i].split(','))):
            for j in range(len(entities)):
                if synonyms.compare(entities['name'][j],keywords['keywords'][i].split(',')[ii],seg=False)>threshold: # if exceed threshold, select it as similar word
                    keywords['entity'][i].append(entities['name'][j])
    nermatch()
    return(keywords)
Exemple #21
0
def compare_chs_sentence(sen1, sen2, threshold=0):
    """
    to check the two Chinese sentences input whether be similar, if not, may change the way of mutation
    :param sen1: the original sentence
    :param sen2: the mutated sentence
    :param threshold: how much the two sentence are likely is okay
    :return: True for the two sentence are similar, false for not
    """
    score = synonyms.compare(sen1, sen2)
    return False if score <= threshold else True
Exemple #22
0
def similarity(db, keywords):
    fp = open("D:\\git\\bigdata_test\\bigdata_test\\wordresult.txt", 'r')
    sentences = fp.readlines()
    st = sentences[0].split(',')
    st_list = []  #  词库列表
    keywords1 = 0  #最接近的词
    #    sim=0.01   #相似度
    sim1 = 0.01  #相似度
    sim2 = 0.01  #相似度
    sent1 = 0

    for i in range(len(st)):
        if '\xef\xbb\xbf' in st[i]:
            st1 = st[i].replace("\xef\xbb\xbf", "")
            st_list.append(st1.strip())
        else:
            st_list.append(st[i].strip())

#    print st_list
    stlen = len(st_list)
    #sentences=np.array(st_list).reshape(1,12)
    #sentences2=np.array(st_list,dtype=str).reshape((1,stlen))
    sentences1 = [[] for i in range(stlen)]
    print type(st_list)
    print st_list
    print type(sentences1)
    for a in range(0, stlen):
        sentences1[a].append(st_list[a])


#    print "###################################"
#    print sentences1

#    model = word2vec.Word2Vec(sentences1, min_count=1,size=500)
#    kw1=keywords.encode('utf-8')
##    print " 获取词性最接近的词###################################"
#    y1=model.most_similar(kw1)
#    keywords1=y1[0][0]
#    sim=y1[0][1]
#    if kw1==keywords1:
#       keywords1=y1[1][0]
#       sim=y1[1][1]
    print sentences1
    keywords1 = keywords.encode('utf-8')
    sentlen = len(sentences1)
    for i in range(sentlen):
        sent = sentences1[i][0]
        if keywords1 != sent:
            sim1 = synonyms.compare(keywords1, sent, seg=False)
            if sim1 > sim2:
                sim2 = sim1
                sent1 = sent

    return sent1, sim2
Exemple #23
0
def comp():
    sen1 = "海尔CXW-200"
    data_list = []
    with open("data.txt", "r", encoding="utf-8") as f:
        for line in f:
            data_list.append(line.strip("\n"))
    with open("output.txt", "w", encoding="utf-8") as f:
        for sen2 in data_list:
            r = synonyms.compare(sen1, sen2, seg=True)
            f.writelines(sen2 + "," + str(r) + "\n")
    print("success")
Exemple #24
0
                def get_reply_content(pre_compare, msg):
                    if pre_compare[1] == 'content':
                        return pre_compare[0]
                    elif pre_compare[1] == 'corpus':
                        ori_list = pre_compare[0].split(',')

                        for x in ori_list:
                            point = synonyms.compare(msg, x, seg=True)
                            if point >= 0.3:
                                return pre_compare[2]
                        return None
def remove_ambiguity(context, possible_entities, kb):
    if len(possible_entities) == 1:
        return possible_entities[0]
    score = []
    #    print(possible_entities)
    for e in possible_entities:
        score.append(synonyms.compare(context, e, ignore=True))


#    print(score)
    return possible_entities[score.index(max(score))]
def replace_ambiguous(context, possible_entities):
    if(len(possible_entities)==0):
        return ''
    if len(possible_entities) == 1:
        return possible_entities[0]
    score = []
    #    print(possible_entities)
    for e in possible_entities:
        # ignore: 是否忽略OOV(out of vocabulary,未登录词,即训练时没有测试时遇到的词),False时,随机生成一个向量
        score.append(synonyms.compare(context, e, ignore=True))
    # 返回最相似的entity //score.index()返回下标
    return possible_entities[score.index(max(score))]
Exemple #27
0
    def test_similarity(self):
        '''
        Generate sentence similarity
        '''
        sen1 = "旗帜引领方向"
        sen2 = "道路决定命运"
        r = synonyms.compare(sen1, sen2, seg=True)
        print("旗帜引领方向 vs 道路决定命运:", r)
        # assert r == 0.0, "the similarity should be zero"

        sen1 = "旗帜引领方向"
        sen2 = "旗帜指引道路"
        r = synonyms.compare(sen1, sen2, seg=True)
        print("旗帜引领方向 vs 旗帜指引道路:", r)
        # assert r > 0, "the similarity should be bigger then zero"


        sen1 = "发生历史性变革"
        sen2 = "发生历史性变革"
        r = synonyms.compare(sen1, sen2, seg=True)
        print("发生历史性变革 vs 发生历史性变革:", r)
 def analyze(self, data):
     index = 0
     result = {}
     for item in data:
         tomerge = item[0]
         index += 1
         result[tomerge] = 1
         for others in data[index:]:
             to_cmp = others[0]
             score = synonyms.compare(tomerge, to_cmp, seg=True)
             if score > 0.5:
                 result[tomerge] += 1
                 print('{}-{}-{}'.format(tomerge, to_cmp, score))
Exemple #29
0
def match_judge(keyword, article):
    keyword_list = "/".join(jieba.cut(keyword, cut_all=True)).split('/')
    tags = models.Tag.objects.filter(contact_id=article.id)

    # 检查标签
    for key_word in keyword_list:
        for tag in tags:
            if synonyms.compare(key_word, tag.标签, seg=True) > 0.7:
                return True

    article_标题们 = "/".join(jieba.cut(article.文章标题, cut_all=True)).split('/')
    for key_word in keyword_list:
        for article_标题 in article_标题们:
            if synonyms.compare(key_word, article_标题, seg=True) > 0.7:
                return True

    article_摘要们 = "/".join(jieba.cut(article.文章摘要, cut_all=True)).split('/')
    for key_word in keyword_list:
        for article_摘要 in article_摘要们:
            if synonyms.compare(key_word, article_摘要, seg=True) > 0.7:
                return True
    return False
Exemple #30
0
    def _try(self, plot):
        if not self._filter_confession(plot):
            return 0
        max_sim = 0
        subs = plot.split(',')
        for sub in subs:
            if len(sub) < 1:
                continue
            sim = synonyms.compare(self.confession_key, plot, seg=True)
            if sim > max_sim:
                max_sim = sim

        return max_sim
Exemple #31
0
def webCompare(SourceDocString1):  #网络对比
    keywords = synonyms.keywords(SourceDocString1, topK=3)  #关键词列表 提取三个关键词搜索
    print("文本关键词为", keywords)
    site = "m.51test.net"  #搜索站点 暂定"无忧考网"
    browser = webdriver.Edge(EdgeChromiumDriverManager().install())  #使用edge浏览器
    browser.get("https://cn.bing.com/search?q=" + " " + keywords[0] + " " +
                keywords[1] + " " + keywords[2] + " site:" + site)  #必应搜索三个关键词
    search_links = []  #储存搜索结果的链接list
    resultcontent = []  #储存搜索结果内容list
    r = []  #结果数字list
    rstelmt = []  #result element list
    results = dict()  #输出用!字典 result带s!!
    key_list2 = []
    value_list2 = []
    site_contentdict = {}
    output = {}
    sitestr = ""
    result = browser.find_elements_by_css_selector("h2>a")  #提取搜索结果项源码
    for i in result[0:8]:  #前五搜索结果源码中提取链接
        if isMatch("https://" + site,
                   i.get_attribute("href")) == True:  #排除非该网站的项
            search_links.append(i.get_attribute("href"))  #提取链接合并到list
    for j in search_links:  #打开结果链接并提取内容
        browser.get(j)
        rstelmt = browser.find_elements_by_css_selector(
            "div#content-txt>p")  #ResultElement
        for k in rstelmt:  #只有当前网页才能提取文本,故需要在循环中加循环嵌套
            if len(k.get_attribute("textContent")) > 0:  #排除空内容
                sitestr += k.get_attribute("textContent")
        resultcontent.append(sitestr)  #单网站内容集
        site_contentdict[j] = sitestr
    browser.quit()  #关闭浏览器
    for l in resultcontent[0:5]:  #取的数字太小
        rel = synonyms.compare(SourceDocString1, l, seg=True, ignore=True)
        print(rel)
        results[rel] = l  #result字典中存储link对应的text(key:list)
        print(results)
        r.append(rel)  #语句比对
    ressorted = sorted(results.items(), key=lambda x: x[0], reverse=True)
    for key, value in site_contentdict.items():  #创造主字典内反向查找的条件
        key_list2.append(key)
        value_list2.append(value)
    print(ressorted)
    for m in ressorted[0:3]:
        val = m[1]  #查找:在results中取前三位的key值
        output[m[0]] = key_list2[value_list2.index(
            val)]  #将rel:link加入output字典之中
    return output  #要改return值