def find_max_similar(data_col, db_obj): #找到最大相似度 headersParameters = { # 发送HTTP请求时的HEAD信息,用于伪装为浏览器 'Connection': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'User-Agent': ua.random } for data in data_col: try: rid = data[0] # 文章的id title = data[1] # 文章的标题1 title = re.sub('-', '_', re.sub('\(.*?\)', '', title)).split('_') if len(title[0]) < 5: title = "".join(title) else: title = title[0] content = data[2] # 文章的内容15 content = filter_tags(content) # print('title:', title) content = content.replace(r'\n', '') #print('content:', content) tmp_url = 'http://www.baidu.com/s?wd=%s' % '"' + title + '"' # 进入搜索首页 test = requests.get(tmp_url, headers=headersParameters, timeout=req_timeout) root = etree.HTML(test.content) tmp = root.xpath("//h3[@class='t']/a") if len(tmp) <= 0: db_obj.update_compare_flag(rid, '2(crawl)') #百度搜索框没右结果 continue summary = root.xpath( "//div[@class='c-abstract']|//div[@class='c-abstract c-abstract-en']" ) res1 = jieba_cut.cut_word(content) parse_article_summary = '' try: for item in range(6): tmp1 = re.search('.{0,25}%s.{0,25}' % res1[item][0], content.strip(), re.DOTALL) # 从文章截取标题 if tmp1: parse_article_summary = "...".join( [parse_article_summary, tmp1.group(0)]) except: # 没有网页内容 pass try: db_obj.insert_Article_List_Summary(parse_article_summary, rid) except: pass #获取相关搜索 related_search = root.xpath("//div[@id='rs']/table/tr/th/a/text()") for j, i in enumerate(tmp): try: url = i.get('href') res_url, content_txt = get_text(url) # 请求超时 res2 = jieba_cut.cut_word(content_txt.text) vectors = jieba_cut.tf_idf(res1=res1, res2=res2) similarity = jieba_cut.run(vector1=vectors[0], vector2=vectors[1]) # print(url, ':', similarity) ciping_url = res2 summary_url = summary[j].xpath('string(.)') # 获取百度搜索的摘要 max_simil_url = res_url max_simil = similarity content_url = content_txt.text meta_description = content_txt.meta_description meta_keywords = content_txt.meta_keywords meta_title = content_txt.title try: article_summary = '' try: for item in range(6): tmp = re.search( '.{0,25}%s.{0,25}' % ciping_url[item][0], content_url.strip(), re.DOTALL) # 从文章截取标题 if tmp: article_summary = "...".join( [article_summary, tmp.group(0)]) except: # 没有网页内容 pass tmp_ci = '' for ci in ciping_url: tmp_ci = "".join([ tmp_ci, ci[0], ":", str(round(ci[1], 2)), '|' ]) db_obj.insert_Compare_List( rid, max_simil_url, summary_url, meta_description, content_url, article_summary, tmp_ci, round(max_simil, 2), meta_title, ",".join(meta_keywords)) # 将文章和从搜索引擎搜索出来的最大相似文章记录 db_obj.update_compare_flag(rid) # 文章数据库标识得到结果 for related_txt in related_search: # 将相关搜索词条插入数据库 db_obj.insert_Compare_Related(rid, related_txt) except Exception as e: print(e) print('3:出错id:', rid, '标题:', meta_title, '描述:', meta_description, '关键字:', meta_keywords) print('文章摘要:', parse_article_summary) except Exception as e: continue for _ in range(1, get_page_count): # 翻多少页 try: # 没有下一页 next_page = root.xpath( "//div[@id='page']/a[@class='n']/@href")[-1] next_page = 'http://www.baidu.com' + next_page test = requests.get(next_page, headers=headersParameters, timeout=req_timeout) root = etree.HTML(test.content) #print(next_page) tmp = root.xpath("//h3[@class='t']/a") summary = root.xpath( "//div[@class='c-abstract']|//div[@class='c-abstract c-abstract-en']" ) for j, i in enumerate(tmp): try: url = i.get('href') res_url, content_txt = get_text(url) res2 = jieba_cut.cut_word(content_txt.text) vectors = jieba_cut.tf_idf(res1=res1, res2=res2) similarity = jieba_cut.run(vector1=vectors[0], vector2=vectors[1]) ciping_url = res2 summary_url = summary[j].xpath( 'string(.)') # 获取百度搜索的摘要 max_simil_url = res_url max_simil = similarity content_url = content_txt.text meta_description = content_txt.meta_description meta_keywords = content_txt.meta_keywords meta_title = content_txt.title try: article_summary = '' try: for item in range(6): tmp = re.search( '.{0,25}%s.{0,25}' % ciping_url[item][0], content_url.strip(), re.DOTALL) # 从文章截取标题 if tmp: article_summary += tmp.group(0) except: # 没有网页内容 pass tmp_ci = '' for ci in ciping_url: tmp_ci = "".join([ tmp_ci, ci[0], ":", str(round(ci[1], 2)), '|' ]) # print ('meta:',meta_title,'meta1++',type(meta_keywords)) db_obj.insert_Compare_List( rid, max_simil_url, summary_url, meta_description, content_url, article_summary, tmp_ci, round(max_simil, 2), meta_title, ",".join(meta_keywords)) # 将文章和从搜索引擎搜索出来的最大相似文章记录 db_obj.update_compare_flag(rid) # 文章数据库标识得到结果 for related_txt in related_search: # 将相关搜索词条插入数据库 db_obj.insert_Compare_Related( rid, related_txt) except Exception as e: print(e) print('3:出错id:', rid, '标题:', meta_title, '描述:', meta_description, '关键字:', meta_keywords) print('文章摘要:', parse_article_summary) except Exception as e: continue except: break """ print('百度摘要:',summary_url) print('文章内容:',content_url) print('文章内容摘要:', article_summary) print ('文章meta_descriptio:',meta_description) print('百度词频:',ciping_url) print('最大相似度:', max_simil) print('最大相似度url', max_simil_url) print('文章id:', rid) print('文章标题:', title) """ # 记录到表 except Exception as e: print('4:', e)
def find_max_similar(data_col, db_obj): #找到最大相似度 headersParameters = { # 发送HTTP请求时的HEAD信息,用于伪装为浏览器 'Connection': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'User-Agent': 'Mozilla/6.1 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' } try: for data in data_col: max_simil = 0 # 用户与记录最大相似度 max_simil_url = '' rid = data[0] # 文章的id title = data[1] # 文章的标题1 title = re.sub('-', '_', re.sub('\(.*?\)', '', title)).split('_') if len(tmp[0]) < 5: title = "".join(title) else: title = title[0] content = data[2] # 文章的内容15 print('title:', title) print('content:', content) content = content.replace(r'\n', '') tmp_url = 'http://www.baidu.com/s?wd=%s' % '"' + title + '"' # 进入搜索首页 test = requests.get(tmp_url, headers=headersParameters, timeout=req_timeout) root = etree.HTML(test.content) tmp = root.xpath("//h3[@class='t']/a") res1 = jieba_cut.cut_word(content) for i in tmp: try: url = i.get('href') record_url, res2 = get_text(url) # 请求超时 except: continue res2 = jieba_cut.cut_word(res2) vectors = jieba_cut.tf_idf(res1=res1, res2=res2) similarity = jieba_cut.run(vector1=vectors[0], vector2=vectors[1]) #print(url, ':', similarity) if max_simil <= similarity: max_simil_url = record_url max_simil = similarity for _ in range(1, get_page_count): # 翻多少页 try: # 没有下一页 next_page = root.xpath( "//div[@id='page']/a[@class='n']/@href")[-1] next_page = 'http://www.baidu.com' + next_page test = requests.get(next_page, headers=headersParameters, timeout=req_timeout) root = etree.HTML(test.content) #print(next_page) tmp = root.xpath("//h3[@class='t']/a") for i in tmp: url = i.get('href') try: record_url, res2 = get_text(url) except: continue res2 = jieba_cut.cut_word(res2) vectors = jieba_cut.tf_idf(res1=res1, res2=res2) similarity = jieba_cut.run(vector1=vectors[0], vector2=vectors[1]) #print(url, ':', similarity) if max_simil <= similarity: max_simil_url = record_url max_simil = similarity except: break """ print('最大相似度:', max_simil) print('最大相似度url', max_simil_url) print('文章id:', rid) print('文章标题:', title) """ # 记录到表 try: db_obj.insert_Compare_List(rid, max_simil_url, max_simil) #将文章和从搜索引擎搜索出来的最大相似文章记录 db_obj.update_compare_flag(rid) #文章数据库标识得到结果 except: pass except Exception as e: print(e)