def get_details(self, url, muti, name, page, index_str, referer_url): resp = self.session.get(url, headers={ "Referer": referer_url, "Cookie": self.get_cookie_string() }) info(resp.headers.get("Set-Cookie", "")) resp.encoding = "utf8" if not self.check_if_need_relogin(resp): return bs_data = bs_4(resp.text, "lxml") uid, sid = Parser.get_uid_and_sid(bs_data) if not any((uid, sid)): error("uid&sid lost") return details, person, notice, img = Cursor.get_model_details(uid, sid) try: person_datas, username = Parser.get_person_data(bs_data) if not person: person_datas.update({ "uid": uid, "user": username, "regtime": Parser.get_reg_date(bs_data, "1999-01-01"), }) person = Cursor.create_person(person_datas) else: Cursor.update_person(person_datas, uid) person = person[0].uid if not notice: notice = Cursor.create_notice({"sid": sid}) else: notice = notice[0].sid detailImages = None detailContent = Parser.get_detail_content(bs_data) if not img: urls = Parser.get_img_urls(bs_data) img = Cursor.create_img({ "sid": sid, "img": urls, "detail": detailContent }) detailImages = self.save_pics(urls, sid) else: img = img[0].sid current_year = moment.now().year real_up_time = Parser.get_up_time(bs_data, current_year) details_datas = Parser.get_details(bs_data, current_year, real_up_time, muti) if not details: details_datas.update({ "sid": sid, "user": person, "area": muti["area"], "title": muti["title"], "detailurl": url, "img": img, "notice": notice, }) details = Cursor.create_details(details_datas) self.make_msg(details, detailContent, detailImages, sid, username) else: Cursor.update_details(details_datas, sid) short_msg = f'[{name}:{page}:{index_str}]-{real_up_time}- {muti["title"]}' success(short_msg) if not details else warning(short_msg) except KeyboardInterrupt: exit() except Exception as e: error(f"[run-->__get_details]: {e}")
def clean_log(self, resp, lens=100): return (" ".join(bs_4(resp.text, "lxml").text.split()))[:lens] + "..."
def get_html(self,url_p="",dic_p={},type_p='rg',chartset_p='utf-8',timeout_p=10): chartset_get = "n/a" # 爬取数据的字符形式编码 headers_p = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"} txt = "nothing" # 获取网页源码 try: # request_get的方法 if (type_p == 'rg'): html = requests.get(url=url_p,timeout=timeout_p,headers=headers_p) chartset_get = self.get_encodings_from_content(html.text) # 用英文字符匹配法重新识别编码 if (self.code_standard_is(chartset_p=chartset_get)): print ("<<原文编码识别[通过]>>") if (chartset_get.lower() == "iso-8859-1"): print ("rg模式,<<原文编码iso-8859-1特殊处理") try: txt = html.content.decode("GBK") except: txt = html.content.decode("gb2312") else: print ("rg模式,按照识别的" + chartset_get + "特殊处理") txt = html.content.decode(chartset_get) else: print ("<<原文编码识别[未通过]>>") txt = "" print ("<<<rg>>>过程:"," ","原文编码:",chartset_get) html.close() # request_get的方法 只输出字节码 if (type_p == 'rg_byte'): txt = b"" html = requests.get(url=url_p,timeout=timeout_p,headers=headers_p) chartset_get = self.get_encodings_from_content(html.text) # 用英文字符匹配法重新识别编码 txt = html.content print ("<<<rg_byte>>>过程:"," ","原文编码:",chartset_get) html.close() # request_post的方法 if (type_p == 'rp'): conn_p = requests.session() rep_p = conn_p.post(url=url_p,data=dic_p,timeout=timeout_p,headers=headers_p) txt = rep_p.content chartset_get = self.get_encodings_from_content(txt.decode(chartset_p, "ignore")) if (self.code_standard_is(chartset_p=chartset_get)): print ("<<原文编码识别[通过]>>") txt = txt.decode(chartset_get, "ignore") else: print ("<<原文编码识别[未通过]>>") code_is = chardet.detect(txt) if ("encoding" in code_is): chartset_get = code_is["encoding"] txt = txt.decode(code_is["encoding"], "ignore") print ("<<<rp>>>过程:"," ","原文编码:",chartset_get) # urllib的get方法 if (type_p == 'ug'): html = urllib.request.urlopen(url=url_p) txt = html.read() chartset_get = self.get_encodings_from_content(txt.decode(chartset_p, "ignore")) # 尝试编码 获得内部编码信息 if (self.code_standard_is(chartset_p=chartset_get)): print ("<<原文编码识别[通过]>>") txt = txt.decode(chartset_get, "ignore") else: # 进行编码判别 print ("<<原文编码识别[未通过]>>") code_is = chardet.detect(txt) if ("encoding" in code_is): chartset_get = code_is["encoding"] txt = txt.decode(code_is["encoding"], "ignore") print ("<<<ug>>>过程:"," ","原文编码:",chartset_get) # urllib的post方法 if (type_p == 'up'): #将字典格式化成能用的形式 data_p = urllib.parse.urlencode(dic_p).encode('utf-8') #创建一个request,放入我们的地址、数据、头 request = urllib.request.Request(url_p, data_p, headers_p) #访问 txt = urllib.request.urlopen(request).read() chartset_get = self.get_encodings_from_content(txt.decode(chartset_p, "ignore")) # 尝试编码 获得内部编码信息 if (self.code_standard_is(chartset_p=chartset_get)): print ("<<原文编码识别[通过]>>") txt = txt.decode(chartset_get, "ignore") else: # 进行编码判别 print ("<<原文编码识别[未通过]>>") code_is = chardet.detect(txt) if ("encoding" in code_is): chartset_get = code_is["encoding"] txt = txt.decode(code_is["encoding"], "ignore") print ("<<<up>>>过程:"," ","原文编码:",chartset_get) # session的方法 if (type_p == 'ss'): res_addr = self.session.get(url_p, timeout=timeout_p, headers=headers_p) res_addr.encoding = chardet.detect(res_addr.content)["encoding"] txt = bs_4(res_addr.text, "lxml") print ("<<<ss>>>过程:"," ","原文编码:",chartset_get) # Selenium的方法 待完善 if (type_p == 'se'): self.driver.get(url_p) js = "var q=document.body.scrollTop=100000" self.driver.execute_script(js) self.driver.implicitly_wait(30) # 据说此方法是智能等待,看效果还不错,数据加载完就返回了 30 代表等待秒 txt = self.driver.page_source chartset_get = self.get_encodings_from_content(txt) print ("<<<se>>>过程:"," ","原文编码:",chartset_get) # login的方法 待完善 if (type_p == 'lg'): print ("<<<lg>>>过程:"," ","原文编码:",chartset_get) except Exception as e: print("html爬虫处理失败", e) html = requests.get(url=url_p, headers=headers_p) chartset_get = "n/a" print ("爬虫的最后处理,按照默认的" + chartset_p + "编码输出") try: txt = html.content.decode(chartset_p) except: txt = html.content.decode("gbk") html.close() return txt,chartset_get # 返回文本型html编码 加上自定义编码头
def crawler_sogou(self,searchUrl,keyword,plant,data_if=0): # 通用数据库连接 全局接口 conn = Conn_mysql( host=config.dic_config["host_mysql"], port=int(config.dic_config["port_mysql"]), user=config.dic_config["user_mysql"], passwd=config.dic_config["pwd_mysql"], db="lqab_basedata_" + config.dic_config["name_mysql_after"] ) chartset_code = "" # 使用自定义获取源码脚本获取 网页 source = HtmlSource() html,chartset_code = source.get_html(url_p=searchUrl,type_p='rg') #使用beautifulSoup 解析html 字符串,'lxml' 原理还是xPath soup = bs_4(html,'lxml') # beautifulSoup 写法解析获取列表 区别 find 与 findAll result_list = soup.find("div", id="main").find("div", class_="results").findAll("div",class_="vrwrap") dic_t = {} url = "" i = 1 for result in result_list: print ("\n\n<",i,">步子处理。") #print(result) content = b"" dic_t[i] = {"title":"","url":"","summary":"","snapshot":"","content":"","chartset":""} try: if(result.find("h3",class_="vrTitle")!=None): title = result.find("h3",class_="vrTitle").get_text() else: title = '' dic_t[i]["title"] = title if(result.find("div",class_="fb")!=None): if(result.find("div",class_="fb").find("a")!=None): snapshot = result.find("div",class_="fb").find("a").get("href") else: snapshot='' else: snapshot='' dic_t[i]["snapshot"] = snapshot content,chartset_code = source.get_html(url_p=snapshot,type_p='rg_byte') if(result.find("h3")!=None): url = "https://www.sogou.com"+result.find("h3").find("a").get("href") else: url='' dic_t[i]["url"] = url if (content.strip() == ""): content,chartset_code = source.get_html(url_p=url,type_p='rg') dic_t[i]["chartset"] = chartset_code if(result.find(class_="str_info")!=None): summary = result.find(class_="str_info").get_text() else: if(result.find(class_="str-text-info")!=None): summary = result.find(class_="str-text-info").get_text() else: summary = '' if (content == b""): content,chartset_code = source.get_html(url_p=url,type_p='rg_byte') if (content != b""): # 利用识别的编码进行解码 if (source.code_standard_is(chartset_p=chartset_code)): print ("<<原文编码识别[通过]>>") try: content = content.decode(chartset_code) except Exception as e: print ("二次爬取编码错误:",e) chartset_code += "_fail" content = str(content) else: chartset_code += "_fail" content = str(content) dic_t[i]["chartset"] = chartset_code dic_t[i]["content"] = content # 提取纯文本 except Exception as e: print (e) time.sleep(1) # 间隔一秒钟 i +=1 conn.close() return dic_t
def crawler_360(self,searchUrl,keyword,plant,data_if=0): # 通用数据库连接 全局接口 conn = Conn_mysql( host=config.dic_config["host_mysql"], port=int(config.dic_config["port_mysql"]), user=config.dic_config["user_mysql"], passwd=config.dic_config["pwd_mysql"], db="lqab_basedata_" + config.dic_config["name_mysql_after"] ) chartset_code = "" # 使用自定义获取源码脚本获取 网页 source = HtmlSource() html,chartset_code = source.get_html(url_p=searchUrl,type_p='rg') #使用beautifulSoup 解析html 字符串,'lxml' 原理还是xPath soup = bs_4(html,'lxml') # beautifulSoup 写法解析获取列表 区别 find 与 findAll result_list = soup.find("div", id="container").find("ul", class_="result").findAll("li") dic_t = {} i = 1 for result in result_list: print ("\n\n<",i,">步子处理。") content = b"" dic_t[i] = {"title":"","url":"","summary":"","snapshot":"","content":"","chartset":""} if(result.get("data-urlfp") != None): title = result.find("h3").get_text() dic_t[i]["title"] = title url = result.find("h3").find("a").get("href") dic_t[i]["url"] = url if(result.find("p", class_="res-desc")!=None): summary = result.find("p", class_="res-desc").get_text() else: summary = '' dic_t[i]["summary"] = summary if(result.find("p", class_="res-linkinfo").find("a", class_="m")!=None): snapshot = result.find("p", class_="res-linkinfo").find("a", class_="m").get("href") else: snapshot='' dic_t[i]["snapshot"] = snapshot if (content.strip() == "" and summary != ''): content,chartset_code = source.get_html(url_p=url,type_p='rg') dic_t[i]["chartset"] = chartset_code if (content == b"" and url != ''): content,chartset_code = source.get_html(url_p=url,type_p='rg_byte') if (content != b""): # 利用识别的编码进行解码 if (source.code_standard_is(chartset_p=chartset_code)): print ("<<原文编码识别[通过]>>") try: content = content.decode(chartset_code) except Exception as e: print ("二次爬取编码错误:",e) chartset_code += "_fail" content = str(content) else: chartset_code += "_fail" content = str(content) dic_t[i]["chartset"] = chartset_code dic_t[i]["content"] = content # 提取纯文本 time.sleep(1) # 间隔一秒钟 if (i > 20): break # 加入空地址熔断 else: i +=1 conn.close() return dic_t
def crawler_baidu(self,searchUrl,keyword,plant,data_if=0): # 通用数据库连接 全局接口 conn = Conn_mysql( host=config.dic_config["host_mysql"], port=int(config.dic_config["port_mysql"]), user=config.dic_config["user_mysql"], passwd=config.dic_config["pwd_mysql"], db="lqab_basedata_" + config.dic_config["name_mysql_after"] ) chartset_code = "" # 使用自定义获取源码脚本获取 网页 source = HtmlSource() html,chartset_code = source.get_html(url_p=searchUrl,type_p='rg') #使用beautifulSoup 解析html 字符串,'lxml' 原理还是xPath soup = bs_4(html,'lxml') # beautifulSoup 写法解析获取列表 区别 find 与 findAll result_list = soup.find("div",id="content_left").findAll("div",srcid="1599",tpl="se_com_default") dic_t = {} i = 1 for result in result_list: print ("\n\n<",i,">步子处理。") #print(result) content = b"" dic_t[i] = {"title":"","url":"","summary":"","snapshot":"","content":"","chartset":""} title = result.find("h3").get_text() dic_t[i]["title"] = title try: snapshot = result.find("div", class_="f13").find("a", class_="m").get("href") dic_t[i]["snapshot"] = snapshot content,chartset_code = source.get_html(url_p=snapshot,type_p='rg_byte') except Exception as e: print (e) url = result.find("h3").find("a").get("href") dic_t[i]["url"] = url if (content.strip() == ""): content,chartset_code = source.get_html(url_p=url,type_p='rg') dic_t[i]["chartset"] = chartset_code summary = result.find("div", class_="c-abstract").get_text() dic_t[i]["summary"] = summary if (content == b""): content,chartset_code = source.get_html(url_p=url,type_p='rg_byte') if (content != b""): # 利用识别的编码进行解码 if (source.code_standard_is(chartset_p=chartset_code)): print ("<<原文编码识别[通过]>>") try: content = content.decode(chartset_code) except Exception as e: print ("二次爬取编码错误:",e) chartset_code += "_fail" content = str(content) else: chartset_code += "_fail" content = str(content) dic_t[i]["chartset"] = chartset_code dic_t[i]["content"] = content # 提取纯文本 time.sleep(1) # 间隔一秒钟 i +=1 conn.close() return dic_t