Ejemplo n.º 1
0
    def get_details(self, url, muti, name, page, index_str, referer_url):
        resp = self.session.get(url,
                                headers={
                                    "Referer": referer_url,
                                    "Cookie": self.get_cookie_string()
                                })
        info(resp.headers.get("Set-Cookie", ""))
        resp.encoding = "utf8"
        if not self.check_if_need_relogin(resp):
            return

        bs_data = bs_4(resp.text, "lxml")
        uid, sid = Parser.get_uid_and_sid(bs_data)

        if not any((uid, sid)):
            error("uid&sid lost")
            return

        details, person, notice, img = Cursor.get_model_details(uid, sid)
        try:
            person_datas, username = Parser.get_person_data(bs_data)
            if not person:
                person_datas.update({
                    "uid":
                    uid,
                    "user":
                    username,
                    "regtime":
                    Parser.get_reg_date(bs_data, "1999-01-01"),
                })
                person = Cursor.create_person(person_datas)

            else:
                Cursor.update_person(person_datas, uid)
                person = person[0].uid

            if not notice:
                notice = Cursor.create_notice({"sid": sid})
            else:
                notice = notice[0].sid

            detailImages = None
            detailContent = Parser.get_detail_content(bs_data)
            if not img:
                urls = Parser.get_img_urls(bs_data)
                img = Cursor.create_img({
                    "sid": sid,
                    "img": urls,
                    "detail": detailContent
                })
                detailImages = self.save_pics(urls, sid)
            else:
                img = img[0].sid
            current_year = moment.now().year
            real_up_time = Parser.get_up_time(bs_data, current_year)
            details_datas = Parser.get_details(bs_data, current_year,
                                               real_up_time, muti)
            if not details:
                details_datas.update({
                    "sid": sid,
                    "user": person,
                    "area": muti["area"],
                    "title": muti["title"],
                    "detailurl": url,
                    "img": img,
                    "notice": notice,
                })
                details = Cursor.create_details(details_datas)
                self.make_msg(details, detailContent, detailImages, sid,
                              username)
            else:
                Cursor.update_details(details_datas, sid)

            short_msg = f'[{name}:{page}:{index_str}]-{real_up_time}- {muti["title"]}'
            success(short_msg) if not details else warning(short_msg)

        except KeyboardInterrupt:
            exit()
        except Exception as e:
            error(f"[run-->__get_details]: {e}")
Ejemplo n.º 2
0
 def clean_log(self, resp, lens=100):
     return (" ".join(bs_4(resp.text, "lxml").text.split()))[:lens] + "..."
Ejemplo n.º 3
0
    def get_html(self,url_p="",dic_p={},type_p='rg',chartset_p='utf-8',timeout_p=10):
        
        chartset_get = "n/a" # 爬取数据的字符形式编码
        headers_p = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"}
        txt = "nothing"
                
        # 获取网页源码
        try:
        
            # request_get的方法
            if (type_p == 'rg'):
            
                html = requests.get(url=url_p,timeout=timeout_p,headers=headers_p)
                chartset_get = self.get_encodings_from_content(html.text) # 用英文字符匹配法重新识别编码
                
                if (self.code_standard_is(chartset_p=chartset_get)):
                
                    print ("<<原文编码识别[通过]>>")
                    
                    if (chartset_get.lower() == "iso-8859-1"):
                    
                        print ("rg模式,<<原文编码iso-8859-1特殊处理")
                        try:
                            txt = html.content.decode("GBK")
                        except:
                            txt = html.content.decode("gb2312")
                            
                    else:
                    
                        print ("rg模式,按照识别的" + chartset_get + "特殊处理")
                        txt = html.content.decode(chartset_get)
                    
                else:
                
                    print ("<<原文编码识别[未通过]>>")
                    
                    txt = ""
                    
                print ("<<<rg>>>过程:"," ","原文编码:",chartset_get)
                html.close()
            
            # request_get的方法 只输出字节码
            if (type_p == 'rg_byte'):
                txt = b""
                html = requests.get(url=url_p,timeout=timeout_p,headers=headers_p)
                chartset_get = self.get_encodings_from_content(html.text) # 用英文字符匹配法重新识别编码
                txt = html.content
                print ("<<<rg_byte>>>过程:"," ","原文编码:",chartset_get)
                html.close()
            
            # request_post的方法
            if (type_p == 'rp'):
            
                conn_p = requests.session()
                rep_p = conn_p.post(url=url_p,data=dic_p,timeout=timeout_p,headers=headers_p)
                txt = rep_p.content
                chartset_get = self.get_encodings_from_content(txt.decode(chartset_p, "ignore"))
                if (self.code_standard_is(chartset_p=chartset_get)):
                    print ("<<原文编码识别[通过]>>")
                    txt = txt.decode(chartset_get, "ignore")
                else:
                    print ("<<原文编码识别[未通过]>>")
                    code_is = chardet.detect(txt)
                    if ("encoding" in code_is):
                        chartset_get = code_is["encoding"]
                        txt = txt.decode(code_is["encoding"], "ignore")
                    
                print ("<<<rp>>>过程:"," ","原文编码:",chartset_get)
                
            # urllib的get方法
            if (type_p == 'ug'):

                html = urllib.request.urlopen(url=url_p)
                txt = html.read()
                chartset_get = self.get_encodings_from_content(txt.decode(chartset_p, "ignore")) # 尝试编码 获得内部编码信息
                if (self.code_standard_is(chartset_p=chartset_get)):
                    print ("<<原文编码识别[通过]>>")
                    txt = txt.decode(chartset_get, "ignore")
                else:
                    # 进行编码判别
                    print ("<<原文编码识别[未通过]>>")
                    code_is = chardet.detect(txt)
                    if ("encoding" in code_is):
                        chartset_get = code_is["encoding"]
                        txt = txt.decode(code_is["encoding"], "ignore")
                        
                print ("<<<ug>>>过程:"," ","原文编码:",chartset_get)
            
            # urllib的post方法
            if (type_p == 'up'):
                
                #将字典格式化成能用的形式
                data_p = urllib.parse.urlencode(dic_p).encode('utf-8')
                #创建一个request,放入我们的地址、数据、头
                request = urllib.request.Request(url_p, data_p, headers_p)
                #访问
                txt = urllib.request.urlopen(request).read()
                chartset_get = self.get_encodings_from_content(txt.decode(chartset_p, "ignore")) # 尝试编码 获得内部编码信息
                if (self.code_standard_is(chartset_p=chartset_get)):
                    print ("<<原文编码识别[通过]>>")
                    txt = txt.decode(chartset_get, "ignore")
                else:
                    # 进行编码判别
                    print ("<<原文编码识别[未通过]>>")
                    code_is = chardet.detect(txt)
                    if ("encoding" in code_is):
                        chartset_get = code_is["encoding"]
                        txt = txt.decode(code_is["encoding"], "ignore")
                        
                print ("<<<up>>>过程:"," ","原文编码:",chartset_get)

                
            # session的方法
            if (type_p == 'ss'):
                res_addr = self.session.get(url_p, timeout=timeout_p, headers=headers_p)
                res_addr.encoding = chardet.detect(res_addr.content)["encoding"]
                txt = bs_4(res_addr.text, "lxml")
                print ("<<<ss>>>过程:"," ","原文编码:",chartset_get)

            # Selenium的方法 待完善
            if (type_p == 'se'):
                self.driver.get(url_p)
                js = "var q=document.body.scrollTop=100000"
                self.driver.execute_script(js)
                self.driver.implicitly_wait(30)  # 据说此方法是智能等待,看效果还不错,数据加载完就返回了 30 代表等待秒
                txt = self.driver.page_source
                chartset_get = self.get_encodings_from_content(txt)
                print ("<<<se>>>过程:"," ","原文编码:",chartset_get)

            # login的方法 待完善
            if (type_p == 'lg'):
                print ("<<<lg>>>过程:"," ","原文编码:",chartset_get)

        except Exception as e:
            
            print("html爬虫处理失败", e)
            
            html = requests.get(url=url_p, headers=headers_p)
            chartset_get = "n/a"
            print ("爬虫的最后处理,按照默认的" + chartset_p + "编码输出")
            try:
                txt = html.content.decode(chartset_p)
            except:
                txt = html.content.decode("gbk")
                
            html.close()
            
        return txt,chartset_get # 返回文本型html编码 加上自定义编码头
Ejemplo n.º 4
0
    def crawler_sogou(self,searchUrl,keyword,plant,data_if=0):
        
        # 通用数据库连接 全局接口
        conn = Conn_mysql(
        host=config.dic_config["host_mysql"], 
        port=int(config.dic_config["port_mysql"]), 
        user=config.dic_config["user_mysql"], 
        passwd=config.dic_config["pwd_mysql"], 
        db="lqab_basedata_" + config.dic_config["name_mysql_after"]
        )
        chartset_code = ""
        # 使用自定义获取源码脚本获取 网页
        source = HtmlSource()

        html,chartset_code = source.get_html(url_p=searchUrl,type_p='rg')
        #使用beautifulSoup 解析html 字符串,'lxml' 原理还是xPath
        soup = bs_4(html,'lxml')
        # beautifulSoup 写法解析获取列表 区别 find  与 findAll
        result_list = soup.find("div", id="main").find("div", class_="results").findAll("div",class_="vrwrap")
        
        dic_t = {}
        url = ""
        i = 1
        for result in result_list:
        
            print ("\n\n<",i,">步子处理。")
            #print(result)
            content = b""
            dic_t[i] = {"title":"","url":"","summary":"","snapshot":"","content":"","chartset":""}
            
            try:
                
                if(result.find("h3",class_="vrTitle")!=None):
                    title = result.find("h3",class_="vrTitle").get_text()
                else:
                    title = ''
                dic_t[i]["title"] = title
            
                if(result.find("div",class_="fb")!=None):
                
                    if(result.find("div",class_="fb").find("a")!=None):
                        snapshot = result.find("div",class_="fb").find("a").get("href")
                    else:
                        snapshot=''
                else:
                    snapshot=''
                dic_t[i]["snapshot"] = snapshot
                content,chartset_code = source.get_html(url_p=snapshot,type_p='rg_byte')
                
                if(result.find("h3")!=None):
                    url = "https://www.sogou.com"+result.find("h3").find("a").get("href")
                else:
                    url=''
                
                dic_t[i]["url"] = url
            
                if (content.strip() == ""):
                    content,chartset_code = source.get_html(url_p=url,type_p='rg')
                    dic_t[i]["chartset"] = chartset_code
                
                if(result.find(class_="str_info")!=None):
                    summary = result.find(class_="str_info").get_text()
                else:
                    if(result.find(class_="str-text-info")!=None):
                        summary = result.find(class_="str-text-info").get_text()
                    else:
                        summary = ''
            
                if (content == b""):
                    content,chartset_code = source.get_html(url_p=url,type_p='rg_byte')
            
                if (content != b""):
            
                    # 利用识别的编码进行解码
                    if (source.code_standard_is(chartset_p=chartset_code)):
                        print ("<<原文编码识别[通过]>>")
                    
                        try:
                            content = content.decode(chartset_code)
                        except Exception as e:
                            print ("二次爬取编码错误:",e)
                        chartset_code += "_fail" 
                        content = str(content)
                
                else:
                
                    chartset_code += "_fail" 
                    content = str(content)
                    
                dic_t[i]["chartset"] = chartset_code
                dic_t[i]["content"] = content  # 提取纯文本
            
            except Exception as e:
                
                print (e)
                
            time.sleep(1) # 间隔一秒钟
            i +=1 
            
        conn.close()
        
        return dic_t
Ejemplo n.º 5
0
    def crawler_360(self,searchUrl,keyword,plant,data_if=0):
        
        # 通用数据库连接 全局接口
        conn = Conn_mysql(
        host=config.dic_config["host_mysql"], 
        port=int(config.dic_config["port_mysql"]), 
        user=config.dic_config["user_mysql"], 
        passwd=config.dic_config["pwd_mysql"], 
        db="lqab_basedata_" + config.dic_config["name_mysql_after"]
        )
        chartset_code = ""
        # 使用自定义获取源码脚本获取 网页
        source = HtmlSource()
        html,chartset_code = source.get_html(url_p=searchUrl,type_p='rg')
        #使用beautifulSoup 解析html 字符串,'lxml' 原理还是xPath
        soup = bs_4(html,'lxml')
        # beautifulSoup 写法解析获取列表 区别 find  与 findAll
        result_list = soup.find("div", id="container").find("ul", class_="result").findAll("li")
        dic_t = {}
        i = 1
        for result in result_list:
        
            print ("\n\n<",i,">步子处理。")
            content = b""
            
            dic_t[i] = {"title":"","url":"","summary":"","snapshot":"","content":"","chartset":""}
            
            if(result.get("data-urlfp") != None):
            
                title = result.find("h3").get_text()
                dic_t[i]["title"] = title
                
                url = result.find("h3").find("a").get("href")
                dic_t[i]["url"] = url
                
                if(result.find("p", class_="res-desc")!=None):
                    summary = result.find("p", class_="res-desc").get_text()
                else:
                    summary = ''
                dic_t[i]["summary"] = summary
                
                if(result.find("p", class_="res-linkinfo").find("a", class_="m")!=None):
                    snapshot = result.find("p", class_="res-linkinfo").find("a", class_="m").get("href")
                else:
                    snapshot=''
                dic_t[i]["snapshot"] = snapshot
                
                if (content.strip() == "" and summary != ''):
                    content,chartset_code = source.get_html(url_p=url,type_p='rg')
                    dic_t[i]["chartset"] = chartset_code

                if (content == b"" and url != ''):
                    content,chartset_code = source.get_html(url_p=url,type_p='rg_byte')
            
                if (content != b""):
            
                    # 利用识别的编码进行解码
                    if (source.code_standard_is(chartset_p=chartset_code)):
                        print ("<<原文编码识别[通过]>>")
                    
                        try:
                            content = content.decode(chartset_code)
                        except Exception as e:
                            print ("二次爬取编码错误:",e)
                            chartset_code += "_fail" 
                            content = str(content)
                
                    else:
                
                        chartset_code += "_fail" 
                        content = str(content)
                    
            dic_t[i]["chartset"] = chartset_code
            dic_t[i]["content"] = content  # 提取纯文本
            
            time.sleep(1) # 间隔一秒钟
            if (i > 20):
                break # 加入空地址熔断
            else:
                i +=1 
            
        conn.close()
        
        return dic_t
Ejemplo n.º 6
0
 def crawler_baidu(self,searchUrl,keyword,plant,data_if=0):
     
     # 通用数据库连接 全局接口
     conn = Conn_mysql(
     host=config.dic_config["host_mysql"], 
     port=int(config.dic_config["port_mysql"]), 
     user=config.dic_config["user_mysql"], 
     passwd=config.dic_config["pwd_mysql"], 
     db="lqab_basedata_" + config.dic_config["name_mysql_after"]
     )
     chartset_code = ""
     # 使用自定义获取源码脚本获取 网页
     source = HtmlSource()
     html,chartset_code = source.get_html(url_p=searchUrl,type_p='rg')
     #使用beautifulSoup 解析html 字符串,'lxml' 原理还是xPath
     soup = bs_4(html,'lxml')
     # beautifulSoup 写法解析获取列表 区别 find  与 findAll
     result_list = soup.find("div",id="content_left").findAll("div",srcid="1599",tpl="se_com_default")
     dic_t = {}
     i = 1
     for result in result_list:
     
         print ("\n\n<",i,">步子处理。")
         #print(result)
         content = b""
         dic_t[i] = {"title":"","url":"","summary":"","snapshot":"","content":"","chartset":""}
         
         title = result.find("h3").get_text()
         dic_t[i]["title"] = title
         
         try:
         
             snapshot = result.find("div", class_="f13").find("a", class_="m").get("href")
             dic_t[i]["snapshot"] = snapshot
             content,chartset_code = source.get_html(url_p=snapshot,type_p='rg_byte')
             
         except Exception as e:
             print (e)
         
         url = result.find("h3").find("a").get("href")
         dic_t[i]["url"] = url
         if (content.strip() == ""):
             content,chartset_code = source.get_html(url_p=url,type_p='rg')
             dic_t[i]["chartset"] = chartset_code
             
         summary = result.find("div", class_="c-abstract").get_text()
         dic_t[i]["summary"] = summary
         
         if (content == b""):
             content,chartset_code = source.get_html(url_p=url,type_p='rg_byte')
         
         if (content != b""):
         
             # 利用识别的编码进行解码
             if (source.code_standard_is(chartset_p=chartset_code)):
                 print ("<<原文编码识别[通过]>>")
                 
                 try:
                     content = content.decode(chartset_code)
                 except Exception as e:
                     print ("二次爬取编码错误:",e)
                     chartset_code += "_fail" 
                     content = str(content)
             
             else:
             
                 chartset_code += "_fail" 
                 content = str(content)
                 
         dic_t[i]["chartset"] = chartset_code
         dic_t[i]["content"] = content  # 提取纯文本
         
         time.sleep(1) # 间隔一秒钟
         i +=1 
         
     conn.close()
     
     return dic_t