def get_proxy(cls):
     """
     爬取无忧代理网站html,解析后获得代理将代理加入数据库
     """
     request = HtmlDownloader()
     proxy_getter_logger.info("Start get data5u proxy")
     try:
         html = request.download(url=cls.url)
         soup = BeautifulSoup(html, features="html.parser")
         ul_list = soup.find_all("ul", class_="l2")
         for ul in ul_list:
             li_list = ul.find_all("li")
             proxy_dict = {
                 "country":
                 "China",
                 "ip":
                 unicode(li_list[0].string),
                 "port":
                 int(li_list[1].string),
                 "area":
                 unicode(li_list[5].string),
                 "type":
                 TRANSPARENT if li_list[2].string == u"透明" else ANONYMOUS,
                 "protocol":
                 HTTPS_PROTOCOL
                 if li_list[2].string == "https" else HTTP_PROTOCOL,
                 "is_valid":
                 False
             }
             proxy_getter_logger.info(proxy_dict)
             Client.insert(proxy_dict)
         proxy_getter_logger.info("End get data5u proxy successfully")
     except Exception as e:
         proxy_getter_logger.warn(
             "End get data5u proxy unsuccessfully, because %s" % e)
 def get_proxy(cls):
     """
     爬取免费IP代理库网站html,解析后获得代理将代理加入数据库
     """
     request = HtmlDownloader()
     proxy_getter_logger.info("Start get Jiangxianli proxy")
     try:
         for page in range(1, 3):
             url = cls.url + "?page=" + str(page)
             html = request.download(url=url)
             # 从html中提取proxy
             soup = BeautifulSoup(html, features="html.parser")
             tbody = soup.find("tbody")
             tr_list = tbody.find_all("tr")
             for tr in tr_list:
                 td_list = tr.find_all("td")
                 proxy_dict = {
                     "country":
                     "China",
                     "ip":
                     unicode(td_list[1].string),
                     "port":
                     int(td_list[2].string),
                     "area":
                     unicode(td_list[5].string),
                     "type":
                     TRANSPARENT if td_list[3] == u"透明" else ANONYMOUS,
                     "protocol":
                     HTTP_PROTOCOL
                     if td_list[4].string == "HTTP" else HTTPS_PROTOCOL,
                     "is_valid":
                     False
                 }
                 proxy_getter_logger.info(proxy_dict)
                 Client.insert(proxy_dict)
         proxy_getter_logger.info("End get Jiangxianli proxy successfully")
     except Exception as e:
         proxy_getter_logger.warn(
             "End get Jiangxianli proxy unsuccessfully, because %s" % e)
Beispiel #3
0
    def get_proxy(cls):
        """
        爬取西刺代理网站html,解析后获得代理将代理加入数据库
        """
        request = HtmlDownloader()
        proxy_getter_logger.info("Start get Xicidaili proxy")
        try:
            # 爬取透明proxy
            for page in range(1, 3):
                # 拼接url
                url = cls.transparent_url + str(page)
                proxy_getter_logger.info("get url %s" % url)
                html = request.download(url=url)
                # 从html中提取proxy
                soup = BeautifulSoup(html, features="html.parser")
                table = soup.find("table", id="ip_list")
                for tr in table.find_all("tr")[1:]:
                    td_list = tr.find_all("td")
                    proxy_dict = {
                        "country":
                        "China",
                        "ip":
                        unicode(td_list[1].string),
                        "port":
                        int(td_list[2].string),
                        "area":
                        unicode(td_list[3].find("a").string if td_list[3].
                                find("a") else ""),
                        "type":
                        TRANSPARENT,
                        "protocol":
                        HTTPS_PROTOCOL
                        if td_list[5].string == "HTTPS" else HTTP_PROTOCOL,
                        "is_valid":
                        False
                    }
                    proxy_getter_logger.info(proxy_dict)
                    Client.insert(proxy_dict)
                # 防止被代理网站反爬,添加间隔时间2s
                time.sleep(2)

            # 爬取高匿proxy
            for page in range(1, 3):
                # 拼接url
                url = cls.anonymous_url + str(page)
                proxy_getter_logger.info("get url %s" % url)
                html = request.download(url=url)
                # 从html中提取proxy
                soup = BeautifulSoup(html, features="html.parser")
                table = soup.find("table", id="ip_list")
                for tr in table.find_all("tr")[1:]:
                    td_list = tr.find_all("td")
                    proxy_dict = {
                        "country":
                        "China",
                        "ip":
                        unicode(td_list[1].string),
                        "port":
                        int(td_list[2].string),
                        "area":
                        unicode(td_list[3].find("a").string if td_list[3].
                                find("a") else ""),
                        "type":
                        ANONYMOUS,
                        "protocol":
                        HTTPS_PROTOCOL
                        if td_list[5].string == "HTTPS" else HTTP_PROTOCOL,
                        "is_valid":
                        False
                    }
                    proxy_getter_logger.info(proxy_dict)
                    Client.insert(proxy_dict)
                # 防止被代理网站反爬,添加间隔时间2s
                time.sleep(2)
            proxy_getter_logger.info("End get Xicidaili proxy successfully")
        except Exception as e:
            proxy_getter_logger.warn(
                "End get Xicidaili proxy unsuccessfully, because %s" % e)
    def get_proxy(cls):
        """
        爬取西拉代理网站html,解析后获得代理将代理加入数据库
        """
        request = HtmlDownloader()
        proxy_getter_logger.info("Start get Xiladaili proxy")
        try:
            # 爬取透明proxy
            for page in range(1, 3):
                url = cls.transparent_url + str(page)
                html = request.download(url=url)
                # 从html中提取proxy
                soup = BeautifulSoup(html, features="html.parser")
                tbody = soup.find("tbody")
                tr_list = tbody.find_all("tr")
                for tr in tr_list:
                    td_list = tr.find_all("td")
                    proxy_dict = {
                        "country":
                        "China",
                        "ip":
                        unicode(td_list[0].string.split(":")[0]),
                        "port":
                        int(td_list[0].string.split(":")[1]),
                        "area":
                        unicode(td_list[3].string),
                        "type":
                        TRANSPARENT,
                        "protocol":
                        HTTP_PROTOCOL
                        if td_list[1].string == u"HTTP代理" else HTTPS_PROTOCOL,
                        "is_valid":
                        False
                    }
                    proxy_getter_logger.info(proxy_dict)
                    Client.insert(proxy_dict)

            # 爬取高匿proxy
            for page in range(1, 3):
                url = cls.anonymous_url + str(page)
                html = request.download(url=url)
                # 从html中提取proxy
                soup = BeautifulSoup(html, features="html.parser")
                tbody = soup.find("tbody")
                tr_list = tbody.find_all("tr")
                for tr in tr_list:
                    td_list = tr.find_all("td")
                    proxy_dict = {
                        "country":
                        "China",
                        "ip":
                        unicode(td_list[0].string.split(":")[0]),
                        "port":
                        int(td_list[0].string.split(":")[1]),
                        "area":
                        unicode(td_list[3].string),
                        "type":
                        ANONYMOUS,
                        "protocol":
                        HTTP_PROTOCOL
                        if td_list[1].string == u"HTTP代理" else HTTPS_PROTOCOL,
                        "is_valid":
                        False
                    }
                    proxy_getter_logger.info(proxy_dict)
                    Client.insert(proxy_dict)
            proxy_getter_logger.info("End get Xiladaili proxy successfully")
        except Exception as e:
            proxy_getter_logger.warn(
                "End get Xiladaili proxy unsuccessfully, because %s" % e)