Beispiel #1
0
    def download(self, url):
        try:
            response = requests.get(url, headers=self.headers, timeout=TIMEOUT)
            response.encoding = chardet.detect(response.content)['encoding']
            if response.ok:
                return response.text
            else:
                raise ConnectionError
        except ConnectionError:
            for retry_count in range(RETRY_TIME):
                proxies_list = list()

                # 代理使用本机IP
                proxies_list.append({})
                # 使用数据库中代理IP去访问代理网站
                condition_dict = {"is_valid": True, "protocol": HTTP_PROTOCOL}
                proxy_list = Client.select(1, condition_dict)
                if proxy_list:
                    proxy = Client.select(1, condition_dict)[0]
                    proxies = {"http": "http://%s:%s" % (proxy.ip, proxy.port)}
                    proxies_list.append(proxies)

                # 从数据库代理IP和本机IP随机选择一个
                proxies = random.choice(proxies_list)
                response = requests.get(url,
                                        headers=self.headers,
                                        proxies=proxies,
                                        timeout=TIMEOUT)
                response.encoding = chardet.detect(
                    response.content)['encoding']
                if response.ok:
                    return response.text
            else:
                return unicode()
Beispiel #2
0
 def run(self):
     while True:
         try:
             proxy = self.queue.get_nowait()
         except Empty:
             break
         if self.check_proxy(proxy):
             if not proxy.is_valid:
                 condition_dict = {
                     "ip": proxy.ip,
                     "port": proxy.port,
                     "protocol": proxy.protocol
                 }
                 update_dict = {"is_valid": True}
                 Client.update(condition_dict, update_dict)
             self.logger.info(
                 "ProxyCheck: %s://%s:%d validation pass" %
                 ("http" if proxy.protocol == HTTP_PROTOCOL else "https",
                  proxy.ip, proxy.port))
         else:
             condition_dict = {
                 "ip": proxy.ip,
                 "port": proxy.port,
                 "protocol": proxy.protocol
             }
             Client.delete(condition_dict)
             self.logger.info(
                 "ProxyCheck: %s://%s:%d validation fail" %
                 ("http" if proxy.protocol == HTTP_PROTOCOL else "https",
                  proxy.ip, proxy.port))
     self.logger.info("end")
 def get_proxy(cls):
     """
     爬取无忧代理网站html,解析后获得代理将代理加入数据库
     """
     request = HtmlDownloader()
     proxy_getter_logger.info("Start get data5u proxy")
     try:
         html = request.download(url=cls.url)
         soup = BeautifulSoup(html, features="html.parser")
         ul_list = soup.find_all("ul", class_="l2")
         for ul in ul_list:
             li_list = ul.find_all("li")
             proxy_dict = {
                 "country":
                 "China",
                 "ip":
                 unicode(li_list[0].string),
                 "port":
                 int(li_list[1].string),
                 "area":
                 unicode(li_list[5].string),
                 "type":
                 TRANSPARENT if li_list[2].string == u"透明" else ANONYMOUS,
                 "protocol":
                 HTTPS_PROTOCOL
                 if li_list[2].string == "https" else HTTP_PROTOCOL,
                 "is_valid":
                 False
             }
             proxy_getter_logger.info(proxy_dict)
             Client.insert(proxy_dict)
         proxy_getter_logger.info("End get data5u proxy successfully")
     except Exception as e:
         proxy_getter_logger.warn(
             "End get data5u proxy unsuccessfully, because %s" % e)
Beispiel #4
0
 def get(self):
     condition_dict = {"is_valid": True, "protocol": HTTPS_PROTOCOL}
     proxy_list = Client.select(0, condition_dict)
     for proxy in proxy_list:
         if check_proxy(proxy):
             result_dict = {
                 "message": "Get a https proxy successfully",
                 "ip": proxy.ip,
                 "port": proxy.port,
                 "protocol": "https"
             }
             api_logger.info(
                 "ProxyCheck: %s://%s:%d validation pass" %
                 ("http" if proxy.protocol == HTTP_PROTOCOL else "https",
                  proxy.ip, proxy.port))
             api_logger.info(
                 "Return %s://%s:%d" %
                 ("http" if proxy.protocol == HTTP_PROTOCOL else "https",
                  proxy.ip, proxy.port))
             return result_dict
         else:
             api_logger.info(
                 "ProxyCheck: %s://%s:%d validation fail" %
                 ("http" if proxy.protocol == HTTP_PROTOCOL else "https",
                  proxy.ip, proxy.port))
     else:
         result_dict = {
             "message": "Get a https proxy unsuccessfully",
         }
         api_logger.info("Return None")
         return result_dict
 def get_proxy(cls):
     """
     爬取免费IP代理库网站html,解析后获得代理将代理加入数据库
     """
     request = HtmlDownloader()
     proxy_getter_logger.info("Start get Jiangxianli proxy")
     try:
         for page in range(1, 3):
             url = cls.url + "?page=" + str(page)
             html = request.download(url=url)
             # 从html中提取proxy
             soup = BeautifulSoup(html, features="html.parser")
             tbody = soup.find("tbody")
             tr_list = tbody.find_all("tr")
             for tr in tr_list:
                 td_list = tr.find_all("td")
                 proxy_dict = {
                     "country":
                     "China",
                     "ip":
                     unicode(td_list[1].string),
                     "port":
                     int(td_list[2].string),
                     "area":
                     unicode(td_list[5].string),
                     "type":
                     TRANSPARENT if td_list[3] == u"透明" else ANONYMOUS,
                     "protocol":
                     HTTP_PROTOCOL
                     if td_list[4].string == "HTTP" else HTTPS_PROTOCOL,
                     "is_valid":
                     False
                 }
                 proxy_getter_logger.info(proxy_dict)
                 Client.insert(proxy_dict)
         proxy_getter_logger.info("End get Jiangxianli proxy successfully")
     except Exception as e:
         proxy_getter_logger.warn(
             "End get Jiangxianli proxy unsuccessfully, because %s" % e)
Beispiel #6
0
 def put_queue(self):
     condition_dict = {"is_valid": False}
     for raw_proxy in Client.select(count=0, condition_dict=condition_dict):
         self.raw_proxy_queue.put(raw_proxy)
Beispiel #7
0
    def get_proxy(cls):
        """
        爬取西刺代理网站html,解析后获得代理将代理加入数据库
        """
        request = HtmlDownloader()
        proxy_getter_logger.info("Start get Xicidaili proxy")
        try:
            # 爬取透明proxy
            for page in range(1, 3):
                # 拼接url
                url = cls.transparent_url + str(page)
                proxy_getter_logger.info("get url %s" % url)
                html = request.download(url=url)
                # 从html中提取proxy
                soup = BeautifulSoup(html, features="html.parser")
                table = soup.find("table", id="ip_list")
                for tr in table.find_all("tr")[1:]:
                    td_list = tr.find_all("td")
                    proxy_dict = {
                        "country":
                        "China",
                        "ip":
                        unicode(td_list[1].string),
                        "port":
                        int(td_list[2].string),
                        "area":
                        unicode(td_list[3].find("a").string if td_list[3].
                                find("a") else ""),
                        "type":
                        TRANSPARENT,
                        "protocol":
                        HTTPS_PROTOCOL
                        if td_list[5].string == "HTTPS" else HTTP_PROTOCOL,
                        "is_valid":
                        False
                    }
                    proxy_getter_logger.info(proxy_dict)
                    Client.insert(proxy_dict)
                # 防止被代理网站反爬,添加间隔时间2s
                time.sleep(2)

            # 爬取高匿proxy
            for page in range(1, 3):
                # 拼接url
                url = cls.anonymous_url + str(page)
                proxy_getter_logger.info("get url %s" % url)
                html = request.download(url=url)
                # 从html中提取proxy
                soup = BeautifulSoup(html, features="html.parser")
                table = soup.find("table", id="ip_list")
                for tr in table.find_all("tr")[1:]:
                    td_list = tr.find_all("td")
                    proxy_dict = {
                        "country":
                        "China",
                        "ip":
                        unicode(td_list[1].string),
                        "port":
                        int(td_list[2].string),
                        "area":
                        unicode(td_list[3].find("a").string if td_list[3].
                                find("a") else ""),
                        "type":
                        ANONYMOUS,
                        "protocol":
                        HTTPS_PROTOCOL
                        if td_list[5].string == "HTTPS" else HTTP_PROTOCOL,
                        "is_valid":
                        False
                    }
                    proxy_getter_logger.info(proxy_dict)
                    Client.insert(proxy_dict)
                # 防止被代理网站反爬,添加间隔时间2s
                time.sleep(2)
            proxy_getter_logger.info("End get Xicidaili proxy successfully")
        except Exception as e:
            proxy_getter_logger.warn(
                "End get Xicidaili proxy unsuccessfully, because %s" % e)
    def get_proxy(cls):
        """
        爬取西拉代理网站html,解析后获得代理将代理加入数据库
        """
        request = HtmlDownloader()
        proxy_getter_logger.info("Start get Xiladaili proxy")
        try:
            # 爬取透明proxy
            for page in range(1, 3):
                url = cls.transparent_url + str(page)
                html = request.download(url=url)
                # 从html中提取proxy
                soup = BeautifulSoup(html, features="html.parser")
                tbody = soup.find("tbody")
                tr_list = tbody.find_all("tr")
                for tr in tr_list:
                    td_list = tr.find_all("td")
                    proxy_dict = {
                        "country":
                        "China",
                        "ip":
                        unicode(td_list[0].string.split(":")[0]),
                        "port":
                        int(td_list[0].string.split(":")[1]),
                        "area":
                        unicode(td_list[3].string),
                        "type":
                        TRANSPARENT,
                        "protocol":
                        HTTP_PROTOCOL
                        if td_list[1].string == u"HTTP代理" else HTTPS_PROTOCOL,
                        "is_valid":
                        False
                    }
                    proxy_getter_logger.info(proxy_dict)
                    Client.insert(proxy_dict)

            # 爬取高匿proxy
            for page in range(1, 3):
                url = cls.anonymous_url + str(page)
                html = request.download(url=url)
                # 从html中提取proxy
                soup = BeautifulSoup(html, features="html.parser")
                tbody = soup.find("tbody")
                tr_list = tbody.find_all("tr")
                for tr in tr_list:
                    td_list = tr.find_all("td")
                    proxy_dict = {
                        "country":
                        "China",
                        "ip":
                        unicode(td_list[0].string.split(":")[0]),
                        "port":
                        int(td_list[0].string.split(":")[1]),
                        "area":
                        unicode(td_list[3].string),
                        "type":
                        ANONYMOUS,
                        "protocol":
                        HTTP_PROTOCOL
                        if td_list[1].string == u"HTTP代理" else HTTPS_PROTOCOL,
                        "is_valid":
                        False
                    }
                    proxy_getter_logger.info(proxy_dict)
                    Client.insert(proxy_dict)
            proxy_getter_logger.info("End get Xiladaili proxy successfully")
        except Exception as e:
            proxy_getter_logger.warn(
                "End get Xiladaili proxy unsuccessfully, because %s" % e)
 def put_queue(self):
     condition_dict = {"is_valid": True}
     for valid_proxy in Client.select(count=0,
                                      condition_dict=condition_dict):
         self.valid_proxy_queue.put(valid_proxy)