def download(self, url): try: response = requests.get(url, headers=self.headers, timeout=TIMEOUT) response.encoding = chardet.detect(response.content)['encoding'] if response.ok: return response.text else: raise ConnectionError except ConnectionError: for retry_count in range(RETRY_TIME): proxies_list = list() # 代理使用本机IP proxies_list.append({}) # 使用数据库中代理IP去访问代理网站 condition_dict = {"is_valid": True, "protocol": HTTP_PROTOCOL} proxy_list = Client.select(1, condition_dict) if proxy_list: proxy = Client.select(1, condition_dict)[0] proxies = {"http": "http://%s:%s" % (proxy.ip, proxy.port)} proxies_list.append(proxies) # 从数据库代理IP和本机IP随机选择一个 proxies = random.choice(proxies_list) response = requests.get(url, headers=self.headers, proxies=proxies, timeout=TIMEOUT) response.encoding = chardet.detect( response.content)['encoding'] if response.ok: return response.text else: return unicode()
def run(self): while True: try: proxy = self.queue.get_nowait() except Empty: break if self.check_proxy(proxy): if not proxy.is_valid: condition_dict = { "ip": proxy.ip, "port": proxy.port, "protocol": proxy.protocol } update_dict = {"is_valid": True} Client.update(condition_dict, update_dict) self.logger.info( "ProxyCheck: %s://%s:%d validation pass" % ("http" if proxy.protocol == HTTP_PROTOCOL else "https", proxy.ip, proxy.port)) else: condition_dict = { "ip": proxy.ip, "port": proxy.port, "protocol": proxy.protocol } Client.delete(condition_dict) self.logger.info( "ProxyCheck: %s://%s:%d validation fail" % ("http" if proxy.protocol == HTTP_PROTOCOL else "https", proxy.ip, proxy.port)) self.logger.info("end")
def get_proxy(cls): """ 爬取无忧代理网站html,解析后获得代理将代理加入数据库 """ request = HtmlDownloader() proxy_getter_logger.info("Start get data5u proxy") try: html = request.download(url=cls.url) soup = BeautifulSoup(html, features="html.parser") ul_list = soup.find_all("ul", class_="l2") for ul in ul_list: li_list = ul.find_all("li") proxy_dict = { "country": "China", "ip": unicode(li_list[0].string), "port": int(li_list[1].string), "area": unicode(li_list[5].string), "type": TRANSPARENT if li_list[2].string == u"透明" else ANONYMOUS, "protocol": HTTPS_PROTOCOL if li_list[2].string == "https" else HTTP_PROTOCOL, "is_valid": False } proxy_getter_logger.info(proxy_dict) Client.insert(proxy_dict) proxy_getter_logger.info("End get data5u proxy successfully") except Exception as e: proxy_getter_logger.warn( "End get data5u proxy unsuccessfully, because %s" % e)
def get(self): condition_dict = {"is_valid": True, "protocol": HTTPS_PROTOCOL} proxy_list = Client.select(0, condition_dict) for proxy in proxy_list: if check_proxy(proxy): result_dict = { "message": "Get a https proxy successfully", "ip": proxy.ip, "port": proxy.port, "protocol": "https" } api_logger.info( "ProxyCheck: %s://%s:%d validation pass" % ("http" if proxy.protocol == HTTP_PROTOCOL else "https", proxy.ip, proxy.port)) api_logger.info( "Return %s://%s:%d" % ("http" if proxy.protocol == HTTP_PROTOCOL else "https", proxy.ip, proxy.port)) return result_dict else: api_logger.info( "ProxyCheck: %s://%s:%d validation fail" % ("http" if proxy.protocol == HTTP_PROTOCOL else "https", proxy.ip, proxy.port)) else: result_dict = { "message": "Get a https proxy unsuccessfully", } api_logger.info("Return None") return result_dict
def get_proxy(cls): """ 爬取免费IP代理库网站html,解析后获得代理将代理加入数据库 """ request = HtmlDownloader() proxy_getter_logger.info("Start get Jiangxianli proxy") try: for page in range(1, 3): url = cls.url + "?page=" + str(page) html = request.download(url=url) # 从html中提取proxy soup = BeautifulSoup(html, features="html.parser") tbody = soup.find("tbody") tr_list = tbody.find_all("tr") for tr in tr_list: td_list = tr.find_all("td") proxy_dict = { "country": "China", "ip": unicode(td_list[1].string), "port": int(td_list[2].string), "area": unicode(td_list[5].string), "type": TRANSPARENT if td_list[3] == u"透明" else ANONYMOUS, "protocol": HTTP_PROTOCOL if td_list[4].string == "HTTP" else HTTPS_PROTOCOL, "is_valid": False } proxy_getter_logger.info(proxy_dict) Client.insert(proxy_dict) proxy_getter_logger.info("End get Jiangxianli proxy successfully") except Exception as e: proxy_getter_logger.warn( "End get Jiangxianli proxy unsuccessfully, because %s" % e)
def put_queue(self): condition_dict = {"is_valid": False} for raw_proxy in Client.select(count=0, condition_dict=condition_dict): self.raw_proxy_queue.put(raw_proxy)
def get_proxy(cls): """ 爬取西刺代理网站html,解析后获得代理将代理加入数据库 """ request = HtmlDownloader() proxy_getter_logger.info("Start get Xicidaili proxy") try: # 爬取透明proxy for page in range(1, 3): # 拼接url url = cls.transparent_url + str(page) proxy_getter_logger.info("get url %s" % url) html = request.download(url=url) # 从html中提取proxy soup = BeautifulSoup(html, features="html.parser") table = soup.find("table", id="ip_list") for tr in table.find_all("tr")[1:]: td_list = tr.find_all("td") proxy_dict = { "country": "China", "ip": unicode(td_list[1].string), "port": int(td_list[2].string), "area": unicode(td_list[3].find("a").string if td_list[3]. find("a") else ""), "type": TRANSPARENT, "protocol": HTTPS_PROTOCOL if td_list[5].string == "HTTPS" else HTTP_PROTOCOL, "is_valid": False } proxy_getter_logger.info(proxy_dict) Client.insert(proxy_dict) # 防止被代理网站反爬,添加间隔时间2s time.sleep(2) # 爬取高匿proxy for page in range(1, 3): # 拼接url url = cls.anonymous_url + str(page) proxy_getter_logger.info("get url %s" % url) html = request.download(url=url) # 从html中提取proxy soup = BeautifulSoup(html, features="html.parser") table = soup.find("table", id="ip_list") for tr in table.find_all("tr")[1:]: td_list = tr.find_all("td") proxy_dict = { "country": "China", "ip": unicode(td_list[1].string), "port": int(td_list[2].string), "area": unicode(td_list[3].find("a").string if td_list[3]. find("a") else ""), "type": ANONYMOUS, "protocol": HTTPS_PROTOCOL if td_list[5].string == "HTTPS" else HTTP_PROTOCOL, "is_valid": False } proxy_getter_logger.info(proxy_dict) Client.insert(proxy_dict) # 防止被代理网站反爬,添加间隔时间2s time.sleep(2) proxy_getter_logger.info("End get Xicidaili proxy successfully") except Exception as e: proxy_getter_logger.warn( "End get Xicidaili proxy unsuccessfully, because %s" % e)
def get_proxy(cls): """ 爬取西拉代理网站html,解析后获得代理将代理加入数据库 """ request = HtmlDownloader() proxy_getter_logger.info("Start get Xiladaili proxy") try: # 爬取透明proxy for page in range(1, 3): url = cls.transparent_url + str(page) html = request.download(url=url) # 从html中提取proxy soup = BeautifulSoup(html, features="html.parser") tbody = soup.find("tbody") tr_list = tbody.find_all("tr") for tr in tr_list: td_list = tr.find_all("td") proxy_dict = { "country": "China", "ip": unicode(td_list[0].string.split(":")[0]), "port": int(td_list[0].string.split(":")[1]), "area": unicode(td_list[3].string), "type": TRANSPARENT, "protocol": HTTP_PROTOCOL if td_list[1].string == u"HTTP代理" else HTTPS_PROTOCOL, "is_valid": False } proxy_getter_logger.info(proxy_dict) Client.insert(proxy_dict) # 爬取高匿proxy for page in range(1, 3): url = cls.anonymous_url + str(page) html = request.download(url=url) # 从html中提取proxy soup = BeautifulSoup(html, features="html.parser") tbody = soup.find("tbody") tr_list = tbody.find_all("tr") for tr in tr_list: td_list = tr.find_all("td") proxy_dict = { "country": "China", "ip": unicode(td_list[0].string.split(":")[0]), "port": int(td_list[0].string.split(":")[1]), "area": unicode(td_list[3].string), "type": ANONYMOUS, "protocol": HTTP_PROTOCOL if td_list[1].string == u"HTTP代理" else HTTPS_PROTOCOL, "is_valid": False } proxy_getter_logger.info(proxy_dict) Client.insert(proxy_dict) proxy_getter_logger.info("End get Xiladaili proxy successfully") except Exception as e: proxy_getter_logger.warn( "End get Xiladaili proxy unsuccessfully, because %s" % e)
def put_queue(self): condition_dict = {"is_valid": True} for valid_proxy in Client.select(count=0, condition_dict=condition_dict): self.valid_proxy_queue.put(valid_proxy)