def get_proxy(cls): """ 爬取无忧代理网站html,解析后获得代理将代理加入数据库 """ request = HtmlDownloader() proxy_getter_logger.info("Start get data5u proxy") try: html = request.download(url=cls.url) soup = BeautifulSoup(html, features="html.parser") ul_list = soup.find_all("ul", class_="l2") for ul in ul_list: li_list = ul.find_all("li") proxy_dict = { "country": "China", "ip": unicode(li_list[0].string), "port": int(li_list[1].string), "area": unicode(li_list[5].string), "type": TRANSPARENT if li_list[2].string == u"透明" else ANONYMOUS, "protocol": HTTPS_PROTOCOL if li_list[2].string == "https" else HTTP_PROTOCOL, "is_valid": False } proxy_getter_logger.info(proxy_dict) Client.insert(proxy_dict) proxy_getter_logger.info("End get data5u proxy successfully") except Exception as e: proxy_getter_logger.warn( "End get data5u proxy unsuccessfully, because %s" % e)
def get_proxy(cls): """ 爬取免费IP代理库网站html,解析后获得代理将代理加入数据库 """ request = HtmlDownloader() proxy_getter_logger.info("Start get Jiangxianli proxy") try: for page in range(1, 3): url = cls.url + "?page=" + str(page) html = request.download(url=url) # 从html中提取proxy soup = BeautifulSoup(html, features="html.parser") tbody = soup.find("tbody") tr_list = tbody.find_all("tr") for tr in tr_list: td_list = tr.find_all("td") proxy_dict = { "country": "China", "ip": unicode(td_list[1].string), "port": int(td_list[2].string), "area": unicode(td_list[5].string), "type": TRANSPARENT if td_list[3] == u"透明" else ANONYMOUS, "protocol": HTTP_PROTOCOL if td_list[4].string == "HTTP" else HTTPS_PROTOCOL, "is_valid": False } proxy_getter_logger.info(proxy_dict) Client.insert(proxy_dict) proxy_getter_logger.info("End get Jiangxianli proxy successfully") except Exception as e: proxy_getter_logger.warn( "End get Jiangxianli proxy unsuccessfully, because %s" % e)
def get_proxy(cls): """ 爬取西刺代理网站html,解析后获得代理将代理加入数据库 """ request = HtmlDownloader() proxy_getter_logger.info("Start get Xicidaili proxy") try: # 爬取透明proxy for page in range(1, 3): # 拼接url url = cls.transparent_url + str(page) proxy_getter_logger.info("get url %s" % url) html = request.download(url=url) # 从html中提取proxy soup = BeautifulSoup(html, features="html.parser") table = soup.find("table", id="ip_list") for tr in table.find_all("tr")[1:]: td_list = tr.find_all("td") proxy_dict = { "country": "China", "ip": unicode(td_list[1].string), "port": int(td_list[2].string), "area": unicode(td_list[3].find("a").string if td_list[3]. find("a") else ""), "type": TRANSPARENT, "protocol": HTTPS_PROTOCOL if td_list[5].string == "HTTPS" else HTTP_PROTOCOL, "is_valid": False } proxy_getter_logger.info(proxy_dict) Client.insert(proxy_dict) # 防止被代理网站反爬,添加间隔时间2s time.sleep(2) # 爬取高匿proxy for page in range(1, 3): # 拼接url url = cls.anonymous_url + str(page) proxy_getter_logger.info("get url %s" % url) html = request.download(url=url) # 从html中提取proxy soup = BeautifulSoup(html, features="html.parser") table = soup.find("table", id="ip_list") for tr in table.find_all("tr")[1:]: td_list = tr.find_all("td") proxy_dict = { "country": "China", "ip": unicode(td_list[1].string), "port": int(td_list[2].string), "area": unicode(td_list[3].find("a").string if td_list[3]. find("a") else ""), "type": ANONYMOUS, "protocol": HTTPS_PROTOCOL if td_list[5].string == "HTTPS" else HTTP_PROTOCOL, "is_valid": False } proxy_getter_logger.info(proxy_dict) Client.insert(proxy_dict) # 防止被代理网站反爬,添加间隔时间2s time.sleep(2) proxy_getter_logger.info("End get Xicidaili proxy successfully") except Exception as e: proxy_getter_logger.warn( "End get Xicidaili proxy unsuccessfully, because %s" % e)
def get_proxy(cls): """ 爬取西拉代理网站html,解析后获得代理将代理加入数据库 """ request = HtmlDownloader() proxy_getter_logger.info("Start get Xiladaili proxy") try: # 爬取透明proxy for page in range(1, 3): url = cls.transparent_url + str(page) html = request.download(url=url) # 从html中提取proxy soup = BeautifulSoup(html, features="html.parser") tbody = soup.find("tbody") tr_list = tbody.find_all("tr") for tr in tr_list: td_list = tr.find_all("td") proxy_dict = { "country": "China", "ip": unicode(td_list[0].string.split(":")[0]), "port": int(td_list[0].string.split(":")[1]), "area": unicode(td_list[3].string), "type": TRANSPARENT, "protocol": HTTP_PROTOCOL if td_list[1].string == u"HTTP代理" else HTTPS_PROTOCOL, "is_valid": False } proxy_getter_logger.info(proxy_dict) Client.insert(proxy_dict) # 爬取高匿proxy for page in range(1, 3): url = cls.anonymous_url + str(page) html = request.download(url=url) # 从html中提取proxy soup = BeautifulSoup(html, features="html.parser") tbody = soup.find("tbody") tr_list = tbody.find_all("tr") for tr in tr_list: td_list = tr.find_all("td") proxy_dict = { "country": "China", "ip": unicode(td_list[0].string.split(":")[0]), "port": int(td_list[0].string.split(":")[1]), "area": unicode(td_list[3].string), "type": ANONYMOUS, "protocol": HTTP_PROTOCOL if td_list[1].string == u"HTTP代理" else HTTPS_PROTOCOL, "is_valid": False } proxy_getter_logger.info(proxy_dict) Client.insert(proxy_dict) proxy_getter_logger.info("End get Xiladaili proxy successfully") except Exception as e: proxy_getter_logger.warn( "End get Xiladaili proxy unsuccessfully, because %s" % e)