class CrawlProxy(object): def __init__(self): self.mysql = MysqlClient() self.verify = VerifyProxy() def get_free_proxy(self): n = int(input("请输入需爬取的页数:")) for i in range(0, n): url = f'https://ip.jiangxianli.com/?page={i}' response = requests.get(url).text html = etree.HTML(response) content = html.xpath('//*[@class="layui-table"]/tbody/tr') time.sleep(1) for j in content: scheme = j.xpath('./td[4]/text()')[0].lower() ip = j.xpath('./td[1]/text()')[0] port = j.xpath('./td[2]/text()')[0] Anonymous_degrees = j.xpath('./td[3]/text()')[0] verify_result = self.verify.verify_proxy( scheme, ip, port, Anonymous_degrees) if verify_result['status'] == '1': proxy = { "scheme": scheme, "ip": ip, "port": port, "Anonymous_degrees": Anonymous_degrees, "status": verify_result["status"], "response_time": verify_result["response_time"] } self.mysql.add_proxy(proxy) print(f"代理{ip}链接测试已通过,已保存Mysql") else: print(f'代理{ip}链接测试未通过')
class CrawlProxy(object): def __init__(self): self.mysql = MysqlClient() self.verify = VerifyProxy() def get_page(self, url, charset): response = requests.get(url, headers=header) response.encoding = charset return response.text def crawl_ip(self, page_num=3): """ 获取代理 ip :param page_num: :return: """ proxy = [] start_url = 'https://www.kuaidaili.com/free/inha/{}/' urls = [start_url.format(page) for page in range(141, page_num + 1)] for url in urls: print('crawl:', url) html = self.get_page(url, 'gb2312') if html: d = PyQuery(html) trs = d('table tbody tr').items() for tr in trs: scheme = tr.find('td:nth-child(4)').text().lower() ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() print(scheme, ip, port) verify_result = self.verify.verify_proxy(scheme, ip, port) if verify_result["status"] == '1': proxy = { "scheme": scheme, "ip": ip, "port": port, "status": verify_result["status"], "response_time": verify_result["response_time"], } # 存入数据库 self.mysql.add_proxy(proxy) print('代理', ip, '连通测试已通过,已保存 Mysql') else: print('代理', ip, '连通测试未通过')
class CrawlProxy(object): def __init__(self): self.mysql = MysqlClient() self.verify = VerifyProxy() def get_page(self, url, charset): response = requests.get(url, headers=header) response.encoding = charset return response.text def crawl_ip(self, page_num=3): """ 获取代理 ip3366 :param page_num: :return: """ verify_result = 0 response_time = time.strftime('%Y-%m-%d', time.localtime(time.time())) proxy = [] start_url = 'https://www.kuaidaili.com/free/inha/{}/' urls = [start_url.format(page) for page in range(1, page_num + 1)] for url in urls: print('crawl:', url) html = self.get_page(url, 'gb2312') if html: d = PyQuery(html) trs = d('table tbody tr').items() for tr in trs: scheme = tr.find('td:nth-child(4)').text().lower() ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() print(scheme, ip, port) #print(response_time) proxy = { "scheme": scheme, "ip": ip, "port": port, "status": verify_result, "response_time": response_time, } self.mysql.add_proxy(proxy) '''