Exemple #1
0
class CrawlProxy(object):
    def __init__(self):
        self.mysql = MysqlClient()
        self.verify = VerifyProxy()

    def get_free_proxy(self):
        n = int(input("请输入需爬取的页数:"))
        for i in range(0, n):
            url = f'https://ip.jiangxianli.com/?page={i}'
            response = requests.get(url).text
            html = etree.HTML(response)
            content = html.xpath('//*[@class="layui-table"]/tbody/tr')
            time.sleep(1)
            for j in content:
                scheme = j.xpath('./td[4]/text()')[0].lower()
                ip = j.xpath('./td[1]/text()')[0]
                port = j.xpath('./td[2]/text()')[0]
                Anonymous_degrees = j.xpath('./td[3]/text()')[0]
                verify_result = self.verify.verify_proxy(
                    scheme, ip, port, Anonymous_degrees)
                if verify_result['status'] == '1':
                    proxy = {
                        "scheme": scheme,
                        "ip": ip,
                        "port": port,
                        "Anonymous_degrees": Anonymous_degrees,
                        "status": verify_result["status"],
                        "response_time": verify_result["response_time"]
                    }
                    self.mysql.add_proxy(proxy)
                    print(f"代理{ip}链接测试已通过,已保存Mysql")
                else:
                    print(f'代理{ip}链接测试未通过')
Exemple #2
0
class CrawlProxy(object):

    def __init__(self):
        self.mysql = MysqlClient()
        self.verify = VerifyProxy()

    def get_page(self, url, charset):
        response = requests.get(url, headers=header)
        response.encoding = charset
        return response.text

    def crawl_ip(self, page_num=3):
        """
        获取代理 ip
        :param page_num:
        :return:
        """
        proxy = []
        start_url = 'https://www.kuaidaili.com/free/inha/{}/'
        urls = [start_url.format(page) for page in range(141, page_num + 1)]
        for url in urls:
            print('crawl:', url)
            html = self.get_page(url, 'gb2312')
            if html:
                d = PyQuery(html)
                trs = d('table tbody tr').items()
                for tr in trs:
                    scheme = tr.find('td:nth-child(4)').text().lower()
                    ip = tr.find('td:nth-child(1)').text()
                    port = tr.find('td:nth-child(2)').text()
                    print(scheme, ip, port)
                    verify_result = self.verify.verify_proxy(scheme, ip, port)

                    if verify_result["status"] == '1':
                        proxy = {
                            "scheme": scheme,
                            "ip": ip,
                            "port": port,
                            "status": verify_result["status"],
                            "response_time": verify_result["response_time"],
                        }
                        # 存入数据库
                        self.mysql.add_proxy(proxy)
                        print('代理', ip, '连通测试已通过,已保存 Mysql')
                    else:
                        print('代理', ip, '连通测试未通过')
Exemple #3
0
class CrawlProxy(object):
    def __init__(self):
        self.mysql = MysqlClient()
        self.verify = VerifyProxy()

    def get_page(self, url, charset):
        response = requests.get(url, headers=header)
        response.encoding = charset
        return response.text

    def crawl_ip(self, page_num=3):
        """ 获取代理 ip3366 :param page_num: :return: """
        verify_result = 0
        response_time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
        proxy = []
        start_url = 'https://www.kuaidaili.com/free/inha/{}/'
        urls = [start_url.format(page) for page in range(1, page_num + 1)]
        for url in urls:
            print('crawl:', url)
            html = self.get_page(url, 'gb2312')
            if html:
                d = PyQuery(html)
                trs = d('table tbody tr').items()
                for tr in trs:
                    scheme = tr.find('td:nth-child(4)').text().lower()
                    ip = tr.find('td:nth-child(1)').text()
                    port = tr.find('td:nth-child(2)').text()
                    print(scheme, ip, port)
                    #print(response_time)
                    proxy = {
                        "scheme": scheme,
                        "ip": ip,
                        "port": port,
                        "status": verify_result,
                        "response_time": response_time,
                    }
                    self.mysql.add_proxy(proxy)
                    '''