Exemple #1
0
 def crawl_cnip(self):
     """
     中国IP代理:http://cn-proxy.com/
     该网站无法访问
     """
     headers = {
         "Accept":
         "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
         "Accept-Encoding": "gzip, deflate",
         "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
         "DNT": "1",
         "Host": "cn-proxy.com",
         "Referer": "https://www.google.com/",
         "Upgrade-Insecure-Requests": "1",
     }
     start_url = 'http://cn-proxy.com/'
     html = downloader(url=start_url, method='GET', headers=headers)
     if html:
         doc = pq(html)
         trs = doc('.sortable tbody tr').items()
         for tr in trs:
             ip = tr.find('td:nth-child(1)').text().strip()
             port = tr.find('td:nth-child(2)').text().strip()
             result = ":".join([ip, port])
             yield result
Exemple #2
0
 def crawl_haidaili(self):
     """ IP海:http://www.iphai.com/ """
     start_url = 'http://www.iphai.com/'
     html = downloader(url=start_url, method='GET')
     doc = pq(html)
     trs = doc('.table-responsive table tr:gt(0)').items()
     for tr in trs:
         ip = tr.find('td:nth-child(1)').text().strip()
         port = tr.find('td:nth-child(2)').text().strip()
         result = ":".join([ip, port])
         yield result
Exemple #3
0
 def crawl_nima(self):
     """ 尼玛代理:http://www.nimadaili.com/putong/ """
     start_url = 'http://www.nimadaili.com/putong/{}/'
     urls = [start_url.format(page) for page in range(1, 6)]
     ip_port_pattern = re.compile(r'<td>(\d+\.\d+\.\d+\.\d+\:\d+)</td>')
     for url in urls:
         html = downloader(url=url, method='GET')
         if html:
             ip_port_list = ip_port_pattern.findall(html)
             for result in ip_port_list:
                 # print(result)
                 yield result
Exemple #4
0
 def crawl_proxylist(self):
     """ 老外:https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1 """
     start_url = 'https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'
     html = downloader(url=start_url, method="GET")
     if html:
         ip_port_parteners = re.compile(
             r'<td>(\d+\.\d+\.\d+\.\d+)</td>\s*<td>(\d+)</td>')
         ip_port_list = ip_port_parteners.findall(html)
         for ip, port in ip_port_list:
             result = ":".join([ip.strip(), port.strip()])
             # print(result)
             yield result
Exemple #5
0
 def crawl_xila(self):
     """ 西拉代理:http://www.xiladaili.com/gaoni/2/ """
     start_url = 'http://www.xiladaili.com/gaoni/{}/'
     for page in range(1, 6):
         html = downloader(url=start_url.format(page), method="GET")
         if html:
             doc = pq(html)
             trs = doc('.fl-table tbody tr').items()
             for tr in trs:
                 ip_port = tr.find('td:nth-child(1)').text().strip()
                 # print(ip_port)
                 yield ip_port
Exemple #6
0
 def crawl_kuai(self):
     """ 快代理:https://www.kuaidaili.com/free/inha/1/ """
     start_url = 'https://www.kuaidaili.com/free/inha/{}/'
     for page in range(1, 4):
         html = downloader(url=start_url.format(page), method='GET')
         if html:
             ip_pattern = re.compile(r'<td data-title="IP">(.*?)</td>')
             port_pattern = re.compile(r'<td data-title="PORT">(\d+)</td>')
             ip_list = ip_pattern.findall(html)
             port_list = port_pattern.findall(html)
             for ip, port in zip(ip_list, port_list):
                 result = ip.strip() + ':' + port.strip()
                 yield result
Exemple #7
0
 def crawl_wuyou(self):
     """ 无忧代理:http://www.data5u.com/ 可用率极低 """
     start_url = 'http://www.data5u.com/'
     html = downloader(url=start_url, method='GET')
     if html:
         ip_port_pattern = re.compile(
             r'<li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class="port.*?">(\d+)</li>',
             re.S)
         ip_port_list = ip_port_pattern.findall(html)  # 列表里面是元组
         for ip, port in ip_port_list:
             result = ":".join([ip.strip(), port.strip()])
             # print(result)
             yield result
Exemple #8
0
 def crawl_xiaohuan(self):
     """ 小幻代理:https://ip.ihuan.me/ """
     start_url = 'https://ip.ihuan.me/'
     html = downloader(url=start_url, method='GET')
     if html:
         doc = pq(html)
         trs = doc('.table tbody tr').items()
         for tr in trs:
             ip = tr.find('td:nth-child(1) a').text().strip()
             port = tr.find('td:nth-child(2)').text().strip()
             result = ":".join([ip, port])
             # print(result)
             yield result
Exemple #9
0
 def crawl_kaixin(self):
     """ 开心代理:http://www.kxdaili.com/dailiip.html """
     start_url = 'http://www.kxdaili.com/dailiip/1/{}.html'
     for page in range(1, 4):
         html = downloader(url=start_url.format(page), method='GET')
         if html:
             doc = pq(html)
             trs = doc('.active tbody tr').items()
             for tr in trs:
                 ip = tr.find('td:nth-child(1)').text().strip()
                 port = tr.find('td:nth-child(2)').text().strip()
                 result = ':'.join([ip, port])
                 # print(result)
                 yield result
Exemple #10
0
 def crawl_ip3366(self, page_count=6):
     """ 云代理爬虫:http://www.ip3366.net/free/?stype=1 """
     start_url = 'http://www.ip3366.net/free/?stype=1&page={}'
     urls = [start_url.format(url) for url in range(1, page_count)]
     for url in urls:
         html = downloader(url=url, method='GET')
         if html:
             doc = pq(html)
             trs = doc('#list table tbody tr').items()
             for tr in trs:
                 ip = tr.find('td:nth-child(1)').text().strip()
                 port = tr.find('td:nth-child(2)').text().strip()
                 result = ":".join([ip, port])
                 yield result
Exemple #11
0
 def crawl_xicidaili(self):
     """ 西刺代理:https://www.xicidaili.com/ 封IP """
     start_url = 'https://www.xicidaili.com/nn/{}'
     for page in range(1, 4):
         html = downloader(url=start_url.format(page), method='GET')
         if html:
             ip_pattern = re.compile(r'<td>(\d+\.\d+\.\d+\.\d+)</td>')
             port_pattern = re.compile(r'<td>(\d+)</td>')
             ip_list = ip_pattern.findall(html)
             port_list = port_pattern.findall(html)
             for ip, port in zip(ip_list, port_list):
                 result = ip.strip() + ':' + port.strip()
                 # print(result)
                 yield result
Exemple #12
0
 def crawl_baibian(self):
     """ 百变IP:https://www.baibianip.com/home/free.html """
     start_url = 'https://www.baibianip.com/home/free.html'
     html = downloader(url=start_url, method='GET')
     ip_pattern = re.compile(r"\('(.*)'\); </script></td>")
     port_pattern = re.compile(r'<td> (\d+) </td>')
     if html:
         ip_list = ip_pattern.findall(html, re.S)
         port_list = port_pattern.findall(html)
         for ips, port in zip(ip_list, port_list):
             ip = self.baibian_js(ips)
             results = ip + ':' + port
             # print(results)
             yield results
Exemple #13
0
 def crawl_free(self):
     """ 免费代理库:http://ip.jiangxianli.com/ """
     start_url = 'http://ip.jiangxianli.com/?page={}'
     urls = [start_url.format(page) for page in range(1, 4)]
     for url in urls:
         html = downloader(url=url, method='GET')
         if html:
             doc = pq(html)
             trs = doc('.table tbody tr').items()
             for tr in trs:
                 ip = tr.find('td:nth-child(2)').text().strip()
                 port = tr.find('td:nth-child(3)').text().strip()
                 result = ":".join([ip, port])
                 # print(result)
                 yield result
Exemple #14
0
 def crawl_cnip(self):
     """
     中国IP代理:https://cn-proxy.com/
     :return:
     """
     start_url = 'https://cn-proxy.com/'
     html = downloader(url=start_url, method='GET')
     if html:
         doc = pq(html)
         trs = doc('.sortable tbody tr').items()
         for tr in trs:
             ip = tr.find('td:nth-child(1)').text().strip()
             port = tr.find('td:nth-child(2)').text().strip()
             result = ":".join([ip, port])
             # print(result)
             yield result
Exemple #15
0
 def crawl_daili66(self, page_count=4):
     """ 代理66:http://www.66ip.cn/index.html  封IP,少量几页"""
     start_url = 'http://www.66ip.cn/{}.html'
     urls = [start_url.format(page) for page in range(1, page_count)]
     for url in urls:
         html = downloader(url=url, method='GET')
         if html:
             doc = pq(html)
             trs = doc('.containerbox table tr:gt(0)').items(
             )  # gt是大于,lt是小于,pyquery中第一个是从0开始
             for tr in trs:
                 ip = tr.find('td:nth-child(1)').text().strip()
                 port = tr.find('td:nth-child(2)').text().strip()
                 data = ":".join([ip, port])
                 # print("代理:{}".format(data))
                 yield data
Exemple #16
0
 def crawl_89ip(self):
     """ 89IP代理:http://www.89ip.cn/ """
     start_url = 'http://www.89ip.cn/index_{}.html'
     for page in range(1, 6):
         if page == 1:
             url = 'http://www.89ip.cn/'
         else:
             url = start_url.format(page)
         html = downloader(url=url, method='GET')
         if html:
             doc = pq(html)
             trs = doc('.layui-table tbody tr').items()
             for tr in trs:
                 ip = tr.find('td:nth-child(1)').text().strip()
                 port = tr.find('td:nth-child(2)').text().strip()
                 result = ":".join([ip, port])
                 # print(result)
                 yield result
Exemple #17
0
    def crawl_sunjs(self):
        """ sunjs代理:https://www.sunjs.com/proxy/list.html """
        start_url = 'https://www.sunjs.com/proxy/list.html'
        ip_list = []
        html = downloader(url=start_url, method='GET')
        selector = etree.HTML(html)
        decode_pattern = re.compile(r'decode\(\"(.*?)\"\)')
        decode_data_list = decode_pattern.findall(html, re.S)
        port_list = selector.xpath('//td[@data-title="PORT"]/text()')
        for data in decode_data_list:
            first_decode = self.run_decode_js(data)
            ip_bytes = base64.b64decode(first_decode)
            ip = str(ip_bytes, encoding='utf-8')
            ip_list.append(ip)

        for ip, port in zip(ip_list, port_list):
            results = ip + ':' + port
            # print(results)
            yield results