Exemple #1
0
 def crawl_daxiang(self):
     url = 'http://vxer.daili666api.com/ip/?tid=555397563436240&num=1000&filter=on'
     html = get_page(url)
     if html:
         urls = html.split('\n')
         for ip in urls:
             yield ip.replace('\r', '')
Exemple #2
0
 def crawl_gaonidaili_free(self):
     start_url = "http://www.xiladaili.com/api/?uuid=ee4bcc3648ad4df4973ea79146d30278&num=100&place=%E4%B8%AD%E5%9B%BD&category=1&protocol=1&sortby=0&repeat=1&format=3&position=1"
     res = get_page(start_url)
     if res:
         proxies = re.findall("\d+\.\d+\.\d+\.\d+\:\d+", res)
         for proxy in proxies:
             yield proxy
Exemple #3
0
 def crawl_nimadaili(self):
     urls = [
         "http://www.nimadaili.com/http/{}/",
         "http://www.nimadaili.com/gaoni/{}/",
         "http://www.nimadaili.com/https/{}/"
     ]
     try:
         for url in urls:
             for page in range(100):
                 html = get_page(url.format(str(page)))
                 if html:
                     html_tree = etree.HTML(html)
                     for ip in html_tree.xpath(
                             '/html/body/div/div[1]/div[2]/table/tbody/tr/td[1]/text()'
                     ):
                         yield ip
                 else:
                     print(
                         "\033[1;31;40m 泥马代理网站  ---->  爬取网站为空已准备跳过! \033[0m"
                     )
                     return 0
                 time.sleep(2)
     except:
         print("\033[1;41;97m 泥马代理网站  ---->  爬虫网站规则更改,请修改! \033[0m")
         return 0
Exemple #4
0
 def crawl_goubanjia(self):
     try:
         url = "http://www.goubanjia.com/"
         html = get_page(url)
         if html:
             tree = etree.HTML(html)
             proxy_list = tree.xpath('//td[@class="ip"]')
             # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号
             # 需要过滤掉<p style="display:none;">的内容
             xpath_str = """.//*[not(contains(@style, 'display: none'))
                             and not(contains(@style, 'display:none'))
                             and not(contains(@class, 'port'))
                             ]/text()
                         """
             for each_proxy in proxy_list:
                 try:
                     # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port
                     ip_addr = ''.join(each_proxy.xpath(xpath_str))
                     port = each_proxy.xpath(
                         ".//span[contains(@class, 'port')]/text()")[0]
                     yield '{}:{}'.format(ip_addr, port)
                 except:
                     pass
         else:
             print("\033[1;31;40m 够搬家代理网站  ---->  爬取网站为空已准备跳过! \033[0m")
             return 0
     except:
         print("\033[1;41;97m 够搬家代理网站  ---->  爬虫网站规则更改,请修改! \033[0m")
         return 0
Exemple #5
0
 def crawl_superfastip(self, page_count=10):
     try:
         url_list = [
             "http://www.superfastip.com/welcome/freeip/{}".format(page)
             for page in range(1, page_count + 1)
         ]
         for url in url_list:
             html = get_page(url)
             if html:
                 html_tree = etree.HTML(html)
                 ip_list = html_tree.xpath(
                     "/html/body/div[3]/div/div/div[2]/div/table/tbody//tr/td[1]/text()"
                 )
                 port_list = html_tree.xpath(
                     "/html/body/div[3]/div/div/div[2]/div/table/tbody//tr/td[2]/text()"
                 )
                 ip_lists = zip(ip_list, port_list)
                 for ip in ip_lists:
                     yield ":".join(ip)
             else:
                 print("\033[1;31;40m 极速代理网站  ---->  爬取网站为空已准备跳过! \033[0m")
                 return 0
     except:
         print("\033[1;41;97m 极速代理网站  ---->  爬虫网站规则更改,请修改! \033[0m")
         return 0
Exemple #6
0
 def crawl_iphai(self):
     start_url = 'http://www.iphai.com/free/wg'
     html = get_page(start_url)
     if html:
         find_tr = re.compile('<tr>(.*?)</tr>', re.S)
         trs = find_tr.findall(html)
         for s in range(1, len(trs)):
             find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>',
                                  re.S)
             re_ip_address = find_ip.findall(trs[s])
             find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S)
             re_port = find_port.findall(trs[s])
             find_protocol = re.compile(
                 '<td>(.*?)</td>\s+<td>(.*?)</td>\s+<td>(.*?)</td>\s+<td>(.*?)</td>',
                 re.S)
             re_pro = find_protocol.findall(trs[s])
             if re_pro[0][3].strip() == '':
                 re_pro = ["HTTP"]
             else:
                 re_pro = ["HTTPS"]
             for address, port, protocol in zip(re_ip_address, re_port,
                                                re_pro):
                 address_port = address + ':' + port
                 tmp = (address_port, protocol)
                 yield tmp
Exemple #7
0
 def crawl_xicidaili(self):
     for i in range(1, 3):
         start_url = 'http://www.xicidaili.com/nn/{}'.format(i)
         headers = {
             'Accept':
             'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
             'Cookie':
             '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3',
             'Host': 'www.xicidaili.com',
             'Referer': 'http://www.xicidaili.com/nn/3',
             'Upgrade-Insecure-Requests': '1',
         }
         html = get_page(start_url, options=headers)
         if html:
             find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S)
             trs = find_trs.findall(html)
             for tr in trs:
                 find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
                 re_ip_address = find_ip.findall(tr)
                 find_port = re.compile('<td>(\d+)</td>')
                 re_port = find_port.findall(tr)
                 for address, port in zip(re_ip_address, re_port):
                     address_port = address + ':' + port
                     print(address_port)
                     yield address_port.replace(' ', '')
Exemple #8
0
 def crawl_mayidaili(self):
     try:
         d1 = datetime.datetime(2018, 12, 31)
         year = int(time.strftime("%Y", time.localtime()))
         mouth = int(time.strftime("%m", time.localtime()))
         day = int(time.strftime("%d", time.localtime()))
         d2 = datetime.datetime(year, mouth, day)
         start = (d2 - d1).days
         for i in range((1325 + (start)), 1325 + start + 1):
             html = get_page(
                 "http://www.mayidaili.com/share/view/{name}/".format(
                     name=i))
             if html:
                 doc = pq(html)
                 iptables = doc('body > div:nth-child(4) > p').text()
                 iptable_list = iptables.split("#")
                 for i in iptable_list:
                     if not i.isalpha():
                         result = re.search('(\d+)', i)
                         start = result.start()
                         ips = i[start:]
                         yield ips.replace(' ', '')
                     else:
                         pass
             else:
                 print("\033[1;31;40m 蚂蚁代理网站  ---->  爬取网站为空已准备跳过! \033[0m")
                 return 0
     except:
         print("\033[1;41;97m 蚂蚁代理网站  ---->  爬虫网站规则更改,请修改! \033[0m")
         return 0
 def crawl_data5u(self):
     start_url = 'http://www.data5u.com/free/gngn/index.shtml'
     headers = {
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
         'Accept-Encoding':
         'gzip, deflate',
         'Accept-Language':
         'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
         'Cache-Control':
         'max-age=0',
         'Connection':
         'keep-alive',
         'Cookie':
         'JSESSIONID=47AA0C887112A2D83EE040405F837A86',
         'Host':
         'www.data5u.com',
         'Referer':
         'http://www.data5u.com/free/index.shtml',
         'Upgrade-Insecure-Requests':
         '1',
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
     }
     html = get_page(start_url, options=headers)
     if html:
         ip_address = re.compile(
             '<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>',
             re.S)
         re_ip_address = ip_address.findall(html)
         for address, port in re_ip_address:
             result = address + ':' + port
             yield result.replace(' ', '')
Exemple #10
0
 def crawl_89ip(self):
     start_url = 'http://www.89ip.cn/apijk/?&tqsl=1000&sxa=&sxb=&tta=&ports=&ktip=&cf=1'
     html = get_page(start_url)
     if html:
         find_ips = re.compile('(\d+\.\d+\.\d+\.\d+:\d+)', re.S)
         ip_ports = find_ips.findall(html)
         for address_port in ip_ports:
             yield address_port
Exemple #11
0
 def crawl_66ip(self):
     start_url = "http://www.66ip.cn/nmtq.php?getnum=300&isp=0&anonymoustype=2&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip"
     res = get_page(start_url)
     if res:
         proxies = re.findall("(\d+.*?)<br />", res)
         for proxy in proxies:
             yield proxy
         time.sleep(random.randint(1, 4))
Exemple #12
0
 def crawl_daxiang(self):
     url = 'http://vtp.daxiangdaili.com/ip/?tid=556488478479034&num=1000'
     data = get_page(url)
     if data:
         data = data.split('\r\n')
         for proxy in data:
             if proxy:
                 yield proxy
 def crawl_ipjai(self):
     url = 'http://www.iphai.com/free/ng'
     html = get_page(url)
     ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.?*)</td>', re.S)
     re_ip_address = ip_address.findall(html)
     for address, port in re_ip_address:
         result = address + ':' + port
         yield result.replace(' ', '')
Exemple #14
0
 def crawl_ip181(self):
     start_url = 'http://www.ip181.com/'
     html = get_page(start_url)
     ip_address = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
     # \s* 匹配空格,起到换行作用
     re_ip_address = ip_address.findall(html)
     for address, port in re_ip_address:
         result = address + ':' + port
         yield result.replace(' ', '')
Exemple #15
0
 def crawl_premproxy(self):
     for i in ['China-01', 'China-02', 'China-03', 'China-04', 'Taiwan-01']:
         start_url = 'https://premproxy.com/proxy-by-country/{}.htm'.format(i)
         html = get_page(start_url)
         if html:
             ip_address = re.compile('<td data-label="IP:port ">(.*?)</td>')
             re_ip_address = ip_address.findall(html)
             for address_port in re_ip_address:
                 yield address_port.replace(' ', '')
Exemple #16
0
 def crawl_ip3366(self):
     for page in range(1, 4):
         start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format(page)
         html = get_page(start_url)
         ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
         # \s * 匹配空格,起到换行作用
         re_ip_address = ip_address.findall(html)
         for address, port in re_ip_address:
             result = address+':'+ port
             yield result.replace(' ', '')
Exemple #17
0
 def crawl_kxdaili(self):
     for i in range(1, 11):
         start_url = 'http://www.kxdaili.com/ipList/{}.html#ip'.format(i)
         html = get_page(start_url)
         ip_address = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
         # \s* 匹配空格,起到换行作用
         re_ip_address = ip_address.findall(html)
         for address, port in re_ip_address:
             result = address + ':' + port
             yield result.replace(' ', '')
Exemple #18
0
 def crawl_66ip(self):
     start_url = 'http://www.66ip.cn/nmtq.php?getnum=500&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip'
     html = get_page(start_url)
     a = html.split("var mediav_ad_height = '60';")[-1].split(
         '<script type="text/javascript" src="http://www.66ip.cn/ggg/jquery.min.js"></script>')[0].split('</div>')[
         0].split('</script>')[-1]
     b = a.split('<br />')
     for i in b:
         result = i.strip()
         yield result.replace(' ', '')
Exemple #19
0
 def crawl_ip181(self):
     start_url = 'http://www.ip181.com/'
     text = get_page(start_url)
     try:
         a = json.loads(text)['RESULT']
         for item in a:
             result = item['ip'] + ':' + item['port']
             yield result.replace(' ', '')
     except:
         return None
 def crawl_yqie(self):
     start_url = 'http://ip.yqie.com/ipproxy.htm'
     html = get_page(start_url)
     ip_address = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>',
                             re.S)
     # \s* 匹配空格,起到换行作用
     re_ip_address = ip_address.findall(html)
     for address, port in re_ip_address:
         result = address + ':' + port
         yield result.replace(' ', '')
Exemple #21
0
 def crawl_mianfei(self):
     for i in range(1, 4):
         start_url = 'http://ip.jiangxianli.com/?page={}'.format(i)
         html = get_page(start_url)
         if html:
             find_ips = re.compile('(\d+\.\d+\.\d+\.\d+:\d+)', re.S)
             re_ip_address = find_ips.findall(html)
             re_ip_address = list(set(re_ip_address))
             for eve_ip_address in re_ip_address:
                 yield eve_ip_address
Exemple #22
0
 def crawl_data5u(self):
     print(3)
     start_url = 'http://www.data5u.com/'
     html = get_page(start_url)
     if html:
         doc = pq(html)
         uls = doc('body ul>li>ul.l2').items()
         for ul in uls:
             host = ul.find('span:nth-child(1)').text()
             port = ul.find('span:nth-child(2)').text()
             yield ':'.join([host, port])
Exemple #23
0
 def crawl_89ip(self):
     start_url = "http://www.89ip.cn/index_{}.html"
     for i in range(1, 7):
         res = get_page(start_url.format(i))
         if res:
             doc = pq(res)
             trs = doc("tbody tr").items()
             for tr in trs:
                 ip = tr.find("td:nth-child(1)").text()
                 port = tr.find("td:nth-child(2)").text()
                 yield ":".join([ip, port])
             time.sleep(random.randint(1, 4))
Exemple #24
0
    def crawl_89ip(self):
        start_url = 'http://www.89ip.cn/tqdl.html?api=1&num=100&port=&address=&isp='
        text = get_page(start_url)

        # a=re.findall('[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}:[0-9]{1,5}',text)
        try:
            for i in re.findall(
                    '[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}:[0-9]{1,5}',
                    text):
                yield i
        except:
            return None
 def crawl_89ip(self):
     for i in range(1, 10):
         start_url = 'http://www.89ip.cn/index_{}.html'.format(i)
         html = get_page(start_url)
         if html:
             ip_address = re.compile(
                 '<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>', re.S)
             # \s* 匹配空格,起到换行作用
             re_ip_address = ip_address.findall(html)
             for address, port in re_ip_address:
                 result = address.strip() + ':' + port.strip()
                 yield result
Exemple #26
0
 def crawl_xroxy(self):
     for i in ['CN', 'TW']:
         start_url = 'http://www.xroxy.com/proxylist.php?country={}'.format(i)
         html = get_page(start_url)
         if html:
             ip_address1 = re.compile("title='View this Proxy details'>\s*(.*).*")
             re_ip_address1 = ip_address1.findall(html)
             ip_address2 = re.compile("title='Select proxies with port number .*'>(.*)</a>")
             re_ip_address2 = ip_address2.findall(html)
             for address, port in zip(re_ip_address1, re_ip_address2):
                 address_port = address + ':' + port
                 yield address_port.replace(' ', '')
Exemple #27
0
 def crawl_kuaidaili(self):
     for i in range(1, 4):
         start_url = 'http://www.kuaidaili.com/free/inha/{}/'.format(i)
         html = get_page(start_url)
         if html:
             ip_address = re.compile('<td data-title="IP">(.*?)</td>')
             re_ip_address = ip_address.findall(html)
             port = re.compile('<td data-title="PORT">(.*?)</td>')
             re_port = port.findall(html)
             for address, port in zip(re_ip_address, re_port):
                 address_port = address + ':' + port
                 yield address_port.replace(' ', '')
Exemple #28
0
 def crawl_data5u(self):
     for i in ['gngn', 'gnpt']:
         start_url = 'http://www.data5u.com/free/{}/index.shtml'.format(i)
         html = get_page(start_url)
         ip_adress = re.compile(
             ' <ul class="l2">\s*<span><li>(.*?)</li></span>\s*<span style="width: 100px;"><li class=".*">(.*?)</li></span>'
         )
         # \s * 匹配空格,起到换行作用
         re_ip_adress = ip_adress.findall(html)
         for adress, port in re_ip_adress:
             result = adress + ':' + port
             yield result.replace(' ', '')
Exemple #29
0
 def crawl_xicidaili(self):
     for page in range(1, 4):
         start_url = 'http://www.xicidaili.com/wt/{}'.format(page)
         html = get_page(start_url)
         ip_adress = re.compile(
             '<td class="country"><img src="http://fs.xicidaili.com/images/flag/cn.png" alt="Cn" /></td>\s*<td>(.*?)</td>\s*<td>(.*?)</td>'
         )
         # \s* 匹配空格,起到换行作用
         re_ip_adress = ip_adress.findall(html)
         for adress, port in re_ip_adress:
             result = adress + ':' + port
             yield result.replace(' ', '')
Exemple #30
0
 def crawl_ip3366(self):
     start_url = "http://www.ip3366.net/?stype=1&page={}"
     for i in range(1, 11):
         res = get_page(start_url.format(i))
         if res:
             doc = pq(res)
             trs = doc("tbody tr").items()
             for tr in trs:
                 ip = tr.find("td:nth-child(1)").text()
                 port = tr.find("td:nth-child(2)").text()
                 yield ":".join([ip, port])
             time.sleep(random.randint(1, 4))