コード例 #1
0
 def crawl_xiaoshu(self):
     url = "http://www.xsdaili.com"
     rep = get_html(url)
     if rep is not None:
         html = etree.HTML(rep)
         index = html.xpath(
             '/html/body/div[5]/div/div[2]/div/div/div/div[2]/div/div[2]/div[1]/div[1]/a'
         )[0]
         index_href = index.xpath('./@href')[0]
         url_c = url + index_href
         index_date = index.xpath('./text()')[0].split(" ")[0]
         date = '-'.join(re.findall(r'\d+', index_date))
         today = str(datetime.datetime.now().date())
         if date == today:
             rep = get_html(url_c)
             if rep is not None:
                 results = re.compile(
                     r'<br>\s*(\d+\.\d+\.\d+\.\d+:\d+).*?').findall(rep)
                 if results:
                     for result in results:
                         yield result
コード例 #2
0
 def crawl_hai(self):
     url = "http://www.iphai.com/free/ng"
     rep = get_html(url)
     if rep is not None:
         html = etree.HTML(rep)
         tr_list = html.xpath('/html/body/div[2]/div[2]/table/tr')
         for tr in tr_list:
             ip = tr.xpath('./td[1]/text()')
             port = tr.xpath('./td[2]/text()')
             if ip and port:
                 ip_port = ip[0].strip() + ":" + port[0].strip()
                 yield ip_port
コード例 #3
0
 def crawl_66ip(self):
     for i in range(1, 6):
         url = "http://www.66ip.cn/areaindex_{}/1.html".format(i)
         rep = get_html(url)
         if rep is not None:
             results = re.compile(
                 r'<tr><td>([\d\.]+)</td><td>(\d+)</td><td>').findall(rep)
             if results:
                 for result in results:
                     ip_port = result[0] + ":" + result[1]
                     yield ip_port
                     time.sleep(0.5)
コード例 #4
0
 def crawl_kaixin(self):
     for i in range(1, 5):
         url = "http://www.kxdaili.com/dailiip/1/{}.html".format(i)
         rep = get_html(url)
         if rep is not None:
             results = re.compile(
                 r'<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>').findall(rep)
             if results:
                 for result in results:
                     ip_port = result[0] + ":" + result[1]
                     yield ip_port
                     time.sleep(0.5)
コード例 #5
0
 def crawl_free(self):
     for i in range(1, 4):
         url = "http://ip.jiangxianli.com/?page={}".format(i)
         rep = get_html(url)
         if rep is not None:
             results = re.compile(
                 r'<tr>\s*<td>\d+</td>\s*<td>([\d\.]+)</td>\s*<td>(\d+)<'
             ).findall(rep)
             if results:
                 for result in results:
                     ip_port = result[0] + ":" + result[1]
                     yield ip_port
                     time.sleep(0.5)
コード例 #6
0
 def crawlVpn_66ip(self):
     url = "http://www.66ip.cn/index.html"
     rep = get_html(url)
     if rep is not None:
         results = re.compile(
             r'<tr><td>([\d\.]+)</td><td>(\d+)</td><td>(.*)</td>').findall(
                 rep)
         if results:
             for result in results:
                 print(result)
                 if result[2][-1] != "市":
                     ip_port = result[0] + ":" + result[1]
                     print(ip_port)
コード例 #7
0
 def crawl_89ip(self):
     for i in range(1, 4):
         url = "http://www.89ip.cn/index_{}.html".format(i)
         rep = get_html(url)
         if rep is not None:
             html = etree.HTML(rep)
             tr_list = html.xpath('//table[@class="layui-table"]/tbody/tr')
             for tr in tr_list:
                 ip = tr.xpath('./td[1]/text()')
                 port = tr.xpath('./td[2]/text()')
                 if ip and port:
                     ip_port = ip[0].strip() + ":" + port[0].strip()
                     yield ip_port
                     time.sleep(0.5)
コード例 #8
0
 def crawl_kuaidaili(self):
     for i in range(1, 3):
         url = "https://www.kuaidaili.com/free/inha/{}/".format(i)
         rep = get_html(url)
         if rep is not None:
             html = etree.HTML(rep)
             tr_list = html.xpath('//*[@id="list"]/table/tbody/tr')
             for tr in tr_list:
                 ip = tr.xpath('./td[1]/text()')
                 port = tr.xpath('./td[2]/text()')
                 date = tr.xpath('./td[last()]/text()')
                 if ip and port and date:
                     date = date[0].split(" ")[0]
                     today = str(datetime.datetime.now().date())
                     if date == today:
                         ip_port = ip[0] + ":" + port[0]
                         yield ip_port
                         time.sleep(0.5)
コード例 #9
0
 def crawl_yundaili(self):
     for i in range(1, 4):
         url = "http://www.ip3366.net/free/?page={}".format(i)
         rep = get_html(url)
         if rep is not None:
             html = etree.HTML(rep)
             tr_list = html.xpath('//*[@id="list"]/table/tbody/tr')
             for tr in tr_list:
                 ip = tr.xpath('./td[1]/text()')
                 port = tr.xpath('./td[2]/text()')
                 date = tr.xpath('./td[last()]/text()')
                 if ip and port and date:
                     date = datetime.datetime.strptime(
                         date[0].split(" ")[0], '%Y/%m/%d').date()
                     today = datetime.datetime.now().date()
                     if date == today:
                         ip_port = ip[0] + ":" + port[0]
                         yield ip_port.replace(' ', '')
                         time.sleep(0.5)
コード例 #10
0
 def crawl_xici(self):
     for i in range(1, 3):
         url = "https://www.xicidaili.com/nn/{}".format(i)
         rep = get_html(url)
         if rep is not None:
             html = etree.HTML(rep)
             tr_list = html.xpath('//*[@id="ip_list"]/tr[@class]')
             for tr in tr_list:
                 ip = tr.xpath('./td[2]/text()')
                 port = tr.xpath('./td[3]/text()')
                 date = tr.xpath('./td[last()]/text()')
                 if ip and port and date:
                     date = datetime.datetime.strptime(
                         "20" + date[0].split(" ")[0], "%Y-%m-%d").date()
                     today = datetime.datetime.now().date()
                     if date == today:
                         ip_port = ip[0] + ":" + port[0]
                         yield ip_port
                         time.sleep(0.5)