Esempio n. 1
0
 def fetch_xici(self, num):
     """抓取http://www.xicidaili.com/,质量10%"""
     page = 1
     proxyes = []
     while len(proxyes) <= num and page <= 2:
         url = "http://www.xicidaili.com/nn/%s" % page
         req = requests.get(url, headers=self.headers)
         html = req.text
         selector = etree.HTML(html)
         tbody = selector.xpath('//tr[@class]')
         for line in tbody:
             tds = line.xpath('td/text()')
             ip = tds[0]
             port = tds[1]
             speed = line.xpath('td[7]/div/@title')[0][:-1]
             latency = line.xpath('td[8]/div/@title')[0][:-1]
             #             print('%s,%s,%s,%s'%(ip, port, speed, latency))
             if float(speed) < 3 and float(latency) < 1:
                 proxy = "%s:%s" % (ip, port)
                 proxy_dict = {'http': proxy, 'https': proxy}
                 valid_res = self.proxy_vaild(proxy_dict)
                 if valid_res[0]:
                     proxyes.append(valid_res[1])
         logger.info('抓取 xicidaili 第 %d 页,有效代理 %d 个' % (page, len(proxyes)))
         page += 1
     return proxyes
Esempio n. 2
0
    def scheduler(self):
        """根据状态码实现调度"""
        deal_code = ['10000', '10001', '10003', '10004', '10016', '10020','10021','10022', '10023']
        pass_code = ['20800', '20801', '20802', '20803', '20003']
        self.status_dict = {
            "10000": self.status_ok,
            "10001": self.status_change_key,
            "10003": self.status_change_key,
            "10004": self.status_change_user_agent,
            "10010": self.status_change_proxy,
            "10016": self.status_change_user_agent,
            "10020": self.status_change_key,
            "10021": self.status_change_proxy,
            "10022": self.status_change_proxy,
            "10023": self.status_change_key,
            "20003": self.status_pass
        }
        status = self.respond['status']
        infocode = self.respond['infocode']
        print(infocode)

        if infocode in deal_code:
            return self.status_dict[infocode]()
        elif infocode in pass_code:
            self.status_pass()
        else:
            print(infocode)
            logger.info(infocode)
            self.status_invalid_request()
Esempio n. 3
0
 def get_duration(self):
     respond = self.process()
     if respond == None:
         logger.info(u'响应为None的链接是%s'%(self.req_url))
     elif respond['duration'] == None:
         logger.info(u'duration为None的链接是%s'%(self.req_url))
     else:
         duration = respond['duration']
         return int(duration)
Esempio n. 4
0
 def process(self):
     if self.change_num < len(self.key_list):
         self.key_dict['key'] = self.key_list[self.change_num]
         self.change_num += 1
         logger.info("========已更换Key=========")
         print("========已更换Key=========")
         return self.key_dict
     else:
         logger.info("========Key已用完,随机选取Key=========")
         self.key_dict['key'] = random.choice(self.key_list)
         return self.key_dict
Esempio n. 5
0
 def proxy_vaild(self, proxy_dict):
     url = "http://ip.chinaz.com/getip.aspx"  #用来测试IP是否可用的url
     try:
         r = requests.get(url,
                          proxies=proxy_dict,
                          headers=self.headers,
                          timeout=3,
                          allow_redirects=False)
         if r.status_code == 200:
             print(r.text)
             return (True, proxy_dict)
         else:
             logger.info('_______%s 无效代理________' % r.status_code)
             return (False, )
     except (req_e.ReadTimeout, req_e.ConnectTimeout, req_e.ProxyError,
             req_e.ConnectionError, req_e.ChunkedEncodingError):
         logger.info('_______连接超时 无效代理________')
         return (False, )
Esempio n. 6
0
 def fetch_66ip(self, num):
     """抓取http://www.66ip.cn/,质量25%"""
     proxyes = []
     url = "http://www.66ip.cn/nmtq.php?getnum=100&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip"
     req = requests.get(url, headers=self.headers)
     html = req.text
     urls = html.split("</script>")[1].split("<br />")
     for u in urls[:-1]:
         if u.strip():
             proxy = u.strip()
             proxy_dict = {'http': proxy, 'https': proxy}
             valid_res = self.proxy_vaild(proxy_dict)
             if valid_res[0]:
                 proxyes.append(valid_res[1])
             if len(proxyes) >= num:
                 break
             else:
                 continue
     logger.info('抓取 66ip,有效代理 %d 个' % (len(proxyes)))
     return proxyes
Esempio n. 7
0
 def fetch_new_proxyes(self, num):
     crawls = [
         self.fetch_ip181, self.fetch_66ip, self.fetch_xici,
         self.fetch_kxdaili
     ]
     valid_proxyes = []
     if os.path.exists('proxy.csv'):
         local_proxyes = self.read_proxy_file('proxy.csv')
         for proxy in local_proxyes:
             valid_res = self.proxy_vaild(proxy)
             if valid_res[0]:
                 valid_proxyes.append(valid_res[1])
     else:
         pass
     if len(valid_proxyes) < num:
         demand_num = num - len(valid_proxyes)
         logger.info('_______有效代理%s,需要抓取%s________' %
                     (len(valid_proxyes), demand_num))
         for crawl in crawls:
             new_proxyes = crawl(demand_num)
             logger.info('_______抓取新代理%s________' % len(new_proxyes))
             valid_proxyes += new_proxyes
             demand_num = demand_num - len(new_proxyes)
             if demand_num <= 0:
                 logger.info('_______代理抓取完毕,共%s________' %
                             len(valid_proxyes))
                 self.save_proxy(valid_proxyes)
                 break
             else:
                 continue
     return valid_proxyes
Esempio n. 8
0
 def fetch_ip181(self, num):
     """抓取http://www.ip181.com/,10分钟更新100个,质量55%"""
     proxyes = []
     url = 'http://www.ip181.com/'
     req = requests.get(url, headers=self.headers)
     html = req.text
     selector = etree.HTML(html)
     tbody = selector.xpath('//tr')
     for line in tbody[1:]:
         tds = line.xpath('td/text()')
         ip = tds[0]
         port = tds[1]
         latency = tds[4].split(' ')[0]
         if float(latency) < 0.5:
             proxy = "%s:%s" % (ip, port)
             proxy_dict = {'http': proxy, 'https': proxy}
             valid_res = self.proxy_vaild(proxy_dict)
             if valid_res[0]:
                 proxyes.append(valid_res[1])
             if len(proxyes) >= num:
                 break
     logger.info('抓取 ip181,有效代理 %d 个' % (len(proxyes)))
     return proxyes
Esempio n. 9
0
 def fetch_kxdaili(self, num):
     """抓取http://www.kxdaili.com/,质量 5%"""
     page = 1
     proxyes = []
     while len(proxyes) <= num and page <= 10:
         url = "http://www.kxdaili.com/dailiip/1/%d.html" % page
         req = requests.get(url, headers=self.headers)
         html = req.text
         selector = etree.HTML(html)
         tbody = selector.xpath('//tr')
         for line in tbody:
             tds = line.xpath('td/text()')
             ip = tds[0]
             port = tds[1]
             latency = tds[4].split(' ')[0]
             if float(latency) < 0.5:
                 proxy = "%s:%s" % (ip, port)
                 proxy_dict = {'http': proxy, 'https': proxy}
                 valid_res = self.proxy_vaild(proxy_dict)
                 if valid_res[0]:
                     proxyes.append(valid_res[1])
         logger.info('抓取 kxdaili 第 %d 页,有效代理 %d 个' % (page, len(proxyes)))
         page += 1
     return proxyes
Esempio n. 10
0
 def save_proxy(self, res_list):
     df = pd.DataFrame(res_list)
     df.to_csv('proxy.csv')
     logger.info('_______代理已储存________')