def _start_crawl(self):
     for url_dict in self.urls:
         logger.info("开始爬取 [ " + self.website + " ] :::> [ " +
                     url_dict['type'] + " ]")
         has_more = True
         url = None
         while has_more:
             if 'page' in url_dict.keys() and str.find(
                     url_dict['url'], '{}') != -1:
                 url = url_dict['url'].format(str(url_dict['page']))
                 url_dict['page'] = url_dict['page'] + 1
             else:
                 url = url_dict['url']
                 has_more = False
             html = etree.HTML(request_page(url))
             ul_list = html.xpath(
                 "//div[@class='wlist'][2]//ul[@class='l2']")
             for ul in ul_list:
                 ip = ul.xpath("./span[1]/li/text()")[0] if len(
                     ul.xpath("./span[1]/li/text()")) else None
                 port = ul.xpath("./span[2]/li/text()")[0] if len(
                     ul.xpath("./span[2]/li/text()")) else None
                 schema = ul.xpath("./span[4]/li/text()")[0] if len(
                     ul.xpath("./span[4]/li/text()")) else None
                 proxy = IPProxy(schema=strip(schema),
                                 ip=strip(ip),
                                 port=strip(port))
                 if proxy._check_format():
                     self.queue.push(proxy)
             if ul_list is None:
                 has_more = False
 def _start_crawl(self):
     for url_dict in self.urls:
         logger.info("开始爬取 [ " + self.website + " ] :::> [ " +
                     url_dict['type'] + " ]")
         has_more = True
         url = None
         while has_more:
             if 'page' in url_dict.keys() and str.find(
                     url_dict['url'], '{}') != -1:
                 url = url_dict['url'].format(str(url_dict['page']))
                 url_dict['page'] = url_dict['page'] + 1
             else:
                 url = url_dict['url']
                 has_more = False
             html = etree.HTML(request_page(url))
             tr_list = html.xpath(
                 "//div[@id='main-content']//table/tr[position()>1]")
             for tr in tr_list:
                 ip = tr.xpath("./td[1]/text()")[0] if len(
                     tr.xpath("./td[1]/text()")) else None
                 port = tr.xpath("./td[2]/text()")[0] if len(
                     tr.xpath("./td[2]/text()")) else None
                 schema = tr.xpath("./td[4]/text()")[0] if len(
                     tr.xpath("./td[4]/text()")) else None
                 proxy = IPProxy(schema=strip(schema),
                                 ip=strip(ip),
                                 port=strip(port))
                 if proxy._check_format():
                     self.queue.push(proxy)
             if tr_list is None:
                 has_more = False
 def _start_crawl(self):
     for url_dict in self.urls:
         logger.info("开始爬取 [ " + self.website + " ] :::> [ " +
                     url_dict['type'] + " ]")
         has_more = True
         url = None
         while has_more:
             if 'page' in url_dict.keys() and str.find(
                     url_dict['url'], '{}') != -1:
                 url = url_dict['url'].format(str(url_dict['page']))
                 url_dict['page'] = url_dict['page'] + 1
             else:
                 url = url_dict['url']
                 has_more = False
             html = etree.HTML(request_page(url))
             tr_list = html.xpath(
                 "//table[@id='ip_list']//tr[@class!='subtitle']")
             for tr in tr_list:
                 ip = tr.xpath("./td[2]/text()")[0] if len(
                     tr.xpath("./td[2]/text()")) else None
                 port = tr.xpath("./td[3]/text()")[0] if len(
                     tr.xpath("./td[3]/text()")) else None
                 schema = tr.xpath("./td[6]/text()")[0] if len(
                     tr.xpath("./td[6]/text()")) else None
                 if schema.lower() == "http" or schema.lower() == "https":
                     proxy = IPProxy(schema=strip(schema),
                                     ip=strip(ip),
                                     port=strip(port))
                     if proxy._check_format():
                         self.queue.push(proxy)
             if tr_list is None:
                 has_more = False
 def _start_crawl(self):
     for url_dict in self.urls:
         logger.info("开始爬取 [ " + self.website + " ] :::> [ " +
                     url_dict['type'] + " ]")
         has_more = True
         url = None
         while has_more:
             if 'page' in url_dict.keys() and str.find(
                     url_dict['url'], '{}') != -1:
                 url = url_dict['url'].format(str(url_dict['page']))
                 url_dict['page'] = url_dict['page'] + 1
             else:
                 url = url_dict['url']
                 has_more = False
             html = etree.HTML(request_page(url))
             tr_list = html.xpath(
                 "//table[@class='table table-bordered table-striped']/tbody/tr"
             )
             for tr in tr_list:
                 ip = tr.xpath("./td[@data-title='IP']/text()")[0] if len(
                     tr.xpath("./td[@data-title='IP']/text()")) else None
                 port = tr.xpath(
                     "./td[@data-title='PORT']/text()")[0] if len(
                         tr.xpath(
                             "./td[@data-title='PORT']/text()")) else None
                 schema = tr.xpath(
                     "./td[@data-title='类型']/text()")[0] if len(
                         tr.xpath(
                             "./td[@data-title='类型']/text()")) else None
                 proxy = IPProxy(schema=strip(schema),
                                 ip=strip(ip),
                                 port=strip(port))
                 if proxy._check_format():
                     self.queue.push(proxy)
             if tr_list is None:
                 has_more = False
            self.server.rpush(
                key,
                json.dumps(self._serialize_proxy(proxy), ensure_ascii=False))

    def pop(self, schema='http', timeout=0):
        """Pop a proxy"""
        if timeout > 0:
            p = self.server.blpop(
                PROXIES_REDIS_FORMATTER.format(schema.lower()), timeout)
            if isinstance(p, tuple):
                p = p[1]
        else:
            p = self.server.lpop(PROXIES_REDIS_FORMATTER.format(
                schema.lower()))
        if p:
            p = self._deserialize_proxy(p)
            self.server.srem(PROXIES_REDIS_EXISTED, p._get_url())
            return p

    def _is_existed(self, proxy):
        added = self.server.sadd(PROXIES_REDIS_EXISTED, proxy._get_url())
        return added == 0


if __name__ == '__main__':
    r = redis.StrictRedis(host='localhost', port=6379)
    queue = FifoQueue(r)
    proxy = IPProxy('http', '218.66.253.144', '80')
    queue.push(proxy)
    proxy = queue.pop(schema='http')
    print(proxy._get_url())
Beispiel #6
0
def proxy_from_dict(d):
    return IPProxy(schema=d['schema'], ip=d['ip'], port=d['port'], used_total=d['used_total'],
                   success_times=d['success_times'], continuous_failed=d['continuous_failed'],
                   created_time=d['created_time'])