def _start_crawl(self): for url_dict in self.urls: logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]") has_more = True url = None while has_more: if 'page' in url_dict.keys() and str.find( url_dict['url'], '{}') != -1: url = url_dict['url'].format(str(url_dict['page'])) url_dict['page'] = url_dict['page'] + 1 else: url = url_dict['url'] has_more = False html = etree.HTML(request_page(url)) ul_list = html.xpath( "//div[@class='wlist'][2]//ul[@class='l2']") for ul in ul_list: ip = ul.xpath("./span[1]/li/text()")[0] if len( ul.xpath("./span[1]/li/text()")) else None port = ul.xpath("./span[2]/li/text()")[0] if len( ul.xpath("./span[2]/li/text()")) else None schema = ul.xpath("./span[4]/li/text()")[0] if len( ul.xpath("./span[4]/li/text()")) else None proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port)) if proxy._check_format(): self.queue.push(proxy) if ul_list is None: has_more = False
def _start_crawl(self): for url_dict in self.urls: logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]") has_more = True url = None while has_more: if 'page' in url_dict.keys() and str.find( url_dict['url'], '{}') != -1: url = url_dict['url'].format(str(url_dict['page'])) url_dict['page'] = url_dict['page'] + 1 else: url = url_dict['url'] has_more = False html = etree.HTML(request_page(url)) tr_list = html.xpath( "//div[@id='main-content']//table/tr[position()>1]") for tr in tr_list: ip = tr.xpath("./td[1]/text()")[0] if len( tr.xpath("./td[1]/text()")) else None port = tr.xpath("./td[2]/text()")[0] if len( tr.xpath("./td[2]/text()")) else None schema = tr.xpath("./td[4]/text()")[0] if len( tr.xpath("./td[4]/text()")) else None proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port)) if proxy._check_format(): self.queue.push(proxy) if tr_list is None: has_more = False
def _start_crawl(self): for url_dict in self.urls: logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]") has_more = True url = None while has_more: if 'page' in url_dict.keys() and str.find( url_dict['url'], '{}') != -1: url = url_dict['url'].format(str(url_dict['page'])) url_dict['page'] = url_dict['page'] + 1 else: url = url_dict['url'] has_more = False html = etree.HTML(request_page(url)) tr_list = html.xpath( "//table[@id='ip_list']//tr[@class!='subtitle']") for tr in tr_list: ip = tr.xpath("./td[2]/text()")[0] if len( tr.xpath("./td[2]/text()")) else None port = tr.xpath("./td[3]/text()")[0] if len( tr.xpath("./td[3]/text()")) else None schema = tr.xpath("./td[6]/text()")[0] if len( tr.xpath("./td[6]/text()")) else None if schema.lower() == "http" or schema.lower() == "https": proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port)) if proxy._check_format(): self.queue.push(proxy) if tr_list is None: has_more = False
def _start_crawl(self): for url_dict in self.urls: logger.info("开始爬取 [ " + self.website + " ] :::> [ " + url_dict['type'] + " ]") has_more = True url = None while has_more: if 'page' in url_dict.keys() and str.find( url_dict['url'], '{}') != -1: url = url_dict['url'].format(str(url_dict['page'])) url_dict['page'] = url_dict['page'] + 1 else: url = url_dict['url'] has_more = False html = etree.HTML(request_page(url)) tr_list = html.xpath( "//table[@class='table table-bordered table-striped']/tbody/tr" ) for tr in tr_list: ip = tr.xpath("./td[@data-title='IP']/text()")[0] if len( tr.xpath("./td[@data-title='IP']/text()")) else None port = tr.xpath( "./td[@data-title='PORT']/text()")[0] if len( tr.xpath( "./td[@data-title='PORT']/text()")) else None schema = tr.xpath( "./td[@data-title='类型']/text()")[0] if len( tr.xpath( "./td[@data-title='类型']/text()")) else None proxy = IPProxy(schema=strip(schema), ip=strip(ip), port=strip(port)) if proxy._check_format(): self.queue.push(proxy) if tr_list is None: has_more = False
self.server.rpush( key, json.dumps(self._serialize_proxy(proxy), ensure_ascii=False)) def pop(self, schema='http', timeout=0): """Pop a proxy""" if timeout > 0: p = self.server.blpop( PROXIES_REDIS_FORMATTER.format(schema.lower()), timeout) if isinstance(p, tuple): p = p[1] else: p = self.server.lpop(PROXIES_REDIS_FORMATTER.format( schema.lower())) if p: p = self._deserialize_proxy(p) self.server.srem(PROXIES_REDIS_EXISTED, p._get_url()) return p def _is_existed(self, proxy): added = self.server.sadd(PROXIES_REDIS_EXISTED, proxy._get_url()) return added == 0 if __name__ == '__main__': r = redis.StrictRedis(host='localhost', port=6379) queue = FifoQueue(r) proxy = IPProxy('http', '218.66.253.144', '80') queue.push(proxy) proxy = queue.pop(schema='http') print(proxy._get_url())
def proxy_from_dict(d): return IPProxy(schema=d['schema'], ip=d['ip'], port=d['port'], used_total=d['used_total'], success_times=d['success_times'], continuous_failed=d['continuous_failed'], created_time=d['created_time'])