def test_canonicalize_parse_url(self): # parse_url() wraps urlparse and is used in link extractors self.assertEqual(canonicalize_url(parse_url(u"http://www.example.com/résumé?q=résumé")), "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") self.assertEqual(canonicalize_url(parse_url('http://www.example.com/caf%e9-con-leche.htm')), 'http://www.example.com/caf%E9-con-leche.htm') self.assertEqual(canonicalize_url(parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")), "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
def download_chapter(chapter_url,chapter_name,path,*args,**kwargs)->None: _path_2 = os.sep.join([path,chapter_name]) if not os.path.exists(_path_2): os.makedirs(_path_2) response = requests.get(chapter_url) text = response.text cid = re.findall('var DM5_CID=(.+?);',text)[0].strip() mid = re.findall('var DM5_MID=(.+?);',text)[0].strip() dt = re.findall('var DM5_VIEWSIGN_DT="(.+?)";',text)[0].strip() sign = re.findall('var DM5_VIEWSIGN="(.+?)";',text)[0].strip() page_count = int(re.findall('var DM5_IMAGE_COUNT=(.+?);',text)[0].strip()) page = 1 while page <= page_count: js_api = f'{chapter_url}chapterfun.ashx?cid={cid}&page={page}&key=&language=1>k=6&_cid={cid}&_mid={mid}&_dt={dt}&_sign={sign}' headers = { 'referer':HOST } ret = requests.get(js_api,headers=headers) js_code = ret.text image_urls = execjs.eval(js_code) img_url = image_urls[0] img_name = wurl.parse_url(img_url).path.split('/')[-1] try: download_picture(img_url,img_name,_path_2) except Exception as e: print(f'章节 <{chapter_name}> 图片 {img_name} 下载失败:{e}') page+=1 print(f'章节 《{chapter_name}》 下载成功!')
def report_stat(self, **kw): req = self.request worker = req.hostname args = req.args kwargs = req.kwargs host = kw.pop('hostname', '') url = kw.pop('url', '') if not host and url: try: host = parse_url(url).hostname except Exception: host = '' fields = kw data = { "measurement": "crawl_stat", "tags": { "project": args[0], "job": args[1], "page": args[2], "host": host, "worker": worker, "batch_id": kwargs.get('batch_id') }, "time": datetime.now(), "fields": fields } # self.logger.info(u'统计回报 %s', data) try: self.influx_stat.write_points([data]) except Exception: self.logger.exception('统计回报异常')
def match(self, url): _m = None if self.pattern: _m = self.pattern.findall(url) parsed_url = urltool.parse_url(url) if (_m and _m[0] == url) or \ url == self.url or \ parsed_url.scheme == self.scheme or \ parsed_url.path == self.path or \ parsed_url.query == self.query or \ parsed_url.params == self.params or \ parsed_url.netloc == self.domain or \ parsed_url.fragment == self.fragment: return url
def url_add(index, url=''): """ add index >>> url_index(27, 'http://blog.com/show/12') 'http://blog.com/show/12/27' >>> url_index('delete', 'http://blog.com/show/12?kwarg=foo') 'http://blog.com/show/12/delete' """ if not url: url = request.url parsed = parse_url(url) new_path = f"{parsed.path}/{index}" return parsed._replace(path=new_path).geturl()
def url_inc(inc=1, url=''): """ increase index in url path by specified value >>> url_inc(1, 'http://blog.com/show/12?key=value') 'http://blog.com/show/13?key=value' >>> url_inc(-5, 'http://blog.com/show/12?key=value') 'http://blog.com/show/7?key=value' """ if not url: url = request.url parsed = parse_url(url) path, index = parsed.path.rsplit('/', 1) new_path = f"{path}/{int(index) + inc}" return parsed._replace(path=new_path).geturl()
def url_index(index, url=''): """ replace url index with specified index >>> url_index(27, 'http://blog.com/show/12') 'http://blog.com/show/27' >>> url_index(-55, 'http://blog.com/show/12?kwarg=foo') 'http://blog.com/show/-55?kwarg=foo' """ if not url: url = request.url parsed = parse_url(url) path, cur_index = parsed.path.rsplit('/', 1) new_path = f"{path}/{index}" return parsed._replace(path=new_path).geturl()
def process_response(self, response): settings = response.spider.settings fakes = settings.HTTP_PROXY_FAKE_STATUS domain = parse_url(response.url).netloc if not response.spider.settings.HTTP_PROXY_ENABLE: return response if response.request.proxy and response.status != 200 \ and response.status not in fakes: proxy = extract_ip_port(response.request.proxy) if proxy not in self.invalid_pool: self.invalid_pool[proxy] = set() self.logger.debug(f'Proxy {proxy} is invalid for ' f'{domain}.') self.invalid_pool[proxy].add(domain) elif response.request.proxy and (response.status == 200 or response.status in fakes): proxy = extract_ip_port(response.request.proxy) if proxy in self.invalid_pool: self.invalid_pool[proxy].discard(domain) self.proxy_pool.add(proxy) return response
def get_proxy_by_api(self, request): domain = parse_url(request.url).netloc def _get_from_pool(): while self.proxy_pool: proxy = self.proxy_pool.pop() if proxy not in self.invalid_pool or\ (domain not in self.invalid_pool.get(proxy)): return proxy else: continue proxy = _get_from_pool() if not proxy: self.logger.debug(f'No proxy in proxy pool.Getting some.') while 1: spider = request.spider req = amipy.Request(spider, spider.settings.HTTP_PROXY_API, delay=0, ignore=True) crawler = spider.binding_hub._crawler looper = spider.binding_hub.looper coro = crawler.requesters[req.down_type].crawl(req) resp = looper.run_coroutine(coro) if not resp: self.logger.error( f'[{resp.status}]Getting Http proxy by api failed.') continue _results = [i.strip() for i in resp.text().split('\n')] results = [ is_proxy_valid(i)[0] for i in _results if is_proxy_valid(i) ] self.proxy_pool.update(results) self.logger.debug( f'Got {len(results)} http proxies from HTTP_PROXY_API.') proxy = _get_from_pool() if not proxy: continue break return proxy
def process_request(self, request): spider = request.spider if not request.obey_robots_txt: if not spider.settings.ROBOTS_TXT_OBEY: return request _purl = parse_url(request.url) netloc = _purl.netloc if not netloc in self.rparser: if netloc in self.rubbish: return request robots_url = f"{_purl.scheme}://{netloc}/robots.txt" req = amipy.Request(spider, robots_url) crawler = spider.binding_hub._crawler looper = spider.binding_hub.looper coro = crawler.requesters[req.down_type].crawl(req) resp = looper.run_coroutine(coro) if resp.status != 200: self.logger.debug( f'[{resp.status}] There is no robots.txt for "{netloc}".') self.rubbish.add(netloc) return request else: self.logger.debug( f'[{resp.status}] Found robots.txt for "{netloc}".') _parser = robotparser.RobotFileParser(robots_url) _parser.parse(resp.text().splitlines()) self.rparser[netloc] = _parser else: _parser = self.rparser[netloc] ua = spider.settings.ROBOTS_USER_AGENT if _parser.can_fetch(ua, request.url): return request else: self.logger.debug(f'Forbidden by robots.txt of "{netloc}".' f'Request:{request}') raise DropRequest
def url_get_index(url=''): if not url: url = request.url parsed = parse_url(url) path, cur_index = parsed.path.rsplit('/', 1) return cur_index
def _proxy_invalid(self, proxy, url): domain = parse_url(url).netloc if proxy in self.invalid_pool: if domain in self.invalid_pool[proxy]: return True return False