Esempio n. 1
0
 def test_canonicalize_parse_url(self):
     # parse_url() wraps urlparse and is used in link extractors
     self.assertEqual(canonicalize_url(parse_url(u"http://www.example.com/résumé?q=résumé")),
                                       "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
     self.assertEqual(canonicalize_url(parse_url('http://www.example.com/caf%e9-con-leche.htm')),
                                       'http://www.example.com/caf%E9-con-leche.htm')
     self.assertEqual(canonicalize_url(parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
                                       "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
Esempio n. 2
0
 def test_canonicalize_parse_url(self):
     # parse_url() wraps urlparse and is used in link extractors
     self.assertEqual(canonicalize_url(parse_url(u"http://www.example.com/résumé?q=résumé")),
                                       "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
     self.assertEqual(canonicalize_url(parse_url('http://www.example.com/caf%e9-con-leche.htm')),
                                       'http://www.example.com/caf%E9-con-leche.htm')
     self.assertEqual(canonicalize_url(parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
                                       "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
Esempio n. 3
0
def download_chapter(chapter_url,chapter_name,path,*args,**kwargs)->None:
    _path_2 = os.sep.join([path,chapter_name])
    if not os.path.exists(_path_2):
        os.makedirs(_path_2)
    response = requests.get(chapter_url)
    text = response.text
    cid = re.findall('var DM5_CID=(.+?);',text)[0].strip()
    mid = re.findall('var DM5_MID=(.+?);',text)[0].strip()
    dt = re.findall('var DM5_VIEWSIGN_DT="(.+?)";',text)[0].strip()
    sign = re.findall('var DM5_VIEWSIGN="(.+?)";',text)[0].strip()
    page_count = int(re.findall('var DM5_IMAGE_COUNT=(.+?);',text)[0].strip())
    page = 1
    while page <= page_count:
        js_api = f'{chapter_url}chapterfun.ashx?cid={cid}&page={page}&key=&language=1&gtk=6&_cid={cid}&_mid={mid}&_dt={dt}&_sign={sign}'
        headers = {
            'referer':HOST
        }
        ret = requests.get(js_api,headers=headers)
        js_code = ret.text
        image_urls = execjs.eval(js_code)
        img_url = image_urls[0]
        img_name = wurl.parse_url(img_url).path.split('/')[-1]
        try:
            download_picture(img_url,img_name,_path_2)
        except Exception as e:
            print(f'章节 <{chapter_name}> 图片 {img_name} 下载失败:{e}')
        page+=1
    print(f'章节 《{chapter_name}》 下载成功!')
Esempio n. 4
0
    def report_stat(self, **kw):
        req = self.request
        worker = req.hostname
        args = req.args
        kwargs = req.kwargs
        host = kw.pop('hostname', '')
        url = kw.pop('url', '')
        if not host and url:
            try:
                host = parse_url(url).hostname
            except Exception:
                host = ''
        fields = kw

        data = {
            "measurement": "crawl_stat",
            "tags": {
                "project": args[0],
                "job": args[1],
                "page": args[2],
                "host": host,
                "worker": worker,
                "batch_id": kwargs.get('batch_id')
            },
            "time": datetime.now(),
            "fields": fields
        }
        # self.logger.info(u'统计回报 %s', data)
        try:
            self.influx_stat.write_points([data])
        except Exception:
            self.logger.exception('统计回报异常')
Esempio n. 5
0
 def match(self, url):
     _m = None
     if self.pattern:
         _m = self.pattern.findall(url)
     parsed_url = urltool.parse_url(url)
     if (_m and _m[0] == url) or \
         url == self.url or \
         parsed_url.scheme == self.scheme or \
         parsed_url.path == self.path or \
         parsed_url.query == self.query or \
         parsed_url.params == self.params or \
         parsed_url.netloc == self.domain or \
         parsed_url.fragment == self.fragment:
         return url
Esempio n. 6
0
def url_add(index, url=''):
    """
    add index

    >>> url_index(27, 'http://blog.com/show/12')
    'http://blog.com/show/12/27'
    >>> url_index('delete', 'http://blog.com/show/12?kwarg=foo')
    'http://blog.com/show/12/delete'
    """
    if not url:
        url = request.url
    parsed = parse_url(url)
    new_path = f"{parsed.path}/{index}"
    return parsed._replace(path=new_path).geturl()
Esempio n. 7
0
def url_inc(inc=1, url=''):
    """
    increase index in url path by specified value

    >>> url_inc(1, 'http://blog.com/show/12?key=value')
    'http://blog.com/show/13?key=value'
    >>> url_inc(-5, 'http://blog.com/show/12?key=value')
    'http://blog.com/show/7?key=value'
    """
    if not url:
        url = request.url
    parsed = parse_url(url)
    path, index = parsed.path.rsplit('/', 1)
    new_path = f"{path}/{int(index) + inc}"
    return parsed._replace(path=new_path).geturl()
Esempio n. 8
0
def url_index(index, url=''):
    """
    replace url index with specified index

    >>> url_index(27, 'http://blog.com/show/12')
    'http://blog.com/show/27'
    >>> url_index(-55, 'http://blog.com/show/12?kwarg=foo')
    'http://blog.com/show/-55?kwarg=foo'
    """
    if not url:
        url = request.url
    parsed = parse_url(url)
    path, cur_index = parsed.path.rsplit('/', 1)
    new_path = f"{path}/{index}"
    return parsed._replace(path=new_path).geturl()
Esempio n. 9
0
 def process_response(self, response):
     settings = response.spider.settings
     fakes = settings.HTTP_PROXY_FAKE_STATUS
     domain = parse_url(response.url).netloc
     if not response.spider.settings.HTTP_PROXY_ENABLE:
         return response
     if response.request.proxy and response.status != 200 \
             and response.status not in fakes:
         proxy = extract_ip_port(response.request.proxy)
         if proxy not in self.invalid_pool:
             self.invalid_pool[proxy] = set()
         self.logger.debug(f'Proxy {proxy} is invalid for ' f'{domain}.')
         self.invalid_pool[proxy].add(domain)
     elif response.request.proxy and (response.status == 200
                                      or response.status in fakes):
         proxy = extract_ip_port(response.request.proxy)
         if proxy in self.invalid_pool:
             self.invalid_pool[proxy].discard(domain)
         self.proxy_pool.add(proxy)
     return response
Esempio n. 10
0
    def get_proxy_by_api(self, request):
        domain = parse_url(request.url).netloc

        def _get_from_pool():
            while self.proxy_pool:
                proxy = self.proxy_pool.pop()
                if proxy not in self.invalid_pool or\
                        (domain not in self.invalid_pool.get(proxy)):
                    return proxy
                else:
                    continue

        proxy = _get_from_pool()
        if not proxy:
            self.logger.debug(f'No proxy in proxy pool.Getting some.')
            while 1:
                spider = request.spider
                req = amipy.Request(spider,
                                    spider.settings.HTTP_PROXY_API,
                                    delay=0,
                                    ignore=True)
                crawler = spider.binding_hub._crawler
                looper = spider.binding_hub.looper
                coro = crawler.requesters[req.down_type].crawl(req)
                resp = looper.run_coroutine(coro)
                if not resp:
                    self.logger.error(
                        f'[{resp.status}]Getting Http proxy by api failed.')
                    continue
                _results = [i.strip() for i in resp.text().split('\n')]
                results = [
                    is_proxy_valid(i)[0] for i in _results if is_proxy_valid(i)
                ]
                self.proxy_pool.update(results)
                self.logger.debug(
                    f'Got {len(results)} http proxies from HTTP_PROXY_API.')
                proxy = _get_from_pool()
                if not proxy:
                    continue
                break
        return proxy
Esempio n. 11
0
 def process_request(self, request):
     spider = request.spider
     if not request.obey_robots_txt:
         if not spider.settings.ROBOTS_TXT_OBEY:
             return request
     _purl = parse_url(request.url)
     netloc = _purl.netloc
     if not netloc in self.rparser:
         if netloc in self.rubbish:
             return request
         robots_url = f"{_purl.scheme}://{netloc}/robots.txt"
         req = amipy.Request(spider, robots_url)
         crawler = spider.binding_hub._crawler
         looper = spider.binding_hub.looper
         coro = crawler.requesters[req.down_type].crawl(req)
         resp = looper.run_coroutine(coro)
         if resp.status != 200:
             self.logger.debug(
                 f'[{resp.status}] There is no robots.txt for "{netloc}".')
             self.rubbish.add(netloc)
             return request
         else:
             self.logger.debug(
                 f'[{resp.status}] Found robots.txt for "{netloc}".')
             _parser = robotparser.RobotFileParser(robots_url)
             _parser.parse(resp.text().splitlines())
             self.rparser[netloc] = _parser
     else:
         _parser = self.rparser[netloc]
     ua = spider.settings.ROBOTS_USER_AGENT
     if _parser.can_fetch(ua, request.url):
         return request
     else:
         self.logger.debug(f'Forbidden by robots.txt of "{netloc}".'
                           f'Request:{request}')
         raise DropRequest
Esempio n. 12
0
def url_get_index(url=''):
    if not url:
        url = request.url
    parsed = parse_url(url)
    path, cur_index = parsed.path.rsplit('/', 1)
    return cur_index
Esempio n. 13
0
 def _proxy_invalid(self, proxy, url):
     domain = parse_url(url).netloc
     if proxy in self.invalid_pool:
         if domain in self.invalid_pool[proxy]:
             return True
         return False