Example #1
0
    def parseItem(self, response):
        base = get_base_url(response)       
        item = MirrorItem()
        meta = {}

        item['item'] = response.url
        yield item

        for img in response.xpath('//img/@src'):
            img = urljoin_rfc(base, img.extract())
            item['item'] = img
            yield item

        for js in response.xpath('//script/@src'):
            js = urljoin_rfc(base, js.extract())
            item['item'] = js
            yield item

        for css in response.xpath('//link/@href'):
            if url_has_any_extension(css.extract(), '.css'):
                css = urljoin_rfc(base, css.extract())
                yield Request(url=css, meta=meta, callback=self.parseStyle)
            else:
                item['item'] = css
                yield item
Example #2
0
 def parse_html(self, response, lru):
     lrulinks = []
     # handle redirects
     realdepth = response.meta['depth']
     if 300 < response.status < 400:
         redir_url = response.headers['Location']
         if redir_url.startswith('/'):
             redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url)
         elif redir_url.startswith('./') or not redir_url.startswith('http'):
             redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:])
         links = [{'url': redir_url}]
         response.meta['depth'] -= 1
     else:
         try:
             links = self.link_extractor.extract_links(response)
         except Exception as e:
             self.log("ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR)
             links = []
             self.errors += 1
     for link in links:
         try:
             url = link.url
         except AttributeError:
             url = link['url']
         try:
             lrulink = url_to_lru_clean(url)
         except ValueError, e:
             self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR)
             continue
         lrulinks.append((url, lrulink))
         if self._should_follow(response.meta['depth'], lru, lrulink) and \
                 not url_has_any_extension(url, self.ignored_exts):
             yield self._request(url)
Example #3
0
 def parse_html(self, response, lru):
     lrulinks = []
     # handle redirects
     realdepth = response.meta['depth']
     if 300 < response.status < 400:
         redir_url = response.headers['Location']
         if redir_url.startswith('/'):
             redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'),
                                   redir_url)
         elif redir_url.startswith(
                 './') or not redir_url.startswith('http'):
             redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'),
                                   redir_url[1:])
         links = [{'url': redir_url}]
         response.meta['depth'] -= 1
     else:
         links = self.link_extractor.extract_links(response)
     for link in links:
         try:
             url = link.url
         except AttributeError:
             url = link['url']
         try:
             lrulink = url_to_lru_clean(url)
         except ValueError, e:
             self.log("Error converting URL to LRU: %s" % e, log.ERROR)
             continue
         lrulinks.append(lrulink)
         if self._should_follow(response.meta['depth'], lru, lrulink) and \
                 not url_has_any_extension(url, self.ignored_exts):
             yield self._request(url)
Example #4
0
    def _link_allowed(self, link):
        if not _is_valid_url(link.url):
            # 非法url
            return False
        if self.allow_res and not _matches(link.url, self.allow_res):
            # 不再允许规则里面
            return False
        if self.deny_res and _matches(link.url, self.deny_res):
            # 在拒绝规则里面
            return False

        # 解析url
        parsed_url = urlparse(link.url)
        if self.allow_domains and not url_is_from_any_domain(
                parsed_url, self.allow_domains):
            # 不再允许域名里面
            return False
        if self.deny_domains and url_is_from_any_domain(
                parsed_url, self.deny_domains):
            # 在拒绝域名里面
            return False
        if self.deny_extensions and url_has_any_extension(
                parsed_url, self.deny_extensions):
            # 在拒绝后缀名里面
            return False
        # 可以
        return True
    def _link_allowed(self, link):
        _matches = lambda url, regexs: any(r.search(url) for r in regexs)
        _is_valid_url = lambda url: url.split('://', 1)[
            0] in {'http', 'https', 'file', 'ftp'}

        if not _is_valid_url(link.url):
            self.logger.warning(f"Not allowed: {link.url} // no valid url")
            return False
        if self.allow_res and not _matches(link.url, self.allow_res):
            self.logger.warning(
                f"Not allowed: {link.url} // does not match whitelist")
            return False
        if self.deny_res and _matches(link.url, self.deny_res):
            self.logger.warning(
                f"Not allowed: {link.url} // matches blacklist")
            return False
        parsed_url = urlparse(link.url)
        if self.allow_domains and not url_is_from_any_domain(
                parsed_url, self.allow_domains):
            self.logger.warning(
                f"Not allowed: {link.url} // domain not listed as allowed")
            return False
        if self.deny_domains and url_is_from_any_domain(
                parsed_url, self.deny_domains):
            self.logger.warning(
                f"Not allowed: {link.url} // domain is listed as denied")
            return False
        if self.deny_extensions and url_has_any_extension(
                parsed_url, self.deny_extensions):
            self.logger.warning(
                f"Not allowed: {link.url} // extension is denied")
            return False
        if self.restrict_text and not _matches(link.text, self.restrict_text):
            return False
        return True
 def parse_html(self, response, lru):
     depth = response.meta['depth']
     lrulinks = []
     for link in self.link_extractor.extract_links(response):
         try:
             lrulink = url_to_lru(link.url)
         except ValueError, e:
             self.log("Error converting URL to LRU: %s" % e, log.ERROR)
             continue
         lrulinks.append(lrulink)
         if self._should_follow(depth, lru, lrulink) and \
                 not url_has_any_extension(link.url, self.ignored_exts):
             yield Request(link.url, callback=self.parse)
Example #7
0
 def _link_allowed(self, link):
     if not _is_valid_url(link.url):
         return False
     if not self._check_link_res(link, self.allow_res, self.deny_res):
         return False
     parsed_url = urlparse(link.url)
     if not self._check_link_domains(parsed_url, self.allow_domains, self.deny_domains):
         return False
     if self.deny_extensions and url_has_any_extension(parsed_url, self.deny_extensions):
         return False
     if self.restrict_text and not _matches(link.text, self.restrict_text):
         return False
     return True
 def parse_html(self, response, lru):
     depth = response.meta['depth']
     lrulinks = []
     for link in self.link_extractor.extract_links(response):
         try:
             lrulink = url_to_lru(link.url)
         except ValueError, e:
             self.log("Error converting URL to LRU: %s" % e, log.ERROR)
             continue
         lrulinks.append(lrulink)
         if self._should_follow(depth, lru, lrulink) and \
                 not url_has_any_extension(link.url, self.ignored_exts):
             yield Request(link.url, callback=self.parse)
Example #9
0
 def test_url_has_any_extension(self):
     deny_extensions = {'.' + e for e in arg_to_iter(IGNORED_EXTENSIONS)}
     self.assertTrue(url_has_any_extension("http://www.example.com/archive.tar.gz", deny_extensions))
     self.assertTrue(url_has_any_extension("http://www.example.com/page.doc", deny_extensions))
     self.assertTrue(url_has_any_extension("http://www.example.com/page.pdf", deny_extensions))
     self.assertFalse(url_has_any_extension("http://www.example.com/page.htm", deny_extensions))
     self.assertFalse(url_has_any_extension("http://www.example.com/", deny_extensions))
     self.assertFalse(url_has_any_extension("http://www.example.com/page.doc.html", deny_extensions))
Example #10
0
 def _link_allowed(self, link):
     if not _is_valid_url(link.url):
         return False
     if self.allow_res and not _matches(link.url, self.allow_res):
         return False
     if self.deny_res and _matches(link.url, self.deny_res):
         return False
     parsed_url = urlparse(link.url)
     if self.allow_domains and not url_is_from_any_domain(parsed_url, self.allow_domains):
         return False
     if self.deny_domains and url_is_from_any_domain(parsed_url, self.deny_domains):
         return False
     if self.deny_extensions and url_has_any_extension(parsed_url, self.deny_extensions):
         return False
     return True
Example #11
0
 def _link_allowed(self, link):
     if not _is_valid_url(link.url):
         return False
     if self.allow_res and not _matches(link.url, self.allow_res):
         return False
     if self.deny_res and _matches(link.url, self.deny_res):
         return False
     parsed_url = urlparse(link.url)
     if self.allow_domains and not url_is_from_any_domain(parsed_url, self.allow_domains):
         return False
     if self.deny_domains and url_is_from_any_domain(parsed_url, self.deny_domains):
         return False
     if self.deny_extensions and url_has_any_extension(parsed_url, self.deny_extensions):
         return False
     return True
Example #12
0
 def _link_allowed(self, link):
     parsed_url = urlparse(link.url)
     allowed = _is_valid_url(link.url)
     if self.allow_res:
         allowed &= _matches(link.url, self.allow_res)
     if self.deny_res:
         allowed &= not _matches(link.url, self.deny_res)
     if self.allow_domains:
         allowed &= url_is_from_any_domain(parsed_url, self.allow_domains)
     if self.deny_domains:
         allowed &= not url_is_from_any_domain(parsed_url, self.deny_domains)
     if self.deny_extensions:
         allowed &= not url_has_any_extension(parsed_url, self.deny_extensions)
     if allowed and self.canonicalize:
         link.url = canonicalize_url(parsed_url)
     return allowed
Example #13
0
 def _link_allowed(self, link):
     parsed_url = urlparse(link.url)
     allowed = _is_valid_url(link.url)
     if self.allow_res:
         allowed &= _matches(link.url, self.allow_res)
     if self.deny_res:
         allowed &= not _matches(link.url, self.deny_res)
     if self.allow_domains:
         allowed &= url_is_from_any_domain(parsed_url, self.allow_domains)
     if self.deny_domains:
         allowed &= not url_is_from_any_domain(parsed_url, self.deny_domains)
     if self.deny_extensions:
         allowed &= not url_has_any_extension(parsed_url, self.deny_extensions)
     if allowed and self.canonicalize:
         link.url = canonicalize_url(parsed_url)
     return allowed
Example #14
0
 def parse_html(self, response, lru):
     lrulinks = []
     # handle redirects
     realdepth = response.meta['depth']
     if 300 < response.status < 400:
         redir_url = response.headers['Location']
         if redir_url.startswith('/'):
             redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'),
                                   redir_url)
         elif redir_url.startswith('../'):
             lrustart = lru[:lru.rfind('|p:')]
             while redir_url.startswith('../'):
                 lrustart = lrustart[:lrustart.rfind('|p:')]
                 redir_url = redir_url[3:]
             redir_url = "%s/%s" % (lru_to_url(lrustart + '|'), redir_url)
         elif redir_url.startswith(
                 './') or not redir_url.startswith('http'):
             redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'),
                                   redir_url[1:])
         links = [{'url': redir_url}]
         response.meta['depth'] -= 1
     else:
         try:
             links = self.link_extractor.extract_links(response)
         except Exception as e:
             self.log(
                 "ERROR: links extractor crashed on %s: %s %s" %
                 (response, type(e), e), logging.ERROR)
             links = []
             self.errors += 1
     for link in links:
         try:
             url = link.url
         except AttributeError:
             url = link['url']
         try:
             lrulink = url_to_lru_clean(url, TLDS_TREE)
         except (ValueError, IndexError) as e:
             self.log("Error converting URL %s to LRU: %s" % (url, e),
                      logging.ERROR)
             continue
         lrulinks.append((url, lrulink))
         if self._should_follow(response.meta['depth'], lrulink) and \
                 not url_has_any_extension(url, self.ignored_exts):
             yield self._request(url)
     response.meta['depth'] = realdepth
     yield self._make_html_page(response, lru, lrulinks)
Example #15
0
    def parseStyle(self, response):
        base = get_base_url(response)
        item = MirrorItem()
        meta = {}

        item['item'] = response.url
        yield item

        if response.selector.re('url\((.*?)\)'):
            for src in response.selector.re('url\((.*?)\)'):
                src = src.strip("'").strip('"')
                if not 'data:' in src:
                    src = urljoin_rfc(base, src)
                    if url_has_any_extension(src, '.css'):
                        yield Request(url=src, meta=meta, callback=self.parseStyle)
                    else:
                        item['item'] = src
                        yield item
Example #16
0
 def _link_allowed(self, link):
     if not _is_valid_url(link.url):  #检查 http等前缀
         return False
     if self.allow_res and not _matches(
             link.url, self.allow_res):  # 如果有 allow 参数 则判断是否不在 其中
         return False
     if self.deny_res and _matches(link.url, self.deny_res):  #同上 判断是否在deny中
         return False
     parsed_url = urlparse(link.url)  #解析出正确的 url格式的url
     if self.allow_domains and not url_is_from_any_domain(
             parsed_url, self.allow_domains):  #是否不在 allow_domains 中
         return False
     if self.deny_domains and url_is_from_any_domain(
             parsed_url, self.deny_domains):  #是否在deny_domains里
         return False
     if self.deny_extensions and url_has_any_extension(
             parsed_url, self.deny_extensions):  #是否在忽略扩展名里
         return False
     if self.restrict_text and not _matches(
             link.text, self.restrict_text):  #是否在restrict_text里面
         return False
     return True
Example #17
0
 def _has_extension(self, url: str) -> bool:
     return url_has_any_extension(url, self.file_extensions)
Example #18
0
 def _has_extension(self, url):
     # type: (str) -> bool
     return url_has_any_extension(url, self.file_extensions)
Example #19
0
def _extract_link_dicts(selector: Selector,
                        base_url: str,
                        only_urls: bool = False):
    """
    Extract dicts with link information::

    {
        'url': '<absolute URL>',
        'attrs': {
            '<attribute name>': '<value>',
            ...
        },
        'inside_text': '<text inside link>',
        # 'before_text': '<text preceeding this link>',
    }

    If only_urls is true, extract only links as strings.

    Note that ``base_url`` argument must contain page base URL, which can be
    different from page URL. Use w3lib.html.get_base_url to get it::

        from w3lib.html import get_base_url
        base_url = get_base_url(html[:4096], page_url)
        links = list(extract_link_dicts(Selector(html), base_url))

    If you're using Scrapy, and Response object is available, then
    scrapy.utils.response.get_base_url should be faster::

        from scrapy.utils.response import get_base_url
        base_url = get_base_url(response)
        links = list(extract_link_dicts(response.selector, base_url))

    """
    selector.remove_namespaces()

    for a in selector.xpath('//a'):
        link = {}  # type: Dict

        attrs = a.root.attrib
        if 'href' not in attrs:
            continue

        href = strip_html5_whitespace(attrs['href'])
        if 'mailto:' in href:
            continue

        js_link = extract_js_link(href)
        if js_link:
            href = js_link
            link['js'] = True

        if href.startswith(('tel:', 'skype:', 'fb:', 'javascript:')):
            continue

        url = urljoin(base_url, href)
        if url_has_any_extension(url, _IGNORED):
            continue

        if only_urls:
            yield url

        else:
            link['url'] = url
            link['attrs'] = dict(attrs)

            link_text = a.xpath('normalize-space()').extract_first(default='')
            img_link_text = a.xpath('./img/@alt').extract_first(default='')
            link['inside_text'] = ' '.join([link_text, img_link_text]).strip()

            # TODO: fix before_text and add after_text
            # link['before_text'] = a.xpath('./preceding::text()[1]').extract_first(default='').strip()[-100:]

            yield link
Example #20
0
 def test_url_has_any_extension(self):
     self.assertTrue(url_has_any_extension("http://www.example.com/page.doc", IGNORED_EXTENSIONS))
     self.assertTrue(url_has_any_extension("http://www.example.com/page.pdf", IGNORED_EXTENSIONS))
     self.assertFalse(url_has_any_extension("http://www.example.com/page.htm", IGNORED_EXTENSIONS))
     self.assertFalse(url_has_any_extension("http://www.example.com/", IGNORED_EXTENSIONS))
     self.assertFalse(url_has_any_extension("http://www.example.com/page.doc.html", IGNORED_EXTENSIONS))
Example #21
0
 def _has_extension(self, url: str) -> bool:
     return url_has_any_extension(url, self.file_extensions)