Example #1
0
    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [
                link for link in links if _matches(link.url, self.allow_res)
            ]
        if self.deny_res:
            links = [
                link for link in links if not _matches(link.url, self.deny_res)
            ]
        if self.allow_domains:
            links = [
                link for link in links
                if url_is_from_any_domain(link.url, self.allow_domains)
            ]
        if self.deny_domains:
            links = [
                link for link in links
                if not url_is_from_any_domain(link.url, self.deny_domains)
            ]

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
Example #2
0
    def extract_links(self, response):
        if self.restrict_xpaths:
            hxs = HtmlXPathSelector(response)
            html_slice = ''.join(''.join(html_fragm for html_fragm in hxs.select(xpath_expr).extract()) \
                for xpath_expr in self.restrict_xpaths)
            links = self._extract_links(html_slice, response.url, response.encoding)
        else:
            links = BaseSgmlLinkExtractor.extract_links(self, response)

        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [link for link in links if _matches(link.url, self.allow_res)]
        if self.deny_res:
            links = [link for link in links if not _matches(link.url, self.deny_res)]
        if self.allow_domains:
            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
        if self.deny_domains:
            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        return links
Example #3
0
    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [link for link in links if _matches(link.url, self.allow_res)]
        if self.deny_res:
            links = [link for link in links if not _matches(link.url, self.deny_res)]
        if self.allow_domains:
            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
        if self.deny_domains:
            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]

        new_links = []
        for link in links:
            CustomerId = link.url.split('/')[6]
            if not self._ignore_identifier(CustomerId):
                log.msg("Found CustomerId: "+CustomerId,level=log.DEBUG)
                new_links.append(link)

        links = new_links

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
Example #4
0
    def _link_allowed(self, link):
        if not _is_valid_url(link.url):
            # 非法url
            return False
        if self.allow_res and not _matches(link.url, self.allow_res):
            # 不再允许规则里面
            return False
        if self.deny_res and _matches(link.url, self.deny_res):
            # 在拒绝规则里面
            return False

        # 解析url
        parsed_url = urlparse(link.url)
        if self.allow_domains and not url_is_from_any_domain(
                parsed_url, self.allow_domains):
            # 不再允许域名里面
            return False
        if self.deny_domains and url_is_from_any_domain(
                parsed_url, self.deny_domains):
            # 在拒绝域名里面
            return False
        if self.deny_extensions and url_has_any_extension(
                parsed_url, self.deny_extensions):
            # 在拒绝后缀名里面
            return False
        # 可以
        return True
    def _link_allowed(self, link):
        _matches = lambda url, regexs: any(r.search(url) for r in regexs)
        _is_valid_url = lambda url: url.split('://', 1)[
            0] in {'http', 'https', 'file', 'ftp'}

        if not _is_valid_url(link.url):
            self.logger.warning(f"Not allowed: {link.url} // no valid url")
            return False
        if self.allow_res and not _matches(link.url, self.allow_res):
            self.logger.warning(
                f"Not allowed: {link.url} // does not match whitelist")
            return False
        if self.deny_res and _matches(link.url, self.deny_res):
            self.logger.warning(
                f"Not allowed: {link.url} // matches blacklist")
            return False
        parsed_url = urlparse(link.url)
        if self.allow_domains and not url_is_from_any_domain(
                parsed_url, self.allow_domains):
            self.logger.warning(
                f"Not allowed: {link.url} // domain not listed as allowed")
            return False
        if self.deny_domains and url_is_from_any_domain(
                parsed_url, self.deny_domains):
            self.logger.warning(
                f"Not allowed: {link.url} // domain is listed as denied")
            return False
        if self.deny_extensions and url_has_any_extension(
                parsed_url, self.deny_extensions):
            self.logger.warning(
                f"Not allowed: {link.url} // extension is denied")
            return False
        if self.restrict_text and not _matches(link.text, self.restrict_text):
            return False
        return True
Example #6
0
    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [link for link in links if _matches(link.url, self.allow_res)]
        if self.deny_res:
            links = [link for link in links if not _matches(link.url, self.deny_res)]
        if self.allow_domains:
            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
        if self.deny_domains:
            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]

        new_links = []
        for link in links:
            ASIN = link.url.split('/')[5]
            if not self._ignore_identifier(ASIN):
                log.msg("Found ASIN: "+ASIN,level=log.DEBUG)
                link.url = "http://www.amazon.com/product-reviews/"+ASIN+"/ref%3Ddp_top_cm_cr_acr_txt?ie=UTF8&showViewpoints=0"
                new_links.append(link)

        links = new_links

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
Example #7
0
    def matches(self, url):
        if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
            return False
        if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
            return False

        allowed = [regex.search(url) for regex in self.allow_res] if self.allow_res else [True]
        denied = [regex.search(url) for regex in self.deny_res] if self.deny_res else []
        return any(allowed) and not any(denied)
Example #8
0
    def matches(self, url):
        if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
            return False
        if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
            return False

        allowed = [regex.search(url) for regex in self.allow_res] if self.allow_res else [True]
        denied = [regex.search(url) for regex in self.deny_res] if self.deny_res else []
        return any(allowed) and not any(denied)
Example #9
0
    def __call__(self, requests):
        """Filter domains"""
        processed = (req for req in requests)

        if self.allow:
            processed = (req for req in requests
                         if url_is_from_any_domain(req.url, self.allow))
        if self.deny:
            processed = (req for req in requests
                         if not url_is_from_any_domain(req.url, self.deny))

        return processed
Example #10
0
    def __call__(self, requests):
        """Filter domains"""
        processed = (req for req in requests)

        if self.allow:
            processed = (req for req in requests
                            if url_is_from_any_domain(req.url, self.allow))
        if self.deny:
            processed = (req for req in requests
                            if not url_is_from_any_domain(req.url, self.deny))

        return processed
Example #11
0
 def _link_allowed(self, link):
     if not _is_valid_url(link.url):
         return False
     if self.allow_res and not _matches(link.url, self.allow_res):
         return False
     if self.deny_res and _matches(link.url, self.deny_res):
         return False
     parsed_url = urlparse(link.url)
     if self.allow_domains and not url_is_from_any_domain(parsed_url, self.allow_domains):
         return False
     if self.deny_domains and url_is_from_any_domain(parsed_url, self.deny_domains):
         return False
     if self.deny_extensions and url_has_any_extension(parsed_url, self.deny_extensions):
         return False
     return True
Example #12
0
 def _link_allowed(self, link):
     if not _is_valid_url(link.url):
         return False
     if self.allow_res and not _matches(link.url, self.allow_res):
         return False
     if self.deny_res and _matches(link.url, self.deny_res):
         return False
     parsed_url = urlparse(link.url)
     if self.allow_domains and not url_is_from_any_domain(parsed_url, self.allow_domains):
         return False
     if self.deny_domains and url_is_from_any_domain(parsed_url, self.deny_domains):
         return False
     if self.deny_extensions and url_has_any_extension(parsed_url, self.deny_extensions):
         return False
     return True
Example #13
0
 def _link_allowed(self, link):
     parsed_url = urlparse(link.url)
     allowed = _is_valid_url(link.url)
     if self.allow_res:
         allowed &= _matches(link.url, self.allow_res)
     if self.deny_res:
         allowed &= not _matches(link.url, self.deny_res)
     if self.allow_domains:
         allowed &= url_is_from_any_domain(parsed_url, self.allow_domains)
     if self.deny_domains:
         allowed &= not url_is_from_any_domain(parsed_url, self.deny_domains)
     if self.deny_extensions:
         allowed &= not url_has_any_extension(parsed_url, self.deny_extensions)
     if allowed and self.canonicalize:
         link.url = canonicalize_url(parsed_url)
     return allowed
Example #14
0
 def _link_allowed(self, link):
     parsed_url = urlparse(link.url)
     allowed = _is_valid_url(link.url)
     if self.allow_res:
         allowed &= _matches(link.url, self.allow_res)
     if self.deny_res:
         allowed &= not _matches(link.url, self.deny_res)
     if self.allow_domains:
         allowed &= url_is_from_any_domain(parsed_url, self.allow_domains)
     if self.deny_domains:
         allowed &= not url_is_from_any_domain(parsed_url, self.deny_domains)
     if self.deny_extensions:
         allowed &= not url_has_any_extension(parsed_url, self.deny_extensions)
     if allowed and self.canonicalize:
         link.url = canonicalize_url(parsed_url)
     return allowed
 def process_spider_output(self, response, result, spider):
     for r in result:
         if not isinstance(r, Request):
             yield r
             continue
         feed_url = r.meta.get('feed_url')
         if not feed_url or url_is_from_any_domain(feed_url, self.domains):
             yield r
Example #16
0
def filter_by_domains(ls, exclude=False):
    domains = []
    for key, op, val in ls:
        if key != 'domain' or op != 'under':
            log.warning(f'Unknown filter {key} {op}')
            continue
        domains.append(val)
    return lambda u: url_is_from_any_domain(u, domains) ^ exclude
Example #17
0
    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [link for link in links if _matches(link.url, self.allow_res)]
        if self.deny_res:
            links = [link for link in links if not _matches(link.url, self.deny_res)]
        if self.allow_domains:
            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
        if self.deny_domains:
            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
    def matches(self, url):
        if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
            return False
        if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
            return False

        if self.allow_res:
            allowed = qualified_range_match(url, self.allow_res, self.allow_range)
        else:
            allowed = True

        if self.deny_res:
            denied = qualified_range_match(url, self.deny_res, self.deny_range)
        else:
            denied = False

        print "matches(%s) allowed=%s, denied=%s" % (url, allowed, denied)
        return allowed and not denied
Example #19
0
 def process_request(self, request, spider):
     auth = getattr(self, 'auth', None)
     if auth and b'Authorization' not in request.headers:
         domain = urlparse_cached(request).hostname
         if self.domain_unset:
             self.domain = domain
             self.domain_unset = False
         if not self.domain or url_is_from_any_domain(
                 request.url, [self.domain]):
             request.headers[b'Authorization'] = auth
Example #20
0
    def test_url_is_from_any_domain(self):
        url = "http://www.wheele-bin-art.co.uk/get/product/123"
        self.assertTrue(url_is_from_any_domain(url, ["wheele-bin-art.co.uk"]))
        self.assertFalse(url_is_from_any_domain(url, ["art.co.uk"]))

        url = "http://wheele-bin-art.co.uk/get/product/123"
        self.assertTrue(url_is_from_any_domain(url, ["wheele-bin-art.co.uk"]))
        self.assertFalse(url_is_from_any_domain(url, ["art.co.uk"]))

        url = "http://www.Wheele-Bin-Art.co.uk/get/product/123"
        self.assertTrue(url_is_from_any_domain(url, ["wheele-bin-art.CO.UK"]))
        self.assertTrue(url_is_from_any_domain(url, ["WHEELE-BIN-ART.CO.UK"]))

        url = "http://192.169.0.15:8080/mypage.html"
        self.assertTrue(url_is_from_any_domain(url, ["192.169.0.15:8080"]))
        self.assertFalse(url_is_from_any_domain(url, ["192.169.0.15"]))

        url = "javascript:%20document.orderform_2581_1190810811.mode.value=%27add%27;%20javascript:%20document.orderform_2581_1190810811.submit%28%29"
        self.assertFalse(url_is_from_any_domain(url, ["testdomain.com"]))
        self.assertFalse(url_is_from_any_domain(url + ".testdomain.com", ["testdomain.com"]))
Example #21
0
    def test_url_is_from_any_domain(self):
        url = 'http://www.wheele-bin-art.co.uk/get/product/123'
        self.assertTrue(url_is_from_any_domain(url, ['wheele-bin-art.co.uk']))
        self.assertFalse(url_is_from_any_domain(url, ['art.co.uk']))

        url = 'http://wheele-bin-art.co.uk/get/product/123'
        self.assertTrue(url_is_from_any_domain(url, ['wheele-bin-art.co.uk']))
        self.assertFalse(url_is_from_any_domain(url, ['art.co.uk']))

        url = 'http://www.Wheele-Bin-Art.co.uk/get/product/123'
        self.assertTrue(url_is_from_any_domain(url, ['wheele-bin-art.CO.UK']))
        self.assertTrue(url_is_from_any_domain(url, ['WHEELE-BIN-ART.CO.UK']))

        url = 'http://192.169.0.15:8080/mypage.html'
        self.assertTrue(url_is_from_any_domain(url, ['192.169.0.15:8080']))
        self.assertFalse(url_is_from_any_domain(url, ['192.169.0.15']))

        url = 'javascript:%20document.orderform_2581_1190810811.mode.value=%27add%27;%20javascript:%20document.orderform_2581_1190810811.submit%28%29'
        self.assertFalse(url_is_from_any_domain(url, ['testdomain.com']))
        self.assertFalse(url_is_from_any_domain(url+'.testdomain.com', ['testdomain.com']))
Example #22
0
    def matches(self, url):
        if self.allow_domains and not url_is_from_any_domain(
                url, self.allow_domains):
            return False
        if self.deny_domains and url_is_from_any_domain(
                url, self.deny_domains):
            return False

        if self.allow_res:
            allowed = qualified_range_match(url, self.allow_res,
                                            self.allow_range)
        else:
            allowed = True

        if self.deny_res:
            denied = qualified_range_match(url, self.deny_res, self.deny_range)
        else:
            denied = False

        print "matches(%s) allowed=%s, denied=%s" % (url, allowed, denied)
        return allowed and not denied
Example #23
0
 def _link_allowed(self, link):
     if not _is_valid_url(link.url):  #检查 http等前缀
         return False
     if self.allow_res and not _matches(
             link.url, self.allow_res):  # 如果有 allow 参数 则判断是否不在 其中
         return False
     if self.deny_res and _matches(link.url, self.deny_res):  #同上 判断是否在deny中
         return False
     parsed_url = urlparse(link.url)  #解析出正确的 url格式的url
     if self.allow_domains and not url_is_from_any_domain(
             parsed_url, self.allow_domains):  #是否不在 allow_domains 中
         return False
     if self.deny_domains and url_is_from_any_domain(
             parsed_url, self.deny_domains):  #是否在deny_domains里
         return False
     if self.deny_extensions and url_has_any_extension(
             parsed_url, self.deny_extensions):  #是否在忽略扩展名里
         return False
     if self.restrict_text and not _matches(
             link.text, self.restrict_text):  #是否在restrict_text里面
         return False
     return True
Example #24
0
    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [
                link for link in links if _matches(link.url, self.allow_res)
            ]
        if self.deny_res:
            links = [
                link for link in links if not _matches(link.url, self.deny_res)
            ]
        if self.allow_domains:
            links = [
                link for link in links
                if url_is_from_any_domain(link.url, self.allow_domains)
            ]
        if self.deny_domains:
            links = [
                link for link in links
                if not url_is_from_any_domain(link.url, self.deny_domains)
            ]

        new_links = []
        for link in links:
            ASIN = link.url.split('/')[5]
            if not self._ignore_identifier(ASIN):
                log.msg("Found ASIN: " + ASIN, level=log.DEBUG)
                link.url = "http://www.amazon.com/product-reviews/" + ASIN + "/ref%3Ddp_top_cm_cr_acr_txt?ie=UTF8&showViewpoints=0"
                new_links.append(link)

        links = new_links

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
Example #25
0
    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [
                link for link in links if _matches(link.url, self.allow_res)
            ]
        if self.deny_res:
            links = [
                link for link in links if not _matches(link.url, self.deny_res)
            ]
        if self.allow_domains:
            links = [
                link for link in links
                if url_is_from_any_domain(link.url, self.allow_domains)
            ]
        if self.deny_domains:
            links = [
                link for link in links
                if not url_is_from_any_domain(link.url, self.deny_domains)
            ]

        new_links = []
        for link in links:
            CustomerId = link.url.split('/')[6]
            if not self._ignore_identifier(CustomerId):
                log.msg("Found CustomerId: " + CustomerId, level=log.DEBUG)
                new_links.append(link)

        links = new_links

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links
Example #26
0
    def parse_item(self, response):
        # Don't log redirects
        if response.status >= 300 and response.status < 400:
            return None

        if not url_is_from_any_domain(response.url, self.allowed_domains):
            return None

        self.scraped_urls.add(response.url.rstrip('/'))

        l = len(self.scraped_urls)
        if l % 100 == 0:
            self.log("%s URLs scraped" % (l,), level=log.INFO)
        if l >= self.limit:
            raise CloseSpider('page_limit')
Example #27
0
    def test_url_is_from_any_domain(self):
        url = 'http://www.wheele-bin-art.co.uk/get/product/123'
        self.assertTrue(url_is_from_any_domain(url, ['wheele-bin-art.co.uk']))
        self.assertFalse(url_is_from_any_domain(url, ['art.co.uk']))

        url = 'http://wheele-bin-art.co.uk/get/product/123'
        self.assertTrue(url_is_from_any_domain(url, ['wheele-bin-art.co.uk']))
        self.assertFalse(url_is_from_any_domain(url, ['art.co.uk']))

        url = 'javascript:%20document.orderform_2581_1190810811.mode.value=%27add%27;%20javascript:%20document.orderform_2581_1190810811.submit%28%29'
        self.assertFalse(url_is_from_any_domain(url, ['testdomain.com']))
        self.assertFalse(url_is_from_any_domain(url+'.testdomain.com', ['testdomain.com']))
Example #28
0
    def test_url_is_from_any_domain(self):
        url = 'http://www.wheele-bin-art.co.uk/get/product/123'
        self.assertTrue(url_is_from_any_domain(url, ['wheele-bin-art.co.uk']))
        self.assertFalse(url_is_from_any_domain(url, ['art.co.uk']))

        url = 'http://wheele-bin-art.co.uk/get/product/123'
        self.assertTrue(url_is_from_any_domain(url, ['wheele-bin-art.co.uk']))
        self.assertFalse(url_is_from_any_domain(url, ['art.co.uk']))

        url = 'javascript:%20document.orderform_2581_1190810811.mode.value=%27add%27;%20javascript:%20document.orderform_2581_1190810811.submit%28%29'
        self.assertFalse(url_is_from_any_domain(url, ['testdomain.com']))
        self.assertFalse(url_is_from_any_domain(url+'.testdomain.com', ['testdomain.com']))
Example #29
0
    def test_url_is_from_any_domain(self):
        url = "http://www.wheele-bin-art.co.uk/get/product/123"
        self.assertTrue(url_is_from_any_domain(url, ["wheele-bin-art.co.uk"]))
        self.assertFalse(url_is_from_any_domain(url, ["art.co.uk"]))

        url = "http://wheele-bin-art.co.uk/get/product/123"
        self.assertTrue(url_is_from_any_domain(url, ["wheele-bin-art.co.uk"]))
        self.assertFalse(url_is_from_any_domain(url, ["art.co.uk"]))

        url = "javascript:%20document.orderform_2581_1190810811.mode.value=%27add%27;%20javascript:%20document.orderform_2581_1190810811.submit%28%29"
        self.assertFalse(url_is_from_any_domain(url, ["testdomain.com"]))
        self.assertFalse(url_is_from_any_domain(url + ".testdomain.com", ["testdomain.com"]))
Example #30
0
 def _check_link_domains(self, parsed_url, allow_domains, deny_domains):
     if allow_domains and not url_is_from_any_domain(parsed_url, allow_domains):
         return False
     if deny_domains and url_is_from_any_domain(parsed_url, deny_domains):
         return False
     return True