def _process_links(self, links): links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: links = [ link for link in links if _matches(link.url, self.allow_res) ] if self.deny_res: links = [ link for link in links if not _matches(link.url, self.deny_res) ] if self.allow_domains: links = [ link for link in links if url_is_from_any_domain(link.url, self.allow_domains) ] if self.deny_domains: links = [ link for link in links if not url_is_from_any_domain(link.url, self.deny_domains) ] if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def extract_links(self, response): if self.restrict_xpaths: hxs = HtmlXPathSelector(response) html_slice = ''.join(''.join(html_fragm for html_fragm in hxs.select(xpath_expr).extract()) \ for xpath_expr in self.restrict_xpaths) links = self._extract_links(html_slice, response.url, response.encoding) else: links = BaseSgmlLinkExtractor.extract_links(self, response) links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: links = [link for link in links if _matches(link.url, self.allow_res)] if self.deny_res: links = [link for link in links if not _matches(link.url, self.deny_res)] if self.allow_domains: links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)] if self.deny_domains: links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)] if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) return links
def _process_links(self, links): links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: links = [link for link in links if _matches(link.url, self.allow_res)] if self.deny_res: links = [link for link in links if not _matches(link.url, self.deny_res)] if self.allow_domains: links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)] if self.deny_domains: links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)] new_links = [] for link in links: CustomerId = link.url.split('/')[6] if not self._ignore_identifier(CustomerId): log.msg("Found CustomerId: "+CustomerId,level=log.DEBUG) new_links.append(link) links = new_links if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def _link_allowed(self, link): if not _is_valid_url(link.url): # 非法url return False if self.allow_res and not _matches(link.url, self.allow_res): # 不再允许规则里面 return False if self.deny_res and _matches(link.url, self.deny_res): # 在拒绝规则里面 return False # 解析url parsed_url = urlparse(link.url) if self.allow_domains and not url_is_from_any_domain( parsed_url, self.allow_domains): # 不再允许域名里面 return False if self.deny_domains and url_is_from_any_domain( parsed_url, self.deny_domains): # 在拒绝域名里面 return False if self.deny_extensions and url_has_any_extension( parsed_url, self.deny_extensions): # 在拒绝后缀名里面 return False # 可以 return True
def _link_allowed(self, link): _matches = lambda url, regexs: any(r.search(url) for r in regexs) _is_valid_url = lambda url: url.split('://', 1)[ 0] in {'http', 'https', 'file', 'ftp'} if not _is_valid_url(link.url): self.logger.warning(f"Not allowed: {link.url} // no valid url") return False if self.allow_res and not _matches(link.url, self.allow_res): self.logger.warning( f"Not allowed: {link.url} // does not match whitelist") return False if self.deny_res and _matches(link.url, self.deny_res): self.logger.warning( f"Not allowed: {link.url} // matches blacklist") return False parsed_url = urlparse(link.url) if self.allow_domains and not url_is_from_any_domain( parsed_url, self.allow_domains): self.logger.warning( f"Not allowed: {link.url} // domain not listed as allowed") return False if self.deny_domains and url_is_from_any_domain( parsed_url, self.deny_domains): self.logger.warning( f"Not allowed: {link.url} // domain is listed as denied") return False if self.deny_extensions and url_has_any_extension( parsed_url, self.deny_extensions): self.logger.warning( f"Not allowed: {link.url} // extension is denied") return False if self.restrict_text and not _matches(link.text, self.restrict_text): return False return True
def _process_links(self, links): links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: links = [link for link in links if _matches(link.url, self.allow_res)] if self.deny_res: links = [link for link in links if not _matches(link.url, self.deny_res)] if self.allow_domains: links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)] if self.deny_domains: links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)] new_links = [] for link in links: ASIN = link.url.split('/')[5] if not self._ignore_identifier(ASIN): log.msg("Found ASIN: "+ASIN,level=log.DEBUG) link.url = "http://www.amazon.com/product-reviews/"+ASIN+"/ref%3Ddp_top_cm_cr_acr_txt?ie=UTF8&showViewpoints=0" new_links.append(link) links = new_links if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def matches(self, url): if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains): return False if self.deny_domains and url_is_from_any_domain(url, self.deny_domains): return False allowed = [regex.search(url) for regex in self.allow_res] if self.allow_res else [True] denied = [regex.search(url) for regex in self.deny_res] if self.deny_res else [] return any(allowed) and not any(denied)
def __call__(self, requests): """Filter domains""" processed = (req for req in requests) if self.allow: processed = (req for req in requests if url_is_from_any_domain(req.url, self.allow)) if self.deny: processed = (req for req in requests if not url_is_from_any_domain(req.url, self.deny)) return processed
def _link_allowed(self, link): if not _is_valid_url(link.url): return False if self.allow_res and not _matches(link.url, self.allow_res): return False if self.deny_res and _matches(link.url, self.deny_res): return False parsed_url = urlparse(link.url) if self.allow_domains and not url_is_from_any_domain(parsed_url, self.allow_domains): return False if self.deny_domains and url_is_from_any_domain(parsed_url, self.deny_domains): return False if self.deny_extensions and url_has_any_extension(parsed_url, self.deny_extensions): return False return True
def _link_allowed(self, link): parsed_url = urlparse(link.url) allowed = _is_valid_url(link.url) if self.allow_res: allowed &= _matches(link.url, self.allow_res) if self.deny_res: allowed &= not _matches(link.url, self.deny_res) if self.allow_domains: allowed &= url_is_from_any_domain(parsed_url, self.allow_domains) if self.deny_domains: allowed &= not url_is_from_any_domain(parsed_url, self.deny_domains) if self.deny_extensions: allowed &= not url_has_any_extension(parsed_url, self.deny_extensions) if allowed and self.canonicalize: link.url = canonicalize_url(parsed_url) return allowed
def process_spider_output(self, response, result, spider): for r in result: if not isinstance(r, Request): yield r continue feed_url = r.meta.get('feed_url') if not feed_url or url_is_from_any_domain(feed_url, self.domains): yield r
def filter_by_domains(ls, exclude=False): domains = [] for key, op, val in ls: if key != 'domain' or op != 'under': log.warning(f'Unknown filter {key} {op}') continue domains.append(val) return lambda u: url_is_from_any_domain(u, domains) ^ exclude
def _process_links(self, links): links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: links = [link for link in links if _matches(link.url, self.allow_res)] if self.deny_res: links = [link for link in links if not _matches(link.url, self.deny_res)] if self.allow_domains: links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)] if self.deny_domains: links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)] if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def matches(self, url): if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains): return False if self.deny_domains and url_is_from_any_domain(url, self.deny_domains): return False if self.allow_res: allowed = qualified_range_match(url, self.allow_res, self.allow_range) else: allowed = True if self.deny_res: denied = qualified_range_match(url, self.deny_res, self.deny_range) else: denied = False print "matches(%s) allowed=%s, denied=%s" % (url, allowed, denied) return allowed and not denied
def process_request(self, request, spider): auth = getattr(self, 'auth', None) if auth and b'Authorization' not in request.headers: domain = urlparse_cached(request).hostname if self.domain_unset: self.domain = domain self.domain_unset = False if not self.domain or url_is_from_any_domain( request.url, [self.domain]): request.headers[b'Authorization'] = auth
def test_url_is_from_any_domain(self): url = "http://www.wheele-bin-art.co.uk/get/product/123" self.assertTrue(url_is_from_any_domain(url, ["wheele-bin-art.co.uk"])) self.assertFalse(url_is_from_any_domain(url, ["art.co.uk"])) url = "http://wheele-bin-art.co.uk/get/product/123" self.assertTrue(url_is_from_any_domain(url, ["wheele-bin-art.co.uk"])) self.assertFalse(url_is_from_any_domain(url, ["art.co.uk"])) url = "http://www.Wheele-Bin-Art.co.uk/get/product/123" self.assertTrue(url_is_from_any_domain(url, ["wheele-bin-art.CO.UK"])) self.assertTrue(url_is_from_any_domain(url, ["WHEELE-BIN-ART.CO.UK"])) url = "http://192.169.0.15:8080/mypage.html" self.assertTrue(url_is_from_any_domain(url, ["192.169.0.15:8080"])) self.assertFalse(url_is_from_any_domain(url, ["192.169.0.15"])) url = "javascript:%20document.orderform_2581_1190810811.mode.value=%27add%27;%20javascript:%20document.orderform_2581_1190810811.submit%28%29" self.assertFalse(url_is_from_any_domain(url, ["testdomain.com"])) self.assertFalse(url_is_from_any_domain(url + ".testdomain.com", ["testdomain.com"]))
def test_url_is_from_any_domain(self): url = 'http://www.wheele-bin-art.co.uk/get/product/123' self.assertTrue(url_is_from_any_domain(url, ['wheele-bin-art.co.uk'])) self.assertFalse(url_is_from_any_domain(url, ['art.co.uk'])) url = 'http://wheele-bin-art.co.uk/get/product/123' self.assertTrue(url_is_from_any_domain(url, ['wheele-bin-art.co.uk'])) self.assertFalse(url_is_from_any_domain(url, ['art.co.uk'])) url = 'http://www.Wheele-Bin-Art.co.uk/get/product/123' self.assertTrue(url_is_from_any_domain(url, ['wheele-bin-art.CO.UK'])) self.assertTrue(url_is_from_any_domain(url, ['WHEELE-BIN-ART.CO.UK'])) url = 'http://192.169.0.15:8080/mypage.html' self.assertTrue(url_is_from_any_domain(url, ['192.169.0.15:8080'])) self.assertFalse(url_is_from_any_domain(url, ['192.169.0.15'])) url = 'javascript:%20document.orderform_2581_1190810811.mode.value=%27add%27;%20javascript:%20document.orderform_2581_1190810811.submit%28%29' self.assertFalse(url_is_from_any_domain(url, ['testdomain.com'])) self.assertFalse(url_is_from_any_domain(url+'.testdomain.com', ['testdomain.com']))
def matches(self, url): if self.allow_domains and not url_is_from_any_domain( url, self.allow_domains): return False if self.deny_domains and url_is_from_any_domain( url, self.deny_domains): return False if self.allow_res: allowed = qualified_range_match(url, self.allow_res, self.allow_range) else: allowed = True if self.deny_res: denied = qualified_range_match(url, self.deny_res, self.deny_range) else: denied = False print "matches(%s) allowed=%s, denied=%s" % (url, allowed, denied) return allowed and not denied
def _link_allowed(self, link): if not _is_valid_url(link.url): #检查 http等前缀 return False if self.allow_res and not _matches( link.url, self.allow_res): # 如果有 allow 参数 则判断是否不在 其中 return False if self.deny_res and _matches(link.url, self.deny_res): #同上 判断是否在deny中 return False parsed_url = urlparse(link.url) #解析出正确的 url格式的url if self.allow_domains and not url_is_from_any_domain( parsed_url, self.allow_domains): #是否不在 allow_domains 中 return False if self.deny_domains and url_is_from_any_domain( parsed_url, self.deny_domains): #是否在deny_domains里 return False if self.deny_extensions and url_has_any_extension( parsed_url, self.deny_extensions): #是否在忽略扩展名里 return False if self.restrict_text and not _matches( link.text, self.restrict_text): #是否在restrict_text里面 return False return True
def _process_links(self, links): links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: links = [ link for link in links if _matches(link.url, self.allow_res) ] if self.deny_res: links = [ link for link in links if not _matches(link.url, self.deny_res) ] if self.allow_domains: links = [ link for link in links if url_is_from_any_domain(link.url, self.allow_domains) ] if self.deny_domains: links = [ link for link in links if not url_is_from_any_domain(link.url, self.deny_domains) ] new_links = [] for link in links: ASIN = link.url.split('/')[5] if not self._ignore_identifier(ASIN): log.msg("Found ASIN: " + ASIN, level=log.DEBUG) link.url = "http://www.amazon.com/product-reviews/" + ASIN + "/ref%3Ddp_top_cm_cr_acr_txt?ie=UTF8&showViewpoints=0" new_links.append(link) links = new_links if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def _process_links(self, links): links = [link for link in links if _is_valid_url(link.url)] if self.allow_res: links = [ link for link in links if _matches(link.url, self.allow_res) ] if self.deny_res: links = [ link for link in links if not _matches(link.url, self.deny_res) ] if self.allow_domains: links = [ link for link in links if url_is_from_any_domain(link.url, self.allow_domains) ] if self.deny_domains: links = [ link for link in links if not url_is_from_any_domain(link.url, self.deny_domains) ] new_links = [] for link in links: CustomerId = link.url.split('/')[6] if not self._ignore_identifier(CustomerId): log.msg("Found CustomerId: " + CustomerId, level=log.DEBUG) new_links.append(link) links = new_links if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = BaseSgmlLinkExtractor._process_links(self, links) return links
def parse_item(self, response): # Don't log redirects if response.status >= 300 and response.status < 400: return None if not url_is_from_any_domain(response.url, self.allowed_domains): return None self.scraped_urls.add(response.url.rstrip('/')) l = len(self.scraped_urls) if l % 100 == 0: self.log("%s URLs scraped" % (l,), level=log.INFO) if l >= self.limit: raise CloseSpider('page_limit')
def test_url_is_from_any_domain(self): url = 'http://www.wheele-bin-art.co.uk/get/product/123' self.assertTrue(url_is_from_any_domain(url, ['wheele-bin-art.co.uk'])) self.assertFalse(url_is_from_any_domain(url, ['art.co.uk'])) url = 'http://wheele-bin-art.co.uk/get/product/123' self.assertTrue(url_is_from_any_domain(url, ['wheele-bin-art.co.uk'])) self.assertFalse(url_is_from_any_domain(url, ['art.co.uk'])) url = 'javascript:%20document.orderform_2581_1190810811.mode.value=%27add%27;%20javascript:%20document.orderform_2581_1190810811.submit%28%29' self.assertFalse(url_is_from_any_domain(url, ['testdomain.com'])) self.assertFalse(url_is_from_any_domain(url+'.testdomain.com', ['testdomain.com']))
def test_url_is_from_any_domain(self): url = "http://www.wheele-bin-art.co.uk/get/product/123" self.assertTrue(url_is_from_any_domain(url, ["wheele-bin-art.co.uk"])) self.assertFalse(url_is_from_any_domain(url, ["art.co.uk"])) url = "http://wheele-bin-art.co.uk/get/product/123" self.assertTrue(url_is_from_any_domain(url, ["wheele-bin-art.co.uk"])) self.assertFalse(url_is_from_any_domain(url, ["art.co.uk"])) url = "javascript:%20document.orderform_2581_1190810811.mode.value=%27add%27;%20javascript:%20document.orderform_2581_1190810811.submit%28%29" self.assertFalse(url_is_from_any_domain(url, ["testdomain.com"])) self.assertFalse(url_is_from_any_domain(url + ".testdomain.com", ["testdomain.com"]))
def _check_link_domains(self, parsed_url, allow_domains, deny_domains): if allow_domains and not url_is_from_any_domain(parsed_url, allow_domains): return False if deny_domains and url_is_from_any_domain(parsed_url, deny_domains): return False return True