def parseItem(self, response): base = get_base_url(response) item = MirrorItem() meta = {} item['item'] = response.url yield item for img in response.xpath('//img/@src'): img = urljoin_rfc(base, img.extract()) item['item'] = img yield item for js in response.xpath('//script/@src'): js = urljoin_rfc(base, js.extract()) item['item'] = js yield item for css in response.xpath('//link/@href'): if url_has_any_extension(css.extract(), '.css'): css = urljoin_rfc(base, css.extract()) yield Request(url=css, meta=meta, callback=self.parseStyle) else: item['item'] = css yield item
def parse_html(self, response, lru): lrulinks = [] # handle redirects realdepth = response.meta['depth'] if 300 < response.status < 400: redir_url = response.headers['Location'] if redir_url.startswith('/'): redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url) elif redir_url.startswith('./') or not redir_url.startswith('http'): redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:]) links = [{'url': redir_url}] response.meta['depth'] -= 1 else: try: links = self.link_extractor.extract_links(response) except Exception as e: self.log("ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR) links = [] self.errors += 1 for link in links: try: url = link.url except AttributeError: url = link['url'] try: lrulink = url_to_lru_clean(url) except ValueError, e: self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR) continue lrulinks.append((url, lrulink)) if self._should_follow(response.meta['depth'], lru, lrulink) and \ not url_has_any_extension(url, self.ignored_exts): yield self._request(url)
def parse_html(self, response, lru): lrulinks = [] # handle redirects realdepth = response.meta['depth'] if 300 < response.status < 400: redir_url = response.headers['Location'] if redir_url.startswith('/'): redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url) elif redir_url.startswith( './') or not redir_url.startswith('http'): redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:]) links = [{'url': redir_url}] response.meta['depth'] -= 1 else: links = self.link_extractor.extract_links(response) for link in links: try: url = link.url except AttributeError: url = link['url'] try: lrulink = url_to_lru_clean(url) except ValueError, e: self.log("Error converting URL to LRU: %s" % e, log.ERROR) continue lrulinks.append(lrulink) if self._should_follow(response.meta['depth'], lru, lrulink) and \ not url_has_any_extension(url, self.ignored_exts): yield self._request(url)
def _link_allowed(self, link): if not _is_valid_url(link.url): # 非法url return False if self.allow_res and not _matches(link.url, self.allow_res): # 不再允许规则里面 return False if self.deny_res and _matches(link.url, self.deny_res): # 在拒绝规则里面 return False # 解析url parsed_url = urlparse(link.url) if self.allow_domains and not url_is_from_any_domain( parsed_url, self.allow_domains): # 不再允许域名里面 return False if self.deny_domains and url_is_from_any_domain( parsed_url, self.deny_domains): # 在拒绝域名里面 return False if self.deny_extensions and url_has_any_extension( parsed_url, self.deny_extensions): # 在拒绝后缀名里面 return False # 可以 return True
def _link_allowed(self, link): _matches = lambda url, regexs: any(r.search(url) for r in regexs) _is_valid_url = lambda url: url.split('://', 1)[ 0] in {'http', 'https', 'file', 'ftp'} if not _is_valid_url(link.url): self.logger.warning(f"Not allowed: {link.url} // no valid url") return False if self.allow_res and not _matches(link.url, self.allow_res): self.logger.warning( f"Not allowed: {link.url} // does not match whitelist") return False if self.deny_res and _matches(link.url, self.deny_res): self.logger.warning( f"Not allowed: {link.url} // matches blacklist") return False parsed_url = urlparse(link.url) if self.allow_domains and not url_is_from_any_domain( parsed_url, self.allow_domains): self.logger.warning( f"Not allowed: {link.url} // domain not listed as allowed") return False if self.deny_domains and url_is_from_any_domain( parsed_url, self.deny_domains): self.logger.warning( f"Not allowed: {link.url} // domain is listed as denied") return False if self.deny_extensions and url_has_any_extension( parsed_url, self.deny_extensions): self.logger.warning( f"Not allowed: {link.url} // extension is denied") return False if self.restrict_text and not _matches(link.text, self.restrict_text): return False return True
def parse_html(self, response, lru): depth = response.meta['depth'] lrulinks = [] for link in self.link_extractor.extract_links(response): try: lrulink = url_to_lru(link.url) except ValueError, e: self.log("Error converting URL to LRU: %s" % e, log.ERROR) continue lrulinks.append(lrulink) if self._should_follow(depth, lru, lrulink) and \ not url_has_any_extension(link.url, self.ignored_exts): yield Request(link.url, callback=self.parse)
def _link_allowed(self, link): if not _is_valid_url(link.url): return False if not self._check_link_res(link, self.allow_res, self.deny_res): return False parsed_url = urlparse(link.url) if not self._check_link_domains(parsed_url, self.allow_domains, self.deny_domains): return False if self.deny_extensions and url_has_any_extension(parsed_url, self.deny_extensions): return False if self.restrict_text and not _matches(link.text, self.restrict_text): return False return True
def test_url_has_any_extension(self): deny_extensions = {'.' + e for e in arg_to_iter(IGNORED_EXTENSIONS)} self.assertTrue(url_has_any_extension("http://www.example.com/archive.tar.gz", deny_extensions)) self.assertTrue(url_has_any_extension("http://www.example.com/page.doc", deny_extensions)) self.assertTrue(url_has_any_extension("http://www.example.com/page.pdf", deny_extensions)) self.assertFalse(url_has_any_extension("http://www.example.com/page.htm", deny_extensions)) self.assertFalse(url_has_any_extension("http://www.example.com/", deny_extensions)) self.assertFalse(url_has_any_extension("http://www.example.com/page.doc.html", deny_extensions))
def _link_allowed(self, link): if not _is_valid_url(link.url): return False if self.allow_res and not _matches(link.url, self.allow_res): return False if self.deny_res and _matches(link.url, self.deny_res): return False parsed_url = urlparse(link.url) if self.allow_domains and not url_is_from_any_domain(parsed_url, self.allow_domains): return False if self.deny_domains and url_is_from_any_domain(parsed_url, self.deny_domains): return False if self.deny_extensions and url_has_any_extension(parsed_url, self.deny_extensions): return False return True
def _link_allowed(self, link): parsed_url = urlparse(link.url) allowed = _is_valid_url(link.url) if self.allow_res: allowed &= _matches(link.url, self.allow_res) if self.deny_res: allowed &= not _matches(link.url, self.deny_res) if self.allow_domains: allowed &= url_is_from_any_domain(parsed_url, self.allow_domains) if self.deny_domains: allowed &= not url_is_from_any_domain(parsed_url, self.deny_domains) if self.deny_extensions: allowed &= not url_has_any_extension(parsed_url, self.deny_extensions) if allowed and self.canonicalize: link.url = canonicalize_url(parsed_url) return allowed
def parse_html(self, response, lru): lrulinks = [] # handle redirects realdepth = response.meta['depth'] if 300 < response.status < 400: redir_url = response.headers['Location'] if redir_url.startswith('/'): redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url) elif redir_url.startswith('../'): lrustart = lru[:lru.rfind('|p:')] while redir_url.startswith('../'): lrustart = lrustart[:lrustart.rfind('|p:')] redir_url = redir_url[3:] redir_url = "%s/%s" % (lru_to_url(lrustart + '|'), redir_url) elif redir_url.startswith( './') or not redir_url.startswith('http'): redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:]) links = [{'url': redir_url}] response.meta['depth'] -= 1 else: try: links = self.link_extractor.extract_links(response) except Exception as e: self.log( "ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), logging.ERROR) links = [] self.errors += 1 for link in links: try: url = link.url except AttributeError: url = link['url'] try: lrulink = url_to_lru_clean(url, TLDS_TREE) except (ValueError, IndexError) as e: self.log("Error converting URL %s to LRU: %s" % (url, e), logging.ERROR) continue lrulinks.append((url, lrulink)) if self._should_follow(response.meta['depth'], lrulink) and \ not url_has_any_extension(url, self.ignored_exts): yield self._request(url) response.meta['depth'] = realdepth yield self._make_html_page(response, lru, lrulinks)
def parseStyle(self, response): base = get_base_url(response) item = MirrorItem() meta = {} item['item'] = response.url yield item if response.selector.re('url\((.*?)\)'): for src in response.selector.re('url\((.*?)\)'): src = src.strip("'").strip('"') if not 'data:' in src: src = urljoin_rfc(base, src) if url_has_any_extension(src, '.css'): yield Request(url=src, meta=meta, callback=self.parseStyle) else: item['item'] = src yield item
def _link_allowed(self, link): if not _is_valid_url(link.url): #检查 http等前缀 return False if self.allow_res and not _matches( link.url, self.allow_res): # 如果有 allow 参数 则判断是否不在 其中 return False if self.deny_res and _matches(link.url, self.deny_res): #同上 判断是否在deny中 return False parsed_url = urlparse(link.url) #解析出正确的 url格式的url if self.allow_domains and not url_is_from_any_domain( parsed_url, self.allow_domains): #是否不在 allow_domains 中 return False if self.deny_domains and url_is_from_any_domain( parsed_url, self.deny_domains): #是否在deny_domains里 return False if self.deny_extensions and url_has_any_extension( parsed_url, self.deny_extensions): #是否在忽略扩展名里 return False if self.restrict_text and not _matches( link.text, self.restrict_text): #是否在restrict_text里面 return False return True
def _has_extension(self, url: str) -> bool: return url_has_any_extension(url, self.file_extensions)
def _has_extension(self, url): # type: (str) -> bool return url_has_any_extension(url, self.file_extensions)
def _extract_link_dicts(selector: Selector, base_url: str, only_urls: bool = False): """ Extract dicts with link information:: { 'url': '<absolute URL>', 'attrs': { '<attribute name>': '<value>', ... }, 'inside_text': '<text inside link>', # 'before_text': '<text preceeding this link>', } If only_urls is true, extract only links as strings. Note that ``base_url`` argument must contain page base URL, which can be different from page URL. Use w3lib.html.get_base_url to get it:: from w3lib.html import get_base_url base_url = get_base_url(html[:4096], page_url) links = list(extract_link_dicts(Selector(html), base_url)) If you're using Scrapy, and Response object is available, then scrapy.utils.response.get_base_url should be faster:: from scrapy.utils.response import get_base_url base_url = get_base_url(response) links = list(extract_link_dicts(response.selector, base_url)) """ selector.remove_namespaces() for a in selector.xpath('//a'): link = {} # type: Dict attrs = a.root.attrib if 'href' not in attrs: continue href = strip_html5_whitespace(attrs['href']) if 'mailto:' in href: continue js_link = extract_js_link(href) if js_link: href = js_link link['js'] = True if href.startswith(('tel:', 'skype:', 'fb:', 'javascript:')): continue url = urljoin(base_url, href) if url_has_any_extension(url, _IGNORED): continue if only_urls: yield url else: link['url'] = url link['attrs'] = dict(attrs) link_text = a.xpath('normalize-space()').extract_first(default='') img_link_text = a.xpath('./img/@alt').extract_first(default='') link['inside_text'] = ' '.join([link_text, img_link_text]).strip() # TODO: fix before_text and add after_text # link['before_text'] = a.xpath('./preceding::text()[1]').extract_first(default='').strip()[-100:] yield link
def test_url_has_any_extension(self): self.assertTrue(url_has_any_extension("http://www.example.com/page.doc", IGNORED_EXTENSIONS)) self.assertTrue(url_has_any_extension("http://www.example.com/page.pdf", IGNORED_EXTENSIONS)) self.assertFalse(url_has_any_extension("http://www.example.com/page.htm", IGNORED_EXTENSIONS)) self.assertFalse(url_has_any_extension("http://www.example.com/", IGNORED_EXTENSIONS)) self.assertFalse(url_has_any_extension("http://www.example.com/page.doc.html", IGNORED_EXTENSIONS))