def scrape(self, request, response): if not self.is_html(request, response): return content_file = response.body.content_file encoding = get_heading_encoding(response) tree = self.parse(content_file, encoding, request.url_info.url) root = tree.getroot() if root is None: return linked_urls = set() inline_urls = set() link_infos = self.iter_links(root) if 'Refresh' in response.fields: link = parse_refresh(response.fields['Refresh']) if link: link_info = LinkInfo(None, '_refresh', None, to_str(link), False, True, None, 'refresh') link_infos = itertools.chain(link_infos, [link_info]) for scraped_link in link_infos: if self._only_relative: if scraped_link.base_link or '://' in scraped_link.link: continue if not self._is_accepted(scraped_link.tag): continue base_url = root.base_url if scraped_link.base_link: base_url = wpull.url.urljoin(base_url, scraped_link.base_link) url = wpull.url.urljoin(base_url, scraped_link.link, allow_fragments=False) # Browsers seems to tolerate URLs with newlines url = url.replace('\n', '').replace('\r', '') if scraped_link.inline: inline_urls.add(url) if scraped_link.linked: linked_urls.add(url) if self._robots and self._robots_cannot_follow(root): linked_urls.clear() return { 'inline_urls': inline_urls, 'linked_urls': linked_urls, 'base_url': to_str(root.base_url), 'encoding': to_str(root.getroottree().docinfo.encoding), }
def iter_links_element(cls, element): '''Iterate a HTML element.''' # reference: lxml.html.HtmlMixin.iterlinks() # NOTE: to_str is needed because on Python 2, only byte strings # are returned from lxml attrib = element.attrib tag = element.tag if tag == 'link': iterable = cls.iter_links_link_element(element) elif tag == 'meta': iterable = cls.iter_links_meta_element(element) elif tag in ('object', 'applet'): iterable = cls.iter_links_object_element(element) elif tag == 'param': iterable = cls.iter_links_param_element(element) elif tag == 'style': iterable = cls.iter_links_style_element(element) else: iterable = cls.iter_links_plain_element(element) for link_info in iterable: yield link_info if 'style' in attrib: for link in CSSScraper.scrape_urls(attrib['style']): yield LinkInfo(element, element.tag, 'style', to_str(link), True, False, None, 'css')
def iter_links_element(cls, element): '''Iterate a HTML element.''' # reference: lxml.html.HtmlMixin.iterlinks() # NOTE: to_str is needed because on Python 2, only byte strings # are returned from lxml attrib = element.attrib tag = element.tag if tag == 'link': iterable = cls.iter_links_link_element(element) elif tag == 'meta': iterable = cls.iter_links_meta_element(element) elif tag in ('object', 'applet'): iterable = cls.iter_links_object_element(element) elif tag == 'param': iterable = cls.iter_links_param_element(element) elif tag == 'style': iterable = cls.iter_links_style_element(element) else: iterable = cls.iter_links_plain_element(element) for link_info in iterable: yield link_info if 'style' in attrib: for link in CSSScraper.scrape_urls(attrib['style']): yield LinkInfo( element, element.tag, 'style', to_str(link), True, False, None, 'css' )
def iter_links_plain_element(cls, element): '''Iterate any element for links using generic rules.''' for attrib_name, link in cls.iter_links_by_attrib(element): inline = cls.is_link_inline(element.tag, attrib_name) linked = cls.is_html_link(element.tag, attrib_name) yield LinkInfo(element, element.tag, attrib_name, to_str(link), inline, linked, None, 'plain')
def iter_links_param_element(cls, element): '''Iterate a ``param`` element.''' valuetype = element.get('valuetype', '') if valuetype.lower() == 'ref' and 'value' in element.attrib: yield LinkInfo(element, element.tag, 'value', to_str(element.get('value')), True, False, None, 'plain')
def iter_links_style_element(self, element): '''Iterate a ``style`` element.''' if element.text: link_iter = itertools.chain( CSSScraper.scrape_imports(element.text), CSSScraper.scrape_urls(element.text)) for link in link_iter: yield LinkInfo(element, element.tag, None, to_str(link), True, False, None, 'css')
def iter_links_meta_element(cls, element): '''Iterate the ``meta`` element for links. This function handles refresh URLs. ''' if element.get('http-equiv', '').lower() == 'refresh': content_value = element.get('content') link = parse_refresh(content_value) if link: yield LinkInfo(element, element.tag, 'http-equiv', to_str(link), False, True, None, 'refresh')
def iter_links_param_element(cls, element): '''Iterate a ``param`` element.''' valuetype = element.get('valuetype', '') if valuetype.lower() == 'ref' and 'value' in element.attrib: yield LinkInfo( element, element.tag, 'value', to_str(element.get('value')), True, False, None, 'plain' )
def iter_links_plain_element(cls, element): '''Iterate any element for links using generic rules.''' for attrib_name, link in cls.iter_links_by_attrib(element): inline = cls.is_link_inline(element.tag, attrib_name) linked = cls.is_html_link(element.tag, attrib_name) yield LinkInfo( element, element.tag, attrib_name, to_str(link), inline, linked, None, 'plain' )
def iter_links_link_element(cls, element): '''Iterate a ``link`` for URLs. This function handles stylesheets and icons in addition to standard scraping rules. ''' rel = element.get('rel', '') inline = 'stylesheet' in rel or 'icon' in rel for attrib_name, link in cls.iter_links_by_attrib(element): yield LinkInfo(element, element.tag, attrib_name, to_str(link), inline, not inline, None, 'plain')
def iter_links_object_element(cls, element): '''Iterate ``object`` and ``embed`` elements. This function also looks at ``codebase`` and ``archive`` attributes. ''' base_link = to_str(element.get('codebase', None)) if base_link: # lxml returns codebase as inline yield LinkInfo(element, element.tag, 'codebase', base_link, True, False, None, 'plain') for attribute in ('code', 'src', 'classid', 'data'): if attribute in element.attrib: yield LinkInfo(element, element.tag, attribute, to_str(element.get(attribute)), True, False, base_link, 'plain') if 'archive' in element.attrib: for match in re.finditer(r'[^ ]+', element.get('archive')): value = match.group(0) yield LinkInfo(element, element.tag, 'archive', to_str(value), True, False, base_link, 'list')
def iter_links_object_element(cls, element): '''Iterate ``object`` and ``embed`` elements. This function also looks at ``codebase`` and ``archive`` attributes. ''' base_link = to_str(element.get('codebase', None)) if base_link: # lxml returns codebase as inline yield LinkInfo( element, element.tag, 'codebase', base_link, True, False, None, 'plain' ) for attribute in ('code', 'src', 'classid', 'data'): if attribute in element.attrib: yield LinkInfo( element, element.tag, attribute, to_str(element.get(attribute)), True, False, base_link, 'plain' ) if 'archive' in element.attrib: for match in re.finditer(r'[^ ]+', element.get('archive')): value = match.group(0) yield LinkInfo( element, element.tag, 'archive', to_str(value), True, False, base_link, 'list' )
def iter_links_style_element(self, element): '''Iterate a ``style`` element.''' if element.text: link_iter = itertools.chain( CSSScraper.scrape_imports(element.text), CSSScraper.scrape_urls(element.text) ) for link in link_iter: yield LinkInfo( element, element.tag, None, to_str(link), True, False, None, 'css' )
def iter_links_meta_element(cls, element): '''Iterate the ``meta`` element for links. This function handles refresh URLs. ''' if element.get('http-equiv', '').lower() == 'refresh': content_value = element.get('content') link = parse_refresh(content_value) if link: yield LinkInfo( element, element.tag, 'http-equiv', to_str(link), False, True, None, 'refresh' )
def iter_links_link_element(cls, element): '''Iterate a ``link`` for URLs. This function handles stylesheets and icons in addition to standard scraping rules. ''' rel = element.get('rel', '') inline = 'stylesheet' in rel or 'icon' in rel for attrib_name, link in cls.iter_links_by_attrib(element): yield LinkInfo( element, element.tag, attrib_name, to_str(link), inline, not inline, None, 'plain' )
def test_to_str(self): self.assertEqual('hi', to_str(b'hi')) self.assertEqual(['hi'], to_str([b'hi'])) self.assertEqual({'hi': 'hello'}, to_str({b'hi': b'hello'}))
def scrape(self, request, response): if not self.is_html(request, response): return content_file = response.body.content_file encoding = get_heading_encoding(response) tree = self.parse(content_file, encoding, request.url_info.url) root = tree.getroot() if root is None: return linked_urls = set() inline_urls = set() link_infos = self.iter_links(root) if 'Refresh' in response.fields: link = parse_refresh(response.fields['Refresh']) if link: link_info = LinkInfo( None, '_refresh', None, to_str(link), False, True, None, 'refresh' ) link_infos = itertools.chain(link_infos, [link_info]) for scraped_link in link_infos: if self._only_relative: if scraped_link.base_link or '://' in scraped_link.link: continue if not self._is_accepted(scraped_link.tag): continue base_url = root.base_url if scraped_link.base_link: base_url = wpull.url.urljoin(base_url, scraped_link.base_link) url = wpull.url.urljoin(base_url, scraped_link.link, allow_fragments=False) # Browsers seems to tolerate URLs with newlines url = url.replace('\n', '').replace('\r', '') if scraped_link.inline: inline_urls.add(url) if scraped_link.linked: linked_urls.add(url) if self._robots and self._robots_cannot_follow(root): linked_urls.clear() return { 'inline_urls': inline_urls, 'linked_urls': linked_urls, 'base_url': to_str(root.base_url), 'encoding': to_str(root.getroottree().docinfo.encoding), }