def scrape(self, request, response, link_type=None): if not self.is_supported(request=request, response=response): return if link_type and link_type != LinkType.javascript: return link_contexts = set() base_url = request.url_info.url encoding = self._encoding_override or \ detect_response_encoding(response) try: with wpull.util.reset_file_offset(response.body): for link, context in self.iter_processed_links( response.body, encoding, base_url, context=True): inline = is_likely_inline(link) if context is True: link_type = None else: link_type = context link_contexts.add( LinkContext(link, inline=inline, linked=not inline, link_type=link_type) ) except UnicodeError as error: _logger.warning(__( _('Failed to read document at ‘{url}’: {error}'), url=request.url_info.url, error=error )) return ScrapeResult(link_contexts, encoding)
def iter_links_script_element(self, element): '''Iterate a ``script`` element.''' if self.javascript_scraper and element.text: link_iter = self.javascript_scraper.scrape_links(element.text, context=True) for link, context in link_iter: inline = is_likely_inline(link) if context is True: link_type = None else: link_type = context yield LinkInfo( element=element, tag=element.tag, attrib=None, link=link, inline=inline, linked=not inline, base_link=None, value_type='script', link_type=link_type ) for link in self.iter_links_plain_element(element): yield link
def scrape(self, request, response, link_type=None): if not self.is_supported(request=request, response=response): return if link_type and link_type != LinkType.javascript: return link_contexts = set() base_url = request.url_info.url encoding = self._encoding_override or detect_response_encoding(response) try: with wpull.util.reset_file_offset(response.body): for link, context in self.iter_processed_links(response.body, encoding, base_url, context=True): inline = is_likely_inline(link) if context is True: link_type = None else: link_type = context link_contexts.add(LinkContext(link, inline=inline, linked=not inline, link_type=link_type)) except UnicodeError as error: _logger.warning(_("Failed to read document at ‘{url}’: {error}"), url=request.url_info.url, error=error) return ScrapeResult(link_contexts, encoding)
def iter_links_plain_element(self, element): '''Iterate any element for links using generic rules.''' for attrib_name, link in self.iter_links_by_attrib(element): if attrib_name in self.LINK_ATTRIBUTES: inline = self.is_link_inline(element.tag, attrib_name) linked = self.is_html_link(element.tag, attrib_name) else: inline = is_likely_inline(link) linked = not inline link_type = identify_link_type(link) yield LinkInfo( element=element, tag=element.tag, attrib=attrib_name, link=link, inline=inline, linked=linked, base_link=None, value_type='plain', link_type=link_type )