Exemple #1
0
    def scrape(self, request, response, link_type=None):
        if not self.is_supported(request=request, response=response):
            return
        if link_type and link_type != LinkType.javascript:
            return

        link_contexts = set()
        base_url = request.url_info.url
        encoding = self._encoding_override or \
            detect_response_encoding(response)

        try:
            with wpull.util.reset_file_offset(response.body):
                for link, context in self.iter_processed_links(
                        response.body, encoding, base_url, context=True):
                    inline = is_likely_inline(link)

                    if context is True:
                        link_type = None
                    else:
                        link_type = context

                    link_contexts.add(
                        LinkContext(link, inline=inline, linked=not inline,
                                    link_type=link_type)
                    )

        except UnicodeError as error:
            _logger.warning(__(
                _('Failed to read document at ‘{url}’: {error}'),
                url=request.url_info.url, error=error
            ))

        return ScrapeResult(link_contexts, encoding)
Exemple #2
0
    def iter_links_script_element(self, element):
        '''Iterate a ``script`` element.'''
        if self.javascript_scraper and element.text:
            link_iter = self.javascript_scraper.scrape_links(element.text,
                                                             context=True)

            for link, context in link_iter:
                inline = is_likely_inline(link)

                if context is True:
                    link_type = None
                else:
                    link_type = context

                yield LinkInfo(
                    element=element, tag=element.tag, attrib=None,
                    link=link,
                    inline=inline, linked=not inline,
                    base_link=None,
                    value_type='script',
                    link_type=link_type
                )

        for link in self.iter_links_plain_element(element):
            yield link
Exemple #3
0
    def scrape(self, request, response, link_type=None):
        if not self.is_supported(request=request, response=response):
            return
        if link_type and link_type != LinkType.javascript:
            return

        link_contexts = set()
        base_url = request.url_info.url
        encoding = self._encoding_override or detect_response_encoding(response)

        try:
            with wpull.util.reset_file_offset(response.body):
                for link, context in self.iter_processed_links(response.body, encoding, base_url, context=True):
                    inline = is_likely_inline(link)

                    if context is True:
                        link_type = None
                    else:
                        link_type = context

                    link_contexts.add(LinkContext(link, inline=inline, linked=not inline, link_type=link_type))

        except UnicodeError as error:
            _logger.warning(_("Failed to read document at ‘{url}’: {error}"), url=request.url_info.url, error=error)

        return ScrapeResult(link_contexts, encoding)
Exemple #4
0
    def iter_links_plain_element(self, element):
        '''Iterate any element for links using generic rules.'''
        for attrib_name, link in self.iter_links_by_attrib(element):
            if attrib_name in self.LINK_ATTRIBUTES:
                inline = self.is_link_inline(element.tag, attrib_name)
                linked = self.is_html_link(element.tag, attrib_name)
            else:
                inline = is_likely_inline(link)
                linked = not inline

            link_type = identify_link_type(link)

            yield LinkInfo(
                element=element, tag=element.tag, attrib=attrib_name,
                link=link,
                inline=inline, linked=linked,
                base_link=None,
                value_type='plain',
                link_type=link_type
            )