Example #1
0
    def _extract_links(self, selector, response_url, response_encoding,
                       base_url):
        links = []
        # hacky way to get the underlying lxml parsed document
        for el, attr, attr_val in self._iter_links(selector.root):
            # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
            try:
                if self.strip:
                    attr_val = strip_html5_whitespace(attr_val)
                attr_val = urljoin(base_url, attr_val)
            except ValueError:
                continue  # skipping bogus links
            else:
                url = self.process_attr(attr_val)
                if url is None:
                    continue
            url = to_native_str(url, encoding=response_encoding)
            # to fix relative links after process_value
            url = urljoin(response_url, url)
            link = Link(url,
                        _collect_string_content(el) or u'',
                        nofollow=rel_has_nofollow(el.get('rel')))

            # split url on # to get the fragment, if it exists
            url_split_on_fragments = url.split('#')

            # if url has a fragment, add it to the link.
            # without this, link.fragment is never utilized, even if
            # keep_fragments is true in the arguments supplied to
            # canonicalize_url above
            if not url_split_on_fragments:
                link.fragment = url_split_on_fragments[-1]

        return self._deduplicate_if_needed(links)
Example #2
0
    def _extract_links(self, selector, response_url, response_encoding, base_url):
        if selector.root is None:
            return []

        links = []
        # hacky way to get the underlying lxml parsed document
        for el, attr, attr_val in self._iter_links(selector.root):
            # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
            try:
                if self.strip:
                    attr_val = strip_html5_whitespace(attr_val)
                attr_val = urljoin(base_url, attr_val)
            except ValueError:
                continue  # skipping bogus links
            else:
                url = self.process_attr(attr_val)
                if url is None:
                    continue
            url = to_native_str(url, encoding=response_encoding)
            # to fix relative links after process_value
            url = urljoin(response_url, url)
            link = Link(url, _collect_string_content(el) or u'',
                        nofollow=rel_has_nofollow(el.get('rel')))
            links.append(link)
        return self._deduplicate_if_needed(links)
Example #3
0
 def _extract_links(self, selector, response_url, response_encoding,
                    base_url):
     '''
     Pretty much the same function, just added 'ignore' to to_native_str()
     '''
     links = []
     # hacky way to get the underlying lxml parsed document
     for el, attr, attr_val in self._iter_links(selector.root):
         # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
         try:
             # Remove leading and trailing white spaces
             # https://www.w3.org/TR/2014/REC-html5-20141028/infrastructure.html#strip-leading-and-trailing-whitespace
             attr_val = attr_val.strip()
             # Join base url and collected link
             attr_val = urljoin(base_url, attr_val)
         except ValueError:
             continue  # skipping bogus links
         else:
             url = self.process_attr(attr_val)
             if url is None:
                 continue
         # added 'ignore' to encoding errors
         url = to_native_str(url,
                             encoding=response_encoding,
                             errors='ignore')
         # to fix relative links after process_value
         url = urljoin(response_url, url)
         link = Link(url,
                     _collect_string_content(el) or u'',
                     nofollow=rel_has_nofollow(el.get('rel')))
         links.append(link)
     return self._deduplicate_if_needed(links)
Example #4
0
    def _extract_links(self, selector, response_url, response_encoding,
                       base_url):
        links = []
        # hacky way to get the underlying lxml parsed document
        for el, attr, attr_val in self._iter_links(selector.root):
            # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
            try:
                attr_val = urljoin(base_url, attr_val)
            except ValueError:
                continue  # skipping bogus links
            else:
                url = self.process_attr(attr_val)
                if url is None:
                    continue
            if isinstance(url, unicode):
                url = url.encode(response_encoding)
            # to fix relative links after process_value
            url = urljoin(response_url, url)
            link = Link(url,
                        _collect_string_content(el) or u'',
                        nofollow=rel_has_nofollow(el.get('rel')))
            links.append(link)

        return unique_list(links, key=lambda link: link.url) \
                if self.unique else links
Example #5
0
 def _extract_links(self, selector, response_url, response_encoding, base_url):
     '''
     Pretty much the same function, just added 'ignore' to to_native_str()
     '''
     links = []
     # hacky way to get the underlying lxml parsed document
     for el, attr, attr_val in self._iter_links(selector.root):
         # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
         try:
             attr_val = urljoin(base_url, attr_val)
         except ValueError:
             continue # skipping bogus links
         else:
             url = self.process_attr(attr_val)
             if url is None:
                 continue
         # added 'ignore' to encoding errors
         url = to_native_str(url, encoding=response_encoding,
                             errors='ignore')
         # to fix relative links after process_value
         url = urljoin(response_url, url)
         link = Link(url, _collect_string_content(el) or u'',
                     nofollow=rel_has_nofollow(el.get('rel')))
         links.append(link)
     return self._deduplicate_if_needed(links)
Example #6
0
 def test_rel_has_nofollow(self):
     assert rel_has_nofollow('ugc nofollow') is True
     assert rel_has_nofollow('ugc,nofollow') is True
     assert rel_has_nofollow('ugc') is False
     assert rel_has_nofollow('nofollow') is True
     assert rel_has_nofollow('nofollowfoo') is False
     assert rel_has_nofollow('foonofollow') is False
     assert rel_has_nofollow('ugc,  ,  nofollow') is True
Example #7
0
 def unknown_starttag(self, tag, attrs):
     if tag == 'base':
         self.base_url = dict(attrs).get('href')
     if self.scan_tag(tag):
         for attr, value in attrs:
             if self.scan_attr(attr):
                 url = self.process_value(value)
                 if url is not None:
                     link = Link(url=url, nofollow=rel_has_nofollow(dict(attrs).get('rel')))
                     self.links.append(link)
                     self.current_link = link
Example #8
0
 def unknown_starttag(self, tag, attrs):
     if tag == 'base':
         self.base_url = dict(attrs).get('href')
     if self.scan_tag(tag):
         for attr, value in attrs:
             if self.scan_attr(attr):
                 url = self.process_value(value)
                 if url is not None:
                     link = Link(url=url, nofollow=rel_has_nofollow(dict(attrs).get('rel')))
                     self.links.append(link)
                     self.current_link = link
Example #9
0
    def _extract_links(self, selector, response_url, response_encoding, base_url):
        links = []
        # hacky way to get the underlying lxml parsed document
        for el, attr, attr_val in self._iter_links(selector.root):
            # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
            try:
                attr_val = urljoin(base_url, attr_val)
            except ValueError:
                continue # skipping bogus links
            else:
                url = self.process_attr(attr_val)
                if url is None:
                    continue
            if isinstance(url, unicode):
                url = url.encode(response_encoding)
            # to fix relative links after process_value
            url = urljoin(response_url, url)
            link = Link(url, _collect_string_content(el) or u'',
                        nofollow=rel_has_nofollow(el.get('rel')))
            links.append(link)

        return unique_list(links, key=lambda link: link.url) \
                if self.unique else links