def _extract_links(self, selector, response_url, response_encoding, base_url): links = [] # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector.root): # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) try: if self.strip: attr_val = strip_html5_whitespace(attr_val) attr_val = urljoin(base_url, attr_val) except ValueError: continue # skipping bogus links else: url = self.process_attr(attr_val) if url is None: continue url = to_native_str(url, encoding=response_encoding) # to fix relative links after process_value url = urljoin(response_url, url) link = Link(url, _collect_string_content(el) or u'', nofollow=rel_has_nofollow(el.get('rel'))) # split url on # to get the fragment, if it exists url_split_on_fragments = url.split('#') # if url has a fragment, add it to the link. # without this, link.fragment is never utilized, even if # keep_fragments is true in the arguments supplied to # canonicalize_url above if not url_split_on_fragments: link.fragment = url_split_on_fragments[-1] return self._deduplicate_if_needed(links)
def _extract_links(self, selector, response_url, response_encoding, base_url): if selector.root is None: return [] links = [] # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector.root): # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) try: if self.strip: attr_val = strip_html5_whitespace(attr_val) attr_val = urljoin(base_url, attr_val) except ValueError: continue # skipping bogus links else: url = self.process_attr(attr_val) if url is None: continue url = to_native_str(url, encoding=response_encoding) # to fix relative links after process_value url = urljoin(response_url, url) link = Link(url, _collect_string_content(el) or u'', nofollow=rel_has_nofollow(el.get('rel'))) links.append(link) return self._deduplicate_if_needed(links)
def _extract_links(self, selector, response_url, response_encoding, base_url): ''' Pretty much the same function, just added 'ignore' to to_native_str() ''' links = [] # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector.root): # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) try: # Remove leading and trailing white spaces # https://www.w3.org/TR/2014/REC-html5-20141028/infrastructure.html#strip-leading-and-trailing-whitespace attr_val = attr_val.strip() # Join base url and collected link attr_val = urljoin(base_url, attr_val) except ValueError: continue # skipping bogus links else: url = self.process_attr(attr_val) if url is None: continue # added 'ignore' to encoding errors url = to_native_str(url, encoding=response_encoding, errors='ignore') # to fix relative links after process_value url = urljoin(response_url, url) link = Link(url, _collect_string_content(el) or u'', nofollow=rel_has_nofollow(el.get('rel'))) links.append(link) return self._deduplicate_if_needed(links)
def _extract_links(self, selector, response_url, response_encoding, base_url): links = [] # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector.root): # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) try: attr_val = urljoin(base_url, attr_val) except ValueError: continue # skipping bogus links else: url = self.process_attr(attr_val) if url is None: continue if isinstance(url, unicode): url = url.encode(response_encoding) # to fix relative links after process_value url = urljoin(response_url, url) link = Link(url, _collect_string_content(el) or u'', nofollow=rel_has_nofollow(el.get('rel'))) links.append(link) return unique_list(links, key=lambda link: link.url) \ if self.unique else links
def _extract_links(self, selector, response_url, response_encoding, base_url): ''' Pretty much the same function, just added 'ignore' to to_native_str() ''' links = [] # hacky way to get the underlying lxml parsed document for el, attr, attr_val in self._iter_links(selector.root): # pseudo lxml.html.HtmlElement.make_links_absolute(base_url) try: attr_val = urljoin(base_url, attr_val) except ValueError: continue # skipping bogus links else: url = self.process_attr(attr_val) if url is None: continue # added 'ignore' to encoding errors url = to_native_str(url, encoding=response_encoding, errors='ignore') # to fix relative links after process_value url = urljoin(response_url, url) link = Link(url, _collect_string_content(el) or u'', nofollow=rel_has_nofollow(el.get('rel'))) links.append(link) return self._deduplicate_if_needed(links)
def test_rel_has_nofollow(self): assert rel_has_nofollow('ugc nofollow') is True assert rel_has_nofollow('ugc,nofollow') is True assert rel_has_nofollow('ugc') is False assert rel_has_nofollow('nofollow') is True assert rel_has_nofollow('nofollowfoo') is False assert rel_has_nofollow('foonofollow') is False assert rel_has_nofollow('ugc, , nofollow') is True
def unknown_starttag(self, tag, attrs): if tag == 'base': self.base_url = dict(attrs).get('href') if self.scan_tag(tag): for attr, value in attrs: if self.scan_attr(attr): url = self.process_value(value) if url is not None: link = Link(url=url, nofollow=rel_has_nofollow(dict(attrs).get('rel'))) self.links.append(link) self.current_link = link