def test_safe_download_url(self): self.assertEqual(safe_download_url('http://www.scrapy.org/../'), 'http://www.scrapy.org/') self.assertEqual(safe_download_url('http://www.scrapy.org/../../images/../image'), 'http://www.scrapy.org/image') self.assertEqual(safe_download_url('http://www.scrapy.org/dir/'), 'http://www.scrapy.org/dir/')
def adapt(self, text, htmlpage=None): if htmlpage is None: return text if text is None: return encoding = getattr(htmlpage, 'encoding', 'utf-8') text = text.encode(encoding) unquoted = unquote_markup(text, encoding=encoding) cleaned = strip_url(disallowed.sub('', unquoted)) base = get_base_url(htmlpage).encode(encoding) base_url = strip_url(unquote_markup(base, encoding=encoding)) joined = urljoin(base_url, cleaned) return safe_download_url(joined)
def adapt(self, text, htmlpage): text = text.encode(htmlpage.encoding) joined = urljoin_rfc(get_base_url(htmlpage), text) return safe_download_url(unquote_markup(joined))
def adapt(self, text, htmlpage): text = text.encode(htmlpage.encoding) joined = urljoin( get_base_url(htmlpage).encode(htmlpage.encoding), text) return safe_download_url( unquote_markup(joined, encoding=htmlpage.encoding))
def adapt(self, text, htmlpage=None): if htmlpage is None: return text text = text.encode(htmlpage.encoding) joined = urljoin(get_base_url(htmlpage).encode(htmlpage.encoding), text) return safe_download_url(unquote_markup(joined, encoding=htmlpage.encoding))