def test_non_ascii_percent_encoding_in_query_arguments(self): self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=£500&a=5&z=3"), u"http://www.example.com/do?a=5&price=%C2%A3500&z=3") self.assertEqual(canonicalize_url(b"http://www.example.com/do?price=\xc2\xa3500&a=5&z=3"), "http://www.example.com/do?a=5&price=%C2%A3500&z=3") self.assertEqual(canonicalize_url(b"http://www.example.com/do?price(\xc2\xa3)=500&a=1"), "http://www.example.com/do?a=1&price%28%C2%A3%29=500")
def test_typical_usage(self): self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"), "http://www.example.com/do?a=1&b=2&c=3") self.assertEqual(canonicalize_url("http://www.example.com/do?c=1&b=2&a=3"), "http://www.example.com/do?a=3&b=2&c=1") self.assertEqual(canonicalize_url("http://www.example.com/do?&a=1"), "http://www.example.com/do?a=1")
def test_spaces(self): self.assertEqual(canonicalize_url("http://www.example.com/do?q=a space&a=1"), "http://www.example.com/do?a=1&q=a+space") self.assertEqual(canonicalize_url("http://www.example.com/do?q=a+space&a=1"), "http://www.example.com/do?a=1&q=a+space") self.assertEqual(canonicalize_url("http://www.example.com/do?q=a%20space&a=1"), "http://www.example.com/do?a=1&q=a+space")
def test_canonicalize_idns(self): self.assertEqual(canonicalize_url(u'http://www.bücher.de?q=bücher'), 'http://www.xn--bcher-kva.de/?q=b%C3%BCcher') # Japanese (+ reordering query parameters) self.assertEqual( canonicalize_url(u'http://はじめよう.みんな/?query=サ&maxResults=5'), 'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?maxResults=5&query=%E3%82%B5')
def test_normalize_percent_encoding_in_query_arguments(self): self.assertEqual(canonicalize_url("http://www.example.com/do?k=b%a3"), "http://www.example.com/do?k=b%A3") self.assertEqual( canonicalize_url("http://www.example.com/do?k=r%c3%a9sum%c3%a9"), "http://www.example.com/do?k=r%C3%A9sum%C3%A9")
def test_quoted_slash_and_question_sign(self): self.assertEqual( canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"), "http://foo.com/AC%2FDC+rocks%3F/?yeah=1", ) self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"), "http://foo.com/AC%2FDC/")
def test_normalize_percent_encoding_in_paths(self): self.assertEqual( canonicalize_url("http://www.example.com/r%c3%a9sum%c3%a9"), "http://www.example.com/r%C3%A9sum%C3%A9", ) # non-UTF8 encoded sequences: they should be kept untouched, only upper-cased # 'latin1'-encoded sequence in path self.assertEqual( canonicalize_url("http://www.example.com/a%a3do"), "http://www.example.com/a%A3do", ) # 'latin1'-encoded path, UTF-8 encoded query string self.assertEqual( canonicalize_url( "http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9"), "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9", ) # 'latin1'-encoded path and query string self.assertEqual( canonicalize_url("http://www.example.com/a%a3do?q=r%e9sum%e9"), "http://www.example.com/a%A3do?q=r%E9sum%E9", )
def test_canonicalize_parse_url(self): # parse_url() wraps urlparse and is used in link extractors self.assertEqual(canonicalize_url(parse_url(u"http://www.example.com/résumé?q=résumé")), "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") self.assertEqual(canonicalize_url(parse_url('http://www.example.com/caf%e9-con-leche.htm')), 'http://www.example.com/caf%E9-con-leche.htm') self.assertEqual(canonicalize_url(parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")), "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
def test_canonicalize_urlparsed(self): # canonicalize_url() can be passed an already urlparse'd URL self.assertEqual(canonicalize_url(urlparse(u"http://www.example.com/résumé?q=résumé")), "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") self.assertEqual(canonicalize_url(urlparse('http://www.example.com/caf%e9-con-leche.htm')), 'http://www.example.com/caf%E9-con-leche.htm') self.assertEqual(canonicalize_url(urlparse("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")), "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
def test_canonicalize_url_unicode_query_string_wrong_encoding(self): # trying to encode with wrong encoding # fallback to UTF-8 self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?currency=€", encoding='latin1'), "http://www.example.com/r%C3%A9sum%C3%A9?currency=%E2%82%AC") self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='latin1'), "http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F")
def test_port_number(self): self.assertEqual( canonicalize_url("http://www.example.com:8888/do?a=1&b=2&c=3"), "http://www.example.com:8888/do?a=1&b=2&c=3") # trailing empty ports are removed self.assertEqual( canonicalize_url("http://www.example.com:/do?a=1&b=2&c=3"), "http://www.example.com/do?a=1&b=2&c=3")
def parse_url(self, url: URL) -> str: # Keep the query strings if they might be feed strings. # Wikipedia for example uses query strings to differentiate feeds. if any(key in url.query for key in self.valid_keys): return canonicalize_url(str(url)) # Canonicalizing the URL is about 4x slower, but worth it to prevent duplicate requests. return canonicalize_url(url_query_cleaner(str(url)))
def test_remove_fragments(self): self.assertEqual( canonicalize_url(u"http://*****:*****@www.example.com/do?a=1#frag"), u"http://*****:*****@www.example.com/do?a=1") self.assertEqual( canonicalize_url(u"http://*****:*****@www.example.com/do?a=1#frag", keep_fragments=True), u"http://*****:*****@www.example.com/do?a=1#frag")
def test_non_ascii_percent_encoding_in_paths(self): self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"), "http://www.example.com/a%20do?a=1"), self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"), "http://www.example.com/a%20%20do?a=1"), self.assertEqual(canonicalize_url(u"http://www.example.com/a do£.html?a=1"), "http://www.example.com/a%20do%C2%A3.html?a=1") self.assertEqual(canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"), "http://www.example.com/a%20do%C2%A3.html?a=1")
def request_fingerprint(self, request): fp = hashlib.sha1() fp.update(to_bytes(request.method)) # if 'url-from' in request.meta: fp.update(to_bytes(canonicalize_url(request.meta['url-from']))) fp.update(to_bytes(canonicalize_url(request.url))) fp.update(request.body or b'') return fp.hexdigest()
def test_canonicalize_url_unicode_query_string(self): # default encoding for path and query is UTF-8 self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé"), "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") # passed encoding will affect query string self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé", encoding='latin1'), "http://www.example.com/r%C3%A9sum%C3%A9?q=r%E9sum%E9") self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='cp1251'), "http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%EE%F1%F1%E8%FF")
def main(): total = 0 time = 0 time_file_uri_to_path = 0 time_safe_url_string = 0 time_canonicalize_url = 0 tar = tarfile.open("sites.tar.gz") urls = [] for member in tar.getmembers(): f = tar.extractfile(member) html = f.read() response = HtmlResponse(url="local", body=html, encoding='utf8') links = response.css('a::attr(href)').extract() urls.extend(links) for url in urls: start_file_uri_to_path = timer() file_uri_to_path(url) end_file_uri_to_path = timer() time_file_uri_to_path += (end_file_uri_to_path - start_file_uri_to_path) time += (end_file_uri_to_path - start_file_uri_to_path) start_safe_url_string = timer() safe_url_string(url) end_safe_url_string = timer() time_safe_url_string += (end_safe_url_string - start_safe_url_string) time += (end_safe_url_string - start_safe_url_string) start_canonicalize_url = timer() canonicalize_url(url) end_canonicalize_url = timer() time_canonicalize_url += (end_canonicalize_url - start_canonicalize_url) time += (end_canonicalize_url - start_canonicalize_url) # any_to_uri(url) # Error on Python 2: KeyError: u'\u9996' total += 1 print("\nTotal number of items extracted = {0}".format(total)) print("Time spent on file_uri_to_path = {0}".format(time_file_uri_to_path)) print("Time spent on safe_url_string = {0}".format(time_safe_url_string)) print("Time spent on canonicalize_url = {0}".format(time_canonicalize_url)) print("Total time taken = {0}".format(time)) click.secho("Rate of link extraction : {0} items/second\n".format( float(total / time)), bold=True) with open("Benchmark.txt", 'w') as g: g.write(" {0}".format((float(total / time))))
def test_canonicalize_url_idna_exceptions(self): # missing DNS label self.assertEqual( canonicalize_url(u"http://.example.com/résumé?q=résumé"), "http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") # DNS label too long self.assertEqual( canonicalize_url(u"http://www.{label}.com/résumé?q=résumé".format( label=u"example" * 11)), "http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9". format(label=u"example" * 11))
def test_canonicalize_url_idempotence(self): for url, enc in [(u'http://www.bücher.de/résumé?q=résumé', 'utf8'), (u'http://www.example.com/résumé?q=résumé', 'latin1'), (u'http://www.example.com/résumé?country=Россия', 'cp1251'), (u'http://はじめよう.みんな/?query=サ&maxResults=5', 'iso2022jp')]: canonicalized = canonicalize_url(url, encoding=enc) # if we canonicalize again, we ge the same result self.assertEqual(canonicalize_url(canonicalized, encoding=enc), canonicalized) # without encoding, already canonicalized URL is canonicalized identically self.assertEqual(canonicalize_url(canonicalized), canonicalized)
def test_keep_blank_values(self): self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2", keep_blank_values=False), "http://www.example.com/do?a=2") self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"), "http://www.example.com/do?a=2&b=") self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2", keep_blank_values=False), "http://www.example.com/do?a=2") self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2"), "http://www.example.com/do?a=2&b=&c=") self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'), 'http://www.example.com/do?1750%2C4=')
def test_canonicalize_url_idna_exceptions(self): # missing DNS label self.assertEqual( canonicalize_url(u"http://.example.com/résumé?q=résumé"), "http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9") # DNS label too long self.assertEqual( canonicalize_url( u"http://www.{label}.com/résumé?q=résumé".format( label=u"example"*11)), "http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format( label=u"example"*11))
def test_canonicalize_url_idna_exceptions(self): # missing DNS label self.assertEqual( canonicalize_url("http://.example.com/résumé?q=résumé"), "http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9", ) # DNS label too long self.assertEqual( canonicalize_url( f"http://www.{'example' * 11}.com/résumé?q=résumé"), f"http://www.{'example' * 11}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9", )
def test_normalize_percent_encoding_in_paths(self): self.assertEqual(canonicalize_url("http://www.example.com/r%c3%a9sum%c3%a9"), "http://www.example.com/r%C3%A9sum%C3%A9") # non-UTF8 encoded sequences: they should be kept untouched, only upper-cased # 'latin1'-encoded sequence in path self.assertEqual(canonicalize_url("http://www.example.com/a%a3do"), "http://www.example.com/a%A3do") # 'latin1'-encoded path, UTF-8 encoded query string self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9"), "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9") # 'latin1'-encoded path and query string self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%e9sum%e9"), "http://www.example.com/a%A3do?q=r%E9sum%E9")
def _process_links(self, links): links = [x for x in links if self._link_allowed(x)] if self.canonicalize: for link in links: link.url = canonicalize_url(link.url) links = self.link_extractor._process_links(links) return links
def canonicalize(url, remove_parameters=('utm_medium', 'utm_source', 'utm_campaign', 'utm_term', 'utm_content')): """Canonicalize URL.""" try: curl = url_query_cleaner(canonicalize_url(url, keep_blank_values=False, keep_fragments=False), parameterlist=remove_parameters, remove=True) return canonicalize_url(curl, keep_blank_values=False, keep_fragments=False) except Exception as e: logger.warning('Fail to canonicalize url %r: %s', url, e) return None
def _parse_links(self, response): #提取网页中的链接 #并把相对url补全为完整的url l = LinkLoader(html.html_to_unicode(response)) l.add_xpath(xpath='//a/@href', re_patten=r'/subject/[0-9]+/$|/tag/.*') #如果需要提取多个不同的规则的链接就调用多次 #l.add_xpath(xpath, re_patten) #l.add_xpath(xpath, re_patten) #最后调用get()方法就可以获取到相应的所有的链接,返回的是一个包含url的列表 links = l.get() base = urlparse.urlparse(response.url) domain = '://'.join((base.scheme, base.netloc)) for url in links: #其实下面这些部分scrapy内置link extrackor实现了 #也可以放到中间件中去实现将相对url补全为完整的url component = urlparse.urlparse(url) #这一步是去除url中的host与response的url的host不相同的url #然后scrapy默认的offsite spider中间件就可以保证抓取到的不会抓取不该抓取的url if (component.netloc) and (component.netloc != base.netloc): continue #这一步判断url是否为完整的url if domain not in url: url = urlparse.urljoin(domain, url) #将url规范化,比如去除url中的#号等等 url = canonicalize_url(url) #设置request抓取的优先级 priority = 5 if self.item_url.search(url) else 0 #遇到了如果没有显示的指定callback,而就单单指定一个 #errback就会报错的情况 yield Request(url=url, callback=self.parse, errback=self.error_back, priority=priority)
def load_products(response): """Load a ProductItem from the product page response.""" loader = ProductItemLoader(item=ProductItem(),response=response) url = url_query_cleaner(response.url, ['snr'], remove=True) url = canonicalize_url(url) loader.add_value('url', url) publisher = response.xpath('//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][6]/div[2]/a[1]/span[2]/text()') if publisher is None: loader.add_xpath('developer','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][5]/div[2]/a[1]/span[2]/text()') loader.add_xpath('publisher','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][5]/div[2]/a[2]/span[2]/text()') else: loader.add_xpath('developer','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][6]/div[2]/a[1]/span[2]/text()') loader.add_xpath('publisher','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][6]/div[2]/a[2]/span[2]/text()') loader.add_xpath('release_date','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][4]/div[2]/text()') loader.add_css('app_name', '.header__title ::text') loader.add_css('specs', '.game-features__title ::text') loader.add_css('genre', '.product-details__data span a.un ::text') try: price = response.css('.module-buy__info > meta:nth-child(2) ::attr(content)').extract_first() price_disc = price except: price = None price_disc = price if price is None: price = '0.00' price_disc = price loader.add_value('price', price) loader.add_value('discount_price', price_disc) loader.add_css('rating', 'div.average-rating:nth-child(1) > meta:nth-child(4) ::attr(content)') return loader.load_item()
def cleanup_url(url): parsed = urlparse(url) url = parsed.netloc + parsed.path + "?" + parsed.query url = canonicalize_url(url) if url[len(url) - 1] == '/': url = url[:len(url) - 1] return url
def custom_request_fingerprint(self, request, include_headers=None, remove_scheme=None): """ Overridden given that some URL can have a wrong encoding (when it is comes from selenium driver) changes: encode.('utf-8) & in order to be no scheme compliant """ # If use_anchors, anchors in URL matters since each anchor define a different webpage and content (special js_rendering) url_for_finger_print = canonicalize_url(request.url) if not self.use_anchors else request.url url_for_hash = url_for_finger_print.encode('utf-8') # scheme agnosticism if remove_scheme: match_capture_any_scheme = r'(https?)(.*)' url_for_hash = re.sub(match_capture_any_scheme, r"\2", url_for_hash) if include_headers: include_headers = tuple(to_bytes(h.lower()) for h in sorted(include_headers)) cache = _fingerprint_cache.setdefault(request, {}) if include_headers not in cache or not remove_scheme: # Since it is called from the same function, wee need to ensure we compute the fingerprint which take into account the scheme. Avoid caching fp = hashlib.sha1() fp.update(to_bytes(request.method.encode('utf-8'))) fp.update(to_bytes(url_for_hash)) fp.update(request.body or ''.encode('utf-8')) if include_headers: for hdr in include_headers: if hdr in request.headers: fp.update(hdr) for v in request.headers.getlist(hdr): fp.update(v) cache[include_headers] = fp.hexdigest() return cache[include_headers]
def __init__(self, tag="a", attr="href", unique=False, process_value=None, strip=True, canonicalized=False): warnings.warn( "BaseSgmlLinkExtractor is deprecated and will be removed in future releases. " "Please use scrapy.linkextractors.LinkExtractor", ScrapyDeprecationWarning, stacklevel=2, ) SGMLParser.__init__(self) self.scan_tag = tag if callable(tag) else lambda t: t == tag self.scan_attr = attr if callable(attr) else lambda a: a == attr self.process_value = ( lambda v: v) if process_value is None else process_value self.current_link = None self.unique = unique self.strip = strip if canonicalized: self.link_key = lambda link: link.url else: self.link_key = lambda link: canonicalize_url(link.url, keep_fragments=True)
def build_key(slug, params): url = reverse('api:lookup-by-slug', kwargs={'slug': slug}) params = {key: value for key, value in params.items() if value} # using the page slug as a redis hash tag ensures the keys related to # the same page in the same node, preventing delete_many from failing # because the keys could be stored across different nodes return f'{{slug}}' + canonicalize_url(url + '?' + urlencode(params))
def request_fingerprint(self, request, include_headers=None, keep_fragments=False): if include_headers: include_headers = tuple( self.to_bytes(h.lower()) for h in sorted(include_headers)) cache = _fingerprint_cache.setdefault(request, {}) cache_key = (include_headers, keep_fragments) if cache_key not in cache: fp = hashlib.sha1() fp.update(self.to_bytes(request.method)) fp.update( self.to_bytes( canonicalize_url(request.url, keep_fragments=keep_fragments))) fp.update(request.body or b'') if include_headers: for hdr in include_headers: if hdr in request.headers: fp.update(hdr) for v in request.headers.getlist(hdr): fp.update(v) cache[cache_key] = fp.hexdigest() return cache[cache_key]
def test_safe_characters_unicode(self): # urllib.quote uses a mapping cache of encoded characters. when parsing # an already percent-encoded url, it will fail if that url was not # percent-encoded as utf-8, that's why canonicalize_url must always # convert the urls to string. the following test asserts that # functionality. self.assertEqual(canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'), 'http://www.example.com/caf%E9-con-leche.htm')
def __init__(self, tag="a", attr="href", process=None, unique=False, strip=True, canonicalized=False): self.scan_tag = tag if callable(tag) else lambda t: t == tag self.scan_attr = attr if callable(attr) else lambda a: a == attr self.process_attr = process if callable(process) else lambda v: v self.unique = unique self.strip = strip if canonicalized: self.link_key = lambda link: link.url else: self.link_key = lambda link: canonicalize_url(link.url, keep_fragments=True)
def request_fingerprint(request, include_headers=None): """ Return the request fingerprint. The request fingerprint is a hash that uniquely identifies the resource the request points to. For example, take the following two urls: http://www.example.com/query?id=111&cat=222 http://www.example.com/query?cat=222&id=111 Even though those are two different URLs both point to the same resource and are equivalent (ie. they should return the same response). Another example are cookies used to store session ids. Suppose the following page is only accesible to authenticated users: http://www.example.com/members/offers.html Lot of sites use a cookie to store the session id, which adds a random component to the HTTP Request and thus should be ignored when calculating the fingerprint. For this reason, request headers are ignored by default when calculating the fingeprint. If you want to include specific headers use the include_headers argument, which is a list of Request headers to include. """ if include_headers: include_headers = tuple(to_bytes(h.lower()) for h in sorted(include_headers)) cache = _fingerprint_cache.setdefault(request, {}) if include_headers not in cache: fp = hashlib.sha1() fp.update(to_bytes(request.method)) fp.update(to_bytes(canonicalize_url(request.url))) fp.update(request.body or b'') if include_headers: for hdr in include_headers: if hdr in request.headers: fp.update(hdr) for v in request.headers.getlist(hdr): fp.update(v) cache[include_headers] = fp.hexdigest() return cache[include_headers]
def __init__(self, tag="a", attr="href", unique=False, process_value=None, strip=True, canonicalized=False): warnings.warn( "BaseSgmlLinkExtractor is deprecated and will be removed in future releases. " "Please use scrapy.linkextractors.LinkExtractor", ScrapyDeprecationWarning, stacklevel=2, ) SGMLParser.__init__(self) self.scan_tag = tag if callable(tag) else lambda t: t == tag self.scan_attr = attr if callable(attr) else lambda a: a == attr self.process_value = (lambda v: v) if process_value is None else process_value self.current_link = None self.unique = unique self.strip = strip if canonicalized: self.link_key = lambda link: link.url else: self.link_key = lambda link: canonicalize_url(link.url, keep_fragments=True)
def test_quoted_slash_and_question_sign(self): self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"), "http://foo.com/AC%2FDC+rocks%3F/?yeah=1") self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"), "http://foo.com/AC%2FDC/")
def test_domains_are_case_insensitive(self): self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"), "http://www.example.com/")
def test_append_missing_path(self): self.assertEqual(canonicalize_url("http://www.example.com"), "http://www.example.com/")
def test_port_number(self): self.assertEqual(canonicalize_url("http://www.example.com:8888/do?a=1&b=2&c=3"), "http://www.example.com:8888/do?a=1&b=2&c=3") # trailing empty ports are removed self.assertEqual(canonicalize_url("http://www.example.com:/do?a=1&b=2&c=3"), "http://www.example.com/do?a=1&b=2&c=3")
def test_dont_convert_safe_characters(self): # dont convert safe characters to percent encoding representation self.assertEqual(canonicalize_url( "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"), "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html")
def test_cookies(settings): # 64K for headers is over Twisted limit, # so if these headers are sent to Splash request would fail. BOMB = 'x' * 64000 class LuaScriptSpider(ResponseSpider): """ Cookies must be sent to website, not to Splash """ custom_settings = { 'SPLASH_COOKIES_DEBUG': True, 'COOKIES_DEBUG': True, } def start_requests(self): # cookies set without Splash should be still # sent to a remote website. FIXME: this is not the case. yield scrapy.Request(self.url + "/login", self.parse, cookies={'x-set-scrapy': '1'}) def parse(self, response): yield SplashRequest(self.url + "#egg", self.parse_1, endpoint='execute', args={'lua_source': DEFAULT_SCRIPT}, cookies={'x-set-splash': '1'}) def parse_1(self, response): yield {'response': response} yield SplashRequest(self.url + "#foo", self.parse_2, endpoint='execute', args={'lua_source': DEFAULT_SCRIPT}) def parse_2(self, response): yield {'response': response} yield scrapy.Request(self.url, self.parse_3) def parse_3(self, response): # Splash (Twisted) drops requests with huge http headers, # but this one should work, as cookies are not sent # to Splash itself. yield {'response': response} yield SplashRequest(self.url + "#bar", self.parse_4, endpoint='execute', args={'lua_source': DEFAULT_SCRIPT}, cookies={'bomb': BOMB}) def parse_4(self, response): yield {'response': response} def _cookie_dict(har_cookies): return {c['name']: c['value'] for c in har_cookies} items, url, crawler = yield crawl_items(LuaScriptSpider, ManyCookies, settings) assert len(items) == 4 # cookie should be sent to remote website, not to Splash resp = items[0]['response'] splash_request_headers = resp.request.headers cookies = resp.data['args']['cookies'] print(splash_request_headers) print(cookies) assert _cookie_dict(cookies) == { # 'login': '******', # FIXME 'x-set-splash': '1' } assert splash_request_headers.get(b'Cookie') is None # new cookie should be also sent to remote website, not to Splash resp2 = items[1]['response'] splash_request_headers = resp2.request.headers headers = resp2.data['args']['headers'] cookies = resp2.data['args']['cookies'] assert canonicalize_url(headers['Referer']) == canonicalize_url(url) assert _cookie_dict(cookies) == { # 'login': '******', 'x-set-splash': '1', 'sessionid': 'ABCD' } print(splash_request_headers) print(headers) print(cookies) assert splash_request_headers.get(b'Cookie') is None # TODO/FIXME: Cookies fetched when working with Splash should be picked up # by Scrapy resp3 = items[2]['response'] splash_request_headers = resp3.request.headers cookie_header = splash_request_headers.get(b'Cookie') assert b'x-set-scrapy=1' in cookie_header assert b'login=1' in cookie_header assert b'x-set-splash=1' in cookie_header # assert b'sessionid=ABCD' in cookie_header # FIXME # cookie bomb shouldn't cause problems resp4 = items[3]['response'] splash_request_headers = resp4.request.headers cookies = resp4.data['args']['cookies'] assert _cookie_dict(cookies) == { # 'login': '******', 'x-set-splash': '1', 'sessionid': 'ABCD', 'bomb': BOMB, } assert splash_request_headers.get(b'Cookie') is None
def test_sorting(self): self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"), "http://www.example.com/do?a=50&b=2&b=5&c=3")
def test_canonicalize_url_unicode_path(self): self.assertEqual(canonicalize_url(u"http://www.example.com/résumé"), "http://www.example.com/r%C3%A9sum%C3%A9")
def test_normalize_percent_encoding_in_query_arguments(self): self.assertEqual(canonicalize_url("http://www.example.com/do?k=b%a3"), "http://www.example.com/do?k=b%A3") self.assertEqual(canonicalize_url("http://www.example.com/do?k=r%c3%a9sum%c3%a9"), "http://www.example.com/do?k=r%C3%A9sum%C3%A9")
def test_urls_with_auth_and_ports(self): self.assertEqual(canonicalize_url(u"http://*****:*****@www.example.com:81/do?now=1"), u"http://*****:*****@www.example.com:81/do?now=1")
def test_canonicalize_idns(self): self.assertEqual(canonicalize_url(u'http://www.bücher.de?q=bücher'), 'http://www.xn--bcher-kva.de/?q=b%C3%BCcher') # Japanese (+ reordering query parameters) self.assertEqual(canonicalize_url(u'http://はじめよう.みんな/?query=サ&maxResults=5'), 'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?maxResults=5&query=%E3%82%B5')
def _get_fingerprint(self, url): return self.fingerprint_function(canonicalize_url(url))
def test_remove_fragments(self): self.assertEqual(canonicalize_url(u"http://*****:*****@www.example.com/do?a=1#frag"), u"http://*****:*****@www.example.com/do?a=1") self.assertEqual(canonicalize_url(u"http://*****:*****@www.example.com/do?a=1#frag", keep_fragments=True), u"http://*****:*****@www.example.com/do?a=1#frag")