Beispiel #1
0
 def test_non_ascii_percent_encoding_in_query_arguments(self):
     self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=£500&a=5&z=3"),
                                       u"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
     self.assertEqual(canonicalize_url(b"http://www.example.com/do?price=\xc2\xa3500&a=5&z=3"),
                                       "http://www.example.com/do?a=5&price=%C2%A3500&z=3")
     self.assertEqual(canonicalize_url(b"http://www.example.com/do?price(\xc2\xa3)=500&a=1"),
                                       "http://www.example.com/do?a=1&price%28%C2%A3%29=500")
Beispiel #2
0
 def test_typical_usage(self):
     self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"),
                                       "http://www.example.com/do?a=1&b=2&c=3")
     self.assertEqual(canonicalize_url("http://www.example.com/do?c=1&b=2&a=3"),
                                       "http://www.example.com/do?a=3&b=2&c=1")
     self.assertEqual(canonicalize_url("http://www.example.com/do?&a=1"),
                                       "http://www.example.com/do?a=1")
Beispiel #3
0
 def test_spaces(self):
     self.assertEqual(canonicalize_url("http://www.example.com/do?q=a space&a=1"),
                                       "http://www.example.com/do?a=1&q=a+space")
     self.assertEqual(canonicalize_url("http://www.example.com/do?q=a+space&a=1"),
                                       "http://www.example.com/do?a=1&q=a+space")
     self.assertEqual(canonicalize_url("http://www.example.com/do?q=a%20space&a=1"),
                                       "http://www.example.com/do?a=1&q=a+space")
Beispiel #4
0
 def test_spaces(self):
     self.assertEqual(canonicalize_url("http://www.example.com/do?q=a space&a=1"),
                                       "http://www.example.com/do?a=1&q=a+space")
     self.assertEqual(canonicalize_url("http://www.example.com/do?q=a+space&a=1"),
                                       "http://www.example.com/do?a=1&q=a+space")
     self.assertEqual(canonicalize_url("http://www.example.com/do?q=a%20space&a=1"),
                                       "http://www.example.com/do?a=1&q=a+space")
Beispiel #5
0
 def test_canonicalize_idns(self):
     self.assertEqual(canonicalize_url(u'http://www.bücher.de?q=bücher'),
                      'http://www.xn--bcher-kva.de/?q=b%C3%BCcher')
     # Japanese (+ reordering query parameters)
     self.assertEqual(
         canonicalize_url(u'http://はじめよう.みんな/?query=サ&maxResults=5'),
         'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?maxResults=5&query=%E3%82%B5')
Beispiel #6
0
 def test_non_ascii_percent_encoding_in_query_arguments(self):
     self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=£500&a=5&z=3"),
                                       u"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
     self.assertEqual(canonicalize_url(b"http://www.example.com/do?price=\xc2\xa3500&a=5&z=3"),
                                       "http://www.example.com/do?a=5&price=%C2%A3500&z=3")
     self.assertEqual(canonicalize_url(b"http://www.example.com/do?price(\xc2\xa3)=500&a=1"),
                                       "http://www.example.com/do?a=1&price%28%C2%A3%29=500")
Beispiel #7
0
 def test_typical_usage(self):
     self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"),
                                       "http://www.example.com/do?a=1&b=2&c=3")
     self.assertEqual(canonicalize_url("http://www.example.com/do?c=1&b=2&a=3"),
                                       "http://www.example.com/do?a=3&b=2&c=1")
     self.assertEqual(canonicalize_url("http://www.example.com/do?&a=1"),
                                       "http://www.example.com/do?a=1")
Beispiel #8
0
    def test_normalize_percent_encoding_in_query_arguments(self):
        self.assertEqual(canonicalize_url("http://www.example.com/do?k=b%a3"),
                         "http://www.example.com/do?k=b%A3")

        self.assertEqual(
            canonicalize_url("http://www.example.com/do?k=r%c3%a9sum%c3%a9"),
            "http://www.example.com/do?k=r%C3%A9sum%C3%A9")
Beispiel #9
0
 def test_quoted_slash_and_question_sign(self):
     self.assertEqual(
         canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"),
         "http://foo.com/AC%2FDC+rocks%3F/?yeah=1",
     )
     self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
                      "http://foo.com/AC%2FDC/")
Beispiel #10
0
    def test_normalize_percent_encoding_in_paths(self):
        self.assertEqual(
            canonicalize_url("http://www.example.com/r%c3%a9sum%c3%a9"),
            "http://www.example.com/r%C3%A9sum%C3%A9",
        )

        # non-UTF8 encoded sequences: they should be kept untouched, only upper-cased
        # 'latin1'-encoded sequence in path
        self.assertEqual(
            canonicalize_url("http://www.example.com/a%a3do"),
            "http://www.example.com/a%A3do",
        )

        # 'latin1'-encoded path, UTF-8 encoded query string
        self.assertEqual(
            canonicalize_url(
                "http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9"),
            "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9",
        )

        # 'latin1'-encoded path and query string
        self.assertEqual(
            canonicalize_url("http://www.example.com/a%a3do?q=r%e9sum%e9"),
            "http://www.example.com/a%A3do?q=r%E9sum%E9",
        )
Beispiel #11
0
 def test_canonicalize_parse_url(self):
     # parse_url() wraps urlparse and is used in link extractors
     self.assertEqual(canonicalize_url(parse_url(u"http://www.example.com/résumé?q=résumé")),
                                       "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
     self.assertEqual(canonicalize_url(parse_url('http://www.example.com/caf%e9-con-leche.htm')),
                                       'http://www.example.com/caf%E9-con-leche.htm')
     self.assertEqual(canonicalize_url(parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
                                       "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
Beispiel #12
0
 def test_canonicalize_urlparsed(self):
     # canonicalize_url() can be passed an already urlparse'd URL
     self.assertEqual(canonicalize_url(urlparse(u"http://www.example.com/résumé?q=résumé")),
                                       "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
     self.assertEqual(canonicalize_url(urlparse('http://www.example.com/caf%e9-con-leche.htm')),
                                       'http://www.example.com/caf%E9-con-leche.htm')
     self.assertEqual(canonicalize_url(urlparse("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
                                       "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
Beispiel #13
0
    def test_canonicalize_url_unicode_query_string_wrong_encoding(self):
        # trying to encode with wrong encoding
        # fallback to UTF-8
        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?currency=€", encoding='latin1'),
                                          "http://www.example.com/r%C3%A9sum%C3%A9?currency=%E2%82%AC")

        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='latin1'),
                                          "http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F")
Beispiel #14
0
 def test_canonicalize_parse_url(self):
     # parse_url() wraps urlparse and is used in link extractors
     self.assertEqual(canonicalize_url(parse_url(u"http://www.example.com/résumé?q=résumé")),
                                       "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
     self.assertEqual(canonicalize_url(parse_url('http://www.example.com/caf%e9-con-leche.htm')),
                                       'http://www.example.com/caf%E9-con-leche.htm')
     self.assertEqual(canonicalize_url(parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
                                       "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
Beispiel #15
0
 def test_port_number(self):
     self.assertEqual(
         canonicalize_url("http://www.example.com:8888/do?a=1&b=2&c=3"),
         "http://www.example.com:8888/do?a=1&b=2&c=3")
     # trailing empty ports are removed
     self.assertEqual(
         canonicalize_url("http://www.example.com:/do?a=1&b=2&c=3"),
         "http://www.example.com/do?a=1&b=2&c=3")
Beispiel #16
0
    def test_canonicalize_url_unicode_query_string_wrong_encoding(self):
        # trying to encode with wrong encoding
        # fallback to UTF-8
        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?currency=€", encoding='latin1'),
                                          "http://www.example.com/r%C3%A9sum%C3%A9?currency=%E2%82%AC")

        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='latin1'),
                                          "http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F")
    def parse_url(self, url: URL) -> str:
        # Keep the query strings if they might be feed strings.
        # Wikipedia for example uses query strings to differentiate feeds.
        if any(key in url.query for key in self.valid_keys):
            return canonicalize_url(str(url))

        # Canonicalizing the URL is about 4x slower, but worth it to prevent duplicate requests.
        return canonicalize_url(url_query_cleaner(str(url)))
Beispiel #18
0
 def test_canonicalize_urlparsed(self):
     # canonicalize_url() can be passed an already urlparse'd URL
     self.assertEqual(canonicalize_url(urlparse(u"http://www.example.com/résumé?q=résumé")),
                                       "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
     self.assertEqual(canonicalize_url(urlparse('http://www.example.com/caf%e9-con-leche.htm')),
                                       'http://www.example.com/caf%E9-con-leche.htm')
     self.assertEqual(canonicalize_url(urlparse("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
                                       "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
Beispiel #19
0
 def test_remove_fragments(self):
     self.assertEqual(
         canonicalize_url(u"http://*****:*****@www.example.com/do?a=1#frag"),
         u"http://*****:*****@www.example.com/do?a=1")
     self.assertEqual(
         canonicalize_url(u"http://*****:*****@www.example.com/do?a=1#frag",
                          keep_fragments=True),
         u"http://*****:*****@www.example.com/do?a=1#frag")
Beispiel #20
0
 def test_non_ascii_percent_encoding_in_paths(self):
     self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"),
                                       "http://www.example.com/a%20do?a=1"),
     self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"),
                                       "http://www.example.com/a%20%20do?a=1"),
     self.assertEqual(canonicalize_url(u"http://www.example.com/a do£.html?a=1"),
                                       "http://www.example.com/a%20do%C2%A3.html?a=1")
     self.assertEqual(canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"),
                                       "http://www.example.com/a%20do%C2%A3.html?a=1")
Beispiel #21
0
 def test_non_ascii_percent_encoding_in_paths(self):
     self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"),
                                       "http://www.example.com/a%20do?a=1"),
     self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"),
                                       "http://www.example.com/a%20%20do?a=1"),
     self.assertEqual(canonicalize_url(u"http://www.example.com/a do£.html?a=1"),
                                       "http://www.example.com/a%20do%C2%A3.html?a=1")
     self.assertEqual(canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"),
                                       "http://www.example.com/a%20do%C2%A3.html?a=1")
    def request_fingerprint(self, request):

        fp = hashlib.sha1()
        fp.update(to_bytes(request.method))
        # if 'url-from' in request.meta:
        fp.update(to_bytes(canonicalize_url(request.meta['url-from'])))
        fp.update(to_bytes(canonicalize_url(request.url)))
        fp.update(request.body or b'')

        return fp.hexdigest()
Beispiel #23
0
    def test_canonicalize_url_unicode_query_string(self):
        # default encoding for path and query is UTF-8
        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé"),
                                          "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")

        # passed encoding will affect query string
        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé", encoding='latin1'),
                                          "http://www.example.com/r%C3%A9sum%C3%A9?q=r%E9sum%E9")

        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='cp1251'),
                                          "http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%EE%F1%F1%E8%FF")
Beispiel #24
0
    def test_canonicalize_url_unicode_query_string(self):
        # default encoding for path and query is UTF-8
        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé"),
                                          "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")

        # passed encoding will affect query string
        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé", encoding='latin1'),
                                          "http://www.example.com/r%C3%A9sum%C3%A9?q=r%E9sum%E9")

        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='cp1251'),
                                          "http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%EE%F1%F1%E8%FF")
def main():
    total = 0
    time = 0
    time_file_uri_to_path = 0
    time_safe_url_string = 0
    time_canonicalize_url = 0

    tar = tarfile.open("sites.tar.gz")
    urls = []

    for member in tar.getmembers():
        f = tar.extractfile(member)
        html = f.read()
        response = HtmlResponse(url="local", body=html, encoding='utf8')

        links = response.css('a::attr(href)').extract()
        urls.extend(links)

    for url in urls:
        start_file_uri_to_path = timer()
        file_uri_to_path(url)
        end_file_uri_to_path = timer()
        time_file_uri_to_path += (end_file_uri_to_path -
                                  start_file_uri_to_path)
        time += (end_file_uri_to_path - start_file_uri_to_path)

        start_safe_url_string = timer()
        safe_url_string(url)
        end_safe_url_string = timer()
        time_safe_url_string += (end_safe_url_string - start_safe_url_string)
        time += (end_safe_url_string - start_safe_url_string)

        start_canonicalize_url = timer()
        canonicalize_url(url)
        end_canonicalize_url = timer()
        time_canonicalize_url += (end_canonicalize_url -
                                  start_canonicalize_url)
        time += (end_canonicalize_url - start_canonicalize_url)

        # any_to_uri(url) # Error on Python 2: KeyError: u'\u9996'

        total += 1

    print("\nTotal number of items extracted = {0}".format(total))
    print("Time spent on file_uri_to_path = {0}".format(time_file_uri_to_path))
    print("Time spent on safe_url_string = {0}".format(time_safe_url_string))
    print("Time spent on canonicalize_url = {0}".format(time_canonicalize_url))
    print("Total time taken = {0}".format(time))
    click.secho("Rate of link extraction : {0} items/second\n".format(
        float(total / time)),
                bold=True)

    with open("Benchmark.txt", 'w') as g:
        g.write(" {0}".format((float(total / time))))
Beispiel #26
0
    def test_canonicalize_url_idna_exceptions(self):
        # missing DNS label
        self.assertEqual(
            canonicalize_url(u"http://.example.com/résumé?q=résumé"),
            "http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")

        # DNS label too long
        self.assertEqual(
            canonicalize_url(u"http://www.{label}.com/résumé?q=résumé".format(
                label=u"example" * 11)),
            "http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".
            format(label=u"example" * 11))
Beispiel #27
0
    def test_canonicalize_url_idempotence(self):
        for url, enc in [(u'http://www.bücher.de/résumé?q=résumé', 'utf8'),
                         (u'http://www.example.com/résumé?q=résumé', 'latin1'),
                         (u'http://www.example.com/résumé?country=Россия', 'cp1251'),
                         (u'http://はじめよう.みんな/?query=サ&maxResults=5', 'iso2022jp')]:
            canonicalized = canonicalize_url(url, encoding=enc)

            # if we canonicalize again, we ge the same result
            self.assertEqual(canonicalize_url(canonicalized, encoding=enc), canonicalized)

            # without encoding, already canonicalized URL is canonicalized identically
            self.assertEqual(canonicalize_url(canonicalized), canonicalized)
Beispiel #28
0
    def test_keep_blank_values(self):
        self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2", keep_blank_values=False),
                                          "http://www.example.com/do?a=2")
        self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"),
                                          "http://www.example.com/do?a=2&b=")
        self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2", keep_blank_values=False),
                                          "http://www.example.com/do?a=2")
        self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2"),
                                          "http://www.example.com/do?a=2&b=&c=")

        self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'),
                                           'http://www.example.com/do?1750%2C4=')
Beispiel #29
0
    def test_keep_blank_values(self):
        self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2", keep_blank_values=False),
                                          "http://www.example.com/do?a=2")
        self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"),
                                          "http://www.example.com/do?a=2&b=")
        self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2", keep_blank_values=False),
                                          "http://www.example.com/do?a=2")
        self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2"),
                                          "http://www.example.com/do?a=2&b=&c=")

        self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'),
                                           'http://www.example.com/do?1750%2C4=')
Beispiel #30
0
    def test_canonicalize_url_idempotence(self):
        for url, enc in [(u'http://www.bücher.de/résumé?q=résumé', 'utf8'),
                         (u'http://www.example.com/résumé?q=résumé', 'latin1'),
                         (u'http://www.example.com/résumé?country=Россия', 'cp1251'),
                         (u'http://はじめよう.みんな/?query=サ&maxResults=5', 'iso2022jp')]:
            canonicalized = canonicalize_url(url, encoding=enc)

            # if we canonicalize again, we ge the same result
            self.assertEqual(canonicalize_url(canonicalized, encoding=enc), canonicalized)

            # without encoding, already canonicalized URL is canonicalized identically
            self.assertEqual(canonicalize_url(canonicalized), canonicalized)
Beispiel #31
0
    def test_canonicalize_url_idna_exceptions(self):
        # missing DNS label
        self.assertEqual(
            canonicalize_url(u"http://.example.com/résumé?q=résumé"),
            "http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")

        # DNS label too long
        self.assertEqual(
            canonicalize_url(
                u"http://www.{label}.com/résumé?q=résumé".format(
                    label=u"example"*11)),
            "http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format(
                    label=u"example"*11))
Beispiel #32
0
    def test_canonicalize_url_idna_exceptions(self):
        # missing DNS label
        self.assertEqual(
            canonicalize_url("http://.example.com/résumé?q=résumé"),
            "http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9",
        )

        # DNS label too long
        self.assertEqual(
            canonicalize_url(
                f"http://www.{'example' * 11}.com/résumé?q=résumé"),
            f"http://www.{'example' * 11}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9",
        )
Beispiel #33
0
    def test_normalize_percent_encoding_in_paths(self):
        self.assertEqual(canonicalize_url("http://www.example.com/r%c3%a9sum%c3%a9"),
                                          "http://www.example.com/r%C3%A9sum%C3%A9")

        # non-UTF8 encoded sequences: they should be kept untouched, only upper-cased
        # 'latin1'-encoded sequence in path
        self.assertEqual(canonicalize_url("http://www.example.com/a%a3do"),
                                          "http://www.example.com/a%A3do")

        # 'latin1'-encoded path, UTF-8 encoded query string
        self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9"),
                                          "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")

        # 'latin1'-encoded path and query string
        self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%e9sum%e9"),
                                          "http://www.example.com/a%A3do?q=r%E9sum%E9")
Beispiel #34
0
 def _process_links(self, links):
     links = [x for x in links if self._link_allowed(x)]
     if self.canonicalize:
         for link in links:
             link.url = canonicalize_url(link.url)
     links = self.link_extractor._process_links(links)
     return links
Beispiel #35
0
def canonicalize(url,
                 remove_parameters=('utm_medium', 'utm_source', 'utm_campaign',
                                    'utm_term', 'utm_content')):
    """Canonicalize URL."""
    try:
        curl = url_query_cleaner(canonicalize_url(url,
                                                  keep_blank_values=False,
                                                  keep_fragments=False),
                                 parameterlist=remove_parameters,
                                 remove=True)
        return canonicalize_url(curl,
                                keep_blank_values=False,
                                keep_fragments=False)
    except Exception as e:
        logger.warning('Fail to canonicalize url %r: %s', url, e)
        return None
Beispiel #36
0
    def _parse_links(self, response):
        #提取网页中的链接
        #并把相对url补全为完整的url
        l = LinkLoader(html.html_to_unicode(response))
        l.add_xpath(xpath='//a/@href', re_patten=r'/subject/[0-9]+/$|/tag/.*')
        #如果需要提取多个不同的规则的链接就调用多次
        #l.add_xpath(xpath, re_patten)
        #l.add_xpath(xpath, re_patten)
        #最后调用get()方法就可以获取到相应的所有的链接,返回的是一个包含url的列表
        links = l.get()

        base = urlparse.urlparse(response.url)
        domain = '://'.join((base.scheme, base.netloc))
        for url in links:
            #其实下面这些部分scrapy内置link extrackor实现了
            #也可以放到中间件中去实现将相对url补全为完整的url
            component = urlparse.urlparse(url)
            #这一步是去除url中的host与response的url的host不相同的url
            #然后scrapy默认的offsite spider中间件就可以保证抓取到的不会抓取不该抓取的url
            if (component.netloc) and (component.netloc != base.netloc):
                continue
            #这一步判断url是否为完整的url
            if domain not in url:
                url = urlparse.urljoin(domain, url)
            #将url规范化,比如去除url中的#号等等
            url = canonicalize_url(url)
            #设置request抓取的优先级
            priority = 5 if self.item_url.search(url) else 0
            #遇到了如果没有显示的指定callback,而就单单指定一个
            #errback就会报错的情况
            yield Request(url=url,
                          callback=self.parse,
                          errback=self.error_back,
                          priority=priority)
Beispiel #37
0
def load_products(response):
    """Load a ProductItem from the product page response."""
    loader = ProductItemLoader(item=ProductItem(),response=response)
    url = url_query_cleaner(response.url, ['snr'], remove=True)
    url = canonicalize_url(url)
    loader.add_value('url', url)
    publisher = response.xpath('//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][6]/div[2]/a[1]/span[2]/text()')
    if publisher is None:
        loader.add_xpath('developer','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][5]/div[2]/a[1]/span[2]/text()')
        loader.add_xpath('publisher','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][5]/div[2]/a[2]/span[2]/text()')
    else:
        loader.add_xpath('developer','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][6]/div[2]/a[1]/span[2]/text()')
        loader.add_xpath('publisher','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][6]/div[2]/a[2]/span[2]/text()')
    loader.add_xpath('release_date','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][4]/div[2]/text()')
    loader.add_css('app_name', '.header__title ::text')
    loader.add_css('specs', '.game-features__title ::text')
    loader.add_css('genre', '.product-details__data span a.un ::text')

    try:
        price = response.css('.module-buy__info > meta:nth-child(2) ::attr(content)').extract_first()
        price_disc = price
    except:
        price = None
        price_disc = price

    if price is None:
        price = '0.00'
        price_disc = price
    loader.add_value('price', price)
    loader.add_value('discount_price', price_disc)
    

    loader.add_css('rating', 'div.average-rating:nth-child(1) > meta:nth-child(4) ::attr(content)')
    
    return loader.load_item()
def cleanup_url(url):
    parsed = urlparse(url)
    url = parsed.netloc + parsed.path + "?" + parsed.query
    url = canonicalize_url(url)
    if url[len(url) - 1] == '/':
        url = url[:len(url) - 1]
    return url
Beispiel #39
0
    def custom_request_fingerprint(self, request, include_headers=None, remove_scheme=None):
        """
        Overridden given that some URL can have a wrong encoding (when it is comes from selenium driver) changes: encode.('utf-8) & in order to be no scheme compliant
        """

        # If use_anchors, anchors in URL matters since each anchor define a different webpage and content (special js_rendering)
        url_for_finger_print = canonicalize_url(request.url) if not self.use_anchors else request.url
        url_for_hash = url_for_finger_print.encode('utf-8')

        # scheme agnosticism
        if remove_scheme:
            match_capture_any_scheme = r'(https?)(.*)'
            url_for_hash = re.sub(match_capture_any_scheme, r"\2", url_for_hash)

        if include_headers:
            include_headers = tuple(to_bytes(h.lower())
                                    for h in sorted(include_headers))
        cache = _fingerprint_cache.setdefault(request, {})

        if include_headers not in cache or not remove_scheme:
            # Since it is called from the same function, wee need to ensure we compute the fingerprint which take into account the scheme. Avoid caching
            fp = hashlib.sha1()
            fp.update(to_bytes(request.method.encode('utf-8')))
            fp.update(to_bytes(url_for_hash))
            fp.update(request.body or ''.encode('utf-8'))
            if include_headers:
                for hdr in include_headers:
                    if hdr in request.headers:
                        fp.update(hdr)
                        for v in request.headers.getlist(hdr):
                            fp.update(v)
            cache[include_headers] = fp.hexdigest()
        return cache[include_headers]
Beispiel #40
0
 def __init__(self,
              tag="a",
              attr="href",
              unique=False,
              process_value=None,
              strip=True,
              canonicalized=False):
     warnings.warn(
         "BaseSgmlLinkExtractor is deprecated and will be removed in future releases. "
         "Please use scrapy.linkextractors.LinkExtractor",
         ScrapyDeprecationWarning,
         stacklevel=2,
     )
     SGMLParser.__init__(self)
     self.scan_tag = tag if callable(tag) else lambda t: t == tag
     self.scan_attr = attr if callable(attr) else lambda a: a == attr
     self.process_value = (
         lambda v: v) if process_value is None else process_value
     self.current_link = None
     self.unique = unique
     self.strip = strip
     if canonicalized:
         self.link_key = lambda link: link.url
     else:
         self.link_key = lambda link: canonicalize_url(link.url,
                                                       keep_fragments=True)
Beispiel #41
0
 def _process_links(self, links):
     links = [x for x in links if self._link_allowed(x)]
     if self.canonicalize:
         for link in links:
             link.url = canonicalize_url(link.url)
     links = self.link_extractor._process_links(links)
     return links
Beispiel #42
0
 def build_key(slug, params):
     url = reverse('api:lookup-by-slug', kwargs={'slug': slug})
     params = {key: value for key, value in params.items() if value}
     # using the page slug as a redis hash tag ensures the keys related to
     # the same page in the same node, preventing delete_many from failing
     # because the keys could be stored across different nodes
     return f'{{slug}}' + canonicalize_url(url + '?' + urlencode(params))
Beispiel #43
0
 def request_fingerprint(self,
                         request,
                         include_headers=None,
                         keep_fragments=False):
     if include_headers:
         include_headers = tuple(
             self.to_bytes(h.lower()) for h in sorted(include_headers))
     cache = _fingerprint_cache.setdefault(request, {})
     cache_key = (include_headers, keep_fragments)
     if cache_key not in cache:
         fp = hashlib.sha1()
         fp.update(self.to_bytes(request.method))
         fp.update(
             self.to_bytes(
                 canonicalize_url(request.url,
                                  keep_fragments=keep_fragments)))
         fp.update(request.body or b'')
         if include_headers:
             for hdr in include_headers:
                 if hdr in request.headers:
                     fp.update(hdr)
                     for v in request.headers.getlist(hdr):
                         fp.update(v)
         cache[cache_key] = fp.hexdigest()
     return cache[cache_key]
Beispiel #44
0
 def test_safe_characters_unicode(self):
     # urllib.quote uses a mapping cache of encoded characters. when parsing
     # an already percent-encoded url, it will fail if that url was not
     # percent-encoded as utf-8, that's why canonicalize_url must always
     # convert the urls to string. the following test asserts that
     # functionality.
     self.assertEqual(canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'),
                                        'http://www.example.com/caf%E9-con-leche.htm')
Beispiel #45
0
 def __init__(self, tag="a", attr="href", process=None, unique=False,
              strip=True, canonicalized=False):
     self.scan_tag = tag if callable(tag) else lambda t: t == tag
     self.scan_attr = attr if callable(attr) else lambda a: a == attr
     self.process_attr = process if callable(process) else lambda v: v
     self.unique = unique
     self.strip = strip
     if canonicalized:
         self.link_key = lambda link: link.url
     else:
         self.link_key = lambda link: canonicalize_url(link.url,
                                                       keep_fragments=True)
Beispiel #46
0
def request_fingerprint(request, include_headers=None):
    """
    Return the request fingerprint.

    The request fingerprint is a hash that uniquely identifies the resource the
    request points to. For example, take the following two urls:

    http://www.example.com/query?id=111&cat=222
    http://www.example.com/query?cat=222&id=111

    Even though those are two different URLs both point to the same resource
    and are equivalent (ie. they should return the same response).

    Another example are cookies used to store session ids. Suppose the
    following page is only accesible to authenticated users:

    http://www.example.com/members/offers.html

    Lot of sites use a cookie to store the session id, which adds a random
    component to the HTTP Request and thus should be ignored when calculating
    the fingerprint.

    For this reason, request headers are ignored by default when calculating
    the fingeprint. If you want to include specific headers use the
    include_headers argument, which is a list of Request headers to include.

    """
    if include_headers:
        include_headers = tuple(to_bytes(h.lower())
                                 for h in sorted(include_headers))
    cache = _fingerprint_cache.setdefault(request, {})
    if include_headers not in cache:
        fp = hashlib.sha1()
        fp.update(to_bytes(request.method))
        fp.update(to_bytes(canonicalize_url(request.url)))
        fp.update(request.body or b'')
        if include_headers:
            for hdr in include_headers:
                if hdr in request.headers:
                    fp.update(hdr)
                    for v in request.headers.getlist(hdr):
                        fp.update(v)
        cache[include_headers] = fp.hexdigest()
    return cache[include_headers]
Beispiel #47
0
 def __init__(self, tag="a", attr="href", unique=False, process_value=None,
              strip=True, canonicalized=False):
     warnings.warn(
         "BaseSgmlLinkExtractor is deprecated and will be removed in future releases. "
         "Please use scrapy.linkextractors.LinkExtractor",
         ScrapyDeprecationWarning, stacklevel=2,
     )
     SGMLParser.__init__(self)
     self.scan_tag = tag if callable(tag) else lambda t: t == tag
     self.scan_attr = attr if callable(attr) else lambda a: a == attr
     self.process_value = (lambda v: v) if process_value is None else process_value
     self.current_link = None
     self.unique = unique
     self.strip = strip
     if canonicalized:
         self.link_key = lambda link: link.url
     else:
         self.link_key = lambda link: canonicalize_url(link.url,
                                                       keep_fragments=True)
Beispiel #48
0
 def test_quoted_slash_and_question_sign(self):
     self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"),
                      "http://foo.com/AC%2FDC+rocks%3F/?yeah=1")
     self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
                      "http://foo.com/AC%2FDC/")
Beispiel #49
0
 def test_domains_are_case_insensitive(self):
     self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"),
                                       "http://www.example.com/")
Beispiel #50
0
 def test_append_missing_path(self):
     self.assertEqual(canonicalize_url("http://www.example.com"),
                                       "http://www.example.com/")
Beispiel #51
0
 def test_port_number(self):
     self.assertEqual(canonicalize_url("http://www.example.com:8888/do?a=1&b=2&c=3"),
                                       "http://www.example.com:8888/do?a=1&b=2&c=3")
     # trailing empty ports are removed
     self.assertEqual(canonicalize_url("http://www.example.com:/do?a=1&b=2&c=3"),
                                       "http://www.example.com/do?a=1&b=2&c=3")
Beispiel #52
0
 def test_dont_convert_safe_characters(self):
     # dont convert safe characters to percent encoding representation
     self.assertEqual(canonicalize_url(
         "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"),
         "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html")
def test_cookies(settings):

    # 64K for headers is over Twisted limit,
    # so if these headers are sent to Splash request would fail.
    BOMB = 'x' * 64000

    class LuaScriptSpider(ResponseSpider):
        """ Cookies must be sent to website, not to Splash """
        custom_settings = {
            'SPLASH_COOKIES_DEBUG': True,
            'COOKIES_DEBUG': True,
        }

        def start_requests(self):
            # cookies set without Splash should be still
            # sent to a remote website. FIXME: this is not the case.
            yield scrapy.Request(self.url + "/login", self.parse,
                                 cookies={'x-set-scrapy': '1'})

        def parse(self, response):
            yield SplashRequest(self.url + "#egg", self.parse_1,
                                endpoint='execute',
                                args={'lua_source': DEFAULT_SCRIPT},
                                cookies={'x-set-splash': '1'})

        def parse_1(self, response):
            yield {'response': response}
            yield SplashRequest(self.url + "#foo", self.parse_2,
                                endpoint='execute',
                                args={'lua_source': DEFAULT_SCRIPT})

        def parse_2(self, response):
            yield {'response': response}
            yield scrapy.Request(self.url, self.parse_3)

        def parse_3(self, response):
            # Splash (Twisted) drops requests with huge http headers,
            # but this one should work, as cookies are not sent
            # to Splash itself.
            yield {'response': response}
            yield SplashRequest(self.url + "#bar", self.parse_4,
                                endpoint='execute',
                                args={'lua_source': DEFAULT_SCRIPT},
                                cookies={'bomb': BOMB})

        def parse_4(self, response):
            yield {'response': response}


    def _cookie_dict(har_cookies):
        return {c['name']: c['value'] for c in har_cookies}

    items, url, crawler = yield crawl_items(LuaScriptSpider, ManyCookies,
                                            settings)
    assert len(items) == 4

    # cookie should be sent to remote website, not to Splash
    resp = items[0]['response']
    splash_request_headers = resp.request.headers
    cookies = resp.data['args']['cookies']
    print(splash_request_headers)
    print(cookies)
    assert _cookie_dict(cookies) == {
        # 'login': '******',   # FIXME
        'x-set-splash': '1'
    }
    assert splash_request_headers.get(b'Cookie') is None

    # new cookie should be also sent to remote website, not to Splash
    resp2 = items[1]['response']
    splash_request_headers = resp2.request.headers
    headers = resp2.data['args']['headers']
    cookies = resp2.data['args']['cookies']
    assert canonicalize_url(headers['Referer']) == canonicalize_url(url)
    assert _cookie_dict(cookies) == {
        # 'login': '******',
        'x-set-splash': '1',
        'sessionid': 'ABCD'
    }
    print(splash_request_headers)
    print(headers)
    print(cookies)
    assert splash_request_headers.get(b'Cookie') is None

    # TODO/FIXME: Cookies fetched when working with Splash should be picked up
    # by Scrapy
    resp3 = items[2]['response']
    splash_request_headers = resp3.request.headers
    cookie_header = splash_request_headers.get(b'Cookie')
    assert b'x-set-scrapy=1' in cookie_header
    assert b'login=1' in cookie_header
    assert b'x-set-splash=1' in cookie_header
    # assert b'sessionid=ABCD' in cookie_header  # FIXME

    # cookie bomb shouldn't cause problems
    resp4 = items[3]['response']
    splash_request_headers = resp4.request.headers
    cookies = resp4.data['args']['cookies']
    assert _cookie_dict(cookies) == {
        # 'login': '******',
        'x-set-splash': '1',
        'sessionid': 'ABCD',
        'bomb': BOMB,
    }
    assert splash_request_headers.get(b'Cookie') is None
Beispiel #54
0
 def test_sorting(self):
     self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"),
                                       "http://www.example.com/do?a=50&b=2&b=5&c=3")
Beispiel #55
0
 def test_canonicalize_url_unicode_path(self):
     self.assertEqual(canonicalize_url(u"http://www.example.com/résumé"),
                                       "http://www.example.com/r%C3%A9sum%C3%A9")
Beispiel #56
0
    def test_normalize_percent_encoding_in_query_arguments(self):
        self.assertEqual(canonicalize_url("http://www.example.com/do?k=b%a3"),
                                          "http://www.example.com/do?k=b%A3")

        self.assertEqual(canonicalize_url("http://www.example.com/do?k=r%c3%a9sum%c3%a9"),
                                          "http://www.example.com/do?k=r%C3%A9sum%C3%A9")
Beispiel #57
0
 def test_urls_with_auth_and_ports(self):
     self.assertEqual(canonicalize_url(u"http://*****:*****@www.example.com:81/do?now=1"),
                                       u"http://*****:*****@www.example.com:81/do?now=1")
Beispiel #58
0
 def test_canonicalize_idns(self):
     self.assertEqual(canonicalize_url(u'http://www.bücher.de?q=bücher'),
                                        'http://www.xn--bcher-kva.de/?q=b%C3%BCcher')
     # Japanese (+ reordering query parameters)
     self.assertEqual(canonicalize_url(u'http://はじめよう.みんな/?query=サ&maxResults=5'),
                                        'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?maxResults=5&query=%E3%82%B5')
Beispiel #59
0
 def _get_fingerprint(self, url):
     return self.fingerprint_function(canonicalize_url(url))
Beispiel #60
0
 def test_remove_fragments(self):
     self.assertEqual(canonicalize_url(u"http://*****:*****@www.example.com/do?a=1#frag"),
                                       u"http://*****:*****@www.example.com/do?a=1")
     self.assertEqual(canonicalize_url(u"http://*****:*****@www.example.com/do?a=1#frag", keep_fragments=True),
                                       u"http://*****:*****@www.example.com/do?a=1#frag")