Python canonicalize_url Exemples, scrapy.utils.url.canonicalize_url Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_utils_url.py Projet : 01-/scrapy

 def test_spaces(self):
     self.assertEqual(canonicalize_url("http://www.example.com/do?q=a space&a=1"),
                                       "http://www.example.com/do?a=1&q=a+space")
     self.assertEqual(canonicalize_url("http://www.example.com/do?q=a+space&a=1"),
                                       "http://www.example.com/do?a=1&q=a+space")
     self.assertEqual(canonicalize_url("http://www.example.com/do?q=a%20space&a=1"),
                                       "http://www.example.com/do?a=1&q=a+space")

Exemple #2

0

Afficher le fichier

Fichier : test_utils_url.py Projet : 01-/scrapy

 def test_non_ascii_percent_encoding_in_query_arguments(self):
     self.assertEqual(canonicalize_url(u"http://www.example.com/do?price=\xa3500&a=5&z=3"),
                                       u"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
     self.assertEqual(canonicalize_url(b"http://www.example.com/do?price=\xc2\xa3500&a=5&z=3"),
                                       "http://www.example.com/do?a=5&price=%C2%A3500&z=3")
     self.assertEqual(canonicalize_url(b"http://www.example.com/do?price(\xc2\xa3)=500&a=1"),
                                       "http://www.example.com/do?a=1&price%28%C2%A3%29=500")

Exemple #3

0

Afficher le fichier

Fichier : test_utils_url.py Projet : 01-/scrapy

 def test_typical_usage(self):
     self.assertEqual(canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"),
                                       "http://www.example.com/do?a=1&b=2&c=3")
     self.assertEqual(canonicalize_url("http://www.example.com/do?c=1&b=2&a=3"),
                                       "http://www.example.com/do?a=3&b=2&c=1")
     self.assertEqual(canonicalize_url("http://www.example.com/do?&a=1"),
                                       "http://www.example.com/do?a=1")

Exemple #4

0

Afficher le fichier

Fichier : test_utils_url.py Projet : 447327642/scrapy

    def test_canonicalize_url_unicode_query_string_wrong_encoding(self):
        # trying to encode with wrong encoding
        # fallback to UTF-8
        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?currency=€", encoding='latin1'),
                                          "http://www.example.com/r%C3%A9sum%C3%A9?currency=%E2%82%AC")

        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='latin1'),
                                          "http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F")

Exemple #5

0

Afficher le fichier

Fichier : test_utils_url.py Projet : 447327642/scrapy

 def test_canonicalize_urlparsed(self):
     # canonicalize_url() can be passed an already urlparse'd URL
     self.assertEqual(canonicalize_url(urlparse(u"http://www.example.com/résumé?q=résumé")),
                                       "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
     self.assertEqual(canonicalize_url(urlparse('http://www.example.com/caf%e9-con-leche.htm')),
                                       'http://www.example.com/caf%E9-con-leche.htm')
     self.assertEqual(canonicalize_url(urlparse("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
                                       "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")

Exemple #6

0

Afficher le fichier

Fichier : test_utils_url.py Projet : 447327642/scrapy

 def test_canonicalize_parse_url(self):
     # parse_url() wraps urlparse and is used in link extractors
     self.assertEqual(canonicalize_url(parse_url(u"http://www.example.com/résumé?q=résumé")),
                                       "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
     self.assertEqual(canonicalize_url(parse_url('http://www.example.com/caf%e9-con-leche.htm')),
                                       'http://www.example.com/caf%E9-con-leche.htm')
     self.assertEqual(canonicalize_url(parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
                                       "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")

Exemple #7

0

Afficher le fichier

Fichier : test_utils_url.py Projet : 01-/scrapy

 def test_non_ascii_percent_encoding_in_paths(self):
     self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"),
                                       "http://www.example.com/a%20do?a=1"),
     self.assertEqual(canonicalize_url("http://www.example.com/a %20do?a=1"),
                                       "http://www.example.com/a%20%20do?a=1"),
     self.assertEqual(canonicalize_url(u"http://www.example.com/a do£.html?a=1"),
                                       "http://www.example.com/a%20do%C2%A3.html?a=1")
     self.assertEqual(canonicalize_url(b"http://www.example.com/a do\xc2\xa3.html?a=1"),
                                       "http://www.example.com/a%20do%C2%A3.html?a=1")

Exemple #8

0

Afficher le fichier

Fichier : test_utils_url.py Projet : 447327642/scrapy

    def test_canonicalize_url_unicode_query_string(self):
        # default encoding for path and query is UTF-8
        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé"),
                                          "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")

        # passed encoding will affect query string
        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?q=résumé", encoding='latin1'),
                                          "http://www.example.com/r%C3%A9sum%C3%A9?q=r%E9sum%E9")

        self.assertEqual(canonicalize_url(u"http://www.example.com/résumé?country=Россия", encoding='cp1251'),
                                          "http://www.example.com/r%C3%A9sum%C3%A9?country=%D0%EE%F1%F1%E8%FF")

Exemple #9

0

Afficher le fichier

Fichier : test_utils_url.py Projet : 447327642/scrapy

    def test_canonicalize_url_idempotence(self):
        for url, enc in [(u'http://www.bücher.de/résumé?q=résumé', 'utf8'),
                         (u'http://www.example.com/résumé?q=résumé', 'latin1'),
                         (u'http://www.example.com/résumé?country=Россия', 'cp1251'),
                         (u'http://はじめよう.みんな/?query=サ&maxResults=5', 'iso2022jp')]:
            canonicalized = canonicalize_url(url, encoding=enc)

            # if we canonicalize again, we ge the same result
            self.assertEqual(canonicalize_url(canonicalized, encoding=enc), canonicalized)

            # without encoding, already canonicalized URL is canonicalized identically
            self.assertEqual(canonicalize_url(canonicalized), canonicalized)

Exemple #10

0

Afficher le fichier

Fichier : test_utils_url.py Projet : 01-/scrapy

    def test_keep_blank_values(self):
        self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2", keep_blank_values=False),
                                          "http://www.example.com/do?a=2")
        self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"),
                                          "http://www.example.com/do?a=2&b=")
        self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2", keep_blank_values=False),
                                          "http://www.example.com/do?a=2")
        self.assertEqual(canonicalize_url("http://www.example.com/do?b=&c&a=2"),
                                          "http://www.example.com/do?a=2&b=&c=")

        self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'),
                                           'http://www.example.com/do?1750%2C4=')

Exemple #11

0

Afficher le fichier

Fichier : test_utils_url.py Projet : 0daybug/scrapy

    def test_canonicalize_url_idna_exceptions(self):
        # missing DNS label
        self.assertEqual(
            canonicalize_url(u"http://.example.com/résumé?q=résumé"),
            "http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")

        # DNS label too long
        self.assertEqual(
            canonicalize_url(
                u"http://www.{label}.com/résumé?q=résumé".format(
                    label=u"example"*11)),
            "http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format(
                    label=u"example"*11))

Exemple #12

0

Afficher le fichier

Fichier : middlewares.py Projet : os2webscanner/os2webscanner

    def process_spider_output(self, response, result, spider):
        """Process spider output."""
        if not getattr(spider, "do_last_modified_check", False):
            return result
        last_modified_header = response.headers.get("Last-Modified", None)
        if last_modified_header is None:
            # We don't need to store the links, since the page has no
            # Last-Modified header.
            return result

        source_url = canonicalize_url(response.request.url)
        try:
            url_last_modified = UrlLastModified.objects.get(
                url=source_url,
                scanner=self.get_scanner_object(spider)
            )
        except UrlLastModified.DoesNotExist:
            # We never stored the URL for the original request: this
            # shouldn't happen.
            return result

        logging.debug("Updating links for %s" % url_last_modified)

        # Clear existing links
        url_last_modified.links.clear()

        # Update links
        for r in result:
            if isinstance(r, Request):
                if spider.is_offsite(r) or spider.is_excluded(r):
                    continue
                target_url = canonicalize_url(r.url)
                # Get or create a URL last modified object
                try:
                    link = UrlLastModified.objects.get(
                        url=target_url,
                        scanner=self.get_scanner_object(spider)
                    )
                except UrlLastModified.DoesNotExist:
                    # Create new link
                    link = UrlLastModified(
                        url=target_url,
                        last_modified=None,
                        scanner=self.get_scanner_object(spider)
                    )
                    link.save()
                # Add the link to the URL last modified object
                url_last_modified.links.add(link)
                logging.debug("Added link %s" % link)
        return result

Exemple #13

0

Afficher le fichier

Fichier : url_validator.py Projet : weisbeck/403Section

def canonicalizer(url):
    """
    Source: https://github.com/scrapy/scrapy
    
    Normalizes the URL.
    """
    return canonicalize_url(url)

Exemple #14

0

Afficher le fichier

Fichier : law_projects.py Projet : ManoSeimas/manoseimas.lt

def process_mp_page_url(url):
    """
    >>> process_mp_page_url('http://www3.lrs.lt/pls/inter/'
    ...                     'w5_smn_akt_new.seim_nar_proj'
    ...                     '?p_start=2012-11-16&p_end=&p_kad_ses='
    ...                     '&p_asm_id=7198&p_grup_id=8&p_forma=')
    'http://www3.lrs.lt/pls/inter/w5_smn_akt_new.seim_nar_proj?p_asm_id=7198&p_end=&p_forma=&p_grup_id=8&p_kad_ses=&p_no=1&p_rus=&p_start=2012-11-16'
    >>> process_mp_page_url('http://www3.lrs.lt/pls/inter/'
    ...                     'w5_smn_akt_new.seim_nar_proj'
    ...                     '?p_start=2012-11-16&p_end=&p_kad_ses='
    ...                     '&p_asm_id=7198&p_grup_id=8&p_forma=&p_no=2')
    'http://www3.lrs.lt/pls/inter/w5_smn_akt_new.seim_nar_proj?p_asm_id=7198&p_end=&p_forma=&p_grup_id=8&p_kad_ses=&p_no=2&p_rus=&p_start=2012-11-16'
    """  # noqa
    parts = urlparse(url)
    qs = parse_qs(parts.query, keep_blank_values=True)
    if 'p_no' not in qs:
        qs['p_no'] = 1
    if 'p_rus' not in qs:
        qs['p_rus'] = ''
    new_url = urlunparse([
        parts.scheme,
        parts.netloc,
        parts.path,
        parts.params,
        urlencode(qs, doseq=True),
        parts.fragment,
    ])

    return canonicalize_url(new_url)

Exemple #15

0

Afficher le fichier

Fichier : proxy.py Projet : sergeospb/WarcProxy

def fingerprint_request(req, arguments=None):
    """
    from scrapy
    Return the request fingerprint.

    The request fingerprint is a hash that uniquely identifies the resource the
    request points to. For example, take the following two urls:

    http://www.example.com/query?id=111&cat=222
    http://www.example.com/query?cat=222&id=111

    Even though those are two different URLs both point to the same resource
    and are equivalent (ie. they should return the same response).

    """
    cache = _fingerprint_cache.setdefault(req, {})
    url = canonicalize_url(req.url)
    ignore_headers = ['Connection', 'User-Agent', 'Referer', ]
    if url not in cache:
        fp = hashlib.sha1()
        fp.update(str(url))
        fp.update(str(req.method))
        fp.update(str(req.body))
        if arguments:
            for name, value in arguments.iteritems():
                fp.update("%s%s" % (name, value))

        for name, value in req.headers.iteritems():
            if name in ignore_headers:
                continue
            fp.update("%s%s" % (name, value))
        cache[url] = fp.hexdigest()
    return cache[url]

Exemple #16

0

Afficher le fichier

Fichier : dupefilter.py Projet : nyov/scrapyext

def request_fingerprint(request, include_headers=None):
	"""
	Calculate request fingerprint on a modified request.
	"""
	if include_headers:
		include_headers = tuple([h.lower() for h in sorted(include_headers)])
	cache = _fingerprint_cache.setdefault(request, {})
	if include_headers not in cache:
		fp = hashlib.sha1()
		fp.update(request.method)
		# hack here #
		# * filter incrementing 'requestid' from url
		rewritten_url = canonicalize_url(request.url)
		#rewritten_url = re.sub(r'(?:requestid=\d+)', '', rewritten_url)
		fp.update(rewritten_url)
		# * ignore sessionid from xhr post body
		parsed_body = request.body or ''
		#parsed_body = re.sub(r'(httpSessionId=.*?\n)', '', parsed_body)
		#parsed_body = re.sub(r'(scriptSessionId=.*?\n)', '', parsed_body)
		fp.update(parsed_body or '')
		# end hack #
		if include_headers:
			for hdr in include_headers:
				if hdr in request.headers:
					fp.update(hdr)
					for v in request.headers.getlist(hdr):
						fp.update(v)
		cache[include_headers] = fp.hexdigest()
	return cache[include_headers]

Exemple #17

0

Afficher le fichier

Fichier : sgml.py Projet : serkanh/scrapy

    def extract_links(self, response):
        if self.restrict_xpaths:
            hxs = HtmlXPathSelector(response)
            html_slice = ''.join(''.join(html_fragm for html_fragm in hxs.select(xpath_expr).extract()) \
                for xpath_expr in self.restrict_xpaths)
            links = self._extract_links(html_slice, response.url, response.encoding)
        else:
            links = BaseSgmlLinkExtractor.extract_links(self, response)

        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [link for link in links if _matches(link.url, self.allow_res)]
        if self.deny_res:
            links = [link for link in links if not _matches(link.url, self.deny_res)]
        if self.allow_domains:
            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
        if self.deny_domains:
            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        return links

Exemple #18

0

Afficher le fichier

Fichier : image.py Projet : bihicheng/scrapy

    def extract_links(self, response):
        xs = HtmlXPathSelector(response)
        base_url = xs.select('//base/@href').extract()
        base_url = urljoin_rfc(response.url, base_url[0]) if base_url else response.url

        links = []
        for location in self.locations:
            if isinstance(location, basestring):
                selectors = xs.select(location)
            elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)):
                selectors = [location] if isinstance(location, HtmlXPathSelector) else location
            else:
                continue

            for selector in selectors:
                links.extend(self.extract_from_selector(selector, response.encoding))

        seen, ret = set(), []
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response.encoding)
            if self.unique:
                if link.url in seen:
                    continue
                else:
                    seen.add(link.url)
            if self.canonicalize:
                link.url = canonicalize_url(link.url)
            ret.append(link)

        return ret

Exemple #19

0

Afficher le fichier

Fichier : linkextractors.py Projet : shahin/hippolyte

    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [link for link in links if _matches(link.url, self.allow_res)]
        if self.deny_res:
            links = [link for link in links if not _matches(link.url, self.deny_res)]
        if self.allow_domains:
            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
        if self.deny_domains:
            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]

        new_links = []
        for link in links:
            CustomerId = link.url.split('/')[6]
            if not self._ignore_identifier(CustomerId):
                log.msg("Found CustomerId: "+CustomerId,level=log.DEBUG)
                new_links.append(link)

        links = new_links

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links

Exemple #20

0

Afficher le fichier

Fichier : computeHmeasures.py Projet : kliegr/hierarchical_evaluation_measures

def extract_norm_uri(uric):
    #print uric
    if "\\u" in uric:
        uric=uric.decode('unicode-escape')
    norm = URIRef(canonicalize_url(uric.replace("<","").replace(">","")))
    #print norm
    return norm

Exemple #21

0

Afficher le fichier

Fichier : linkextractors.py Projet : shahin/hippolyte

    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [link for link in links if _matches(link.url, self.allow_res)]
        if self.deny_res:
            links = [link for link in links if not _matches(link.url, self.deny_res)]
        if self.allow_domains:
            links = [link for link in links if url_is_from_any_domain(link.url, self.allow_domains)]
        if self.deny_domains:
            links = [link for link in links if not url_is_from_any_domain(link.url, self.deny_domains)]

        new_links = []
        for link in links:
            ASIN = link.url.split('/')[5]
            if not self._ignore_identifier(ASIN):
                log.msg("Found ASIN: "+ASIN,level=log.DEBUG)
                link.url = "http://www.amazon.com/product-reviews/"+ASIN+"/ref%3Ddp_top_cm_cr_acr_txt?ie=UTF8&showViewpoints=0"
                new_links.append(link)

        links = new_links

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links

Exemple #22

0

Afficher le fichier

Fichier : test_utils_url.py Projet : 447327642/scrapy

    def test_normalize_percent_encoding_in_paths(self):
        self.assertEqual(canonicalize_url("http://www.example.com/r%c3%a9sum%c3%a9"),
                                          "http://www.example.com/r%C3%A9sum%C3%A9")

        # non-UTF8 encoded sequences: they should be kept untouched, only upper-cased
        # 'latin1'-encoded sequence in path
        self.assertEqual(canonicalize_url("http://www.example.com/a%a3do"),
                                          "http://www.example.com/a%A3do")

        # 'latin1'-encoded path, UTF-8 encoded query string
        self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9"),
                                          "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")

        # 'latin1'-encoded path and query string
        self.assertEqual(canonicalize_url("http://www.example.com/a%a3do?q=r%e9sum%e9"),
                                          "http://www.example.com/a%A3do?q=r%E9sum%E9")

Exemple #23

0

Afficher le fichier

Fichier : test_spidermiddleware_urlfilter.py Projet : kenzouyeh/scrapy

    def test_process_spider_output(self):
        res = Response('http://scrapytest.org')
        req_url = 'http://scrapytest.org/?last=1&first=2'
        reqs = [Request(req_url)]

        out = list(self.mw.process_spider_output(res, reqs, self.spider))
        self.assertEquals(out[0].url, canonicalize_url(req_url))

Exemple #24

0

Afficher le fichier

Fichier : linkextractor.py Projet : 5ace/scrapy

 def _process_links(self, links):
     links = [x for x in links if self._link_allowed(x)]
     if self.canonicalize:
         for link in links:
             link.url = canonicalize_url(urlparse(link.url))
     links = self.link_extractor._process_links(links)
     return links

Exemple #25

0

Afficher le fichier

Fichier : dupefilter.py Projet : pombredanne/djoonga

def request_fingerprint(request, include_headers=None):
    """
    Return the request fingerprint.
    
    This was taken from scrapy.utils, but we added canocalization for Referer url
    
    The request fingerprint is a hash that uniquely identifies the resource the
    request points to. For example, take the following two urls:
    
    http://www.example.com/query?id=111&cat=222
    http://www.example.com/query?cat=222&id=111

    Even though those are two different URLs both point to the same resource
    and are equivalent (ie. they should return the same response).

    Another example are cookies used to store session ids. Suppose the
    following page is only accesible to authenticated users:
    
    http://www.example.com/members/offers.html

    Lot of sites use a cookie to store the session id, which adds a random
    component to the HTTP Request and thus should be ignored when calculating
    the fingerprint. 
    
    For this reason, request headers are ignored by default when calculating
    the fingeprint. If you want to include specific headers use the
    include_headers argument, which is a list of Request headers to include.

    """
    if include_headers:
        include_headers = tuple([h.lower() for h in sorted(include_headers)])
    cache = _fingerprint_cache.setdefault(request, {})
    if include_headers not in cache:
        fp = hashlib.sha1()
        fp.update(request.method)
        fp.update(canonicalize_url(request.url))
        fp.update(request.body or '')
        if include_headers:
            for hdr in include_headers:
                if hdr in request.headers:
                    fp.update(hdr)
                    for v in request.headers.getlist(hdr):
                        if v != '':
                            v = canonicalize_url(v)
                        fp.update(v)
        cache[include_headers] = fp.hexdigest()
    return cache[include_headers]

Exemple #26

0

Afficher le fichier

Fichier : test_utils_url.py Projet : 01-/scrapy

 def test_safe_characters_unicode(self):
     # urllib.quote uses a mapping cache of encoded characters. when parsing
     # an already percent-encoded url, it will fail if that url was not
     # percent-encoded as utf-8, that's why canonicalize_url must always
     # convert the urls to string. the following test asserts that
     # functionality.
     self.assertEqual(canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'),
                                        'http://www.example.com/caf%E9-con-leche.htm')

Exemple #27

0

Afficher le fichier

Fichier : webscan_middleware.py Projet : os2webscanner/os2webscanner

 def get_stored_links(self, url, spider):
     """Return the links that have been stored for this URL."""
     url = canonicalize_url(url)
     try:
         url_last_modified = UrlLastModified.objects.get(
             url=url, scanner=self.get_scanner_object(spider))
         return url_last_modified.links.all()
     except UrlLastModified.DoesNotExist:
         return []

Exemple #28

0

Afficher le fichier

Fichier : webscan_middleware.py Projet : os2webscanner/os2webscanner

 def get_stored_last_modified_object(self, url, spider):
     """Return the UrlLastModified object for the given URL."""
     url = canonicalize_url(url)
     try:
         url_last_modified = UrlLastModified.objects.get(
             url=url, scanner=self.get_scanner_object(spider))
         return url_last_modified
     except UrlLastModified.DoesNotExist:
         return None

Exemple #29

0

Afficher le fichier

Fichier : base_spider.py Projet : barravi/undercrawler

 def _pagination_urls(self, response):
     return [
         url for url in
         unique(
             canonicalize_url(url, keep_fragments=True)
             for url in autopager.urls(response)
         )
         if self.link_extractor.matches(url)
         ]

Exemple #30

0

Afficher le fichier

Fichier : urlfilter.py Projet : kenzouyeh/scrapy

 def process_spider_output(self, response, result, spider):
     disabled = getattr(spider, 'urlfilter_disabled', False)
     for r in result:
         if isinstance(r, Request) and not disabled:
             curl = canonicalize_url(r.url)
             # only assign if different to avoid re-calculating fingerprint
             if curl != r.url: 
                 r.url = curl
         yield r

Exemple #31

0

Afficher le fichier

 def handle_form(self, url, form, meta):
     action = canonicalize_url(urljoin(url, form.action))
     if not self.link_extractor.matches(action):
         return
     if (meta['form'] == 'search' and
             self.settings.getbool('CRAZY_SEARCH_ENABLED') and
             action not in self.handled_search_forms and
             len(self.handled_search_forms) <
             self.settings.getint('MAX_DOMAIN_SEARCH_FORMS')):
         self.logger.debug('Found a search form at %s', url)
         self.handled_search_forms.add(action)
         for request_kwargs in search_form_requests(
                 url, form, meta,
                 search_terms=self.search_terms,
                 extra_search_terms=self.extra_search_terms):
             request_kwargs['meta'] = {'is_search': True}
             request_kwargs['cls'] = \
                 SplashFormRequest if self.use_splash else FormRequest
             yield request_kwargs

Exemple #32

0

Afficher le fichier

Fichier : request.py Projet : richard-ma/CodeReading

def request_fingerprint(request, include_headers=None):
    """
    Return the request fingerprint.
    
    The request fingerprint is a hash that uniquely identifies the resource the
    request points to. For example, take the following two urls:
    
    http://www.example.com/query?id=111&cat=222
    http://www.example.com/query?cat=222&id=111

    Even though those are two different URLs both point to the same resource
    and are equivalent (ie. they should return the same response).

    Another example are cookies used to store session ids. Suppose the
    following page is only accesible to authenticated users:
    
    http://www.example.com/members/offers.html

    Lot of sites use a cookie to store the session id, which adds a random
    component to the HTTP Request and thus should be ignored when calculating
    the fingerprint. 
    
    For this reason, request headers are ignored by default when calculating
    the fingeprint. If you want to include specific headers use the
    include_headers argument, which is a list of Request headers to include.

    """
    if include_headers:
        include_headers = tuple([h.lower() for h in sorted(include_headers)])
    cache = _fingerprint_cache.setdefault(request, {})
    if include_headers not in cache:
        fp = hashlib.sha1()
        fp.update(request.method)
        fp.update(canonicalize_url(request.url))
        fp.update(request.body or '')
        if include_headers:
            for hdr in include_headers:
                if hdr in request.headers:
                    fp.update(hdr)
                    for v in request.headers.getlist(hdr):
                        fp.update(v)
        cache[include_headers] = fp.hexdigest()
    return cache[include_headers]

Exemple #33

0

Afficher le fichier

 def _link_allowed(self, link):
     parsed_url = urlparse(link.url)
     allowed = _is_valid_url(link.url)
     if self.allow_res:
         allowed &= qualified_range_match(link.url, self.allow_res,
                                          self.allow_range)
     if self.deny_res:
         allowed &= not qualified_range_match(link.url, self.deny_res,
                                              self.deny_range)
     if self.allow_domains:
         allowed &= url_is_from_any_domain(parsed_url, self.allow_domains)
     if self.deny_domains:
         allowed &= not url_is_from_any_domain(parsed_url,
                                               self.deny_domains)
     if self.deny_extensions:
         allowed &= not url_has_any_extension(parsed_url,
                                              self.deny_extensions)
     if allowed and self.canonicalize:
         link.url = canonicalize_url(parsed_url)
     return allowed

Exemple #34

0

Afficher le fichier

    def parse_cat(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        products = hxs.select(
            '//div[@class="bd"]/ol/li/ul/li/h4/a/@href').extract()

        next_page = hxs.select(
            '//div[@class="pagenav"]/a[@class="next"]/@href').extract()
        if next_page:
            yield Request(url=canonicalize_url(
                urljoin_rfc(self.URL_BASE, next_page[0])),
                          callback=self.parse_cat)

        products = hxs.select(
            '//div[@class="bd"]/ol/li/ul/li/h4/a/@href').extract()

        for product in products:
            url = urljoin_rfc(get_base_url(response), product)
            yield Request(url, callback=self.parse_product)

Exemple #35

0

Afficher le fichier

Fichier : mobicity.py Projet : oceancloud82/scraping

    def parse(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        pages = set(hxs.select('//*[@class="pages"]//a/@href').extract())

        for page in pages:
            yield Request(urljoin_rfc(base_url, page))

        cat_name = hxs.select(
            "//div[@class='page-title category-title']/h1/text()").extract()

        # Dive in product, if it is
        products = hxs.select(
            "//div[@class='category-products']/ul/li/h2/a/@href").extract()
        if products:
            for product in products:
                yield Request(
                    url=canonicalize_url(product),
                    meta={"cat_name": cat_name},
                    callback=self.parse_product)

Exemple #36

0

Afficher le fichier

Fichier : onlinepharmacy.py Projet : oceancloud82/scraping

    def parse_products(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        # Fill up the Product model fields
        #identifier =
        url = canonicalize_url(response.url)
        name = hxs.select("//div[@class='product-name']/h1/text()"
            ).extract()[0].strip()
        price = hxs.select(
            "//div[@class='pricing']/div[@class='m-price']/span/text()"
            ).extract()
        if not price:
            price = ""
        sku = hxs.select("//div[@class='product-name']/p/text()"
            ).extract()[0].split(" ")[1]
        #metadata =
        category = response.meta["cat_name"]
        image_url = hxs.select("//div[@class='main_image']/a/img/@src"
            ).extract()
        if not image_url:
            image_url = hxs.select(
                "//div[@class='product-img-box']/p/img/@src").extract()
            if not image_url:
                image_url = ""
        #brand =
        #shipping_cost =

        l = ProductLoader(response=response, item=Product())
        #l.add_value('identifier', identifier)
        l.add_value('url', url)
        l.add_value('name', name)
        l.add_value('price', price)
        l.add_value('sku', sku)
        #l.add_value('metadata', metadata)
        l.add_value('category', category)
        l.add_value('image_url', image_url)
        #l.add_value('brand', brand)
        #l.add_value('shipping_cost', shipping_cost)
        yield l.load_item()

Exemple #37

0

Afficher le fichier

Fichier : linkextractors.py Projet : shahin/hippolyte

    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [
                link for link in links if _matches(link.url, self.allow_res)
            ]
        if self.deny_res:
            links = [
                link for link in links if not _matches(link.url, self.deny_res)
            ]
        if self.allow_domains:
            links = [
                link for link in links
                if url_is_from_any_domain(link.url, self.allow_domains)
            ]
        if self.deny_domains:
            links = [
                link for link in links
                if not url_is_from_any_domain(link.url, self.deny_domains)
            ]

        new_links = []
        for link in links:
            ASIN = link.url.split('/')[5]
            if not self._ignore_identifier(ASIN):
                log.msg("Found ASIN: " + ASIN, level=log.DEBUG)
                link.url = "http://www.amazon.com/product-reviews/" + ASIN + "/ref%3Ddp_top_cm_cr_acr_txt?ie=UTF8&showViewpoints=0"
                new_links.append(link)

        links = new_links

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links

Exemple #38

0

Afficher le fichier

Fichier : linkextractors.py Projet : shahin/hippolyte

    def _process_links(self, links):
        links = [link for link in links if _is_valid_url(link.url)]

        if self.allow_res:
            links = [
                link for link in links if _matches(link.url, self.allow_res)
            ]
        if self.deny_res:
            links = [
                link for link in links if not _matches(link.url, self.deny_res)
            ]
        if self.allow_domains:
            links = [
                link for link in links
                if url_is_from_any_domain(link.url, self.allow_domains)
            ]
        if self.deny_domains:
            links = [
                link for link in links
                if not url_is_from_any_domain(link.url, self.deny_domains)
            ]

        new_links = []
        for link in links:
            CustomerId = link.url.split('/')[6]
            if not self._ignore_identifier(CustomerId):
                log.msg("Found CustomerId: " + CustomerId, level=log.DEBUG)
                new_links.append(link)

        links = new_links

        if self.canonicalize:
            for link in links:
                link.url = canonicalize_url(link.url)

        links = BaseSgmlLinkExtractor._process_links(self, links)
        return links

Exemple #39

0

Afficher le fichier

Fichier : bhphotovideo_spider.py Projet : oceancloud82/scraping

    def parse_full(self, response):
        hxs = HtmlXPathSelector(response)

        cats = hxs.select(
                '//*[@id="tContent"]/div/div/div[@class="column"]'
                '/ul/li/a/@href').extract()
        if cats:
            for cat in cats:
                yield Request(
                        url=canonicalize_url(cat),
                        callback=self.parse_full)

        next_page = hxs.select(
                '//*[@id="bottompagination"]/div/a[@class="lnext"]/@href'
                ).extract()
        if next_page:
            if len(next_page)>1:
                yield Request(
                        url=canonicalize_url(next_page[1]),
                        callback=self.parse_full)
            else:
                yield Request(
                        url=canonicalize_url(next_page[0]),
                        callback=self.parse_full)

        products = hxs.select(
                '//div[@class="productBlock clearfix " '
                'or @class="productBlock clearfix topmrgn"]')
        if products:
            for product in products:
                brand = product.select(
                    'div/div/div[@class="brandTop"]/text()').extract()[0]
                title = product.select(
                    'div/div[@id="productTitle"]/h2/a/text()'
                ).extract()[0]
                name = ' '.join((brand, title))

                url = product.select('div/div[@id="productTitle"]/h2/a/@href').extract()[0]

                price = product.select(
                    'div[@id="productRight"]/ul/li[@class="price"]'
                    '/span[@class="value"]/text()').extract()
                if not price:
                    price = product.select(
                        'div[@id="productRight"]/ul'
                        '/li[@class="discountPrice"]'
                        '/span[@class="value"]/text()').extract()
                if not price:
                    price = product.select(
                        'div[@id="productRight"]/ul'
                        '/li[@class="map youPay"]'
                        '/span[@class="value"]/text()').extract()
                if not price:
                    price_label = product.select(
                        'div/ul/li/span[@class="label"]//text()'
                    ).extract()
                    if price_label and 'Savings' not in price_label[0]:
                        price = product.select(
                            'div/ul/li/span[@class="value"]/text()'
                        ).extract()
                if not price:
                    price = ''
                else:
                    price = price[0]

                if price:
                    loader = ProductLoader(item=Product(), selector=product)
                    loader.add_value('url', url)
                    loader.add_value('name', name)
                    loader.add_value('price', price)
                    yield loader.load_item()
                else:
                    # parse product page if price not found
                    yield Request(
                        url=url,
                        callback=self.parse_product)

Exemple #40

0

Afficher le fichier

 def test_remove_fragments(self):
     self.assertEqual(canonicalize_url(u"http://*****:*****@www.example.com/do?a=1#frag"),
                                       u"http://*****:*****@www.example.com/do?a=1")
     self.assertEqual(canonicalize_url(u"http://*****:*****@www.example.com/do?a=1#frag", keep_fragments=True),
                                       u"http://*****:*****@www.example.com/do?a=1#frag")

Exemple #41

0

Afficher le fichier

 def test_dont_convert_safe_characters(self):
     # dont convert safe characters to percent encoding representation
     self.assertEqual(canonicalize_url(
         "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"),
         "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html")

Exemple #42

0

Afficher le fichier

Fichier : spiders.py Projet : inferlinkdev/undercrawler

def url_fingerprint(url):
    url = canonicalize_url(url, keep_fragments=True)
    return hashlib.sha1(url.encode()).hexdigest()

Exemple #43

0

Afficher le fichier

 def test_urls_with_auth_and_ports(self):
     self.assertEqual(canonicalize_url(u"http://*****:*****@www.example.com:81/do?now=1"),
                                       u"http://*****:*****@www.example.com:81/do?now=1")

Exemple #44

0

Afficher le fichier

def url_fingerprint(url):
    fp = hashlib.sha1()
    fp.update(canonicalize_url(url))
    return fp.hexdigest()

Exemple #45

0

Afficher le fichier

Fichier : middlewares.py Projet : luoxinghong/06_baidu_kw_news_spider

 def url_sha1(self, url):
     fp = hashlib.sha1()
     # 对url中的构成数据进行了重新排列，例如有些url中请求参数一样，但是顺序不同
     fp.update(canonicalize_url(url).encode("utf-8"))
     url_sha1 = fp.hexdigest()
     return url_sha1

Exemple #46

0

Afficher le fichier

 def test_canonicalize_url(self):
     # simplest case
     self.assertEqual(canonicalize_url("http://www.example.com/"),
                                       "http://www.example.com/")

Exemple #47

0

Afficher le fichier

 def test_append_missing_path(self):
     self.assertEqual(canonicalize_url("http://www.example.com"),
                                       "http://www.example.com/")

Exemple #48

0

Afficher le fichier

 def test_sorting(self):
     self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"),
                                       "http://www.example.com/do?a=50&b=2&b=5&c=3")

Exemple #49

0

Afficher le fichier

 def test_domains_are_case_insensitive(self):
     self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"),
                                       "http://www.example.com/")

Exemple #50

0

Afficher le fichier

Fichier : webscan_middleware.py Projet : magenta-aps/old_os2datascanner

    def has_been_modified(self, request, response, spider):
        """Return whether the response was modified since last seen.

        We check against the database here.
        If the response has been modified, we update the database.
        If there is no stored last modified date, we save one.
        """
        # Check the Last-Modified header to see if the content has been
        # updated since the last time we checked it.
        last_modified_header = response.headers.get("Last-Modified", None)
        if last_modified_header is not None:
            last_modified_header_date = datetime.datetime.fromtimestamp(
                mktime_tz(parsedate_tz(last_modified_header.decode('utf-8'))),
                tz=pytz.utc)
        else:
            last_modified_header_date = None

        if last_modified_header_date is None and request.method == 'GET':
            content_type_header = response.headers.get("Content-Type",
                                                       None).decode('utf-8')
            if content_type_header.startswith("text/html"):
                try:
                    body_html = html.fromstring(response.body)
                except:
                    logging.info(
                        'Error occured while trying to extract string from response body.'
                    )

                meta_dict = {
                    list(el.values())[0]: list(el.values())[1]
                    for el in body_html.findall('head/meta')
                }
                if 'last-modified' in meta_dict:
                    lm = meta_dict['last-modified']
                    try:
                        last_modified_header_date = arrow.get(lm).datetime
                    except:
                        logging.error(
                            "Date format error on last modied: {0}".format(lm))

        # lastmod comes from a sitemap.xml file
        sitemap_lastmod_date = request.meta.get("lastmod", None)
        if sitemap_lastmod_date is None:
            last_modified = last_modified_header_date
            logging.debug("Using header's last-modified date: %s" %
                          last_modified)
        else:
            if last_modified_header_date is None:
                # No Last-Modified header, use the lastmod from the sitemap
                last_modified = sitemap_lastmod_date
                logging.debug("Using lastmod from sitemap %s" % last_modified)
            else:
                # Take the most recent of the two
                logging.debug(
                    "Taking most recent of (header) %sand (sitemap) %s" %
                    (last_modified_header_date, sitemap_lastmod_date))
                last_modified = max(last_modified_header_date,
                                    sitemap_lastmod_date)
                logging.debug("Last modified %s" % last_modified)

        if last_modified is not None:
            # Check against the database
            canonical_url = canonicalize_url(response.url)
            try:
                url_last_modified = UrlLastModified.objects.get(
                    url=canonical_url, scanner=self.get_scanner_object(spider))
                stored_last_modified = url_last_modified.last_modified
                logging.info("Comparing header %s against stored %s" %
                             (last_modified, stored_last_modified))
                if (stored_last_modified is not None
                        and last_modified == stored_last_modified):
                    return False
                else:
                    # Update last-modified date in database
                    url_last_modified.last_modified = last_modified
                    url_last_modified.save()
                    return True
            except UrlLastModified.DoesNotExist:
                logging.debug("No stored Last-Modified header found.")
                url_last_modified = UrlLastModified(
                    url=canonical_url,
                    last_modified=last_modified,
                    scanner=self.get_scanner_object(spider))
                logging.debug("Saving new last-modified value {}".format(
                    url_last_modified))
                url_last_modified.save()
                return True
        else:
            # If there is no Last-Modified header, we have to assume it has
            # been modified.
            logging.debug('No Last-Modified header found at all.')
            return True

Exemple #51

0

Afficher le fichier

    def test_canonicalize_url(self):
        # simplest case
        self.assertEqual(canonicalize_url("http://www.example.com/"),
                         "http://www.example.com/")

        # always return a str
        assert isinstance(canonicalize_url(u"http://www.example.com"), str)

        # append missing path
        self.assertEqual(canonicalize_url("http://www.example.com"),
                         "http://www.example.com/")
        # typical usage
        self.assertEqual(
            canonicalize_url("http://www.example.com/do?a=1&b=2&c=3"),
            "http://www.example.com/do?a=1&b=2&c=3")
        self.assertEqual(
            canonicalize_url("http://www.example.com/do?c=1&b=2&a=3"),
            "http://www.example.com/do?a=3&b=2&c=1")
        self.assertEqual(canonicalize_url("http://www.example.com/do?&a=1"),
                         "http://www.example.com/do?a=1")

        # sorting by argument values
        self.assertEqual(
            canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"),
            "http://www.example.com/do?a=50&b=2&b=5&c=3")

        # using keep_blank_values
        self.assertEqual(
            canonicalize_url("http://www.example.com/do?b=&a=2",
                             keep_blank_values=False),
            "http://www.example.com/do?a=2")
        self.assertEqual(canonicalize_url("http://www.example.com/do?b=&a=2"),
                         "http://www.example.com/do?a=2&b=")
        self.assertEqual(
            canonicalize_url("http://www.example.com/do?b=&c&a=2",
                             keep_blank_values=False),
            "http://www.example.com/do?a=2")
        self.assertEqual(
            canonicalize_url("http://www.example.com/do?b=&c&a=2"),
            "http://www.example.com/do?a=2&b=&c=")

        self.assertEqual(canonicalize_url(u'http://www.example.com/do?1750,4'),
                         'http://www.example.com/do?1750%2C4=')

        # spaces
        self.assertEqual(
            canonicalize_url("http://www.example.com/do?q=a space&a=1"),
            "http://www.example.com/do?a=1&q=a+space")
        self.assertEqual(
            canonicalize_url("http://www.example.com/do?q=a+space&a=1"),
            "http://www.example.com/do?a=1&q=a+space")
        self.assertEqual(
            canonicalize_url("http://www.example.com/do?q=a%20space&a=1"),
            "http://www.example.com/do?a=1&q=a+space")

        # normalize percent-encoding case (in paths)
        self.assertEqual(canonicalize_url("http://www.example.com/a%a3do"),
                         "http://www.example.com/a%A3do"),
        # normalize percent-encoding case (in query arguments)
        self.assertEqual(canonicalize_url("http://www.example.com/do?k=b%a3"),
                         "http://www.example.com/do?k=b%A3")

        # non-ASCII percent-encoding in paths
        self.assertEqual(canonicalize_url("http://www.example.com/a do?a=1"),
                         "http://www.example.com/a%20do?a=1"),
        self.assertEqual(
            canonicalize_url("http://www.example.com/a %20do?a=1"),
            "http://www.example.com/a%20%20do?a=1"),
        self.assertEqual(
            canonicalize_url("http://www.example.com/a do\xc2\xa3.html?a=1"),
            "http://www.example.com/a%20do%C2%A3.html?a=1")
        # non-ASCII percent-encoding in query arguments
        self.assertEqual(
            canonicalize_url(
                u"http://www.example.com/do?price=\xa3500&a=5&z=3"),
            u"http://www.example.com/do?a=5&price=%C2%A3500&z=3")
        self.assertEqual(
            canonicalize_url(
                "http://www.example.com/do?price=\xc2\xa3500&a=5&z=3"),
            "http://www.example.com/do?a=5&price=%C2%A3500&z=3")
        self.assertEqual(
            canonicalize_url(
                "http://www.example.com/do?price(\xc2\xa3)=500&a=1"),
            "http://www.example.com/do?a=1&price%28%C2%A3%29=500")

        # urls containing auth and ports
        self.assertEqual(
            canonicalize_url(u"http://*****:*****@www.example.com:81/do?now=1"),
            u"http://*****:*****@www.example.com:81/do?now=1")

        # remove fragments
        self.assertEqual(
            canonicalize_url(u"http://*****:*****@www.example.com/do?a=1#frag"),
            u"http://*****:*****@www.example.com/do?a=1")
        self.assertEqual(
            canonicalize_url(u"http://*****:*****@www.example.com/do?a=1#frag",
                             keep_fragments=True),
            u"http://*****:*****@www.example.com/do?a=1#frag")

        # dont convert safe characters to percent encoding representation
        self.assertEqual(
            canonicalize_url(
                "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"
            ),
            "http://www.simplybedrooms.com/White-Bedroom-Furniture/Bedroom-Mirror:-Josephine-Cheval-Mirror.html"
        )

        # urllib.quote uses a mapping cache of encoded characters. when parsing
        # an already percent-encoded url, it will fail if that url was not
        # percent-encoded as utf-8, that's why canonicalize_url must always
        # convert the urls to string. the following test asserts that
        # functionality.
        self.assertEqual(
            canonicalize_url(u'http://www.example.com/caf%E9-con-leche.htm'),
            'http://www.example.com/caf%E9-con-leche.htm')

        # domains are case insensitive
        self.assertEqual(canonicalize_url("http://www.EXAMPLE.com/"),
                         "http://www.example.com/")

        # quoted slash and question sign
        self.assertEqual(
            canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"),
            "http://foo.com/AC%2FDC+rocks%3F/?yeah=1")
        self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
                         "http://foo.com/AC%2FDC/")

Exemple #52

0

Afficher le fichier

    def parse_full(self, response):
        meta = response.meta.copy()
        meta['dont_redirect'] = True
        meta['dont_merge_cookies'] = True

        items_number = response.xpath(
            '//div[contains(@class, "pagination")]//span[contains(@class, "bold")]/text()'
        ).re(r'\d+')

        if items_number:
            if items_number[0] > items_number[1]:
                return

        need_retry = False

        brands = response.xpath('//dl[@class="brandsList"]//a/@href').extract()
        for brand in brands:
            yield (Request(brand, callback=self.parse_full))

        cats = response.xpath(
            '//li[@data-selenium="category"]//@href').extract()
        if cats:
            for cat in cats:
                meta['try'] = 0
                yield Request(url=canonicalize_url(cat),
                              callback=self.parse_full,
                              meta=meta,
                              errback=lambda failure, url=canonicalize_url(
                                  cat), metadata=meta: self.bsm_retry_download(
                                      failure, url, metadata, self.parse_full))

        products = response.xpath(
            '//div[contains(@class, "item") and contains(@class, "clearfix")]')
        if products:
            for product in products:
                try:
                    brand = product.xpath(
                        './/span[@itemprop="brand"]/text()').extract()[0]
                except IndexError:
                    brand = ''
                try:
                    title = product.xpath(
                        './/span[@itemprop="name"]/text()').extract()[0]
                except IndexError:
                    continue
                name = ' '.join((brand, title))

                url = product.xpath('.//a[@itemprop="url"]/@href').extract()[0]

                price = ''.join(
                    product.xpath('.//*[contains(@class, "price")]/text()').
                    extract()).strip()

                identifier = product.xpath(
                    './/input[@name="sku"]/@value').extract()
                if identifier:
                    identifier = identifier[0]
                    id_part = product.xpath(
                        './/input[@name="is"]/@value').extract()
                    if id_part:
                        identifier = identifier + '-' + id_part[0]
                else:
                    self.log('No identifier found for %s on %s' %
                             (name, response.url))
                    continue

                if not price:
                    for data in response.xpath(
                            '//div/@data-itemdata').extract():
                        json_data = json.loads(data)
                        if json_data['sku'] in identifier.split('-'):
                            price = json_data['price']
                            break

                sku = product.xpath(
                    './/p[contains(@class, "skus")]//span[@class="sku"]/text()'
                ).extract()
                if sku:
                    sku = sku[-1]
                else:
                    sku = ''

                image_url = product.xpath(
                    'div/a[@name="image"]/img/@src').extract()
                if not image_url:
                    image_url = product.xpath(
                        'div[@class="img-zone zone"]//img/@data-src').extract(
                        )
                if not image_url:
                    image_url = product.xpath(
                        'div[@class="img-zone zone"]//img/@src').extract()
                if image_url:
                    image_url = response.urljoin(image_url[0])
                else:
                    image_url = ''
                category = response.xpath('//ul[@id="breadcrumbs"]/li/a/text()'
                                          ).extract()[-1].strip()
                if category.lower() == "home":
                    category = response.xpath(
                        '//ul[@id="breadcrumbs"]/li[@class="last"]/text()'
                    ).extract()[-1].strip()

                if identifier:
                    if not price:
                        price = '0.0'

                    loader = AxeMusicProductLoader(item=Product(),
                                                   selector=product)
                    loader.add_value('url', url)
                    loader.add_value('identifier', identifier)
                    loader.add_value('sku', sku)
                    loader.add_value('image_url', image_url)
                    if brand:
                        loader.add_value('brand', brand)
                    loader.add_value('category', category)
                    loader.add_value('name', name)
                    loader.add_value('price', price)

                    if url not in self.product_pages and loader.get_output_value(
                            'price') > 0:
                        item = loader.load_item()
                        if item['identifier'].endswith('-REG'):
                            item['identifier'] = item['identifier'].replace(
                                '-REG', '')
                        yield item
                    self.product_pages.add(url)
        elif not cats:
            need_retry = True

        pages = response.xpath(
            '//div[contains(@class, "pagination-zone")]//a/@href').extract()
        for page_url in pages:
            meta['try'] = 0
            yield Request(callback=self.parse_full,
                          url=canonicalize_url(page_url),
                          meta=meta)

        if need_retry:
            retry = response.meta.get('try', 0)
            if retry < 15:
                meta = response.meta.copy()
                meta['try'] = retry + 1
                self.log("Try %d. retrying to download %s" %
                         (meta['try'], response.url))
                yield Request(url=response.url,
                              callback=self.parse_full,
                              dont_filter=True,
                              meta=meta)

Exemple #53

0

Afficher le fichier

def hash_url(url):
    url = canonicalize_url(url)
    sha1 = hashlib.sha1()
    sha1.update(url)
    return sha1.hexdigest()

Exemple #54

0

Afficher le fichier

 def test_canonicalize_url_unicode_path(self):
     self.assertEqual(canonicalize_url(u"http://www.example.com/résumé"),
                      "http://www.example.com/r%C3%A9sum%C3%A9")

Exemple #55

0

Afficher le fichier

 def test_quoted_slash_and_question_sign(self):
     self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"),
                      "http://foo.com/AC%2FDC+rocks%3F/?yeah=1")
     self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC/"),
                      "http://foo.com/AC%2FDC/")

Exemple #56

0

Afficher le fichier

 def test_normalize_percent_encoding_in_paths(self):
     self.assertEqual(canonicalize_url("http://www.example.com/a%a3do"),
                                       "http://www.example.com/a%A3do"),

Exemple #57

0

Afficher le fichier

 def test_return_str(self):
     assert isinstance(canonicalize_url(u"http://www.example.com"), str)
     assert isinstance(canonicalize_url(b"http://www.example.com"), str)

Exemple #58

0

Afficher le fichier

 def test_normalize_percent_encoding_in_query_arguments(self):
     self.assertEqual(canonicalize_url("http://www.example.com/do?k=b%a3"),
                                       "http://www.example.com/do?k=b%A3")

Exemple #59

0

Afficher le fichier

Fichier : reqproc.py Projet : richard-ma/CodeReading

 def __call__(self, requests):
     """Canonicalize all requests' urls"""
     return (x.replace(url=canonicalize_url(x.url)) for x in requests)

Exemple #60

0

Afficher le fichier

    def parse(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        meta = response.meta.copy()

        cats = hxs.select('//*[@id="tContent"]/div/div/div[@class="column"]'
                          '/ul/li/a/@href').extract()

        pages = hxs.select(
            '//div[contains(@class, "pagination-zone")]//a/@href').extract()
        for page_url in pages:
            yield Request(callback=self.parse,
                          url=canonicalize_url(page_url),
                          errback=lambda failure, url=canonicalize_url(
                              page_url), metadata=meta: self.retry_download(
                                  failure, url, metadata, self.parse))

        products = hxs.select(
            '//div[contains(@class, "item") and contains(@class, "clearfix")]')
        if products:
            for product in products:
                try:
                    brand = product.select(
                        './/span[@itemprop="brand"]/text()').extract()[0]
                except IndexError:
                    brand = ''
                title = product.select(
                    './/span[@itemprop="name"]/text()').extract()[0]
                name = ' '.join((brand, title))

                url = product.select(
                    './/a[@itemprop="url"]/@href').extract()[0]

                identifier = product.select(
                    './/input[@name="sku"]/@value').extract().pop()

                price = 0
                for data in hxs.select('//div/@data-itemdata').extract():
                    json_data = json.loads(data)
                    if json_data['sku'] == identifier:
                        price = json_data['price']
                        break

                if not price:
                    price = product.select(
                        './/div[@class="price-zone"]/div[@class="atc-price"]'
                        '//strong[contains(@class, "price")]/text()').extract(
                        )

                try:
                    sku = product.select(
                        './/p[contains(@data-selenium, "skus")]//span[@class="sku"]/text()'
                    ).extract()[-1]
                except:
                    sku = ''
                image_url = product.select(
                    './/a[@class="itemImg"]/img/@data-src').extract(
                    ) or product.select(
                        './/a[@class="itemImg"]/img/@src').extract()
                if image_url:
                    image_url = urljoin_rfc(base_url, image_url[0])
                else:
                    image_url = ''

                category = hxs.select('//ul[@id="breadcrumbs"]/li/a/text()'
                                      ).extract()[-1].strip()
                if category.lower() == "home":
                    category = hxs.select(
                        '//ul[@id="breadcrumbs"]/li[@class="last"]/text()'
                    ).extract()[-1].strip()

                bushnell_product = self.bushnell_products.get(
                    sku.upper().strip(), None)
                if bushnell_product:
                    category = bushnell_product['Class']
                    log.msg(
                        'Extracts category "%s" from bushnell file, URL: %s' %
                        (category, response.url))

                if url not in self.urls_list:
                    if price:
                        self.urls_list.append(url)
                        loader = ProductLoader(item=Product(),
                                               selector=product)
                        loader.add_value('url', url)
                        loader.add_value('identifier', identifier)
                        loader.add_value('sku', sku)
                        loader.add_value('image_url', image_url)
                        loader.add_value('brand', brand)
                        loader.add_value('category', category)
                        loader.add_value('name', name)
                        loader.add_value('price', price)
                        product = loader.load_item()
                        yield self._get_reviews_url(product)
                    else:
                        # parse product page if price not found
                        meta = {
                            'name': name,
                            'brand': brand,
                            'category': category,
                            'identifier': identifier,
                            'image_url': image_url,
                            'sku': sku
                        }
                        yield Request(
                            url=url,
                            callback=self.parse_product,
                            meta=meta,
                            errback=lambda failure, url=url,
                            metadata=meta: self.retry_download(
                                failure, url, metadata, self.parse_product))
        elif not cats:
            retry = response.meta.get('try', 0)
            if retry < 15:
                meta = response.meta.copy()
                meta['try'] = retry + 1
                yield Request(
                    url=response.url,
                    dont_filter=True,
                    callback=self.parse,
                    errback=lambda failure, url=response.url, metadata=meta:
                    self.retry_download(failure, url, metadata, self.parse))