コード例 #1
0
ファイル: test_utils_url.py プロジェクト: 447327642/scrapy
 def test_canonicalize_parse_url(self):
     # parse_url() wraps urlparse and is used in link extractors
     self.assertEqual(canonicalize_url(parse_url(u"http://www.example.com/résumé?q=résumé")),
                                       "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
     self.assertEqual(canonicalize_url(parse_url('http://www.example.com/caf%e9-con-leche.htm')),
                                       'http://www.example.com/caf%E9-con-leche.htm')
     self.assertEqual(canonicalize_url(parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
                                       "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
コード例 #2
0
 def test_canonicalize_parse_url(self):
     # parse_url() wraps urlparse and is used in link extractors
     self.assertEqual(
         canonicalize_url(
             parse_url(u"http://www.example.com/résumé?q=résumé")),
         "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
     self.assertEqual(
         canonicalize_url(
             parse_url('http://www.example.com/caf%e9-con-leche.htm')),
         'http://www.example.com/caf%E9-con-leche.htm')
     self.assertEqual(
         canonicalize_url(
             parse_url("http://www.example.com/a%a3do?q=r%c3%a9sum%c3%a9")),
         "http://www.example.com/a%A3do?q=r%C3%A9sum%C3%A9")
コード例 #3
0
ファイル: archip.py プロジェクト: miemiekurisu/icscraw
 def parse_prod_list(self, response):
     print 'parse_prod_list'
     urlstr = "%s://%s%s"
     parsedurl = surl.parse_url(get_base_url(response))
     for grid in response.css('.clearfix .grid_link'):
         item = ArchItem()
         item['path_url'] = []
         item['path_name'] = []
         item['main_bread'] = []
         item['path_url'].extend(response.meta['item']['path_url'])
         item['path_name'].append(response.meta['item']['path_name'])
         item['main_bread'].append(response.meta['item']['main_bread'])
         prdlnk = grid.xpath('a/@href').extract()[0]
         realurl = urlstr % (parsedurl.scheme, parsedurl.netloc, prdlnk)
         item['path_url'].append(realurl)
         item['path_name'].append(
             [grid.css('.name-product').xpath('text()').extract()])
         item['main_bread'].append(
             response.xpath('//*[@itemprop="title"]/text()').extract())
         print "=======================PD:", item['path_url']
         yield scrapy.Request(item['path_url'][-1],
                              meta={'item': item},
                              callback=self.parse_content)
         #return item
     nxtpg = response.css('.btn-pag-right').xpath('@href').extract()
     if len(nxtpg) > 0:
         nextpage = urlstr % (parsedurl.scheme, parsedurl.netloc, nxtpg[0])
         yield scrapy.Request(nextpage,
                              meta={'item': response.meta['item']},
                              callback=self.parse_prod_list)
コード例 #4
0
ファイル: utils.py プロジェクト: usakey/Any
def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
        encoding=None):
    """Canonicalize the given url by applying the following procedures:

    - sort query arguments, first by key, then by value
    - percent encode paths and query arguments. non-ASCII characters are
      percent-encoded using UTF-8 (RFC-3986)
    - normalize all spaces (in query arguments) '+' (plus symbol)
    - normalize percent encodings case (%2f -> %2F)
    - remove query arguments with blank values (unless keep_blank_values is True)
    - remove duplicate query arguments
    - remove fragments (unless keep_fragments is True)

    The url passed can be a str or unicode, while the url returned is always a
    str.

    This builds on scrapy.utils.url.canonicalize_url to remove duplicate arguments.
    """

    scheme, netloc, path, params, query, fragment = parse_url(url)
    keyvals = urlparse.parse_qsl(query, keep_blank_values)
    keyvals = list(set(keyvals))
    keyvals.sort()
    query = urllib.urlencode(keyvals)
    path = safe_url_string(_unquotepath(path)) or '/'
    fragment = '' if not keep_fragments else fragment
    return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))
コード例 #5
0
ファイル: shop.py プロジェクト: lygntx/scrapyc
    def parse_selloffer_jsonp(self,response):

        fixedcontent = self._regex.sub(r"\\\\", response.body)
        rep = json.loads(fixedcontent[len(self.jsonp_callback)+1:-1].decode("GBK"))
        if rep["hasError"] == True:
            self.log("[pase_jsonp] Error:%s %s"%(rep["message"],response.url),level=scrapy.log.ERROR)
            return
        content = rep["content"]["offerResult"]["html"]
        tree=lxml.html.fromstring(content)
        #pdb.set_trace()
        #parse shop
        for href in tree.xpath('//li[@class="sm-offerShopwindow"]//a[@class="sm-previewCompany sw-mod-previewCompanyInfo"]/@href'):
            if not href.startswith("http://"):
                continue 
            shop_url = href+"/"
            self.log('[pase_jsonp] found shop %s from %s'%(shop_url,response.url),level=scrapy.log.INFO)
            yield ShopItem(url=shop_url,insert_time=str(datetime.datetime.now()))

        #nextpage
        scheme, netloc, path, params, query, fragment = parse_url(response.url)
        qs = parse_query(query)
        try:
            totalPage = int(qs.get('totalPage'))
            beginPage = int(qs.get('beginPage'))
        except Exception, e:
            self.log("[pase_jsonp] %s"%e,level=scrapy.log.ERROR)
            return
コード例 #6
0
ファイル: shop.py プロジェクト: lygntx/scrapyc
    def parse_selloffer(self,response):
        ''' parse like this :http://s.1688.com/selloffer/offer_search.htm?.....'''
        #parse category
        for href in response.xpath('//a/@href').extract():
            if not href.startswith("http://s.1688.com/selloffer/offer_search.htm?"):
                continue 
            yield scrapy.Request(href)

        #parse shop
        for item in self._get_shop_byxpath(response,'//li[@class="sm-offerShopwindow"]//a[@class="sm-previewCompany sw-mod-previewCompanyInfo"]/@href'):
            yield item
        
        #next page    
        scheme, netloc, path, params, query, fragment = parse_url(response.url)
        qs = parse_query(query)
        keywords = qs.get('keywords')
        if not keywords:
            return
        totalPage= response.xpath('//li[@id="breadCrumbText"]/em/text()').extract()
        if len(totalPage ) == 1:
            totalPage = int(totalPage[0])/60 +1
        else:
            totalPage = 50

        
        jsonrpc_url='http://s.1688.com/selloffer/rpc_offer_search.jsonp?descendOrder=true&onlineStatus=yes&isOnlyAlipay=true&sortType=booked&uniqfield=userid&n=y&filt=y&from=marketSearch&async=true&asyncCount=60&startIndex=0&qrwRedirectEnabled=false&offset=0&isWideScreen=false&controls=_template_%3Aofferresult%2Cshopwindow%2CshopwindowOfferResult.vm%7C_moduleConfig_%3AshopwindowResultConfig%7CpageSize%3A60%7C_name_%3AofferResult%7Coffset%3A0&token=959352873'+'&callback=%(callback)s&beginPage=%(beginPage)d&totalPage=%(totalPage)d&keywords=%(keywords)s'%{"callback":self.jsonp_callback,"beginPage":1,"keywords":keywords,"totalPage":totalPage}

        
        yield scrapy.Request(jsonrpc_url)
コード例 #7
0
ファイル: shop.py プロジェクト: lygntx/scrapyc
    def parse_caigou(self,response):
        ''' parse like this :http://s.1688.com/caigou/offer_search.htm?.....'''
        #parse category
        for href in response.xpath('//a/@href').extract():
            if not href.startswith("http://s.1688.com/caigou/offer_search.htm?"):
                continue 
            #yield IndexItem(url=href,insert_time=str(datetime.datetime.now()))
            #yield scrapy.Request(href)

        for item in self._get_shop_byxpath(response,'//li[@class="sm-offerItem"]/div[@class="sm-offerItem-alitalk"]//a[2]/@href'):
            yield item

        #next page    
        scheme, netloc, path, params, query, fragment = parse_url(response.url)
        qs = parse_query(query)
        keywords = qs.get('keywords')
        if not keywords:
            return
        totalPage= response.xpath('//*[@id="content"]/div[1]/div[1]/div[1]/span/em/text()').extract()
        if len(totalPage ) == 1:
            totalPage = int(totalPage[0])/60 +1
        else:
            totalPage = 10

        
        jsonrpc_url='http://s.1688.com/caigou/rpc_offer_search.jsonp?n=y&async=true&asyncCount=60&startIndex=0&qrwRedirectEnabled=false&offset=0&isWideScreen=false&controls=_template_%3Aofferresult%2CjicaiOfferResult.vm%7C_moduleConfig_%3AshopwindowResultConfig%7C_name_%3AofferResult&token=237250634'+'&callback=%(callback)s&beginPage=%(beginPage)d&totalPage=%(totalPage)d&keywords=%(keywords)s'%{"callback":self.jsonp_callback,"beginPage":1,"keywords":keywords,"totalPage":totalPage}

        
        yield scrapy.Request(jsonrpc_url)
コード例 #8
0
ファイル: one.py プロジェクト: yupengyan/scrapyc
    def parse(self, response):
        self.log("Crawled (%d) <GET %s>" % (response.status, response.url),
                 level=scrapy.log.INFO)
        if response.status != 200:
            yield response.request
            return
        if not isinstance(response, scrapy.http.HtmlResponse):
            return
        depth = response.meta.get("depth", 1)
        for href in response.xpath("//a/@href").extract():
            href = href.strip()

            if href.startswith("javascript:") or href.startswith(
                    "rtsp:") or href.startswith("ftp:"):
                continue
            scheme, netloc, path, params, query, fragment = parse_url(href)
            if path:
                suffix = path.split('.')[-1]
                if suffix in [
                        "png", "jpg", "gif", "rar", "zip", "mp3", ".pdf",
                        "doc", ".txt", "docx", "swf", "mp4"
                ]:
                    continue
            abs_url = urljoin_rfc(response.url, href)
            yield UrlItem(url=abs_url, fromurl=response.url)
            if depth < 1:
                depth += 1
                yield scrapy.Request(abs_url, meta={"depth": depth})
コード例 #9
0
ファイル: shop.py プロジェクト: lygntx/scrapyc
    def parse_index(self,response):
 
        #parse category
        for href in response.xpath('//*[@id="hotwordpanel"]//li/@data-url').extract()+response.xpath('//*[@id="hotwordpanel"]//a/@href').extract():
            if not href.startswith("http://"):
                continue 
            scheme, netloc, path, params, query, fragment = parse_url(href)
            if netloc.startswith("shop") or path.endswith("creditdetail.htm"):
                shop_url = "%s://%s/"%(scheme,netloc)
                yield ShopItem(url=shop_url,insert_time=str(datetime.datetime.now()))
                self.log('[parse_index] found shop %s'%(shop_url),level=scrapy.log.INFO)
            elif netloc == "detail.1688.com":
                yield GoodsItem(url=href,insert_time=str(datetime.datetime.now()))
            elif netloc == "go.1688.com" and 'supplier' in path:
                yield IndexItem(url=href,insert_time=str(datetime.datetime.now()))
                yield scrapy.Request(href)


        #parse shop
        for href in response.xpath('//*[@id="listbody"]/div[@class="supplier-list"]/div[@class="supplier"]/div[@class="title p-margin"]//a/@href').extract():
            if not href.startswith("http://"):
                continue 
            scheme, netloc, path, params, query, fragment = parse_url(href)
            shop_url = "%s://%s/"%(scheme,netloc)
            self.log('[parse_index] found shop %s from %s'%(shop_url,response.url),level=scrapy.log.INFO)
            yield ShopItem(url=shop_url,insert_time=str(datetime.datetime.now()))


        
        #next page    
        scheme, netloc, path, params, query, fragment = parse_url(response.url)
        qs = parse_query(query)
        pageStart = int(qs.get('pageStart',1))
        pageCount = int(qs.get('pageCount',0))
        if not pageCount:
            pageCount = response.xpath('//*[@id="pageCount"]/@value').extract()
            if len(pageCount) > 0:
                try:
                    pageCount = int(pageCount[0])
                except Exception, e:
                    pageCount = 1000
コード例 #10
0
 def start_requests(self):
     with open('urls_to_scrape_full_urls.csv', 'r') as read_file:
         for u in read_file.readlines():
             host = parse_url(u.strip()).netloc.lower()
             if not host:
                 continue
             self.allowed_domains.append(host)
             self.extractor.allow_domains.add(host)
             yield scrapy.Request(u.strip(),
                                  callback=self.parse,
                                  errback=self.errback_httpbin,
                                  dont_filter=True)
コード例 #11
0
def my_canonicalize_url(url):
    url = canonicalize_url(url)
    scheme, netloc, path, params, query, fragment = parse_url(url)
    # www.hmrc.gov.uk and hmrc.gov.uk are the same
    if netloc == 'hmrc.gov.uk':
        netloc = 'www.hmrc.gov.uk'
    # Fix manuals links with multiple slashes
    path = re.sub(r'^/+', '/', path)
    # Fix customs.hmrc with session tokens in path (!?)
    if params and path.endswith('.portal'):
        params = ''
    url = urlparse.urlunparse((scheme, netloc, path, params, query, fragment))
    return url
コード例 #12
0
ファイル: archip.py プロジェクト: miemiekurisu/icscraw
    def parse(self,response):
        urlstr = "%s://%s%s"
        print 'start'
	parsedurl = surl.parse_url(get_base_url(response))
	#col-categories-selected
	for catsel in response.css('.col-categories-selected'):
		item = ArchItem()
		pathurl = []
		pathurl.append(urlstr%(parsedurl.scheme,parsedurl.netloc,catsel.xpath('a/@href').extract()[0]))
		item['path_url'] = pathurl
		item['path_name'] = catsel.xpath('a/span/text()').extract()
		item['main_bread'] = [catsel.xpath('//*[@itemprop="title"]/text()').extract()]
		print "!!!!!!!!!!!!!",item['path_url']
		yield scrapy.Request(item['path_url'][-1],meta={'item':item},callback=self.pg2parse)
コード例 #13
0
 def handle_list(self, response: Response) -> Request:
     torrents = response.css(
         'td.desc-top a[type="application/x-bittorrent"]::attr(href)'
     ).extract()
     if len(torrents) < 1:
         return
     for torrent in torrents:
         url = parse_url(torrent)
         if url.netloc == 'www.nyaa.se':
             continue
         request = DownloadRequest(url=torrent,
                                   callback=self.handle_item,
                                   dont_filter=True)
         request.meta['from_url'] = response.url
         yield request
コード例 #14
0
    def init_allowed_domains(self):
        if (not hasattr(self, 'allowed_domains')) or (hasattr(
                self, 'allowed_domains') and (not self.allowed_domains)):
            self.allowed_domains = []
            dict_locations = dict(self.available_locations)
            for url in self.start_urls:
                domain = parse_url(
                    dict_locations[self.location]).netloc.replace('www.', '')
                if domain not in self.allowed_domains:
                    self.allowed_domains.append(domain)
        elif hasattr(self, 'allowed_domains') and not isinstance(
                self.allowed_domains, list):
            self.allowed_domains = list(self.allowed_domains)

        if 'bazaarvoice.com' not in self.allowed_domains:
            self.allowed_domains.append('bazaarvoice.com')
コード例 #15
0
ファイル: archip.py プロジェクト: miemiekurisu/icscraw
    def pg2parse(self,response):
        print 'pg2parse'
	parsedurl = surl.parse_url(get_base_url(response))
        urlstr = "%s://%s%s"
	for subcat in response.css('.cont-sub-content').xpath('a'):
		item = ArchItem() 
                item['path_url']=[]
                item['path_name']=[]
                item['main_bread']=[]
                item['path_url'].extend( response.meta['item']['path_url'])
                item['path_name'] .append( response.meta['item']['path_name'])
                item['main_bread'] .append(  response.meta['item']['main_bread'])
                realurl = urlstr%(parsedurl.scheme,parsedurl.netloc,subcat.xpath('@href').extract()[0])
		item['path_url'].append(realurl)
		item['path_name'].append(subcat.xpath('*/h2[@itemprop="name"]/text()').extract()[0])
		item['main_bread'].append(response.xpath('//*[@itemprop="title"]/text()').extract())
                print "!!!!!!!!!!!!!==========",item['path_url']
                yield scrapy.Request(item['path_url'][-1],meta={'item':item},callback=self.parse_prod_list)
コード例 #16
0
ファイル: shop.py プロジェクト: lygntx/scrapyc
    def parse_jinpai(self, response):

        div = response.xpath('//*[@id="box_doc"]/div[1]/div/div[1]')
        for href in div.xpath("//a/@href").extract():
            if not href.startswith("http://"):
                continue
            scheme, netloc, path, params, query, fragment = parse_url(href)

            if netloc.startswith("shop") or path.endswith("creditdetail.htm"):
                yield ShopItem(url="%s://%s/"%(scheme,netloc),insert_time=str(datetime.datetime.now()))
            elif netloc == "detail.1688.com":
                yield GoodsItem(url=href,insert_time=str(datetime.datetime.now()))
            elif netloc == "go.1688.com" and 'supplier' in path:
                yield IndexItem(url=href,insert_time=str(datetime.datetime.now()))
                yield scrapy.Request(href)


        pass
コード例 #17
0
ファイル: archip.py プロジェクト: miemiekurisu/icscraw
 def parse(self, response):
     urlstr = "%s://%s%s"
     print 'start'
     parsedurl = surl.parse_url(get_base_url(response))
     #col-categories-selected
     for catsel in response.css('.col-categories-selected'):
         item = ArchItem()
         pathurl = []
         pathurl.append(urlstr % (parsedurl.scheme, parsedurl.netloc,
                                  catsel.xpath('a/@href').extract()[0]))
         item['path_url'] = pathurl
         item['path_name'] = catsel.xpath('a/span/text()').extract()
         item['main_bread'] = [
             catsel.xpath('//*[@itemprop="title"]/text()').extract()
         ]
         print "!!!!!!!!!!!!!", item['path_url']
         yield scrapy.Request(item['path_url'][-1],
                              meta={'item': item},
                              callback=self.pg2parse)
コード例 #18
0
    def parse(self, response):
        hidden = lambda id: response.xpath(
            '/html/body/input[@id="{}"]/@data-value'.format(id)).extract_first(
            )

        total_pages = int(hidden('quantidadeTotalPaginas').replace('.', ''))

        hashfragment = OrderedDict([
            ('pagina', None),
            ('semente', self.seed or hidden('semente')),
        ])

        formdata = OrderedDict([
            ('tipoOferta', '1'),
            ('paginaAtual', None),
            ('pathName', parse_url(response.url).path),
            ('hashFragment', ''),
        ])

        headers = {'X-Requested-With': 'XMLHttpRequest'}
        url = 'https://www.zapimoveis.com.br/Busca/RetornarBuscaAssincrona/'

        from_page = self.start
        if self.count:
            to_page = min(self.start + self.count - 1, total_pages)
        else:
            to_page = total_pages

        self.crawler.stats.set_value('total_pages', total_pages)
        self.crawler.stats.set_value('selected_pages',
                                     max(0, to_page - from_page + 1))

        for page in range(from_page, to_page + 1):
            hashfragment['pagina'] = formdata['paginaAtual'] = str(page)
            formdata['hashFragment'] = json.dumps(hashfragment,
                                                  separators=(',', ':'))
            yield FormRequest(url,
                              headers=headers,
                              formdata=formdata,
                              callback=self.parse_busca)
コード例 #19
0
ファイル: one.py プロジェクト: lygntx/scrapyc
    def parse(self, response):
        self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status != 200 :
            yield response.request 
            return
        if not isinstance(response,scrapy.http.HtmlResponse):
            return
        depth = response.meta.get("depth",1) 
        for href in response.xpath("//a/@href").extract():
            href = href.strip()

            if href.startswith("javascript:")  or href.startswith("rtsp:") or  href.startswith("ftp:"):
                continue
            scheme, netloc, path, params, query, fragment = parse_url(href)
            if path:
                suffix = path.split('.')[-1]
                if suffix in ["png","jpg","gif","rar","zip","mp3",".pdf","doc",".txt","docx","swf","mp4"]:
                    continue
            abs_url =urljoin_rfc(response.url,href)
            yield UrlItem(url=abs_url,fromurl=response.url)
            if depth < 10:
                depth += 1
                yield scrapy.Request(abs_url,meta={"depth":depth})
コード例 #20
0
ファイル: archip.py プロジェクト: miemiekurisu/icscraw
 def pg2parse(self, response):
     print 'pg2parse'
     parsedurl = surl.parse_url(get_base_url(response))
     urlstr = "%s://%s%s"
     for subcat in response.css('.cont-sub-content').xpath('a'):
         item = ArchItem()
         item['path_url'] = []
         item['path_name'] = []
         item['main_bread'] = []
         item['path_url'].extend(response.meta['item']['path_url'])
         item['path_name'].append(response.meta['item']['path_name'])
         item['main_bread'].append(response.meta['item']['main_bread'])
         realurl = urlstr % (parsedurl.scheme, parsedurl.netloc,
                             subcat.xpath('@href').extract()[0])
         item['path_url'].append(realurl)
         item['path_name'].append(
             subcat.xpath('*/h2[@itemprop="name"]/text()').extract()[0])
         item['main_bread'].append(
             response.xpath('//*[@itemprop="title"]/text()').extract())
         print "!!!!!!!!!!!!!==========", item['path_url']
         yield scrapy.Request(item['path_url'][-1],
                              meta={'item': item},
                              callback=self.parse_prod_list)
コード例 #21
0
ファイル: archip.py プロジェクト: miemiekurisu/icscraw
 def parse_prod_list(self,response):
     print 'parse_prod_list'
     urlstr = "%s://%s%s"
     parsedurl = surl.parse_url(get_base_url(response))
     for grid in response.css('.clearfix .grid_link'):
         item = ArchItem() 
         item['path_url']=[]
         item['path_name']=[]
         item['main_bread']=[]
         item['path_url'].extend( response.meta['item']['path_url'])
         item['path_name'] .append( response.meta['item']['path_name'])
         item['main_bread'] .append(  response.meta['item']['main_bread'])
         prdlnk = grid.xpath('a/@href').extract()[0]
         realurl = urlstr%(parsedurl.scheme,parsedurl.netloc,prdlnk)
         item['path_url'].append(realurl)
         item['path_name'].append([grid.css('.name-product').xpath('text()').extract()])
         item['main_bread'].append(response.xpath('//*[@itemprop="title"]/text()').extract())
         print "=======================PD:",item['path_url']
         yield scrapy.Request(item['path_url'][-1],meta={'item':item},callback=self.parse_content)
         #return item
     nxtpg = response.css('.btn-pag-right').xpath('@href').extract()
     if len(nxtpg)>0:
         nextpage = urlstr%(parsedurl.scheme,parsedurl.netloc,nxtpg[0])
         yield scrapy.Request(nextpage,meta={'item':response.meta['item']},callback=self.parse_prod_list)
コード例 #22
0
ファイル: distrelec_it.py プロジェクト: oceancloud82/scraping
    def parse_products_list(self, response):
        hxs = HtmlXPathSelector(response)

        url_base_parts = parse_url(response.url)
        base_url = '://'.join([url_base_parts.scheme, url_base_parts.netloc])

        categories = hxs.select("//div[@class='catNav']//a/@href").extract()

        for category_url in categories:
            url = urljoin(get_base_url(response), category_url)

            r = Request(url, callback=self.parse_products_list)
            yield r

        search_form = hxs.select("//form[@name='luceneSearchForm'][@method='post']")
        if search_form:
            form_url = search_form.select("@action").extract()[0]
            form_url = urljoin(get_base_url(response), form_url)
            formdata = self._get_formdata(search_form)
            for page_number in set(hxs.select("//table[@class='searchresultsArticlesNavigation']//td[contains(., 'Pagina')]//a/@onclick").re(r'submitPageNumber\((.*)\)')):
                data = formdata.copy()
                data['pageNumber'] = page_number
                data['urlToCall'] = ''
                if 'onlyOnStock' in data:
                    del(data['onlyOnStock'])
                if 'onlyPriceReduced' in data:
                    del(data['onlyPriceReduced'])
                if 'pageSize' in data:
                    del(data['pageSize'])
                if 'trackArtNr' in data:
                    del(data['trackArtNr'])
                if 'trackPosition' in data:
                    del(data['trackPosition'])
                if 'trackProductTitle' in data:
                    del(data['trackProductTitle'])
                r = FormRequest(form_url, formdata=data, callback=self.parse_products_list)
                yield r

        category = hxs.select("//table[@class='LinksKatalogNav']//td[1]/a/h1/text()").extract()
        products = hxs.select("//td[@id='productListCell']/table/tr[td[contains(@class, 'document')]]")
        for p in products:
            name = p.select(".//input[contains(@name, '.shortDescription')]/@value").extract()[0]
            url = p.select(".//input[contains(@name, '.uriPath')]/@value").extract()[0]
            url = urljoin(base_url, url)
            brand = p.select(".//input[contains(@name, '.vendor')]/@value").extract()[0]
            price = p.select(".//input[contains(@name, '.price')]/@value").extract()[0]
            image_url = urljoin(get_base_url(response), p.select(".//input[contains(@name, '.img')]/@value").extract()[0])
            sku = p.select(".//input[contains(@name, '.type')]/@value").extract()[0]
            identifier = p.select(".//input[contains(@name, '.artNr')]/@value").extract()[0]
            stock = p.select(".//input[contains(@name, '.stockValue')]/@value").extract()[0]
            if not stock:
                stock = 0

            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('name', name)
            loader.add_value('url', url)
            loader.add_value('category', category)
            loader.add_value('brand', brand)
            loader.add_value('image_url', image_url)
            loader.add_value('sku', sku)
            loader.add_value('identifier', identifier)
            loader.add_value('stock', stock)
            loader.add_value('price', price)

            price = extract_price(price)

            if price < Decimal(100.00):
                loader.add_value('shipping_cost', '10.00')

            yield loader.load_item()
コード例 #23
0
ファイル: url_cleaning.py プロジェクト: yuelinsoft/Crawler
def url_has_any_extension(url, extensions):
    return posixpath.splitext(parse_url(url).path)[1].lower()[1:] in extensions