Python url_query_cleaner Beispiele, scrapy.utils.url.url_query_cleaner Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_utils_url.py Projekt: serkanh/scrapy

 def test_url_query_cleaner(self):
     self.assertEqual(url_query_cleaner("product.html?id=200&foo=bar&name=wired", 'id'),
                      'product.html?id=200')
     self.assertEqual(url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name']),
                      'product.html?id=200&name=wired')
     self.assertEqual(url_query_cleaner("product.html?id=200&foo=bar&name=wired#id20", ['id', 'foo']),
                      'product.html?id=200&foo=bar')

Beispiel #2

0

Datei anzeigen

Datei: trovaprezzi.py Projekt: oceancloud82/scraping

    def parse_products(self, response):
        category = response.css('.breadcrumbs').xpath(
            './/a/text()').extract()[1:]
        products = response.css('.listing_item')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            image_url = product.css('.listing_item_image').xpath(
                'img/@src').extract_first()
            if not 'noimage' in image_url:
                loader.add_value('image_url', image_url)
            url = product.css('.listing_item_name').xpath(
                '@href').extract_first()
            url = url_query_cleaner(response.urljoin(url))
            sku = url.split('/')[-1]
            loader.add_value('identifier', sku)
            loader.add_value('sku', sku)

            loader.add_value('url', url)
            loader.add_xpath('name', './/a[@class="listing_item_name"]/text()')
            loader.add_xpath(
                'price', './/span[@class="listing_item_basic_price"]/text()')
            loader.add_value('category', category)
            shipping_cost = product.css('.listing_item_delivery_costs').xpath(
                'text()').extract_first()
            loader.add_value('shipping_cost', extract_price_eu(shipping_cost))
            if 'Non disponibile' in product.css(
                    '.listing_item_availability').xpath(
                        'text()').extract_first():
                loader.add_value('stock', 0)
            item = loader.load_item()
            dealer = product.css('.listing_item_merchant_name').xpath(
                'img/@alt').extract_first()
            item['metadata'] = {'Dealer': dealer}
            yield item

Beispiel #3

0

Datei anzeigen

Datei: delo.py Projekt: 7loops/zaposlim.se

 def parse_job_detail(self, response):        
     hxs = HtmlXPathSelector(response)
     
     title = hxs.select("//span[@class='header']/text()").extract_unquoted()       
     company = hxs.select("//span[@class='fontblacklarge']/b/text()").extract_unquoted()   
          
     if title and company: 
         city = hxs.select("//span[@class='fontblacklarge']/b[2]/text()").extract_unquoted()
         category = response.request.meta['category']        
         published_date = hxs.select("//span[@class='fontblacklarge']/../../following-sibling::tr[2]/td/b/text()").extract_unquoted()
         
         item=JobItem()
         images_url = hxs.select("//span[@class='fontblacklarge']/../following-sibling::td[2]/img/@src").extract()        
         
         if images_url:
             item.load_image(self.get_base_url(response, images_url[0]))
         
         loader = JobLoader(item)
         loader.add_value('title', title)
         loader.add_value('company', company)
         loader.add_value('category', category)
         loader.add_value('city', city)
         loader.add_value('details_url', url_query_cleaner(response.url, ('najdi', 'id')))
         loader.add_value('published_date', published_date)
         loader.add_value('id', self.generate_id(response.url, ('najdi', 'id')))
         loader.add_value('content', response.body_as_unicode())
         loader.add_value('source', self.name)
         loader.add_value('source_label', self.label)
         
         yield loader.load_item()

Beispiel #4

0

Datei anzeigen

Datei: borzadela.py Projekt: 7loops/zaposlim.se

 def parse_job_detail(self, response):
     hxs = HtmlXPathSelector(response)
     
     title = hxs.select("//div[@id='businessmyaccountjobadpreviewmain']/div/font/text()").extract_unquoted()
     company = hxs.select("//li[@class='propertiesleft' and contains(text(),'Podjetje:')]/following-sibling::li/text()").extract_unquoted()
     
     if title and company:
         city = hxs.select("//li[@class='propertiesleft' and contains(text(),'Regija in kraj dela:')]/following-sibling::li/text()").extract_unquoted()
         category = hxs.select(u"//li[@class='propertiesleft2' and contains(text(),'Področje dela:')]/following-sibling::li/text()".encode('utf-8')).extract_unquoted()
         images_url = hxs.select("//img[@id='mainimage']/@src").extract()
         item=JobItem()
         
         if images_url:
             item.load_image(self.get_base_url(response, images_url[0]))
     
         loader = JobLoader(item)
         loader.add_value('title', title)
         loader.add_value('company', company)
         loader.add_value('category', category)
         loader.add_value('city', city)
         loader.add_value('details_url', url_query_cleaner(response.url))
         loader.add_value('published_date', hxs.select("//li[@class='dates']/text()").re(r".*:\s+(.*)"))
         loader.add_value('id', self.generate_id(response.url))
         loader.add_value('content', response.body_as_unicode())
         loader.add_value('source', self.name)
         loader.add_value('source_label', self.label)
         
         yield loader.load_item()

Beispiel #5

0

Datei anzeigen

Datei: mojazaposlitev.py Projekt: mkramb/zaposlim.se

    def parse_job(self, response):
        hxs = HtmlXPathSelector(response)
        next_page = hxs.select("//ul[@class='pagination']/li[@class='selected']//following-sibling::li[1]/a/@href").extract()

        if next_page:
            next_page = self.get_base_url(response, next_page[0])
            if next_page != self.get_base_url(response, response.request.url):
                yield Request(url=next_page, callback=self.parse_job, meta={'category': response.request.meta['category']})

        for job in hxs.select("//ul[@id='newJobs']/li"):
            name = job.select("p[@class='jobTitle']/a/text()").extract_unquoted()
            company = job.select("strong/text()").extract_unquoted()

            if name and company:
                detail_url = job.select("p[@class='jobTitle']/a/@href").extract()
                detail_url = self.get_base_url(response, detail_url[0])

                if detail_url:
                    images_url = job.select("div[@class='jobImgDiv']/img/@src").extract()
                    item = JobItem()

                    item['title'] = name
                    item['company'] = company
                    item['category'] = response.request.meta['category']
                    item['summary'] =  job.select("p[2]/text()").extract_unquoted()
                    item['details_url'] = url_query_cleaner(detail_url)
                    item['published_date'] = job.select("span[1]/text()").re(r".*:\s(.*)")

                    if images_url:
                        item.load_image(self.get_base_url(response, images_url[0]))

                    yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})

Beispiel #6

0

Datei anzeigen

 def parse_product(self, response):
     
     
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     loader = ProductLoader(item=Product(), selector=hxs)
     loader.add_xpath('category', '//li[@typeof="v:Breadcrumb"]/a[@href!="/"]/text()')
     brand = hxs.select('//script[@type="text/javascript"]/text()').re('brand: *\"(.+)\"')
     loader.add_value('brand', brand)
     loader.add_xpath('image_url', '//div[@id="amp-originalImage"]/img/@src')
     loader.add_value('url', url_query_cleaner(response.url))
     loader.add_xpath('name', '//input[@name="speedtrapProductDisplayName"]/@value')
     item = loader.load_item()
     if hxs.select('//ul[@class="productOptionsList"]/li[contains(@class, "skuAttribute")]'):
         data = hxs.select('//script[contains(text(),"stockMatrix =")]/text()')[0].extract()
         data = data.replace('\n', '').replace('null', '"null"')
         data = re.search('stockMatrix = (.*?);', data, re.DOTALL)
         data = json.loads(data.group(1)) if data else []
         for i, variant in enumerate(data):
             sku = [elem for elem in variant if elem.startswith('sku')][0]
             sku_idx = variant.index(sku)
             product = Product(item)
             product['name'] = item['name'] + ' - ' + ' '.join(variant[:sku_idx]).title()
             product['identifier'] = '{}-{}'.format(response.meta.get('row').get('PRODUCT_NUMBER'), i)
             product['sku'] = product['identifier']
             product['price'] = variant[sku_idx + 2]
             product['stock'] = 1 if 'Available#Delivery' in variant[sku_idx + 1] else 0
             yield product
         return
     loader.add_value('identifier', response.meta.get('row').get('PRODUCT_NUMBER'))
     loader.add_value('sku', response.meta.get('row').get('PRODUCT_NUMBER'))
     loader.add_xpath('price', '//input[@name="speedtrapPrice"]/@value')
     stock = 1 if hxs.select('//meta[@property="product:availability"]/@content[.="In Stock"]') else 0
     loader.add_value('stock', stock)
     yield loader.load_item()

Beispiel #7

0

Datei anzeigen

 def closing_parse_simple(self, response):
     for item in super(Bike24Spider, self).closing_parse_simple(response):
         if isinstance(item, Product):
             if 'shipping_cost' in item:
                 del item['shipping_cost']
             # Normalize URL
             item['url'] = url_query_cleaner(item['url'],
                                             parameterlist=('content',
                                                            'product'),
                                             sep=';')
         yield item

Beispiel #8

0

Datei anzeigen

Datei: test_utils_url.py Projekt: richard-ma/CodeReading

 def test_url_query_cleaner(self):
     self.assertEqual('product.html?id=200',
             url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id']))
     self.assertEqual('product.html?id=200',
             url_query_cleaner("product.html?&id=200&&foo=bar&name=wired", ['id']))
     self.assertEqual('product.html',
             url_query_cleaner("product.html?foo=bar&name=wired", ['id']))
     self.assertEqual('product.html?id=200&name=wired',
             url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name']))
     self.assertEqual('product.html?id',
             url_query_cleaner("product.html?id&other=3&novalue=", ['id']))
     self.assertEqual('product.html?d=1&d=2&d=3',
             url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False))
     self.assertEqual('product.html?id=200&foo=bar',
             url_query_cleaner("product.html?id=200&foo=bar&name=wired#id20", ['id', 'foo']))
     self.assertEqual('product.html?foo=bar&name=wired',
             url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True))
     self.assertEqual('product.html?name=wired',
             url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True))
     self.assertEqual('product.html?foo=bar&name=wired',
             url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'footo'], remove=True))

Beispiel #9

0

Datei anzeigen

Datei: TiebaImageSpider.py Projekt: wuliang/TiebaImageGrabber

    def parse_post(self, response):
        image_urls = []
        hxs = HtmlXPathSelector(response)
          
        # scrap each row in the table
        posts = hxs.select('//div[@class="l_post"]')

        for post in posts:
            # only one image at most in fact 
            images = post.select('.//div[@class="p_content"]/img/@src')
            for image in images:               
                url = image.extract() 
                url = urljoin_rfc(response.url, url)
                url = url_query_cleaner(url, [])
                print 'url is %s' % url
                image_urls.append(url)
            # End of For
        #End of For
        item = ImageItem()
        item['image_urls'] = image_urls
        yield item

Beispiel #10

0

Datei anzeigen

Datei: rhinocamera.py Projekt: oceancloud82/scraping

 def parse_product(self, response):
     loader = ProductLoader(item=Product(), response=response)
     loader.add_xpath('name', '//h2[@class="product-title"]/text()')
     identifier = url_query_parameter(response.url, 'ProductID')
     loader.add_value('identifier', identifier)
     loader.add_value('sku', identifier)
     url = url_query_cleaner(response.url, ('ProductName', 'ProductID'))
     loader.add_value('url', url)
     loader.add_xpath(
         'price',
         '//div[contains(@class, "product-details")]//span[@class="price"]/text()'
     )
     image_url = response.xpath(
         '//img[@class="prodImg"]/@src').extract_first()
     loader.add_value('image_url', response.urljoin(image_url))
     stock = response.xpath('//div[@id="MasterCopy_Instock"]/h4/text()').re(
         '\d+')
     if stock:
         loader.add_value('stock', stock[0])
     else:
         loader.add_value('stock', 0)
     yield loader.load_item()

Beispiel #11

0

Datei anzeigen

Datei: mojedelo.py Projekt: 7loops/zaposlim.se

    def parse_job(self, response):
        hxs = HtmlXPathSelector(response)
        next_page = hxs.select("//div[@class='PagedList-pager']//ul/li[contains(@class, 'PagedList-currentPage')]/following-sibling::li[1]/a/@href").extract()
        
        if next_page:
            next_page = self.get_base_url(response, next_page[0])
            
            if next_page != self.get_base_url(response, response.request.url):
                yield Request(url=next_page, callback=self.parse_job)
          
        category = hxs.select("//form[@id='searchForm']//select[@name='wfid']/option[@selected='selected']/text()").extract()      
        informations = hxs.select("//table[@class='job-add-listing']//tr//div[@class='job-add-item-inner']")
        
        for information in informations:
            name    = information.select("h2/a/text()").extract()
            company = information.select("p[3]/strong/text()").extract()

            if name and company:                
                detail_url = information.select("h2/a/@href").extract()
                detail_url = self.get_base_url(response, detail_url[0])
                
                if detail_url:
                    images_url = information.select("div[contains(@class,'city-logo')]/img/@src").extract()
                    item = JobItem()
                    
                    item['title'] = name
                    item['company'] = company
                    item['category'] = category
                    item['summary'] = information.select("p[2]/text()").extract()
                    item['city'] = information.select("p[1]/text()").extract()
                    item['details_url'] = url_query_cleaner(detail_url)
                    item['published_date'] = information.select("div[contains(@class,'city-logo')]/div/text()").extract()
                    
                    if images_url:
                        item.load_image(self.get_base_url(response, images_url[0]))
                
                    yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})

Beispiel #12

0

Datei anzeigen

Datei: zaposlitev.py Projekt: 7loops/zaposlim.se

    def parse_job(self, response):
        hxs = HtmlXPathSelector(response)
        next_page = hxs.select("//span[@class='stevilke']/a[contains(@class, 'active')]/following-sibling::a[1]/@href").extract()

        if next_page:
            next_page = self.get_base_url(response, next_page[0])
            if next_page != self.get_base_url(response, response.request.url):
                yield Request(url=next_page, callback=self.parse_job)

        category = hxs.select(        
            "//input[@type='checkbox' and contains(@class, 'iskalnik_kriteriji_tip_sektor') and @checked]" +
            "//following-sibling::label[1]/text()"
        ).extract_unquoted()
        
        for job in hxs.select("//tr[@class='bg_oglas_dm']"):
            name = job.select("td[@class='ena']/div/a/b/text()").extract_unquoted()
            company = job.select("td[@class='dva']/a/text()").extract_unquoted()
            
            if name and company:
                detail_url = job.select("td[@class='ena']/div/a/@href").extract()
                detail_url = self.get_base_url(response, detail_url[0])
                
                if detail_url:
                    images_url = job.select("td[@class='stiri']//img/@src").extract()
                    item = JobItem()
                    
                    item['title'] = name
                    item['company'] = company
                    item['category'] = category
                    item['city'] = job.select("td[@class='tri']/a/text()").extract_unquoted()
                    item['details_url'] = url_query_cleaner(detail_url)
                    item['published_date'] = job.select("td[@class='stiri']//div[2]/text()").re(r"\s+(\d{2}.\d{2}.\d{4})\s+")

                    if images_url:
                        item.load_image(self.get_base_url(response, images_url[0]))
        
                    yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})

Beispiel #13

0

Datei anzeigen

Datei: zavod.py Projekt: mkramb/zaposlim.se

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        next_page = hxs.select("//div[contains(@class, 'pagination')]/span/following-sibling::a[1]/@href").re(r"'(.*)','(.*)'")

        if next_page:
            yield Request(
                url=response.url,
                method='POST',
                headers= {
                    'content-type'     : 'application/x-www-form-urlencoded; charset=utf-8',
                    'x-requested-with' : 'XMLHttpRequest',
                    'x-microsoftajax'  : 'Delta=true'
                },
                body=self.build_formdata({
                    '__EVENTTARGET'  : next_page[0],
                    '__EVENTARGUMENT': next_page[1],
                }),
                callback=self.parse
            )

        for add in hxs.select("//div[@class='cc-gv']/table/tbody//tr"):
            name = add.select("td[1]/a/text()").extract_unquoted()
            company = add.select("td[2]/text()").extract_unquoted()

            if name and company:
                detail_url = add.select("td[1]/a/@href").extract()
                detail_url = self.get_base_url(response, detail_url[0])

                if detail_url:
                    item = JobItem()
                    item['title'] = name
                    item['company'] = company
                    item['published_date'] = add.select("td[3]/text()").extract_unquoted()
                    item['details_url'] = url_query_cleaner(detail_url, ('IDEPD'))
                    item['city'] = add.select("td[4]/text()").extract_unquoted()

                    yield Request(url=detail_url, callback=self.parse_job_detail, meta={'item': item})

Beispiel #14

0

Datei anzeigen

Datei: __init__.py Projekt: 7loops/zaposlim.se

 def generate_id(self, detail_url, parameterlist=()):
     detail_url = url_query_cleaner(detail_url, parameterlist)
     detail_url = ''.join([self.name, detail_url]).encode('utf-8')
     
     return hashlib.md5(detail_url).hexdigest()

Beispiel #15

0

Datei anzeigen

    def parse_product(self, response):
        identifier = response.xpath(
            '//form[@id="pdAddToCart"]//input[@name="product"]/@value'
        ).extract()
        if not identifier:
            return

        loader = ProductLoader(item=Product(), response=response)
        # Normalize URL
        product_url = url_query_cleaner(response.url,
                                        parameterlist=('content', 'product'),
                                        sep=';')
        loader.add_value('url', product_url)
        loader.add_value('identifier', identifier)
        sku = response.xpath(
            '//table[@class="table-bordered table-striped table-product-datasheet"]'
            '//td[text()="Item Code:"]/following-sibling::td[1]/text()'
        ).extract()
        if sku:
            loader.add_value('sku', sku[0])
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')

        price = response.xpath(
            '//div[@class="box-price js-price"]/span[@itemprop="price"]/text()'
        ).extract()
        if price:
            price = extract_price(price[0].strip().replace('.', '').replace(
                ',', '.'))
            loader.add_value('price', price)
        else:
            loader.add_value('price', '0.0')

        image_url = response.xpath('//img[@itemprop="image"]/@src').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])

        brand = response.xpath(
            '//table[@class="table-bordered table-striped table-product-datasheet"]'
            '//td[text()="Manufacturer:"]/following-sibling::td[1]/text()'
        ).extract()
        if brand:
            loader.add_value('brand', brand[0])

        category = response.xpath(
            '//ul[@class="nav"]//li[contains(@class,"item-active")]/a/text()'
        ).extract()
        if category:
            loader.add_value('category', category)

        availability = response.xpath(
            '//*[@id="js-availability-label"]/text()').extract()
        if availability and 'unknown' in availability[0].lower():
            loader.add_value('stock', 0)

        product = loader.load_item()
        options = response.xpath(
            '//div[@class="input-group input-group-select"]/select')
        if not options:
            if not (getattr(self, 'simple_run', False) and (hasattr(self, 'matched_identifiers')) \
               and (product['identifier'] not in self.matched_identifiers)):

                if not product['identifier'] in self.id_seen:
                    self.id_seen.append(product['identifier'])
                    yield product

            return

        for sel in options:
            opt = ''
            select_name = sel.xpath('@name').extract()
            if select_name:
                opt = select_name[0].replace('opt_', '')
            for option in sel.xpath('option[@value!="-2"]'):
                item = Product(product)
                opt_id = option.xpath('@value').extract()
                if opt_id:
                    item['identifier'] += '-' + opt + '-' + opt_id[0]
                    item['stock'] = 1
                    opt_stock = option.xpath('@data-av').extract()
                    if opt_stock and opt_stock[0] == '100':
                        item['stock'] = 0
                    opt_name = option.xpath('text()').extract()
                    if opt_name:
                        item['name'] += ' - ' + opt_name[0]

                    if getattr(self, 'simple_run', False) and (hasattr(self, 'matched_identifiers')) \
                       and (item['identifier'] not in self.matched_identifiers):
                        continue

                    if not item['identifier'] in self.id_seen:
                        self.id_seen.append(item['identifier'])
                        yield item

Beispiel #16

0

Datei anzeigen

Datei: very.py Projekt: oceancloud82/scraping

    def parse_product(self, response):

        loader = ProductLoader(item=Product(), response=response)
        categories = response.xpath(
            '//ul[@class="breadcrumbList"]/li[@itemprop="itemListElement"]//span[@itemprop="name"]/text()'
        ).extract()[1:]
        loader.add_value('category', categories)
        brand = response.xpath('//script[@type="text/javascript"]/text()').re(
            'brand: *\"(.+)\"')
        loader.add_value('brand', brand)
        loader.add_xpath('image_url',
                         '//div[@id="amp-originalImage"]/img/@src')
        loader.add_value('url', url_query_cleaner(response.url))
        loader.add_xpath(
            'name', '//input[@name="speedtrapProductDisplayName"]/@value')
        identifier = response.xpath('//text()').re("productId: '(.*)'")[0]
        loader.add_value('identifier', identifier)
        sku = response.xpath('//span[@id="productEAN"]/text()').extract()
        sku = sku[-1].strip() if sku else ''
        loader.add_value('sku', sku)
        loader.add_xpath('price', '//input[@name="speedtrapPrice"]/@value')
        stock = 1 if response.xpath(
            '//meta[@property="product:availability"]/@content[.="In Stock"]'
        ) else 0
        loader.add_value('stock', stock)
        loader.add_value('shipping_cost', 3.99)
        item = loader.load_item()

        options = response.xpath(
            '//ul[@class="productOptionsList"]/li[contains(@class, "skuAttribute")]'
        )
        if options:
            data = response.xpath(
                '//script[contains(text(),"stockMatrix =")]/text()'
            )[0].extract()
            data = data.replace('\n', '').replace('null', '"null"')
            data = re.search('stockMatrix = (.*?);', data, re.DOTALL)
            data = json.loads(data.group(1)) if data else []
            for i, variant in enumerate(data):
                sku = [elem for elem in variant if elem.startswith('sku')][0]
                sku_idx = variant.index(sku)
                product = Product(item)
                product['name'] = item['name'] + ' - ' + ' '.join(
                    variant[:sku_idx]).title()
                product['identifier'] += '-' + sku
                product['price'] = extract_price(str(variant[sku_idx + 2]))
                if not ('Available#Delivery' in variant[sku_idx + 1]
                        or 'In stock#' in variant[sku_idx + 1]
                        or 'Low stock#' in variant[sku_idx + 1]):
                    product['stock'] = 0

                image_code = response.xpath(
                    '//li[input[@value="' + variant[0] +
                    '"]]/input[@class="colourImageUrl"]/@value').extract()
                if image_code:
                    image_url = 'http://media.very.co.uk/i/very/' + image_code[
                        0]
                    product['image_url'] = image_url

                yield product
        else:
            yield item

Beispiel #17

0

Datei anzeigen

Datei: walmart.py Projekt: oceancloud82/scraping

    def parse_product(self, response):
        data = response.xpath(
            '//script/text()[contains(., "product/data")]').extract_first()
        rdata = response.xpath(
            '//script/text()[contains(., "window.__WML_REDUX_INITIAL_STATE__")]'
        ).extract_first()
        if not data:
            if rdata:
                for product in self.parse_product_rdata(response):
                    yield product
            else:
                retries = response.meta.get('retries', 0)
                if retries < 20:
                    self.logger.warning('No product data on %s. Retrying.' %
                                        response.url)
                    yield Request(response.url,
                                  self.parse_product,
                                  meta={'retries': retries + 1},
                                  dont_filter=True)
                else:
                    self.logger.warning(
                        'No product data found on %s. Gave up retrying' %
                        response.url)
            return

        data = json.loads(
            re.search('product/data",[ \n]*({.+})', data).group(1))

        loader = ProductLoader(item=Product(), response=response)

        product_id = response.xpath(
            '//form[@name="SelectProductForm"]/input[@name="product_id"]/@value'
        ).extract()
        if product_id:
            identifier = product_id[0]
        else:
            identifier = url_query_cleaner(response.url).split('/')[-1]
        identifier = identifier.split('?')[0]

        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)

        name = filter(
            lambda n: n,
            map(unicode.strip,
                response.xpath('//h1[@itemprop="name"]//text()').extract()))
        if not name:
            name = filter(
                lambda n: n,
                map(
                    unicode.strip,
                    response.xpath(
                        '//h1[contains(@class,"product-name")]//text()').
                    extract()))
        if name:
            loader.add_value('name', name[0].strip())
        #loader.add_xpath('name', '//option[@selected and not(@disabled)]/text()')

        loader.add_css('brand', 'a.product-brand span::text')

        categories = response.xpath(
            '//div[@itemprop="breadcrumb"]//span[@itemprop="title"]/text()'
        ).extract()
        if not categories:
            categories = response.xpath(
                '//div[@itemprop="breadcrumb"]//span[@itemprop="name"]/text()'
            ).extract()
        if categories:
            if 'Home' in categories:
                categories.remove('Home')
            loader.add_value('category', categories)
        elif 'category' in response.meta:
            loader.add_value('category', response.meta['category'])

        loader.add_value('url', response.url)

        price = response.xpath('//@data-product-price').extract_first()
        if price:
            price = [price] if price else None
        if not price:
            price = response.xpath(
                '//div[@id="WM_PRICE"]//*[contains(@class,"camelPrice")]/span/text()'
            ).extract()
        if not price:
            price = response.xpath(
                '//div[@class="onlinePriceMP"]//*[contains(@class,"camelPrice")]/span/text()'
            ).extract()
        if not price:
            price = response.xpath(
                '//div[@itemprop="offers"]/div[contains(@class, "product-price")]//*[@itemprop="price"][1]//text()'
            ).extract()
        if not price:
            price = response.xpath(
                '//div[@class="col5"]//div[contains(@class,"product-buying-table-row")][1]//div[contains(@class,"price-display")][1]//text()'
            ).extract()
        if not price:
            price = response.xpath('//*[@itemprop="price"]//text()').extract()

        price = ''.join(price).strip() if price else 0

        loader.add_value('price', price)

        stock = response.xpath(
            '//meta[@itemprop="availability"]/@content').extract_first()
        if not stock or stock != 'InStock':
            loader.add_value('stock', 0)

        image = response.xpath(
            '//div[@class="LargeItemPhoto215"]//img/@src').extract()
        if not image:
            image = response.xpath(
                '//div[contains(@class,"product-images")][1]//img/@src'
            ).extract()
        if image:
            loader.add_value('image_url', image[0])

        try:
            loader.add_value(
                'shipping_cost',
                data['buyingOptions']['shippingPrice']['displayPrice'])
        except KeyError:
            loader.add_css('shipping_cost', 'h2.js-shipping-primary-msg::text')

        if not data or not data.get('variantInformation'):
            yield loader.load_item()
            return

        if url_query_parameter(response.url, 'selected'):
            if response.css('div.product-buying-table').xpath(
                    './/div[contains(., "Information unavailable")]'
            ) or price == 0:
                retries = response.meta.get('retries', 0)
                if retries < 9:
                    yield Request(response.url,
                                  self.parse_product,
                                  meta={'retries': retries + 1},
                                  dont_filter=True)
                return
            for option in data['variantInformation']['variantTypes']:
                try:
                    loader.add_value('name', option['selectedValue'])
                except KeyError:
                    pass
            yield loader.load_item()
            return

        for variant in data['variantInformation']['variantProducts']:
            try:
                option_id = variant['buyingOptions']['usItemId']
            except KeyError:
                continue
            url = '/'.join(response.url.split('/')[:-1])
            url += '/%s' % option_id
            yield Request(add_or_replace_parameter(url, 'selected', 'True'),
                          self.parse_product)

Beispiel #18

0

Datei anzeigen

Datei: bike24.py Projekt: oceancloud82/scraping

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        identifier = response.xpath(
            '//form[@id="pdAddToCart"]//input[@name="product"]/@value'
        ).extract()
        if not identifier:
            return

        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), response=response)
        # Normalize URL
        product_url = url_query_cleaner(response.url,
                                        parameterlist=('content', 'product'),
                                        sep=';')
        loader.add_value('url', product_url)
        loader.add_value('identifier', identifier[0])
        sku = response.xpath(
            '//td[text()="Item Code:"]/following-sibling::td[1]/text()'
        ).extract()
        if sku:
            loader.add_value('sku', sku[0])
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        price = response.xpath('//span[@itemprop="price"]/text()').extract()
        if price:
            price = extract_price(price[0].strip().replace('.', '').replace(
                ',', '.'))
            loader.add_value('price', self.convert_to_pounds(str(price)))
        else:
            loader.add_value('price', '0.0')

        image_url = response.xpath('//img[@itemprop="image"]/@src').extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
        brand = response.xpath(
            '//td[text()="Manufacturer:"]/following-sibling::td[1]/text()'
        ).extract()
        if brand:
            loader.add_value('brand', brand[0])

        category = response.xpath(
            '//main//span[@class="text-title"]/text()').extract()
        if category:
            loader.add_value('category', category[0].split(':')[0].strip())

        availability = response.xpath(
            '//div[@class="pd-availability"]/span[contains(text(),"Delivery")]/text()'
        ).extract()
        if availability and 'unknown' in availability[0].lower():
            loader.add_value('stock', 0)

        product = loader.load_item()
        options = response.xpath('//form[@id="pdAddToCart"]//select')
        if not options:
            if not (getattr(self, 'simple_run', False) and (hasattr(self, 'matched_identifiers')) \
               and (product['identifier'] not in self.matched_identifiers)):

                if not product['identifier'] in self.id_seen:
                    self.id_seen.append(product['identifier'])
                    yield product

            return

        for sel in options:
            opt = ''
            select_name = sel.xpath('@name').extract()
            if select_name:
                opt = select_name[0].replace('opt_', '')
            for option in sel.xpath('option[@value!="-2"]'):
                item = Product(product)
                opt_id = option.xpath('@value').extract()
                if opt_id:
                    item['identifier'] += '-' + opt + '-' + opt_id[0]
                    item['stock'] = 1
                    if option.xpath('@data-av') == '100':
                        item['stock'] = 0
                    opt_name = option.xpath('text()').extract()
                    if opt_name:
                        item['name'] += ' - ' + opt_name[0]
                    opt_surcharge = option.xpath('@data-surcharge').extract()
                    if opt_surcharge:
                        item['price'] += extract_price(opt_surcharge[0])

                    if getattr(self, 'simple_run', False) and (hasattr(self, 'matched_identifiers')) \
                       and (item['identifier'] not in self.matched_identifiers):
                        continue

                    if not item['identifier'] in self.id_seen:
                        self.id_seen.append(item['identifier'])
                        yield item