Esempio n. 1
0
 def parse_product(self, response):
     item = response.meta['item']
     data = SpiderSchema(response).get_product()
     category = response.css('a.GTM-breadcumb::text').extract()[1:]
     loader = ProductLoaderEU(Product(), response=response)
     loader.add_value(None, item)
     loader.replace_value('price', data['offers']['properties']['price'])
     loader.replace_value('category', category)
     if data['offers']['properties']['availability'] != 'inStock':
         loader.replace_value('stock', 0)
     yield loader.load_item()
Esempio n. 2
0
 def parse_products(self, response):
     data = json.loads(response.body)
     if data[0]['result']:
         for product in data[0]['result']:
             product_loader = ProductLoader(item=Product(), response=response)
             product_loader.add_value('name', product['name']['sv'])
             if product['images']:
                 product_loader.add_value('image_url', product['images'][0])
             product_loader.add_value('url', product['url']['sv'])
             product_loader.add_value('identifier', product['uid'])
             sku = product['name']['sv']
             sku = self.re_sku.findall(sku)
             product_loader.add_value('sku', sku)
             product_loader.add_value('price', product['price']['current']['SEK'])
             if not product['isBuyable']:
                 product_loader.add_value('stock', 0)
             product = product_loader.load_item()
             yield product
         offset = response.meta['offset'] + 200
         data = self.post_data.replace('{}', str(offset))
         yield scrapy.Request(self.post_url, method='POST', body=data,
                              callback=self.parse_products, meta={'offset': offset},
                              dont_filter=True)
Esempio n. 3
0
 def parse_product(self, response):
     loader = ProductLoader(Product(), response=response)
     loader.add_css('identifier', 'input.qs-cart-pid::attr(value)')
     loader.add_xpath('identifier', '//script/text()', re='product_id=(.+)"')
     loader.add_value('url', response.url)
     loader.add_css('name', 'h1.product-description-header::text')
     loader.add_css('price', 'input.qs-cart-price::attr(value)')
     loader.add_value('price', 0)
     name = loader.get_output_value('name')
     sku = self.re_sku.findall(name)
     if sku:
         sku = max(sku, key=len)
         loader.add_value('sku', sku)
     loader.add_css('image_url', 'div.product-images ::attr(src)')
     stock = response.xpath('//link[@itemprop="availability"]/@href').extract_first()
     if not stock or 'instock' not in stock.lower():
         loader.add_value('stock', 0)       
     yield loader.load_item()
Esempio n. 4
0
    def parse_product(response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        product_loader = ProductLoader(item=Product(), selector=hxs)
        image_url = response.css('.picture').xpath('img/@src').extract_first()
        product_identifier = response.xpath(
            '//@data-productid').extract_first()
        product_name = response.xpath(
            '//h1[@itemprop="name"]/text()').extract_first()
        product_loader.add_value('identifier', product_identifier)
        product_loader.add_value('name', product_name)
        if image_url:
            product_loader.add_value('image_url',
                                     urljoin_rfc(base_url, image_url))
        price = ''.join(
            response.xpath('//span[@itemprop="price"]/text()').re('\S+'))
        sku = ''.join(
            response.xpath('//span[@itemprop="sku"]/text()').re('\w+'))
        product_loader.add_value('sku', sku)
        product_loader.add_value('price', price)
        product_loader.add_value('url', response.url)
        product_loader.add_value('category', response.meta['category'][-3:])
        product = product_loader.load_item()
        yield product
Esempio n. 5
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        name = hxs.select("//div[@class='product-name']/h1/text()").extract()

        if not name:
            self.log('No name on %s' %response.url)
            return

        # price = hxs.select('//*[@itemprop="price"]/text()').extract()[0]

        product_image = hxs.select('//*[@id="ma-zoom1"]/img/@src').extract()
        if product_image:
            product_image = urljoin_rfc(get_base_url(response), product_image[0])
        category = ''.join(hxs.select('//div[@class="breadcrumbs"]/ul/li[2]/a/text()').extract())

        shipping = hxs.select('//table[@id="product-attribute-specs-table"]'
                              '//th[@class="label" and contains(text(), "Spese Spedizione")]'
                              '/following-sibling::td/text()').extract()
        if not shipping:
            shipping = hxs.select('//table[@id="product-attribute-specs-table"]'
                              '//th[@class="label" and contains(text(), "Shipping Cost")]'
                              '/following-sibling::td/text()').extract()
        if shipping:
            shipping_cost = shipping[0].strip()
            if shipping_cost == 'Gratis':
                shipping_cost = '0.0'
            else:
                shipping_cost = extract_price_eu(shipping[0])
                if shipping_cost >= Decimal(1000):
                    shipping_cost = extract_price(shipping[0])
        else:
            shipping_cost = None

        brand = hxs.select('//table[@id="product-attribute-specs-table"]'
                           '//th[@class="label" and contains(text(), "Marca")]'
                           '/following-sibling::td/a/@title').extract()
        if not brand:
            brand = hxs.select('//table[@id="product-attribute-specs-table"]'
                               '//th[@class="label" and contains(text(), "Marca")]'
                               '/following-sibling::td/text()').extract()

        l = ProductLoader(item=Product(), response=response)
        identifier = response.xpath("//input[@type='hidden'][@name='product']/@value").extract()[0]
        price = response.xpath('//div[@class="product-shop"]//span[@itemprop="price"]/text()').extract()
        
        l.add_xpath('sku', 'normalize-space(substring-after(//li[contains(text(),"Codice:")]/text(), ":"))')
        
        l.add_value('url', response.url)
        
        l.add_value('image_url', product_image)
        l.add_value('category', category)
        if brand:
            l.add_value('brand', brand[0].strip())
        
        stock = response.xpath('//p[contains(@class, "availability")]')
        if stock.xpath('//@class[contains(., "instock") or contains(., "in-stock")]'):
            l.add_value('stock', 1)
        else:
            l.add_value('stock', 0)

        if shipping_cost is not None:
            l.add_value('shipping_cost', shipping_cost)
            
        if not price:
            price = response.xpath('//*[@id="product-price-{}"]//text()'.format(identifier)).re(r'[\d,.]+')
            if price:
                l.add_value('identifier', identifier)
                l.add_value('name', name)
                l.add_value('price', price[0])
                yield l.load_item()
                return
                
        if price and len(price) == 1:
            l.add_value('identifier', identifier)
            l.add_value('name', name)
            l.add_value('price', price[0])
            yield l.load_item()
            return
        
        table = response.xpath('//table[@id="super-product-table"]')
        if not table:
            self.log('No correct price found on %s' %response.url)
            self.log('Price is %s' %price)
            return
        
        item = l.load_item()
        
        for product in table.xpath('tbody/tr[td/input]'):
            loader = ProductLoader(item=Product(item), selector=product)
            loader.replace_xpath('name', 'td[1]/text()')
            loader.replace_xpath('identifier', 'td/div/span/@id', re='\d+')
            loader.replace_xpath('price', './/span[contains(@id, "product-price")]//text()', re='\S+')
            item = loader.load_item()
            yield item
Esempio n. 6
0
 def parse_product(self, response):
     loader = ProductLoaderEU(item=Product(), response=response)
     identifier = response.xpath('//@data-id').extract_first()
     loader.add_value('identifier', identifier)
     loader.add_xpath('url', '//link[@rel="canonical"]/@href')
     loader.add_xpath('name', '(//h1/text())[1]')
     loader.add_css('price', '.price-including-tax .price ::text')
     if not loader.get_output_value('price'):
         return
     loader.add_value('sku', identifier)
     loader.add_value('category', response.meta.get('category'))
     image_url = response.xpath('//img[@id="image"]/@src').extract_first()
     if image_url:
         loader.add_value('image_url', response.urljoin(image_url))
     loader.add_xpath('brand', '//strong[text()="Brand:"]/following-sibling::a/text()')
     loader.add_xpath('brand', '//img[contains(@src, "/brands/")]/@title')
     if not response.css('.in-stock').xpath('div[@itemprop="availability"][not (contains(., "Ikke"))]').extract():
         loader.add_value('stock', 0)
         loader.replace_value('price', 0)
     item = loader.load_item()
     option_attributes = response.xpath('//select[@id="bundle-option"]')
     if not option_attributes:
         yield item
         return
     options = []
     for attribute in option_attributes:
         options.append(attribute.xpath('.//option[@value!=""]'))
     variants = itertools.product(*options)
     for variant in variants:
         loader = ProductLoader(item=Product(), response=response)
         loader.add_value(None, item)
         identifier = ''
         loader.replace_value('name', '')
         price = item['price']
         for option in variant:
             identifier += '-' + option.xpath('@value').extract_first()
             loader.add_value('name',  option.xpath('text()').extract_first())
             if option.xpath('@disabled'):
                 loader.replace_value('stock', 0)
             extra_cost = option.xpath('@data-extra-cost').extract_first()
             if extra_cost:
                 price += Decimal(extra_cost)
         loader.replace_value('price', price)
         loader.replace_value('identifier', identifier.strip('-'))
         loader.replace_value('sku', identifier.strip('-'))
         yield loader.load_item()
         
Esempio n. 7
0
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)
        pages = hxs.select('//select[@name="nbPagesPerPage"]')
        cat_text = hxs.select('//h2[@class="titre_image titre_image_niv1"]')
        if not pages and not cat_text:
            try:
                category = hxs.select(
                    '//div[@id="chemin_os"]//a/span[@itemprop="title"]/text()'
                ).extract()[-1]
            except:
                category = None

            main_ref = hxs.select(
                '//div[@id="ficheProduitPied"]//span[@class="reference"]/text()'
            ).re(r'R\xe9f. (.*)')
            name = response.xpath(
                '//div[@id="ficheProduitPied"]/div[@id="fichetitre"]/text()'
            ).extract()
            if not name or (name and not name[0].strip()):
                name = response.xpath(
                    '//span[@itemprop="name"]/text()').extract()
            price = ''.join(
                response.xpath(
                    '//div[@id="ficheProduitPied"]//*[@class="prix"]/text()').
                re('\S+'))
            if name:
                identifier = remove_punctuation_and_spaces(name[0]).lower()
                image_url = response.xpath(
                    '//div[@id="ficheProduitPied"]//img/@src').extract()
                image_url = urljoin_rfc(get_base_url(response),
                                        image_url[0]) if image_url else ''

                l = ProductLoader(
                    item=Product(),
                    selector=response.xpath('//div[@id="ficheProduitPied"]'))
                l.add_value('identifier', identifier)
                l.add_value('name', name)
                if category:
                    l.add_value('category', category)
                l.add_xpath('sku',
                            '//div[@id="ligne_achat"]//text()',
                            re=':(.+)')
                l.add_value('stock', 1)
                l.add_value('url', response.url)
                l.add_value('price', price)
                l.add_value('image_url', image_url)
                yield l.load_item()

            products = hxs.select(
                '//div[@id="bloc_offre"]/div/div[@class="bloc_cadre_pied"]/form[@class="mini_fiche_ligne"]'
            )
            products += hxs.select(
                '//div[@id="bloc_accessoire"]/div/div[@class="bloc_cadre_pied"]/form[@class="mini_fiche_ligne"]'
            )
            for p in products:
                p_url = p.select(
                    './/div[@class="ligne_titre"]/a/@href').extract()
                if p_url:
                    yield Request(urljoin_rfc(get_base_url(response),
                                              p_url[0]),
                                  callback=self.parse_product)
                    continue
                name = p.select(
                    './/div[@class="colonne_1"]/div[@class="ligne_titre"]/span[@class="titre_descriptif"]/strong/text()'
                )
                if not name:
                    name = p.select(
                        './/div[@class="colonne_1"]/div[@class="ligne_titre"]/a/span[@class="titre_descriptif"]/strong/text()'
                    )
                name = name[0].extract().strip()
                name = name.replace('- OFFRE SPECIALE !', '').strip()
                url = response.url
                price = "".join(
                    p.select(
                        './/div[@class="lignebeige"]/div[@class="wrapperPrix"]/div/div/div/b/text()'
                    ).re(r'([0-9\,\. ]+)')).strip()
                # identifier = p.select('.//div/div/span[@class="reference"]/text()').extract()[1].strip()
                identifier = remove_punctuation_and_spaces(name).lower()
                image_url = p.select('.//div/img/@src').extract()
                if image_url:
                    image_url = urljoin_rfc(get_base_url(response),
                                            image_url[0])
                sku = ''
                p_ref = p.select('.//span[@class="reference"]//text()').re(
                    r'(\d+)')
                if main_ref and p_ref:
                    if p_ref[0] == main_ref[0]:
                        p_sku = p.select(
                            '//div[@id="ligne_achat"]/table/tr/td/text()'
                        ).extract()
                        if p_sku:
                            try:
                                sku = p_sku[0].strip().split(': ')[1]
                            except IndexError:
                                sku = p.select(
                                    '//div[@id="ligne_achat"]/table/tr/td/text()'
                                ).re('\S+')[2]

                l = ProductLoader(item=Product(), response=response)
                l.add_value('identifier', identifier)
                l.add_value('name', name)
                if category:
                    l.add_value('category', category)
                l.add_value('sku', sku)
                l.add_value('stock', 1)
                l.add_value('url', url)
                l.add_value('price', price)
                l.add_value('image_url', image_url)
                yield l.load_item()
Esempio n. 8
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        loader = ProductLoader(item=Product(), selector=hxs)

        identifier = response.xpath(
            '//input[@id="products-id"]/@value').extract_first()
        loader.add_value('identifier', identifier)

        sku = response.xpath(
            '//span[@itemprop="model"]/text()').extract_first()
        loader.add_value('sku', sku)

        name = response.xpath('//h2/span[@itemprop="name"]/text()'
                              ).extract_first() or response.xpath(
                                  '//h1/text()').extract_first()
        loader.add_value('name', name)

        loader.add_value('url', response.url)

        price = response.xpath(
            '//span[@itemprop="price"]/@content').extract_first()
        if price:
            price = price.replace('.', ',')
        else:
            price = response.xpath(
                '//span[@itemprop="price"]/text()').extract_first(
                ) or response.css('div.current-price-container').xpath(
                    'br/following::text()').extract_first() or response.css(
                        'div.current-price-container ::text').extract_first(
                        ) or 0
        loader.add_value('price', price)

        category = hxs.select(
            '//div[@id="breadcrumb_navi"]/span/a/span/text()').extract()
        category = category[1:-1] if len(category) > 2 else ''
        loader.add_value('category', category)

        image_url = response.xpath(
            '//img[@itemprop="image"]/@src').extract_first()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))

        item = loader.load_item()

        options = response.css('fieldset.attributes div div label')
        if options:
            for option in options:
                option_item = deepcopy(item)
                option_item['identifier'] += '-' + option.xpath(
                    './/input/@value').extract_first()
                option_name = ' '.join(
                    option.xpath('text()').extract()).strip()
                if '(' in option_name:
                    price = extract_price(option_name.split('(')[-1])
                    option_name = option_name.split('(')[0].strip()
                    option_item['price'] += price
                option_item['name'] += ' ' + option_name
                yield option_item
        else:
            yield item
Esempio n. 9
0
    def parse_product(self, response):
        loader = ProductLoader(Product(), response=response)
        identifier = response.css(
            'input.productId::attr(value)').extract_first()
        loader.add_value('identifier', identifier)
        loader.add_value('url', url_query_cleaner(response.url))
        loader.add_css('name', '.title h1::text')
        category = response.css('.breadcrumbs a::text').extract()
        loader.add_value('category', category[2:])
        image_url = response.css(
            '.productDetail1 .image img::attr(src)').extract_first()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))
        loader.add_value('brand', category[-1])
        item = loader.load_item()

        for option in response.xpath('//div[@id="valStaffelSelection"]//li'):
            loader = ProductLoader(Product(), selector=option)
            loader.add_value(None, item)
            identifier = item['identifier'] + '-' + option.xpath(
                'input/@value').extract_first()
            loader.replace_value('identifier', identifier)
            url = item['url'] + '?' + option.xpath('@class').extract_first()
            loader.replace_value('url', url)
            loader.add_css('name', 'span.label::text')
            price = option.css('div.price::text').extract()
            loader.replace_value('price', price.pop())
            loader.replace_value('sku', identifier)
            yield loader.load_item()
Esempio n. 10
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        for url in hxs.select('//ul[@class="pagination"]//a/@href').extract():
            yield Request(urljoin(base_url, url))

        for product in hxs.select('//ul[@id="products"]/li'):
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('identifier', './/@data-id')
            url = product.select('.//a/@href').extract()[0].split('?')[0]
            loader.add_value('url', urljoin(base_url, url))
            loader.add_xpath('name', './/@data-name')
            loader.add_value(
                'price', ''.join(product.select('.//@data-price').re('\S')))
            loader.add_xpath('sku', './/@data-id')
            loader.add_xpath(
                'category',
                '//ol[@id="breadcrumbs"]/li[position()>1]/a/span/text()')
            loader.add_xpath('image_url', './/@src')
            loader.add_xpath('brand', './/@data-brand')
            yield loader.load_item()
Esempio n. 11
0
 def parse_product(self, response):
     loader = ProductLoader(Product(), response=response)
     identifier = response.xpath(
         '//input[@id="prodid"]/@value').extract_first()
     if not identifier:
         self.logger.warning('No identifier for %s' % response.url)
         return
     loader.add_value('identifier', identifier)
     loader.add_value('url', response.url)
     loader.add_css('name', 'div.infotitle h1::text')
     loader.add_css('price', '.inline.price::text')
     loader.add_value('sku', identifier)
     image_url = response.css('.photo::attr(src)').extract_first()
     if image_url:
         loader.add_value('image_url', response.urljoin(image_url))
     brand = response.xpath(
         '//meta[@itemprop="brand"]/@content').extract_first()
     if not brand:
         try:
             brand = response.xpath('//script/text()').re(
                 '"manufacturer":"(.*?)"')[0].decode('unicode-escape')
         except IndexError:
             pass
     loader.add_value('brand', brand)
     yield loader.load_item()
Esempio n. 12
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('url', response.url)
        product_loader.add_css('name', '.heading-title-text::text')
        categories = response.css('.breadcrumb a::text').extract()[2:]
        category = categories.pop(0).strip() if categories else ''
        if category == 'All Categories':
            category = categories.pop(0)
        product_loader.add_value('category', category)
        product_loader.add_xpath(
            'brand', '//*[@id="product-header-order-brand"]//img/@alt')
        product_loader.add_xpath('image_url',
                                 '//meta[@property="og:image"]/@content')
        product_loader.add_xpath('identifier',
                                 '//input[@name="product_id"]/@value')
        price = response.css('.price::text').extract_first()
        if price:
            product_loader.add_value('price', price.replace(' ', ''))
        product_loader.add_value('price', 0)
        stock = response.xpath('//script/text()').re('availability.+')
        if stock and 'InStock' not in stock[0]:
            product_loader.add_value('stock', 0)
        product_loader.add_xpath('sku', '//input[@name="product_id"]/@value')
        item = product_loader.load_item()
        metadata = CRCMeta()
        rrp = response.css('.price-public::text').extract_first()
        if rrp:
            rrp = extract_price(rrp)
            metadata['rrp'] = rrp if float(rrp) > float(item['price']) else ''
            item['metadata'] = metadata

        options = response.xpath('//select[@name="product_id"]/option')
        if not options:
            yield item
            return
        for opt in options:
            product_loader = ProductLoader(item=Product(), selector=opt)
            product_loader.add_value(None, item)
            identifier = opt.xpath('@value').extract_first()
            if not identifier:
                continue
            product_loader.replace_value('identifier', identifier)
            product_loader.replace_value('sku', identifier)
            product_loader.add_xpath('name', 'text()')
            price = response.xpath('//div[@data-value="%s"]' % identifier).css(
                '.alltricks-ChildSelector-customOptionPrice::text'
            ).extract_first()
            product_loader.replace_value('price', price.replace(' ', ''))
            stock = opt.xpath('@data-stock-label').extract_first()
            if stock == 'Out of stock':
                product_loader.replace_value('stock', 0)
            option_item = product_loader.load_item()
            option_item['metadata'] = metadata
            yield option_item
Esempio n. 13
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), selector=hxs)

        identifier = hxs.select(
            '//span[@class="ProductNo DisplayBlock SmallTopMargin"]/text()'
        ).re('Artikel-Nr\.: (.*)')
        loader.add_value('identifier', identifier)

        loader.add_value('sku', identifier)

        name = hxs.select('//h1[@itemprop="name"]/text()').extract()
        loader.add_value('name', name[0])

        loader.add_value('url', response.url)

        price = response.xpath('//span[@itemprop="price"]/text()').extract()
        price = price[0] if price else '0.00'
        loader.add_value('price', price)

        price = loader.get_output_value('price')
        if price and Decimal(price) <= 49.99:
            loader.add_value('shipping_cost', '4.99')

        category = hxs.select(
            '//a[@class="BreadcrumbItem"]/span/text()').extract()
        category = ' > '.join(category[1:-1] if len(category) > 2 else '')
        loader.add_value('category', category)

        image_url = hxs.select(
            '//div[@class="ProductImage"]//img/@src').extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))

        yield loader.load_item()

        if response.meta.get('options_crawled', False):
            log.msg('Option found: ' + response.url)
            return

        primary_options = hxs.select(
            '//select[@id="SelectedVariation0"]/option/@value').extract()
        for primary_option in primary_options:
            secondary_options = hxs.select(
                '//select[@id="SelectedVariation1"]/option/@value').extract()
            if not secondary_options:
                formdata = {
                    'ChangeAction': 'SelectSubProduct',
                    'SelectedVariation': primary_option
                }
                yield FormRequest(response.url,
                                  dont_filter=True,
                                  formdata=formdata,
                                  meta={'options_crawled': True},
                                  callback=self.parse_product)
            else:
                for secondary_option in secondary_options:
                    formdata = {
                        'ChangeAction': 'SelectSubProduct',
                        'SelectedVariation':
                        [primary_option, secondary_option]
                    }
                    yield FormRequest(response.url,
                                      dont_filter=True,
                                      formdata=formdata,
                                      meta={'options_crawled': True},
                                      callback=self.parse_product)
Esempio n. 14
0
    def parse_product(self, response):
        if not response.xpath('//body[@id="product"]'
                              ) and not 'body id="product"' in response.body:
            return
        promo_dates = response.xpath(
            '//div[@class="pl_promoinfo_product_promo"]/span[@class="date"]/text()'
        ).extract()
        promo_start, promo_end = (None, None)
        try:
            promo_dates = [
                datetime.datetime.strptime(d, '%d-%m-%Y') for d in promo_dates
            ]
            promo_start, promo_end = promo_dates
        except ValueError:
            pass

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('identifier',
                         '//input[@id="product_page_product_id"]/@value')
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        price = response.xpath(
            '//span[@id="our_price_display"]/text()').extract_first()
        loader.add_value('price', price.replace(' ', ''))
        loader.add_xpath('sku', '//span[@itemprop="sku"]/text()')
        loader.add_xpath('sku',
                         '//script/text()',
                         re="productReference='(.+?)'")
        category = response.css('.navigation_page ::attr(title)').extract()
        main_category = response.meta.get('category')
        if not category or category[0].strip() != main_category:
            category = [main_category] + category
        loader.add_value('category', category)
        loader.add_xpath('image_url', '//img[@id="bigpic"]/@src')
        loader.add_xpath('brand', '//a[@itemprop="brand"]/span/text()')
        if not response.css('.primary_block .avail3'):
            loader.add_value('stock', 0)
        metadata = SonaeMeta()
        if promo_start and promo_end:
            metadata['promo_start'] = promo_start.strftime('%Y-%m-%d')
            metadata['promo_end'] = promo_end.strftime('%Y-%m-%d')
        metadata['extraction_timestamp'] = datetime.datetime.now().strftime(
            '%Y-%m-%d %H:%M')
        item = loader.load_item()
        item['metadata'] = metadata
        yield item
Esempio n. 15
0
 def parse_product(self, response):
     loader = ProductLoader(response=response, item=Product())
     base_identifier = response.xpath('//input[@id="productID"]/@value').extract_first()
     loader.add_value('url', response.url)
     base_name = response.css('.puu-led h1::text').extract_first()
     category = response.css('.puu-rbn span::text').extract()
     loader.add_value('category', category[2:])
     image_url = response.css('.puu-vsl img::attr(src)').extract_first()
     loader.add_value('image_url', response.urljoin(image_url))
     loader.add_css('brand', '.puu-brand ::text')
     base_product = loader.load_item()
     
     options = response.css('.puu-ofrs tr')
     for option in options:
         loader = ProductLoader(selector=option, item=Product(base_product))
         name = base_name + ' ' + option.xpath('td[1]/text()').extract_first()
         loader.replace_value('name', name)
         identifier = base_identifier + '-' + option.xpath('td[1]/text()').re('\d+')[0]
         loader.replace_value('identifier', identifier)
         loader.replace_value('sku', identifier)
         loader.replace_css('price', '.puu-prc::text')
         yield loader.load_item()
     if options or not base_identifier:
         return
     loader.add_css('name', '.puu-prd h1::text')
     loader.add_value('identifier', base_identifier)
     loader.add_value('sku', base_identifier)
     loader.add_css('price', '.puu-prc::text')
     yield loader.load_item()
Esempio n. 16
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        product_loader = ProductLoader(item=Product(), selector=hxs)

        image_url = hxs.select('//img[@property="image"]/@src').extract()
        product_identifier = hxs.select('//script/text()').re("'productId': *(.+),")
        product_name = hxs.select('//script/text()').re("'name': *\"(.+)\"")
        product_loader.add_value('identifier', product_identifier)
        product_loader.add_value('name', product_name)
        if image_url:
            product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
        price = hxs.select('//script/text()').re("'price': *\"(.+)\"")
        sku = hxs.select('//script/text()').re('"ArtNbr":"(.+?)"')
        product_loader.add_value('sku', sku[-1])
        product_loader.add_value('price', price)
        product_loader.add_value('url', response.url)
        category = hxs.select('//div[@class="breadcrumb gridle_container"]/a/text()').extract()[1:-1]
        category = category[-3:]
        product_loader.add_value('category', category)
        brand = hxs.select('//script/text()').re("'brand': *\"(.+)\"")
        product_loader.add_value('brand', brand)
        product = product_loader.load_item()
        yield product
Esempio n. 17
0
 def parse_product(self, response):
     loader = ProductLoader(item=Product(), response=response)
     identifier = response.xpath('//script/text()').re('product_id: (\d+)')
     loader.add_value('identifier', identifier)
     loader.add_value('sku', identifier)
     loader.add_value('url', response.url)
     name = response.css('.product_display_padding').xpath(
         '*[position()<3]//text()').extract()
     loader.add_value('name', name)
     loader.add_xpath('price', '//span[@id="p_price"]/text()')
     loader.add_value('category', response.meta.get('category'))
     loader.add_xpath('image_url', '//meta[@property="og:image"]/@content')
     loader.add_value('brand', name[0])
     if response.xpath('//meta[@property="og:availability"]/@content').re(
             'out *of *stock'):
         loader.add_value('stock', 0)
     yield loader.load_item()
Esempio n. 18
0
    def parse_product(self, response):
        loader = ProductLoader(Product(), response=response)
        identifier = response.xpath('//input[@name="product"]/@value').extract_first()
        loader.add_value('identifier', identifier)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        image_url = response.xpath('//img[@id="image"]/@src').extract_first()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))
        loader.add_xpath('brand', '//*[@itemprop="brand"]/text()')  
        item = loader.load_item()

        promotion = response.xpath('//div[@id="advantages-of-registering-popup"]//p[contains(text(), "korting")]/text()').extract()
        promotion = promotion[0].strip() if promotion else ''
        
        for option in response.xpath('//table[@id="product-option-packages"]/tbody/tr'):
            loader = ProductLoader(Product(), selector=option)
            loader.add_value(None, item)
            identifier = option.xpath('.//@data-id').extract_first()
            loader.replace_value('identifier', identifier)
            loader.add_xpath('name', './/label/text()')
            price = option.css('.price::text').extract()
            loader.replace_value('price', price.pop())
            loader.replace_value('sku', identifier)

            metadata = SpecSaversMeta()
            metadata['promotion'] = promotion
            item = loader.load_item()
            item['metadata'] = metadata
            yield item
Esempio n. 19
0
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     loader = ProductLoader(item=Product(), selector=hxs)
     
     try:
         identifier = hxs.select('//div[@id="product_description"]/@data-product_id').extract()[0]
     except IndexError:
         yield Request(response.url, dont_filter=True, callback=self.parse_cat)
         return
     loader.add_value('identifier', identifier)
     loader.add_xpath('sku', '//script/text()', re='"prdref","(.+)"')
     loader.add_value('url', response.url)
     loader.add_xpath('name', '//h1//text()', re='.+')
     loader.add_xpath('name', '//div/text()', re='Couleur.*:(.+)')
     loader.add_xpath('category', '//nav[@id="breadcrumb"]//a[position()>1]/span/text()')
     price = ''.join(hxs.select('//div[@class="product_container"]//div[@class="product-price"]/span[@data-product_id="%s"]//text()' %identifier).extract())
     loader.add_value('price', ''.join(price.split()))
     loader.add_xpath('image_url', '//script/text()', re='"prdparam-image_url","(.+)"')
     if not hxs.select('//input[contains(@id, "addToCart")]'):
         loader.add_value('stock', '0')
     yield loader.load_item()
     
     siblings = hxs.select('//div[@id="slider_collection-container"]//a/@href').extract()
     siblings += hxs.select('//div[contains(@class, "siblings")]//a/@href').extract()
     for url in siblings:
         yield Request(urljoin(base_url, url), callback=self.parse_product)
Esempio n. 20
0
 def parse_product(self, response):
     loader = ProductLoader(Product(), response=response)
     identifier = re.search('-(\d+)\.html', response.url).group(1)
     loader.add_value('identifier', identifier)
     loader.add_value('url', response.url)
     loader.add_css('name', 'div.titles h1 ::text')
     loader.add_css('price', '.rprice .value::text')
     loader.add_value('sku', identifier)
     loader.add_xpath('category',
                      '//div[@id="path"]//a[position()>1]/text()')
     loader.add_css('image_url', 'div#image img::attr(src)')
     loader.add_css('brand', 'h1 .brand-name::text')
     yield loader.load_item()
Esempio n. 21
0
 def parse_product(self, response):
     if response.xpath('//h5[contains(., "under varemerket")]'):
         return
     loader = ProductLoader(Product(), response=response)
     identifier = response.xpath(
         '//input[@id="articleId"]/@value').extract_first(
         ) or response.xpath('//input[@id="skuId"]/@value').extract_first()
     loader.add_value('identifier', identifier)
     loader.add_value('sku', identifier)
     loader.add_value('url', response.url)
     breadcrumbs = response.css('.breadcrumbs a::text').extract()[1:]
     loader.add_value('name', breadcrumbs.pop())
     loader.add_value('category', breadcrumbs[-3:])
     loader.add_xpath('price', '//h3[@itemprop="price"]/@content')
     loader.add_xpath('image_url', '//img[@itemprop="image"]/@src')
     loader.add_css('brand', '.product-hero-brand img::attr(alt)')
     if loader.get_output_value('price') < 1000:
         loader.add_value('shipping_cost', 49)
     yield loader.load_item()
Esempio n. 22
0
    def parse_products(self, response):
        category = response.css('.breadcrumbs').xpath(
            './/a/text()').extract()[1:]
        products = response.css('.listing_item')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            image_url = product.css('.listing_item_image').xpath(
                'img/@src').extract_first()
            if not 'noimage' in image_url:
                loader.add_value('image_url', image_url)
            url = product.css('.listing_item_name').xpath(
                '@href').extract_first()
            url = url_query_cleaner(response.urljoin(url))
            sku = url.split('/')[-1]
            loader.add_value('identifier', sku)
            loader.add_value('sku', sku)

            loader.add_value('url', url)
            loader.add_xpath('name', './/a[@class="listing_item_name"]/text()')
            loader.add_xpath(
                'price', './/span[@class="listing_item_basic_price"]/text()')
            loader.add_value('category', category)
            shipping_cost = product.css('.listing_item_delivery_costs').xpath(
                'text()').extract_first()
            loader.add_value('shipping_cost', extract_price_eu(shipping_cost))
            if 'Non disponibile' in product.css(
                    '.listing_item_availability').xpath(
                        'text()').extract_first():
                loader.add_value('stock', 0)
            item = loader.load_item()
            dealer = product.css('.listing_item_merchant_name').xpath(
                'img/@alt').extract_first()
            item['metadata'] = {'Dealer': dealer}
            yield item