Exemple #1
0
 def parse_product(self, response):
     loader = ProductLoader(Product(), response=response)
     loader.add_xpath('identifier', '//input[@name="productid"]/@value')
     loader.add_value('url', response.url)
     loader.add_css('name', '.descr::text')
     loader.add_css('price', 'span.currency::text')
     loader.add_value('sku', response.meta['sku'])
     image_url = response.css(
         'img#product_thumbnail::attr(src)').extract_first()
     if image_url:
         loader.add_value('image_url', response.urljoin(image_url))
     loader.add_value('brand', response.meta['brand'])
     stock = response.css('.quantity script::text').re(
         'product_avail = (\d+);')[0]
     loader.add_value('stock', stock)
     item = loader.load_item()
     if stock == '0':
         yield item
         return
     request = FormRequest.from_response(response,
                                         formname='orderform',
                                         meta={
                                             'cookiejar':
                                             item['identifier'],
                                             'item': Product(item)
                                         },
                                         cookies=self.cookies,
                                         callback=self.parse_shipping,
                                         dont_filter=True)
     yield request
Exemple #2
0
    def parse_product_base(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        image_xpath = '//div[@id="image-block"]//img[@itemprop="image"]/@src'

        breadcrumb = response.css('div.breadcrumb a span::text').extract()
        if len(breadcrumb) > 0:
            category = breadcrumb.pop().strip()
        else:
            category = ''

        try:
            name = response.css('div.primary_block h1::text').extract_first().strip()
        except:
            return

        product_brand = ''
        for brand in self.brands:
            if brand.lower() in category.lower() or name.lower().startswith(brand.lower()):
                product_brand = brand
                break

        allow_buy_out_stock = re.search('var allowBuyWhenOutOfStock = true;', response.body)

        image = hxs.select(image_xpath).extract().pop()
        product_url = urljoin_rfc(base_url, response.url)
        image_url = urljoin_rfc(base_url, image)

        # "var quantityAvailable = 7" means there are in total 7 products available in stock
        quantity = re.search('var quantityAvailable\D+(\d+)', response.body)
        product_id = re.search('var id_product\D+(\d+)', response.body)

        price = response.xpath('//span[@id="our_price_display"]//text()').extract()

        if price:
            price = price.pop()
        else:
            price = '0.00'

        loader = ProductLoader(response=response, item=Product())
        loader.add_value('url', product_url)
        loader.add_value('name', name)
        loader.add_value('brand', product_brand)
        loader.add_value('image_url', image_url)
        loader.add_value('price', price.replace(' ', '').replace(',', '.'))
        loader.add_value('category', category)
        loader.add_xpath('sku', '//p[@id="product_reference"]/span/text()')

        if product_id:
            loader.add_value('identifier', product_id.group(1))
        else:
            loader.add_xpath('identifier', '//form//input[@name="id_product"]/@value')

        stock = response.xpath('//span[@id="availability_value"]/text()').extract_first()
        
        if stock and stock.title() != 'In Stock':
            loader.add_value('stock', 0)

        return loader.load_item()
Exemple #3
0
    def parse_product(self, response):
        base_url = get_base_url(response)

        name = response.xpath('//h1[@class="product-view__title"]/span/text()').extract()
        name = map(lambda x: x.strip(), name)
        name = ' '.join(name)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('name', name)
        loader.add_xpath('price', '//div[contains(@class, "product-view__total-price")]/@data-price')
        image_url = response.xpath('//img[@itemprop="image"]/@alt').extract()
        if image_url:
            loader.add_value('image_url', 'http:' + image_url[0])
        loader.add_xpath('brand', '//div[@class="product-view__brand brand"]/img[@class="brand__image"]/@alt')
        loader.add_value('category', 'Kontaktlinser')
        loader.add_value('url', response.url)
        identifier = re.findall('"ecomm_prodid":"(\d+)","', response.body)[0]
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)

        metadata = SpecSaversMeta()
        promotion = response.xpath('//section[contains(@class, "product-view--product-page")]//figcaption[@class="splash__inner"]//text()').extract()
        if promotion:
            promotion = [s for s in map(lambda x: x.strip(), promotion) if s != '']
            promotion = ' '.join(promotion)
        else:
            promotion = ''
        metadata['promotion'] = promotion

        item = loader.load_item()
        item['metadata'] = metadata
        yield item
Exemple #4
0
    def parse_product(self, response):
        loader = ProductLoader(Product(), response=response)
        identifier = re.search('\d\d\d\d', response.url).group(0)
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//header[@class="prodCat"]/h1/text()')
        category = response.css('.bread li a::text').extract()[1:]
        category += response.css('.bread li:last-child::text').extract()
        loader.add_value('category', category)
        image_url = response.css('.detimg a::attr(href)').extract_first()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))
        item = loader.load_item()

        options = response.css('.tbl').xpath('.//*[@class="tr"]')
        if not options:
            item['price'] = 0
            yield item
            return
        for option in options:
            loader = ProductLoader(Product(), selector=option)
            loader.add_value(None, item)
            identifier = option.xpath('.//input/@name').extract_first()
            loader.replace_value('identifier', identifier)
            loader.replace_value('sku', identifier)
            loader.replace_css('price', '.tc-price .pr-now::text')
            loader.add_css('price', '.tc-price::text')
            loader.replace_css('name', '.tc-title::text')
            yield loader.load_item()
Exemple #5
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(selector=hxs, item=Product())
        loader.add_value('url', response.url)
        loader.add_xpath(
            'brand', './/dt[text()="Brand"]/following-sibling::dd[1]/text()')
        loader.add_xpath('category',
                         './/div[contains(@class, "breadcrumbs")]//a/text()')

        if hxs.select('//article[@id="product"]'):
            image_url = hxs.select(
                './/div[@id="amplienceContent"]//img/@src').extract()
            loader.replace_value('image_url', urljoin(base_url, image_url[0]))
            options = hxs.select(
                '//script[@type="text/javascript"]/text()[contains(., "productData")]'
            ).extract()
            for item in self.parse_options(hxs, base_url, loader, options):
                yield item

        for product in hxs.select('//article[@class="bdp-item"]'):
            image_url = product.select(
                './/a[contains(@id, "mainImage")]/img/@src').extract()[0]
            loader.replace_value('image_url', urljoin(base_url, image_url))
            options = product.select(
                './div/div[1]//script[@type="text/javascript"]/text()'
            ).extract()
            for item in self.parse_options(product, base_url, loader, options):
                yield item
Exemple #6
0
    def parse_product(self, response):
        flix = '//script[@type="text/javascript"]/@data-flix-%s'
        name = response.xpath('//td/div[@align="center"]/b/text()').extract()
        if not name:
            return
        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('name', name[0].strip(' ,'))
        loader.add_value('url', response.url)
        identifier = filter(lambda s: bool(s.strip()),
                            response.xpath(flix % 'ean').extract())
        if not identifier or not identifier[0]:
            identifier = response.xpath(
                '//b[contains(text(), "Model :")]/../text()[1]').extract()
        sku = response.xpath(flix % 'mpn').extract()
        if not sku or not sku[0]:
            sku = response.xpath(
                '//b[contains(text(), "Model")]/../text()[1]').extract()
        loader.add_value('identifier', identifier)
        loader.add_value('sku', sku)
        price = re.findall(u'POST.+?> *&#8364;(.+?) *<', response.body)
        loader.add_value('price', price)
        loader.add_xpath('category', '//h8//a[position()>1]/text()')
        loader.add_xpath('brand', flix % 'brand')
        stock = response.xpath(
            '//button[@value="Central Warehouse"]/../text()').extract_first()
        if not stock or 'Available' not in stock:
            loader.add_value('stock', 0)
        item = loader.load_item()
        if response.xpath('//img[@alt="Exdisplay"]'):
            item['metadata'] = {'Ex Display': 'Ex Display'}

        yield item
Exemple #7
0
 def parse_doors(self, response):
     url = response.xpath('//link[@rel="canonical"]/@href').extract()
     category = response.xpath(
         '//p[@class="breadcrumbs"]/a[position()>1]/text()').extract()
     ids = response.xpath('//script/text()').re('ecomm_prodid.*(\[.+\])')
     ids = eval(ids[0])
     for i, product in enumerate(
             response.xpath('//div[@itemprop="offers"]')):
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('name', './/h3[@itemprop="name"]/a/text()[1]')
         loader.add_value('identifier', ids[i])
         loader.add_value('sku', ids[i])
         loader.add_xpath('price', './/span[@itemprop="price"]/text()')
         local_url = product.xpath(
             './/h3[@itemprop="name"]/a/@href').extract()
         if local_url:
             local_url = response.urljoin(local_url[0])
         else:
             local_url = url
         loader.add_value('url', local_url)
         image_url = product.xpath('.//a/img/@src').extract()
         loader.add_value('image_url', response.urljoin(image_url[0]))
         loader.add_value('category', category)
         if not product.xpath(
                 'link[@itemprop="availability"][@href="http://schema.org/InStock"]'
         ):
             loader.add_value('stock', 0)
         if loader.get_output_value('price') < 750:
             loader.add_value('shipping_cost', 36)
         yield loader.load_item()
Exemple #8
0
    def parse_product(self, response):
        base_url = get_base_url(response)

        name = response.xpath(
            '//h1[@class="product-view__title"]/span/text()').extract()
        name = map(lambda x: x.strip(), name)
        name = ' '.join(name)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('name', name)
        loader.add_xpath(
            'price',
            '//div[contains(@class, "product-view__total-price")]/@data-price')
        image_url = response.xpath('//img[@itemprop="image"]/@alt').extract()
        if image_url:
            loader.add_value('image_url', 'http:' + image_url[0])
        loader.add_xpath(
            'brand',
            '//div[@class="product-view__brand brand"]/img[@class="brand__image"]/@alt'
        )
        loader.add_value('category', 'Kontaktlinser')
        loader.add_value('url', response.url)
        identifier = re.findall('"ecomm_prodid":"(\d+)","', response.body)[0]
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)

        yield loader.load_item()
Exemple #9
0
    def parse_product(self, response):
        data = SpiderSchema(response).get_product()

        options = response.xpath(
            '//div[@class="summary-container"]/table//tr[not(th)]')
        for option in options:
            loader = ProductLoader(item=Product(), response=response)
            opt_name = option.xpath(
                './/td[contains(@class,"optionscol")]/text()')[0].extract()
            opt_name = u'{} - {}'.format(data['name'], opt_name)
            opt_identifier = option.xpath('@class')[0].extract().split(' ')[0]
            opt_price = option.xpath('@data-price').extract()

            loader.add_value('name', opt_name)
            loader.add_value('url', response.url)
            loader.add_value('sku', data['sku'])
            loader.add_value('identifier', opt_identifier)
            if 'image' in data:
                loader.add_value('image_url', data['image'])
            else:
                loader.add_xpath('image_url',
                                 '//meta[@itemprop="og:image"]/@content')
            stock = option.xpath('@class').re('instock')
            if not stock:
                loader.add_value('stock', 0)
            loader.add_value('price', opt_price)
            loader.add_css('category',
                           'div.product_meta span.posted_in a::text')

            yield loader.load_item()
Exemple #10
0
 def parse_product(self, response):
     loader = ProductLoader(item=Product(), response=response)
     loader.add_value('identifier',
                      re.findall('product_id.+?(\d+)', response.body))
     loader.add_xpath('url', '//link[@rel="canonical"]/@href')
     loader.add_value('name', re.findall('"name":"(.+?)"', response.body))
     prices = re.findall('tier_price_total".+?([\d.]+)', response.body)
     if not prices:
         return
     price = Decimal(prices[0]).quantize(Decimal('.01'))
     loader.add_value('price', price)
     loader.add_value('sku', re.findall('product_id.+?(\d+)',
                                        response.body))
     category = re.findall(
         '<span class="technical_label">Lenstype:</span><a href.+?>(.+?)</a',
         response.body
     ) or re.findall(
         '<span class="technical_label">Producttype:</span><a href.+?>(.+?)</a',
         response.body)
     loader.add_value('category', category)
     loader.add_value(
         'image_url',
         re.findall('<img src="(\S+media/catalog/product\S+)"',
                    response.body))
     loader.add_value(
         'brand',
         re.findall(
             '<span class="technical_label">Merk:</span><a href.+?>(.+?)</a',
             response.body))
     if loader.get_output_value('price') < 70:
         loader.add_value('shipping_cost', '4.98')
     yield loader.load_item()
Exemple #11
0
 def parse_product(self, response):
     loader = ProductLoader(Product(), response=response)
     identifier = response.xpath('//script/text()').re(
         'ecomm_prodid: *(\d+),')
     loader.add_value('identifier', identifier)
     loader.add_value('url', response.url)
     name = ' '.join(''.join(
         response.xpath('//h1//text()').extract()).split())
     loader.add_value('name', name)
     loader.add_css('price', 'span.GBP::attr(content)')
     loader.add_xpath('sku', '//span[@id="js-product-reference"]/@data-ref')
     category = response.xpath(
         '//div[contains(@class, "breadcrumb")]//a/span/text()').extract(
         )[1:]
     loader.add_value('category', category)
     image_url = response.xpath(
         '//a[@class="product__image__zoom-link"]/@href').extract()
     image_url = response.urljoin(image_url[0]) if image_url else ''
     loader.add_value('image_url', image_url)
     brand = response.xpath(
         '//span[@class="product-content__title--brand"]/text()').extract()
     brand = brand[0].strip() if brand else ''
     loader.add_value('brand', brand)
     stock = response.xpath(
         '//span[@id="js-product-in-stock-default" and contains(text(), "in Stock")]'
     )
     if not stock:
         loader.add_value('stock', 0)
     yield loader.load_item()
Exemple #12
0
 def parse_product(self, response):
     loader = ProductLoader(Product(), response=response)
     loader.add_value('url', response.url)
     loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
     sku = response.xpath('//div[@itemprop="description"]/div/div[last()]/text()').extract_first()
     loader.add_value('identifier', sku)
     loader.add_value('sku', sku)
     category = response.css('.breadcrumbs a::text').extract()[1:]
     category += response.css('.breadcrumbs li:last-of-type::text').extract()
     loader.add_value('category', category)
     image_url = response.css('img.gallery-main-image::attr(src)').extract_first()
     if image_url:
         loader.add_value('image_url', response.urljoin(image_url))
     if not response.css('.in-stock'):
         loader.add_value('stock', 0)       
     item = loader.load_item()
     
     options = response.css('table.product-table tbody tr')
     for option in options:
         loader = ProductLoader(Product(), selector=option)
         loader.add_value(None, item)
         sku = option.css('span.product-code::text').re('\((.+)\)')[0]
         name = option.css('span.product-name::text').extract_first()
         identifier = '-'.join((sku, hashlib.md5(item['name'] + name).hexdigest()))
         loader.replace_value('identifier', identifier)
         loader.replace_value('sku', sku)
         loader.add_css('price', 'span.product-price-rrp')
         price = option.css('td.product-price').xpath('text()[last()]').extract_first()
         loader.replace_value('price', price)
         if name not in item['name']:
             loader.add_value('name', name)
         yield loader.load_item()
         
Exemple #13
0
    def parse_product(self, response):
        options = response.css('.pg_select')
        if options:
            selected_option = options.xpath('option[@selected]')
            if not selected_option:
                for url in options.xpath('.//@data-href').extract():
                    yield Request(response.urljoin(url_query_cleaner(url)),
                                  self.parse_product)
                return

        loader = ProductLoader(Product(), response=response)
        sku = response.xpath(
            '//div[@id="content"]//input[@name="sku"]/@value').extract_first()
        loader.add_value('identifier', sku)
        loader.add_value('sku', sku)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//strong[@itemprop="name"]/text()')
        loader.add_css('price', 'div.show h5 ::text')
        loader.add_css('price', '.nowPrice ::text')
        loader.add_css('price', '.typicalPrice h5 ::text')
        category = response.xpath('//input[@name="productDetailsDTO"]/@value'
                                  ).re('"category":"(.+?)"')
        if category:
            loader.add_value('category', category[0].split('/'))
        image_url = response.css(
            'ul#galleryImages a::attr(href)').extract_first()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))
        loader.add_xpath(
            'brand',
            '//span[@itemprop="brand"]//span[@itemprop="name"]/text()')
        if response.css('div#content p.oos'):
            loader.add_value('stock', 0)
        yield loader.load_item()
Exemple #14
0
 def parse_product(self, response):
     identifier = response.xpath('//div[@itemscope]/@id').re('product-(.+)')
     loader = ProductLoader(item=Product(), response=response)
     loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
     loader.add_xpath('price', '//meta[@itemprop="price"]/@content')
     loader.add_xpath('url', '//link[@rel="canonical"]/@href')
     category = response.css('.breadcrumb a::text').extract()[1:]
     loader.add_value('category', category)
     loader.add_value('brand', response.meta['brand'])
     loader.add_xpath('image_url', '//div/@data-original-img')
     loader.add_value('identifier', identifier)
     product = loader.load_item()
     if not response.css('.variations'):
         yield product
         return
     
     variations = response.xpath('//form/@data-product_variations').extract_first()
     variations = json.loads(variations)
     for variation in variations:
         variation_loader = ProductLoader(item=Product(product), response=response)
         attributes = variation['attributes'].values()
         variation_loader.replace_value('name', product['name'])
         for attribute in attributes:
             variation_loader.add_xpath('name', '//option[@value="%s"]/text()' %attribute)
         variation_loader.replace_value('price', variation['display_price'])
         variation_loader.replace_value('identifier', variation['variation_id'])
         yield variation_loader.load_item()
Exemple #15
0
 def parse_product(self, response):
     loader = ProductLoader(Product(), response=response)
     identifier = response.css('span#thisstkcode::text').extract_first()
     if not identifier:
         retries = response.meta.get('retries', 0)
         if retries > 9:
             self.logger.warning('No identifier found on %s' % response.url)
         else:
             self.logger.debug('Retry %s to get identifier' % response.url)
         meta = response.meta
         meta['retries'] = retries + 1
         yield response.request.replace('dont_filter=True', meta=meta)
     loader.add_value('identifier', identifier)
     loader.add_value('sku', identifier)
     loader.add_value('url', response.url)
     loader.add_xpath('name', '//h1/text()')
     price = response.css('span.prodPrice').xpath(
         './/span[@itemprop="price"]/text()').extract_first()
     loader.add_value('price', price)
     category = response.css('.breadcrumbs span::text').extract()[1:]
     loader.add_value('category', category)
     loader.add_css('image_url', '.main-product-photo::attr(href)')
     loader.add_css('brand', 'span#thisbrand::text')
     loader.add_css('stock', 'input#data-stock-qty::attr(value)')
     yield loader.load_item()
Exemple #16
0
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     if hxs.select('//a[@href="#product-range"]'):
         for url in hxs.select(
                 '//section[contains(@class, "product-range")]//div/a/@href'
         ).extract():
             yield Request(urljoin(base_url, url),
                           callback=self.parse_product)
         return
     loader = ProductLoader(item=Product(), selector=hxs)
     loader.add_xpath('name', '//h1[@class="fn c-both"]/text()')
     loader.add_xpath('price',
                      ('//span[@class="cta now-price"]/text()', '0'))
     if not hxs.select('//select[@id="quantity"]'):
         loader.add_value('stock', 0)
     categories = hxs.select(
         '//section[@class="breadcrumbs"]//a/text()').extract()[2:-1]
     if 'in the kitchen' in categories:
         categories.remove('in the kitchen')
     if 'baking' in categories:
         categories.remove('baking')
     loader.add_value('category', categories)
     loader.add_value('brand', "Lakeland")
     loader.add_xpath('identifier', '//meta[@name="productcode"]/@content')
     loader.add_xpath('sku', '//meta[@name="productcode"]/@content')
     loader.add_xpath('image_url', '//img[@class="main-image"]/@src')
     loader.add_value('url', response.url)
     product = loader.load_item()
     if product.get('price', 30) < 30:
         product['shipping_cost'] = 2.99
     yield product
Exemple #17
0
    def parse_product(self, response):
        brand = response.meta['brand']
        brands = response.meta['brands']

        loader = ProductLoader(Product(), response=response)

        sku_searched = response.meta['sku']
        sku = response.css('.part-number strong::text').extract_first()
        if not sku or sku.strip().upper() != sku_searched:
            return

        product_brand = response.xpath(
            '//tr[th[contains(text(), "Brand")]]/td[contains(@class, "data")]/text()'
        ).extract()[0]
        if product_brand.upper().strip() not in brands:
            return

        loader.add_value('identifier', sku)
        loader.add_value('url', response.url)
        loader.add_css('name', '.product-name .h1::text')
        loader.add_xpath(
            'price', '//span[contains(@id, "price-excluding-tax")]/text()')
        loader.add_value('sku', sku)
        category = response.css('.breadcrumbs a::text').extract()[1:]
        loader.add_value('category', category)
        loader.add_css('image_url', 'img#image-main::attr(src)')
        loader.add_value('brand', brand)
        if response.css('.availability .out-of-stock'):
            loader.add_value('stock', 0)
        item = loader.load_item()
        if item['price'] < 50:
            item['shipping_cost'] = 5
        yield item
Exemple #18
0
    def parse_product(self, response):
        base_url = get_base_url(response)

        name = response.xpath(
            '//div[@class="lensname"]/h1/text()').extract()[0].strip()
        model_name = response.xpath(
            '//div[@class="lensname"]/span[@class="name-model"]/text()'
        ).extract()
        if model_name:
            name = name + ' ' + model_name[0]

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('name', name)
        price = response.xpath(
            '//div[@id="tiered_box_red"]//tr[td[text()="1"]]/td/strong/text()'
        ).extract()
        if not price:
            price = response.xpath(
                '//meta[@itemprop="price"]/@content').extract()[0]
        loader.add_value('price', price)
        image_url = response.xpath('//img[@itemprop="image"]/@src').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])
        loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content')
        categories = response.xpath(
            '//div[@id="prodBreadCrumbs"]/a/text()').extract()
        loader.add_value('category', categories)
        loader.add_value('url', response.url)
        identifier = re.findall('productsId = "(\d+)";', response.body)[0]
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)

        yield loader.load_item()
Exemple #19
0
 def parse_product(self, response):
     data = response.xpath('//script/text()').re('{\\\\"Variants.+}')[0]
     data = json.loads(data.replace('\\"', '"'))
     variants = data['Variants']
     for variant in variants:
         url = response.urljoin(variant['ProductPLU'])
         yield Request(make_variant_url(url), self.parse_product)
     
     loader = ProductLoader(item=Product(), response=response)
     identifier = response.xpath('//input[@id="ProductPLU"]/@value').extract_first()
     loader.add_value('identifier', identifier)
     loader.add_value('sku', identifier)
     loader.add_value('url', response.url)
     loader.add_xpath('name', '(//h1[@itemprop="name"]/text())[1]')
     metadata = {}
     for i in xrange(3):
         variant_name = data['Variant%dSelected' %(i+1)]
         if variant_name and variant_name != 'N/A':
             loader.add_value('name', variant_name)
             metadata[data['Variant%dHeader' %(i+1)]] = variant_name
             if 'size' in variant_name.lower():
                 metadata['size'] = variant_name[5:].strip()
     price = response.css('.price-value .currency::text').extract()
     loader.add_value('price', price.pop())
     category = response.css('.breadcrumb a::text').extract()
     loader.add_value('category', category[1:])
     loader.add_css('image_url', '.product-image::attr(src)')
     loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content')
     loader.add_value('shipping_cost',  '7.95')
     stock = response.css('.product-stock-widget::attr(ng-init)').re('AvailableOnline: (\w+)')[0]
     if stock != 'true':
         loader.add_value('stock', 0)
     item = loader.load_item()
     item['metadata'] = metadata
     yield item
Exemple #20
0
    def parse_product(self, response):
        base_sku = response.xpath('//@data-ref').extract_first()
        identifier = re.search('p(\d+)$',
                               url_query_cleaner(response.url)).group(1)
        url = 'https://www.andrewjamesworldwide.com/ajax/get_product_options/{0}'.format(
            identifier)
        data = json.load(urlopen(url))
        attributes = [attr['values'] for attr in data['attributes']]
        if [] in attributes:
            url = add_or_replace_parameter(url, 'attributes[1]',
                                           attributes[0][0]['value_id'])
            data = json.load(urlopen(url))
            attributes = [attr['values'] for attr in data['attributes']]
        variants = itertools.product(*attributes)
        for variant in variants:
            url = 'https://www.andrewjamesworldwide.com/ajax/get_product_options/{0}'.format(
                identifier)
            for idx, option in enumerate(variant):
                url = add_or_replace_parameter(
                    url, 'attributes[{0}]'.format(idx + 1), option['value_id'])
            data = json.load(urlopen(url))
            selection = data['selection'].values()[0]
            sku = selection['reference'].strip()
            if not sku and base_sku not in self.skus_found:
                sku = base_sku
            if sku not in self.skus.keys():
                continue
            if sku in self.skus_found:
                self.logger.info('Duplicated SKU is found: %s' % sku)
            self.skus_found.add(sku)

            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('sku', sku)
            loader.add_value('identifier', selection['product_id'])
            loader.add_xpath('name', '//span[@id="js-product-title"]/text()')
            loader.add_value('name', [option['value'] for option in variant])
            loader.replace_value('name', selection['title'])
            loader.add_value('url', response.url)
            loader.add_value('price', selection['price_inc'])
            category = response.css('div.breadcrumb a::attr(title)').extract()
            loader.add_value('category', category[1:])
            try:
                image_url = [
                    attr['images'][0]['image']
                    for attr in data['attributes'][-1]['values']
                ]
            except IndexError:
                image_url = response.xpath(
                    '//div[@id="js-product-image"]//@src').extract()
            loader.add_value('image_url', response.urljoin(image_url[0]))
            loader.add_value('brand', "Andrew James")
            item = loader.load_item()

            metadata = AndrewJamesMeta()
            metadata['asin'] = self.skus[sku]['ASIN']
            item['metadata'] = metadata
            yield item
Exemple #21
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        brand = response.xpath(
            '//span[@itemprop="http://schema.org/manufacturer"]/text()'
        ).extract_first() or response.xpath(
            '//span[@itemprop="http://schema.org/brand"]/text()'
        ).extract_first()

        identifier = hxs.select('//input[@id="itemsArray"]/@value').extract()
        if not identifier:
            return
        sku = response.xpath(
            '//*[@itemprop="mpn"]/text()').extract()[0].strip()
        product_loader = ProductLoader(item=Product(), selector=hxs)
        image_url = response.css(
            'img#productMainImage::attr(src)').extract_first()
        if image_url:
            product_loader.add_value('image_url', response.urljoin(image_url))

        category = response.meta.get('category', '')
        if not category:
            category = hxs.select('//div[@id="breadcrumb"]/ul/li/a/text()'
                                  ).extract()[-2].strip()

        product_loader.add_value('category', category)

        product_name = response.xpath('//div[@id="product"]//h1//text()').re(
            '\S+')

        product_loader.add_value('name', product_name)
        product_loader.add_xpath('url', 'link[@rel="canonical"]/@href')
        product_loader.add_value('url', response.url)
        product_loader.add_value('identifier', identifier.pop())

        product_loader.add_value('brand', brand)
        product_loader.add_value('sku', sku)
        price = ''.join(
            hxs.select(
                '//table[contains(@class, "pricing")]//td[@class="threeColTd"][1]/text()'
            ).extract()).strip().split('(')[0].strip().replace(u'\xa3', '')
        if price:
            price = extract_price(price)
            price = price.quantize(Decimal('.01'))
            product_loader.add_value('price', price)
        else:
            product_loader.add_value('price', 0)

        stock = response.css('span.availability::text').re('\d+')
        if stock:
            product_loader.add_value('stock', stock[0])
        else:
            product_loader.add_value('stock', 0)

        yield product_loader.load_item()
Exemple #22
0
    def parse_product(self, response):
        try:
            pdata = SpiderSchema(response).get_product()
        except:
            self.logger.error('No structured product data on %s' %response.url)
            return
        options = None
        js_line = ''
        for l in response.body.split('\n'):
            if 'variants:' in l:
                js_line = l
                break

        if js_line:
            options = demjson.decode(re.search(r'variants:(.*};)?', js_line).groups()[0][:-2].strip())

        product_loader = ProductLoader(item=Product(), response=response)
        sku = response.css('span.pd_productVariant::text').extract_first()
        product_loader.add_css('sku', 'span.pd_productVariant::text')
        product_loader.add_xpath('identifier', '//input[@name="productId"]/@value')
        product_loader.add_value('url', response.url)
        try:
            product_loader.add_value('name', pdata['name'])
        except KeyError:
            return
        category = response.xpath('//*[@id="breadcrumb"]//a/text()').extract()[1:-1]
        product_loader.add_value('category', category)
        img = response.xpath('//meta[@property="og:image"]/@content').extract()
        if img:
            product_loader.add_value('image_url', response.urljoin(img.pop()))
        price = response.xpath('//p[@class="productOfferPrice"]/text()').extract()[0]
        product_loader.add_value('price', price)
        if product_loader.get_output_value('price') < 45:
            product_loader.add_value('shipping_cost', '3.5')
        brand = response.xpath('//*[@id="brandHeader"]/a/@href').extract()
        if brand:
            brand = brand[0].replace('/en/', '')[:-1]
            if '/' not in brand:
                product_loader.add_value('brand', brand)
        stock = response.xpath('//link[@itemprop="availability"]/@href').extract_first()
        if stock != 'http://schema.org/InStock':
            product_loader.add_value('stock', 0)
        product = product_loader.load_item()

        yield product

        if options:
            for k, val in options.items():
                option_name = k.replace('_', ' ')
                option_product = Product(product)
                option_product['name'] = product['name'] + ' ' + option_name
                option_product['sku'] = val['productCode']
                option_product['identifier'] = val['variantId']
                option_product['price'] = extract_price(val['nowPrice'])
                yield option_product
Exemple #23
0
 def parse_simple_product(self, response):
     loader = ProductLoader(Product(), response=response)
     loader.add_xpath('identifier', '//input[@name="product"]/@value')
     loader.add_value('url', response.url)
     loader.add_css('name', 'div.product-name h1::text')
     loader.add_css('price', 'li.bigPrice span.price::text')
     loader.add_xpath('sku', '//input[@name="product"]/@value')
     category = response.css('div.breadcrumbs a::text').extract()[1:]
     loader.add_value('category', category)
     loader.add_css('image_url', 'img#image::attr(src)')
     item = loader.load_item()
     yield item
Exemple #24
0
    def parse_product(self, response):
        if response.url.endswith('page-not-found.page'):
            return
        formdata = {}
        for inp in response.xpath('//form[@id="variant-form"]//input'):
            formdata[inp.xpath('@name').extract_first()] = inp.xpath(
                '@value').extract_first()
        if not formdata:
            self.logger.warning('No data on %s' % response.url)
            return
        del formdata[None]
        options = response.css('.vContainer .variantDataElement')
        for option in options:
            formdata[option.xpath('@name').extract_first()] = option.xpath(
                '@data-variant-value').extract_first()
            r = FormRequest.from_response(
                response,
                formxpath='//form[@id="variant-form"]',
                formdata=formdata,
                callback=self.parse_product)
            yield r

        loader = ProductLoader(item=Product(), response=response)
        sku = response.xpath('//input[@id="skuIdVal"]/@value').extract_first()
        if sku != url_query_parameter(response.url, 'skuId'):
            url = add_or_replace_parameter(url_query_cleaner(response.url),
                                           'skuId', sku)
            yield Request(url, self.parse_product)
            return
        loader.add_value('identifier', sku)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@id="productLabel"]//text()')
        #loader.add_css('name', '.selected .variantDisplayName_title ::text')
        loader.add_css('price', '.current-price ::text')
        loader.add_value('sku', sku)
        category = response.xpath(
            '//div[@id="breadcrumb"]//li//span[@itemprop="title"]/text()'
        ).extract()
        loader.add_value('category', category[-4:-1])
        image_url = response.xpath(
            '//img[@itemprop="image"]/@src').extract_first()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))
        loader.add_xpath(
            'brand', '//div[@itemprop="brand"]//span[@itemprop="name"]/text()')
        loader.add_value('shipping_cost', 3)
        #if not response.css('.stock-tag.in-stock') and not response.xpath('//link[@href="http://schema.org/InStock"]') and not response.css('.available-from'):
        if not response.css('.add-to-basket'):
            loader.add_value('stock', 0)
        if loader.get_output_value('price'):
            yield loader.load_item()
 def parse_frames(self, response):
     base_url = get_base_url(response)
     products = response.xpath('//tr/td[text()="Code"][1]')
     if products:
         margin = 3
     else:
         products = response.xpath('//tr/td[span/text()="CODE"][1]')
         if products:
             margin = 2
     if not products:
         self.log('No products found on %s' % response.url)
     identifiers = []
     image_url = response.xpath(
         '//img[not (contains(@alt, "Doors"))]/@src[contains(., "images-thumb")]'
     ).extract()
     for product in products:
         for idx, option in enumerate(
                 product.xpath(
                     './../preceding-sibling::tr[1]/td[position()>1]')):
             name = option.xpath('.//text()').extract()
             for size in product.xpath('./../following-sibling::tr'):
                 if size.xpath(
                         'td[(text()="Code") or (span/text()="CODE")]'):
                     break
                 if not size.xpath('./td[1][contains(.//text(), " x")]'):
                     continue
                 loader = ProductLoader(item=Product(), selector=size)
                 loader.add_value('name', name)
                 size_name = size.xpath('td[1]/text()').extract()
                 loader.add_value('name', size_name)
                 loader.add_xpath('sku',
                                  'td[%d]/text()' % (idx * 2 + margin))
                 loader.add_xpath('price',
                                  'td[%d]/text()' % (idx * 2 + margin + 1))
                 if not loader.get_output_value('sku'):
                     continue
                 identifier = loader.get_output_value(
                     'sku') + '-' + '-'.join(re.findall(
                         '\d+', size_name[0]))
                 identifier += '-' + response.url.split('/')[-1].split(
                     '_')[0].split('.')[0]
                 while identifier in identifiers or identifier in self.ids_seen:
                     identifier += '-d'
                 identifiers.append(identifier)
                 self.ids_seen.append(identifier)
                 loader.add_value('identifier', identifier)
                 loader.add_value('url', response.url)
                 if image_url:
                     loader.add_value('image_url',
                                      urljoin(base_url, image_url[0]))
                 yield loader.load_item()
Exemple #26
0
 def parse_product(self, response):
     loader = ProductLoader(Product(), response=response)
     identifier = response.xpath('//input[@name="product_id"]/@value').extract_first()
     loader.add_value('identifier', identifier)
     loader.add_value('sku', identifier)
     loader.add_value('url', response.url)
     loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
     loader.add_xpath('price', '//meta[@itemprop="price"]/@content')
     category = response.xpath('//div[@id="ProductBreadcrumb"]//a/text()').extract()[1:]
     loader.add_value('category', category)
     loader.add_xpath('image_url', '//img[@itemprop="image"]/@src')
     loader.add_xpath('brand', '//div[@itemtype="http://schema.org/Organization"]/meta[@itemprop="name"]/@content')
     if not response.xpath('//link[@itemprop="availability"]/@href[contains(., "InStock")]'):
         loader.add_value('stock', 0)
     
     sku = identifier
     name = loader.get_output_value('name')
     name_end = re.search('\S+$', name).group(0).strip(' ()')
     keywords = response.xpath('//meta[@name="keywords"]/@content').extract_first().split(',')
     keywords = [word.strip() for word in keywords if word]
     shortest_keyword = min(keywords, key=len) if keywords else 'none'
     from_name = re.findall('\S*\d+\S*', name)
     if shortest_keyword.lower() == name_end.lower():
         sku = name_end
     elif shortest_keyword.upper() == shortest_keyword:
         sku = shortest_keyword
     elif name_end.upper() == name_end:
         sku = name_end
     elif from_name:
         sku = max(from_name, key=len)
         if '(' in sku:
             sku = identifier
     loader.replace_value('sku', sku)
     yield loader.load_item()
Exemple #27
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        price = filter(
            lambda p: p.strip(),
            hxs.select("//span[@class='regular-price']//text()").extract())[1:]

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('price', price)
        loader.add_value('url', response.url)
        loader.add_xpath('name', "//div[@class='product-name']//h1//text()")
        loader.add_xpath(
            'category',
            "//div[@class='breadcrumbs']//li[position() > 1 and position() < last()]/a/text()"
        )
        brand = hxs.select(
            "//div[@class='product-shop']/div[@class='product-name']/a[@class='brand']/text()"
        ).extract()
        loader.add_value('brand', brand)
        loader.add_value('shipping_cost', 0)
        loader.add_xpath('sku', '//li/span[text()="SKU:"]/../text()')
        loader.add_xpath(
            'identifier',
            "//div[@class='product-view']//input[@name='product']/@value")
        image_urls = hxs.select(
            '//img[contains(@class, "gallery-image")]/@src').extract()
        for image_url in image_urls:
            if len(image_url) < 1024:
                loader.add_value('image_url', image_url)
                break
        product = loader.load_item()
        if product['price'] > 0:
            yield product
Exemple #28
0
 def parse_product(self, response):
     
     
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     loader = ProductLoader(item=Product(), selector=hxs)
     loader.add_xpath('category', '//li[@typeof="v:Breadcrumb"]/a[@href!="/"]/text()')
     brand = hxs.select('//script[@type="text/javascript"]/text()').re('brand: *\"(.+)\"')
     loader.add_value('brand', brand)
     loader.add_xpath('image_url', '//div[@id="amp-originalImage"]/img/@src')
     loader.add_value('url', url_query_cleaner(response.url))
     loader.add_xpath('name', '//input[@name="speedtrapProductDisplayName"]/@value')
     item = loader.load_item()
     if hxs.select('//ul[@class="productOptionsList"]/li[contains(@class, "skuAttribute")]'):
         data = hxs.select('//script[contains(text(),"stockMatrix =")]/text()')[0].extract()
         data = data.replace('\n', '').replace('null', '"null"')
         data = re.search('stockMatrix = (.*?);', data, re.DOTALL)
         data = json.loads(data.group(1)) if data else []
         for i, variant in enumerate(data):
             sku = [elem for elem in variant if elem.startswith('sku')][0]
             sku_idx = variant.index(sku)
             product = Product(item)
             product['name'] = item['name'] + ' - ' + ' '.join(variant[:sku_idx]).title()
             product['identifier'] = '{}-{}'.format(response.meta.get('row').get('PRODUCT_NUMBER'), i)
             product['sku'] = product['identifier']
             product['price'] = variant[sku_idx + 2]
             product['stock'] = 1 if 'Available#Delivery' in variant[sku_idx + 1] else 0
             yield product
         return
     loader.add_value('identifier', response.meta.get('row').get('PRODUCT_NUMBER'))
     loader.add_value('sku', response.meta.get('row').get('PRODUCT_NUMBER'))
     loader.add_xpath('price', '//input[@name="speedtrapPrice"]/@value')
     stock = 1 if hxs.select('//meta[@property="product:availability"]/@content[.="In Stock"]') else 0
     loader.add_value('stock', stock)
     yield loader.load_item()
Exemple #29
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        box = hxs.select('//div[@class="prod-box"]')
        crumbs = hxs.select('//ul[@class="breadcrumbs"]')[0]
        loader = ProductLoader(selector=box, item=Product())
        loader.add_value('url', response.url)
        brand = crumbs.select('.//a[contains(text(), "Brands")]/../following-sibling::li[1]/a/text()').extract()
        loader.add_value('brand', brand)
        categories = crumbs.select('.//a/text()').extract()
        categories = [cat for cat in categories if "Brand" not in cat]
        loader.add_value('category', categories)
        image_url = hxs.select('//section[@id="one"]//@src').extract()
        if not image_url:
            yield Request(response.url, callback=self.parse_category, dont_filter=True)
            return
        loader.add_value('image_url', urljoin(base_url, image_url[0]))
        loader.add_xpath('name', './h1/text()')
        loader.add_xpath('identifier', '//*/@prodref')
        loader.add_xpath('sku', '//*/@prodref')
        if not box.select('//*[text()="In Stock" or text()="Low Stock"]'):
            loader.add_value('stock', 0)
        loader.add_xpath('price', './/span[@class="product-price"]/text()')
        product = loader.load_item()
        if product['price'] < 20:
            product['shipping_cost'] = 2
        elif product['price'] < 40:
            product['shipping_cost'] = 4.99
        yield product
Exemple #30
0
 def parse_product(self, response):
     loader = ProductLoader(item=Product(), response=response)
     loader.add_xpath('url', '//link[@rel="canonical"]/@href')
     loader.add_xpath('name', '//span[@id="productName"]//text()')
     loader.add_xpath('sku', '//span[@id="productEAN"]/text()[last()]')
     loader.add_xpath('category', '//div[@id="breadcrumb"]/ul/li[position()>1]/a/span/text()')
     loader.add_css('image_url', '.productImageItem ::attr(href)')
     brand = response.css('.brand ::text').extract_first()
     if brand != "null":
         loader.add_value('brand', brand)
     item = loader.load_item()
     
     p = re.compile('stockMatrix = (.+?);', re.DOTALL)
     data = response.xpath('//script/text()').re(p)
     options = json.loads(data[0])
     for option in options:
         loader = ProductLoader(item=Product(), response=response)
         loader.add_value(None, item)
         opt_iter = iter(option)
         opt_name = ''
         for attribute in response.css('.skuAttribute'):
             opt_name = opt_iter.next()
             loader.add_value('name', opt_name)
         colour_url = response.xpath('//input[@class="colourImageUrl"][@name="%s"]/@value' %opt_name).extract_first()
         if colour_url:
             loader.replace_value('image_url', 'http://media.littlewoods.com/i/littlewoods/%s?$1064x1416_standard$' %colour_url)
         loader.replace_value('identifier', opt_iter.next())
         stock = opt_iter.next()
         if stock.startswith('Unavailable'):
             continue
         loader.replace_value('stock', int('Out of stock' not in stock))
         loader.replace_value('price', opt_iter.next())
         yield loader.load_item()