Esempio n. 1
0
    def parse_option(self, response):
        item = response.meta['item']
        data = json.loads(response.body)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value(None, item)
        loader.replace_value('identifier', data['sku'])
        loader.replace_value('sku', data['sku'])

        selector = Selector(text=data['productPrices'])
        price = selector.css('.price-group').xpath(
            'span/span//text()').extract()
        loader.replace_value('price', ''.join(price))
        was_price = selector.css(
            '.price-group p span.text--strikethrough ::text').extract()
        loader.replace_value('url', data['url'])

        selector = Selector(text=data['stock'])
        out_of_stock = selector.css('.stock-indicator__status--inactive')
        loader.replace_value('stock', int(not out_of_stock))

        selector = Selector(text=data['navigationTitle'])
        loader.add_value('name',
                         selector.css('.sub-title::text').extract_first())
        item = loader.load_item()

        was_price = extract_price(''.join(was_price))
        metadata = MetaData()
        metadata['Promotions'] = was_price if was_price else ''
        item['metadata'] = metadata
        yield item
    def parse_sub(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        items = hxs.select('//ol[@id="products-list"]/li')
        for item in items:
            product_id = item.select(
                './/span[contains(@id, "product-price-")]/@id').re(
                    r'product-price-(\d+)')
            product_id = product_id[0] if product_id else ''
            if product_id and product_id in self.previous_crawl_data:
                self.log('>>> CACHED PRODUCT => %s' % product_id)
                loader = ProductLoader(
                    item=Product(**self.previous_crawl_data[product_id]),
                    selector=item)
                loader.replace_xpath(
                    'price',
                    './/span[@id="product-price-%s"]//text()' % product_id,
                    re=r'([\d.,]+)')
                if item.select('.//p[@class="availability out-of-stock"]'):
                    loader.replace_value('stock', 0)
                yield loader.load_item()
            else:
                product_url = item.select(
                    './/h2[@class="product-name"]/a/@href').extract()
                if product_url:
                    yield Request(urljoin_rfc(base_url, product_url[0]),
                                  callback=self.parse_product)

        for url in set(hxs.select('//div[@class="pages"]//a/@href').extract()):
            yield Request(urljoin_rfc(base_url, url), callback=self.parse_sub)
Esempio n. 3
0
    def parse_product(self, response):
        row = response.meta['row']

        loader = ProductLoader(response=response, item=Product())
        loader.add_value('identifier', row['Unique product code'])
        loader.add_value('sku', row['Unique product code'])
        loader.add_value('url', response.url)
        loader.add_value('brand', row['Brand'])
        loader.add_value('category', row['Category'])
        loader.add_value('name', row['Product name'])

        image_url = response.xpath('//p[contains(@class, "product-image")]/a/@href').extract()
        image_url = image_url[0] if image_url else ''
        loader.add_value('image_url', image_url)

        price = response.xpath('//div[@class="add_to_cart"]//span[@class="regular-price"]//span/text()').extract()
        if not price:
            price = response.xpath('//div[@class="add_to_cart"]//p[@class="special-price"]//span[@class="price"]/text()').extract()
        loader.add_value('price', price[0])

        out_of_stock = response.xpath('//p[@class="availability out-of-stock"]')
        if out_of_stock:
            loader.add_value('stock', 0)

        option_text = row['Product name'].split(' - ')[-1]

        options_config = re.search(r'var spConfig = new Product.Config\((.*)\)', response.body)
        if options_config:
            product_data = json.loads(options_config.groups()[0])
            sku_data = re.search('var care_attribs = (.*);', response.body).group(1)
            sku_data = json.loads(sku_data)

            products = {}
            for attr in product_data['attributes'].itervalues():
                for option in attr['options']:
                    for product_id in option['products']:
                        products[product_id] = ' - '.join((products.get(product_id, ''), option['label']))

            for identifier, option_name in products.iteritems():
                if sku_data[identifier]['attrib_sku']['value'] == row['Unique product code']:
                    loader.replace_value('price', product_data['childProducts'][identifier]['finalPrice'])
                    stock = product_data['stockInfo'][identifier]['stockQty']
                    if not stock:
                        loader.replace_value('stock', 0)
                    break

        item = loader.load_item()
        
        metadata = EbeddingMeta()
        metadata['cost_price'] = row['Cost price']
        metadata['ean'] = row['EAN']

        item['metadata'] = metadata

        yield item
Esempio n. 4
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        main_name = hxs.select(
            '//meta[@property="og:title"]/@content')[0].extract()
        loader.add_value('name', main_name)
        loader.add_value('url', response.url)
        loader.add_value('identifier',
                         response.meta.get('row').get('PRODUCT_NUMBER'))
        loader.add_xpath('image_url', '//meta[@property="og:image"]/@content')
        price = hxs.select(
            '//meta[@property="product:price:amount"]/@content').extract()
        if price:
            loader.add_value('price', format_price(Decimal(price[0])))
        else:
            loader.add_value('price', '0.00')
        loader.add_value('sku', response.meta.get('row').get('PRODUCT_NUMBER'))
        loader.add_xpath(
            'brand',
            '//div[@itemprop="brand"]/div[@class="Value"]/a/span/text()')
        loader.add_value('shipping_cost', '3.99')
        stock = hxs.select(
            '//meta[@property="og:availability" and @content="instock"]')
        if not stock:
            loader.add_value('stock', 0)

        for category in hxs.select(
                '//div[@id="ProductBreadcrumb"]/ul/li/a/text()')[1:].extract():
            loader.add_value('category', category)

        options = hxs.select(
            '//div[@class="productOptionViewSelect"]/select/option[not(contains(text(),"Please Choose"))]/text()'
        ).extract()
        for option in options[:1]:
            loader.replace_value('name', '{} {}'.format(main_name, option))
            yield loader.load_item()

        if not options:
            yield loader.load_item()
Esempio n. 5
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        main_name = re.search('ecommerce.*name\': \'(.*?)\'', response.body, re.DOTALL).group(1)
        main_price = re.search('ecommerce.*price\': \'(.*?)\'', response.body, re.DOTALL).group(1)
        brand = re.search('ecommerce.*?brand\': \'(.*?)\'', response.body, re.DOTALL).group(1)

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('name', main_name)
        loader.add_value('url', response.url)
        loader.add_value('price', response.url)
        loader.add_xpath('image_url', '//meta[@property="og:image"]/@content')
        loader.add_value('identifier', response.meta.get('row').get('PRODUCT_NUMBER'))
        loader.add_value('sku', response.meta.get('row').get('PRODUCT_NUMBER'))
        loader.add_value('brand', brand)
        for category in hxs.select('//div[@id="breadcrumb"]/ul[@id="crumbs"]/li/a/text()')[1:].extract():
            loader.add_value('category', category)

        options = hxs.select('//select[@name="ProductID" and @id="select_size"]/option')
        for option in options:
            identifier = option.select('./@value')[0].extract()
            loader.replace_value('identifier', identifier)

            option_name, option_price = option.select('./text()')[0].extract().strip().split(' - ')
            loader.replace_value('name', '{} {}'.format(main_name, option_name))
            loader.replace_value('price', option_price)

            yield loader.load_item()

        if not options:
            yield loader.load_item()
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        product_identifier = hxs.select(
            '//input[@name="product"]/@value')[0].extract()
        sku = ''
        product_name = hxs.select(
            '//div[@class="product-name"]/span/text()')[0].extract().strip()
        base_price = response.xpath(
            '//p[@class="special-price"]/span[@class="price"]/text()').extract(
            )
        if not base_price:
            base_price = response.xpath(
                '//span[@class="regular-price"]/span[@class="price"]/text()'
            ).extract()
        base_price = extract_price(base_price[0]) if base_price else 0
        #cart_price = hxs.select('//div[@class="cartBoxTotal"]/text()').extract()
        image_url = hxs.select('//img[@id="image-main"]/@src').extract()
        image_url = urljoin_rfc(base_url, image_url[0]) if image_url else ''
        category = hxs.select(
            '//span[@typeof="v:Breadcrumb"]/a/text()').extract()
        category = category[-1] if category else ''
        brand = hxs.select(
            '//ul[@id="productDetailsList"]/li[contains(text(),"Manufactured")]/text()'
        ).re('Manufactured by: (.*)')

        options = hxs.select(
            '//select[@class=" required-entry product-custom-option"]/option')
        data_config = response.xpath('//script/text()').re(
            'new Product.Config\((.+)\);')
        if options:
            for option in options:
                identifier = option.select('./@value').extract()
                if not identifier or identifier[0] == '':
                    continue
                else:
                    identifier = identifier[0]
                option_name = option.select('./text()').extract()[0]
                option_name = option_name.split(u'+\xa3')[0].strip()
                name = product_name + " " + option_name
                price = extract_price(option.select('@price').extract()[0])

                identifier = product_identifier + "-" + identifier
                loader = ProductLoader(response=response, item=Product())
                loader.add_value('identifier', identifier)
                loader.add_value('sku', product_identifier)
                loader.add_value('price', base_price + price)
                loader.add_value('brand', '')
                loader.add_value('url', response.url)
                loader.add_value('name', name)
                loader.add_value('image_url', image_url)
                loader.add_value('category', category)
                if not loader.get_output_value('price'):
                    loader.add_value('stock', 0)
                yield loader.load_item()
            return

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('identifier', product_identifier)
        loader.add_value('sku', product_identifier)
        loader.add_value('url', response.url)
        loader.add_value('name', product_name)
        loader.add_value('image_url', image_url)
        loader.add_value('brand', brand)
        loader.add_value('category', category)
        loader.add_value('price', base_price)
        if not loader.get_output_value('price'):
            loader.add_value('stock', 0)
        item = loader.load_item()

        if data_config:
            data = json.loads(data_config[0])['attributes']
            products = dict()
            for attribute in sorted(data):
                for option in data[attribute]['options']:
                    for product in option['products']:
                        if not products.get(product):
                            products[product] = dict()
                            products[product]['label'] = option['label']
                            products[product]['price'] = extract_price(
                                option['price'])
                        else:
                            products[product]['label'] += ' ' + option['label']
                            products[product]['price'] += extract_price(
                                option['price'])
            for product in products:
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value(None, item)
                loader.add_value('name', products[product]['label'])
                loader.replace_value('identifier',
                                     product_identifier + '-' + product)
                loader.replace_value('sku', product)
                loader.replace_value('price',
                                     base_price + products[product]['price'])
                yield loader.load_item()
            return

        yield item
Esempio n. 7
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        options = response.xpath('//div[@class="optionValues"]/select/option')
        name = response.xpath(
            '//div[@id="productGeneral"]/form//h1[@id="productName"]/text()'
        ).extract()

        price = response.xpath(
            '//div[@id="productGeneral"]/form//h2[@id="productPrices"]/span[@class="productSalePrice"]/text()'
        ).extract()
        if not price:
            price = response.xpath(
                '//div[@id="productGeneral"]/form//h2[@id="productPrices"]/span[@class="productSpecialPrice"]/text()'
            ).extract()
        if not price:
            price = response.xpath(
                '//div[@id="productGeneral"]/form//h2[@id="productPrices"]/text()'
            ).extract()
        if price:
            price = price[0]
        price = price.replace(',', '').strip()
        stock = response.xpath(
            '//div[@id="cartAdd"]/input[@class="cssButton button_in_cart"]')
        if not stock:
            stock = 0
        category = response.meta['category'].replace(u'/', u' > ')

        brand = response.xpath('//ul[@id="productDetailsList"]/li/text()').re(
            'Manufactured by: (.*)')

        gtin_code = response.xpath(
            '//ul[@id="productDetailsList"]/li/text()').re('GTIN: (.*)')
        model_code = response.xpath(
            '//ul[@id="productDetailsList"]/li/text()').re('Model: (.*)')

        image_url = response.xpath(
            '//div[@class="MagicToolboxContainer"]/a/img/@src').extract()
        if image_url:
            image_url = response.urljoin(image_url[0])

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('price', price)
        price = Decimal(loader.get_output_value('price'))
        if price < Decimal('100.0'):
            loader.add_value('shipping_cost', self.shipping_cost)
        if not stock:
            loader.add_value('stock', 0)
        loader.add_value('category', category)
        loader.add_value('brand', brand)
        identifier = response.xpath(
            '//input[@name="products_id"]/@value')[0].extract()

        loader.add_value('sku', gtin_code[0] if gtin_code else model_code[0])
        loader.add_value('url', response.url)
        loader.add_value('image_url', image_url)
        loader.add_value('name', name)
        loader.add_value('identifier', identifier)

        if options:
            for option in options:
                option_id = option.xpath('@value')[0].extract()
                loader.replace_value('identifier',
                                     identifier + u'_' + option_id)
                option_name = option.xpath('text()')[0].extract()
                option_price = re.search('\( \+(.*)', option_name)
                loader.replace_value('name',
                                     u'{} {}'.format(name, option_name))
                if option_price:
                    option_price = option_price.group(1)
                    option_price = re.search('([\.\d]+)',
                                             option_price.replace(',',
                                                                  '')).group(1)
                    new_price = price + Decimal(option_price)
                    if new_price < Decimal('100'):
                        loader.replace_value('shipping_cost',
                                             self.shipping_cost)
                    else:
                        loader.replace_value('shipping_cost', Decimal('0.00'))
                    loader.replace_value('price', new_price)
                    loader.replace_value('name',
                                         u'{} {}'.format(name, option_name))
                yield loader.load_item()
        else:
            yield loader.load_item()