def parse_frames(self, response):
     base_url = get_base_url(response)
     products = response.xpath('//tr/td[text()="Code"][1]')
     if products:
         margin = 3
     else:
         products = response.xpath('//tr/td[span/text()="CODE"][1]')
         if products:
             margin = 2
     if not products:
         self.log('No products found on %s' % response.url)
     identifiers = []
     image_url = response.xpath(
         '//img[not (contains(@alt, "Doors"))]/@src[contains(., "images-thumb")]'
     ).extract()
     for product in products:
         for idx, option in enumerate(
                 product.xpath(
                     './../preceding-sibling::tr[1]/td[position()>1]')):
             name = option.xpath('.//text()').extract()
             for size in product.xpath('./../following-sibling::tr'):
                 if size.xpath(
                         'td[(text()="Code") or (span/text()="CODE")]'):
                     break
                 if not size.xpath('./td[1][contains(.//text(), " x")]'):
                     continue
                 loader = ProductLoader(item=Product(), selector=size)
                 loader.add_value('name', name)
                 size_name = size.xpath('td[1]/text()').extract()
                 loader.add_value('name', size_name)
                 loader.add_xpath('sku',
                                  'td[%d]/text()' % (idx * 2 + margin))
                 loader.add_xpath('price',
                                  'td[%d]/text()' % (idx * 2 + margin + 1))
                 if not loader.get_output_value('sku'):
                     continue
                 identifier = loader.get_output_value(
                     'sku') + '-' + '-'.join(re.findall(
                         '\d+', size_name[0]))
                 identifier += '-' + response.url.split('/')[-1].split(
                     '_')[0].split('.')[0]
                 while identifier in identifiers or identifier in self.ids_seen:
                     identifier += '-d'
                 identifiers.append(identifier)
                 self.ids_seen.append(identifier)
                 loader.add_value('identifier', identifier)
                 loader.add_value('url', response.url)
                 if image_url:
                     loader.add_value('image_url',
                                      urljoin(base_url, image_url[0]))
                 yield loader.load_item()
Example #2
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), response=response)
        # identifier = hxs.select('').extract()
        sku = hxs.select('//p/span[@itemprop="sku"]/text()').extract()
        identifier = sku
        if not sku:
            identifier = response.url.split('/')[-1].split('.')[0]
        loader.add_value('identifier', identifier)
        loader.add_value('sku', sku)
        if identifier in self.seen_ids:
            return
        self.seen_ids.append(identifier)
        name = hxs.select('//h1[@class="first"]/span[@itemprop="name"]/text()'
                          ).extract()[0].strip()
        try:
            loader.add_value('name', name)
        except:
            loader.add_value('name', name.decode('utf-8', 'replace'))
        category = hxs.select('//ol[@class="breadcrumb"]//a/text()').extract()
        loader.add_value('category', ' > '.join(category[1:][-3:]))
        image_url = hxs.select('//a[@class="lightbox"]/img/@src').extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
        loader.add_value('url', response.url)

        price = hxs.select(
            '//span[@class="price-big orange"]/text()').extract()[0]
        loader.add_value('price', price)
        if not loader.get_output_value('price'):
            loader.add_value('stock', 0)
        yield loader.load_item()
Example #3
0
 def parse_product(self, response):
     sel = Selector(response)
     price = sel.re(re.compile('jsProductPrice = \'(.*)\';'))
     categories = sel.xpath('//div[@id="navBreadCrumb"]/a/text()')[1:].extract()
     brand = sel.xpath('//span[@class="product_manufacturer"]/text()').re('Manufactured by: (.*)')
     brand = brand[0].strip() if brand else ''
     sku = sel.xpath('//span[@class="product_model"]/text()').re('Ref: (.*)')
     sku = sku[0].strip() if sku else ''
     identifier = re.search('p-(.*)\.html', response.url).group(1)
     image_url = response.xpath('//div[@id="replace_image_zoom"]//img[@class="zoom_pic"]/@src').extract()
     if image_url:
         image_url = response.urljoin(image_url[0])
     name = sel.xpath('//h1[@class="productGeneral"]/text()').extract()
     loader = ProductLoader(item=Product(), response=response)
     loader.add_value('identifier', identifier)
     loader.add_value('sku', sku)
     loader.add_value('name', name)
     loader.add_value('price', price)
     price = loader.get_output_value('price')
     if price and Decimal(price) < Decimal('400.0'):
         loader.add_value('shipping_cost', Decimal('35.00'))
     loader.add_value('url', response.url)
     if image_url:
         loader.add_value('image_url', image_url)
     for category in categories:
         loader.add_value('category', category)
     loader.add_value('brand', brand)
     yield loader.load_item()
Example #4
0
 def parse_product(self, response):
     loader = ProductLoader(item=Product(), response=response)
     loader.add_value('identifier',
                      re.findall('product_id.+?(\d+)', response.body))
     loader.add_xpath('url', '//link[@rel="canonical"]/@href')
     loader.add_value('name', re.findall('"name":"(.+?)"', response.body))
     prices = re.findall('tier_price_total".+?([\d.]+)', response.body)
     if not prices:
         return
     price = Decimal(prices[0]).quantize(Decimal('.01'))
     loader.add_value('price', price)
     loader.add_value('sku', re.findall('product_id.+?(\d+)',
                                        response.body))
     category = re.findall(
         '<span class="technical_label">Lenstype:</span><a href.+?>(.+?)</a',
         response.body
     ) or re.findall(
         '<span class="technical_label">Producttype:</span><a href.+?>(.+?)</a',
         response.body)
     loader.add_value('category', category)
     loader.add_value(
         'image_url',
         re.findall('<img src="(\S+media/catalog/product\S+)"',
                    response.body))
     loader.add_value(
         'brand',
         re.findall(
             '<span class="technical_label">Merk:</span><a href.+?>(.+?)</a',
             response.body))
     if loader.get_output_value('price') < 70:
         loader.add_value('shipping_cost', '4.98')
     yield loader.load_item()
Example #5
0
 def parse_product(self, response):
     loader = ProductLoader(Product(), response=response)
     identifier = response.xpath('//input[@name="product_id"]/@value').extract_first()
     loader.add_value('identifier', identifier)
     loader.add_value('sku', identifier)
     loader.add_value('url', response.url)
     loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
     loader.add_xpath('price', '//meta[@itemprop="price"]/@content')
     category = response.xpath('//div[@id="ProductBreadcrumb"]//a/text()').extract()[1:]
     loader.add_value('category', category)
     loader.add_xpath('image_url', '//img[@itemprop="image"]/@src')
     loader.add_xpath('brand', '//div[@itemtype="http://schema.org/Organization"]/meta[@itemprop="name"]/@content')
     if not response.xpath('//link[@itemprop="availability"]/@href[contains(., "InStock")]'):
         loader.add_value('stock', 0)
     
     sku = identifier
     name = loader.get_output_value('name')
     name_end = re.search('\S+$', name).group(0).strip(' ()')
     keywords = response.xpath('//meta[@name="keywords"]/@content').extract_first().split(',')
     keywords = [word.strip() for word in keywords if word]
     shortest_keyword = min(keywords, key=len) if keywords else 'none'
     from_name = re.findall('\S*\d+\S*', name)
     if shortest_keyword.lower() == name_end.lower():
         sku = name_end
     elif shortest_keyword.upper() == shortest_keyword:
         sku = shortest_keyword
     elif name_end.upper() == name_end:
         sku = name_end
     elif from_name:
         sku = max(from_name, key=len)
         if '(' in sku:
             sku = identifier
     loader.replace_value('sku', sku)
     yield loader.load_item()
Example #6
0
 def parse_doors(self, response):
     url = response.xpath('//link[@rel="canonical"]/@href').extract()
     category = response.xpath(
         '//p[@class="breadcrumbs"]/a[position()>1]/text()').extract()
     ids = response.xpath('//script/text()').re('ecomm_prodid.*(\[.+\])')
     ids = eval(ids[0])
     for i, product in enumerate(
             response.xpath('//div[@itemprop="offers"]')):
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('name', './/h3[@itemprop="name"]/a/text()[1]')
         loader.add_value('identifier', ids[i])
         loader.add_value('sku', ids[i])
         loader.add_xpath('price', './/span[@itemprop="price"]/text()')
         local_url = product.xpath(
             './/h3[@itemprop="name"]/a/@href').extract()
         if local_url:
             local_url = response.urljoin(local_url[0])
         else:
             local_url = url
         loader.add_value('url', local_url)
         image_url = product.xpath('.//a/img/@src').extract()
         loader.add_value('image_url', response.urljoin(image_url[0]))
         loader.add_value('category', category)
         if not product.xpath(
                 'link[@itemprop="availability"][@href="http://schema.org/InStock"]'
         ):
             loader.add_value('stock', 0)
         if loader.get_output_value('price') < 750:
             loader.add_value('shipping_cost', 36)
         yield loader.load_item()
Example #7
0
    def parse_product(self, response):
        for url in response.css('.facet-nav a::attr(href)').extract():
            yield Request(response.urljoin(url), self.parse_product)

        xpath = '//meta[@property="%s"]/@content'
        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('identifier', xpath % 'product:retailer_part_no')
        loader.add_xpath('url', '//link[@rel="canonical"]/@href')
        loader.add_xpath('name', xpath % 'og:title')
        #loader.add_xpath('name', xpath %'product:color')
        loader.add_xpath('price', xpath % 'product:price:amount')
        loader.add_xpath('sku', xpath % 'product:retailer_part_no')
        category = response.xpath(
            '//ul[@itemprop="breadcrumb"]//a/text()').extract()
        category.remove('Home')
        category.remove('Products')
        category.pop(-1)
        loader.add_value('category', category[-3:])
        loader.add_xpath('image_url', xpath % 'og:image')
        loader.add_xpath('brand', xpath % 'product:brand')
        if loader.get_output_value('price') < 50:
            loader.add_value('shipping_cost', '3.99')

        item = loader.load_item()
        if item.get('identifier'):
            yield item
Example #8
0
 def parse_category(self, response):
     category = response.css('li.last::text').extract()
     products = response.xpath('//div[@typeof="Product"]')
     for product in products:
         loader = ProductLoader(Product(), selector=product)
         loader.add_xpath('identifier', './/*[@property="url"]/@sku')
         url = product.xpath('.//*[@property="url"]/@href').extract_first()
         loader.add_value('url', response.urljoin(url))
         loader.add_xpath('name', './/*[@property="url"]/text()')
         loader.add_xpath('price', './/*[@property="price"]/text()')
         loader.add_xpath('sku', './/*[@property="url"]/@sku')
         loader.add_xpath('category', '//li[@typeof="v:Breadcrumb"]/a/text()')
         loader.add_value('category', category)
         loader.add_xpath('image_url', './/*[@property="image"]/@content')
         if loader.get_output_value('price') < 50:
             loader.add_value('shipping_cost', '9.95')
         if product.xpath('.//button[starts-with(@id, "outOfStock")]'):
             loader.add_value('stock', 0)
         yield loader.load_item()
         
     if url_query_parameter(response.url, 'pn') or re.search('/cat_.+/.', response.url):
         return
     filters = response.css('ul.filters input::attr(id)').re('^\S{5}$')
     for filt in filters:
         url = response.url + '/' + filt
         yield Request(url, self.parse_category)
Example #9
0
    def parse_product(self, response):
        loader = ProductLoader(item=Product(), response=response)

        identifier = response.xpath('//div[@id="habtat-sku"]/text()').re(
            'Product Code: (\d+)')
        if not identifier:
            return
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//div[@class="product-name"]/h1//text()')
        loader.add_xpath('name', '//div/text()', re='Colour.*:(.+)')

        product_name = loader.get_output_value('name')
        if 'product' in response.meta:
            category = response.meta['product']['category'].split(' > ')
        else:
            website_category = response.meta['website_categories']
            categories = response.meta['category']
            kwrds = response.meta.get('kwrds', '')
            category = self.get_category(product_name, categories, kwrds)
            if not category:
                category = website_category.split(' > ')

        loader.add_value('category', category)
        loader.add_xpath(
            'price',
            '//div[@class="price-info"]//span[contains(@id, "product-price")]//span/text()'
        )
        if not loader.get_output_value('price'):
            loader.add_xpath(
                'price',
                '//p[@class="special-price"]/span[@class="price"]/text()')
        price = loader.get_output_value('price')
        if price and Decimal(price) < 50.0:
            loader.add_value('shipping_cost', '4.95')
        img = response.xpath(
            '//div[@class="product-img-box"]/div/a/@href').extract()
        if img:
            loader.add_value('image_url', response.urljoin(img[0]))
        if loader.get_output_value('price'):
            loader.add_value('stock', '1')
        else:
            loader.add_value('stock', '0')

        yield loader.load_item()
Example #10
0
    def parse_product(self, response):
        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('category', response.meta['category'])
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        option_name = response.css('.label-select-container').xpath(
            './/option[@selected]/text()').extract()
        loader.add_value('name', option_name)
        item_identifier = response.xpath(
            '//input[@id="item_details_item_id"]/@value').extract_first()
        if not item_identifier:
            self.logger.warning('No identifier on %s' % response.url)
        identifier = item_identifier + '-' + response.xpath(
            '//input[@id="item_details_product_id"]/@value').extract_first()
        loader.add_value('identifier', identifier)
        loader.add_value('url', response.url)
        loader.add_xpath('price', '//meta[@itemprop="price"]/@content')
        sku = []
        sku.append(
            response.css('.order-code').xpath(
                'text()').extract_first().strip())
        sku.extend(response.css('.order-code span::text').extract())
        loader.add_value('sku', ' '.join(sku))
        loader.add_xpath('image_url', '//img[@id="imageMain"]/@src')
        loader.add_css('brand', '.sku_kc_brand_id_ ::text')
        if loader.get_output_value('price') < 50:
            loader.add_value('shipping_cost', '2.99')
        stock = response.xpath(
            '//meta[@itemprop="availability"]/@content').extract_first()
        stock = stock.replace(' ', '').lower()
        if stock not in self.instock:
            loader.add_value('stock', 0)
            if stock not in self.outofstock:
                self.logger.warning('Undefined stock status for %s' %
                                    response.url)
        item = loader.load_item()
        if item['identifier'] not in self.identifiers:
            self.identifiers.add(item['identifier'])
            yield item

        attributes = []
        options = []
        for attribute in response.css('.label-select-container select'):
            attribute_name = attribute.xpath('@id').extract_first()
            attribute_name = attribute_name.replace('_%s' % item_identifier,
                                                    '')
            attributes.append(attribute_name)
            options.append([])
            for value in attribute.xpath('option/@value').extract():
                options[-1].append(value)
        for variant in itertools.product(*options):
            url = 'http://www.kiddicare.com/ajax.get_exact_product.php?instart_disable_injection=true&item_id=%s' % item_identifier
            for n, option in enumerate(variant):
                url += '&attributes[%s]=%s' % (attributes[n], option)
            url = url.replace('+', '%2B')
            meta = response.meta
            meta['sku'] = sku
            meta['attributes'] = attributes
            yield Request(url, self.parse_option, meta=meta)
Example #11
0
 def parse_price_from_cart(self, response):
     loader = ProductLoader(item=response.meta['product'],
                            response=response)
     loader.replace_xpath(
         'price',
         '//td[@class="right"]/div[@class="prodetail-price"][1]/text()')
     shipping_cost = 9.9 if loader.get_output_value('price') < 200 else 0
     loader.replace_value('shipping_cost', shipping_cost)
     yield loader.load_item()
Example #12
0
    def parse_node(self, response, node):
        identifier = node.select('./*[local-name()="id"]/text()')[0].extract()
        if identifier not in self.id_code_map:
            return
        product_code = self.id_code_map[identifier]
        loader = ProductLoader(item=Product(), selector=node)
        size = node.xpath('./*[local-name()="size"]/text()').extract()
        color = node.xpath('./*[local-name()="color"]/text()').extract()
        material = node.xpath('./*[local-name()="material"]/text()').extract()
        name = node.xpath('./*[local-name()="parent_title"]/text()').extract()
        if not name:
            name = node.xpath('./title/text()').extract()
        name = name[0]
        if material:
            name += u' {}'.format(material[0])
        if color:
            name += u' {}'.format(color[0])
        if size:
            name += u' {}'.format(size[0])
        price = node.xpath('./*[local-name()="price"]/text()').extract_first()
        pack_size = node.xpath('./description/text()').re(
            'Pack Size m: *([\d.]+)')
        if pack_size:
            price = extract_price(price) * extract_price(pack_size[0])

        loader.add_value('name', name)
        loader.add_xpath('url', './link/text()')
        loader.add_xpath('image_url', './*[local-name()="image_link"]/text()')
        loader.add_value('identifier', identifier)
        loader.add_value('price', price)
        loader.add_xpath(
            'shipping_cost',
            './*[local-name()="shipping"]/*[local-name()="price"]/text()')
        loader.add_xpath('brand', './*[local-name()="brand"]/text()')
        loader.add_xpath('category',
                         './*[local-name()="google_product_category"]/text()')
        loader.add_xpath('sku', './*[local-name()="mpn"]/text()')
        stock = node.xpath('./*[local-name()="availability"]/text()').extract()
        if stock and stock[0] == 'out of stock':
            loader.add_value('stock', 0)

        item = loader.load_item()

        if product_code in self.cost_prices:
            try:
                cost_price = Decimal(self.cost_prices[product_code])
            except:
                self.log('ERROR: unable to set cost price for item %r' % item)
            else:
                item['metadata'] = {'cost_price': str(cost_price)}

        if pack_size:
            yield Request(loader.get_output_value('url'),
                          self.parse_pack_price,
                          meta={'item': item})
        else:
            yield item
Example #13
0
 def parse_product(self, response):
     loader = ProductLoader(item=Product(), response=response)
     css = '.nosto_product .%s ::text'
     loader.add_css('identifier', css % 'product_id')
     loader.add_css('sku', css % 'product_id')
     for field in ('url', 'name', 'image_url', 'brand'):
         loader.add_css(field, css % field)
     list_price = response.css(css % 'list_price').extract_first()
     sales_price = response.css(css % 'price').extract_first()
     loader.add_value('price', list_price)
     if 'InStock' not in response.css(css % 'availability').extract_first():
         loader.add_value('stock', 0)
     category = response.css(css % 'category').extract_first()
     loader.add_value('category', category.split('/')[-1])
     options_data = response.xpath('//script/text()').re(
         'Product.Config.({.+})')
     if not options_data:
         item = loader.load_item()
         if sales_price != list_price:
             item['metadata'] = {'SalesPrice': Decimal(sales_price)}
         yield item
         return
     options_data = json.loads(options_data[0])
     if len(options_data['attributes']) > 1:
         self.log('More than one options attributes found on %s' %
                  response.url)
         return
     price = loader.get_output_value('price')
     name = loader.get_output_value('name')
     sales_price = Decimal(sales_price)
     for option in options_data['attributes'].values()[0]['options']:
         new_price = sales_price + Decimal(option['price'])
         loader.replace_value('price', price + Decimal(option['oldPrice']))
         loader.replace_value('name', name + ' ' + option['label'])
         loader.replace_value('identifier', option['products'][0])
         loader.replace_value('sku', option['products'][0])
         loader.replace_xpath(
             'image_url', '//li[@id="simple-product-image-%s"]/a/@href' %
             option['products'][0])
         item = loader.load_item()
         if price + Decimal(option['oldPrice']) != new_price:
             item['metadata'] = {'SalesPrice': new_price}
         yield item
Example #14
0
    def parse_product(self, response):
        try:
            pdata = SpiderSchema(response).get_product()
        except:
            self.logger.error('No structured product data on %s' %response.url)
            return
        options = None
        js_line = ''
        for l in response.body.split('\n'):
            if 'variants:' in l:
                js_line = l
                break

        if js_line:
            options = demjson.decode(re.search(r'variants:(.*};)?', js_line).groups()[0][:-2].strip())

        product_loader = ProductLoader(item=Product(), response=response)
        sku = response.css('span.pd_productVariant::text').extract_first()
        product_loader.add_css('sku', 'span.pd_productVariant::text')
        product_loader.add_xpath('identifier', '//input[@name="productId"]/@value')
        product_loader.add_value('url', response.url)
        try:
            product_loader.add_value('name', pdata['name'])
        except KeyError:
            return
        category = response.xpath('//*[@id="breadcrumb"]//a/text()').extract()[1:-1]
        product_loader.add_value('category', category)
        img = response.xpath('//meta[@property="og:image"]/@content').extract()
        if img:
            product_loader.add_value('image_url', response.urljoin(img.pop()))
        price = response.xpath('//p[@class="productOfferPrice"]/text()').extract()[0]
        product_loader.add_value('price', price)
        if product_loader.get_output_value('price') < 45:
            product_loader.add_value('shipping_cost', '3.5')
        brand = response.xpath('//*[@id="brandHeader"]/a/@href').extract()
        if brand:
            brand = brand[0].replace('/en/', '')[:-1]
            if '/' not in brand:
                product_loader.add_value('brand', brand)
        stock = response.xpath('//link[@itemprop="availability"]/@href').extract_first()
        if stock != 'http://schema.org/InStock':
            product_loader.add_value('stock', 0)
        product = product_loader.load_item()

        yield product

        if options:
            for k, val in options.items():
                option_name = k.replace('_', ' ')
                option_product = Product(product)
                option_product['name'] = product['name'] + ' ' + option_name
                option_product['sku'] = val['productCode']
                option_product['identifier'] = val['variantId']
                option_product['price'] = extract_price(val['nowPrice'])
                yield option_product
Example #15
0
    def parse_product(self, response):
        category = response.xpath(
            '//div[@class="breadcrumbs"]//li[position()>1]/a/@title').extract(
            )
        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('identifier', '//input[@name="product"]/@value')
        loader.add_xpath('sku', '//meta[@itemprop="sku"]/@content')
        loader.add_xpath('url', '//link[@rel="canonical"]/@href')
        loader.add_xpath('name', '//div[@itemprop="name"]/h1/text()')
        loader.add_xpath(
            'price', '//meta[@property="og:product:price:amount"]/@content')
        loader.add_xpath(
            'price',
            '//span[@id="product-price-%s"]//span[@class="price"]/text()' %
            loader.get_output_value('identifier'))
        loader.add_value('category', category)
        loader.add_xpath('image_url',
                         '//div[@class="product-img-box"]//img/@src')
        loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content')
        if self.shipping_cost:
            loader.add_value('shipping_cost', self.shipping_cost)
        if not response.xpath('//*[@class="availability in-stock"]'):
            loader.add_value('stock', 0)
        product = loader.load_item()

        if 'Doors, Joinery & Windows' in category:
            product['shipping_cost'] = Decimal('33')
        elif 'Flooring' in category:
            product['shipping_cost'] = Decimal('20')

        config = response.xpath('//script/text()').re(
            'Product.Config\((.+)\);')
        if config:
            data = json.loads(config[0])
            baseprice = Decimal(data['basePrice'])
            options = []
            attributes = data['attributes']
            for attribute_id in attributes:
                options.append(attributes[attribute_id]['options'])
            variants = itertools.product(*options)
            for variant in variants:
                item = Product(product)
                item['price'] = baseprice
                for option in variant:
                    item['identifier'] += '-' + option['id']
                    item['name'] += ' ' + option['label'].strip()
                    item['price'] += Decimal(option['price'])
                    item['price'] *= Decimal('1.2')
                yield Product(item)
            return

        yield product
Example #16
0
    def parse_product(self, response):
        url = response.url
        l = ProductLoader(item=Product(), response=response)

        # name
        l.add_css('name', '.pro-des::text')

        # price
        price = '.'.join(
            response.xpath('//div[@class="price-strike"]/div/span//text()').re(
                '\d+'))
        l.add_value('price', price)

        # sku
        l.add_xpath('sku', '//div[@class="short-desc"]/span//text()')

        # identifier
        productid = response.xpath(
            '//input[@id="selectedProductIdd"]/@value').extract()[0]
        priceid = response.xpath('//input[@id="priceId"]/@value').extract()[0]
        identifier = '-'.join((productid, priceid))
        l.add_value('identifier', identifier)

        # category
        l.add_xpath(
            'category',
            "//div[@class='bread']//li[position() > 1]//text()[not(contains(., '>'))]"
        )

        # product image
        l.add_xpath('image_url', "//meta[@property='og:image']/@content")
        # url
        l.add_value('url', url)
        # brand
        l.add_xpath('brand', '//div[@class="added-item"]/h2/text()')
        # shipping
        shipping_cost = 9.9 if l.get_output_value('price') < 200 else 0
        l.add_value('shipping_cost', shipping_cost)
        product = l.load_item()

        if not price:
            storeid = response.xpath(
                '//input[@id="storeId"]/@value').extract()[0]
            url = 'http://www.courts.com.sg/home/addtocart.html?isAdd=true&newProduct=true&productId=%s&selectedCurrency=SGD&quantity=1&cartId=na&addQuantity=true&newQuantity=1&shippingOption=&shippingCity=&deliveryOption=&shippingDate=&cityId=&title=&inventorysensible=yes&priceId=%s&storeId=%s'
            yield Request(url % (productid, priceid, storeid),
                          callback=self.parse_price_from_cart,
                          meta={
                              'product': Product(product),
                              'dont_merge_cookies': True
                          })
        else:
            yield product
Example #17
0
    def parse_product(self, response):
        if response.url.endswith('page-not-found.page'):
            return
        formdata = {}
        for inp in response.xpath('//form[@id="variant-form"]//input'):
            formdata[inp.xpath('@name').extract_first()] = inp.xpath(
                '@value').extract_first()
        if not formdata:
            self.logger.warning('No data on %s' % response.url)
            return
        del formdata[None]
        options = response.css('.vContainer .variantDataElement')
        for option in options:
            formdata[option.xpath('@name').extract_first()] = option.xpath(
                '@data-variant-value').extract_first()
            r = FormRequest.from_response(
                response,
                formxpath='//form[@id="variant-form"]',
                formdata=formdata,
                callback=self.parse_product)
            yield r

        loader = ProductLoader(item=Product(), response=response)
        sku = response.xpath('//input[@id="skuIdVal"]/@value').extract_first()
        if sku != url_query_parameter(response.url, 'skuId'):
            url = add_or_replace_parameter(url_query_cleaner(response.url),
                                           'skuId', sku)
            yield Request(url, self.parse_product)
            return
        loader.add_value('identifier', sku)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@id="productLabel"]//text()')
        #loader.add_css('name', '.selected .variantDisplayName_title ::text')
        loader.add_css('price', '.current-price ::text')
        loader.add_value('sku', sku)
        category = response.xpath(
            '//div[@id="breadcrumb"]//li//span[@itemprop="title"]/text()'
        ).extract()
        loader.add_value('category', category[-4:-1])
        image_url = response.xpath(
            '//img[@itemprop="image"]/@src').extract_first()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url))
        loader.add_xpath(
            'brand', '//div[@itemprop="brand"]//span[@itemprop="name"]/text()')
        loader.add_value('shipping_cost', 3)
        #if not response.css('.stock-tag.in-stock') and not response.xpath('//link[@href="http://schema.org/InStock"]') and not response.css('.available-from'):
        if not response.css('.add-to-basket'):
            loader.add_value('stock', 0)
        if loader.get_output_value('price'):
            yield loader.load_item()
Example #18
0
    def parse_product(self, response):
        categories = response.xpath(
            '//li[@class="blockBreadcrumb__item"]/a/text()').extract()[-3:]

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('identifier', '//input[@name="simpleSku"]/@value')
        loader.add_xpath('sku', '//input[@id="configSku"]/@value')
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[contains(@class, "__heading")]/text()')
        loader.add_xpath('name', '//input[@name="simpleSku"]/../span/text()')
        loader.add_xpath('image_url', '//div[@class="layoutImage"]//img/@src')
        loader.add_xpath('price', '//input[@id="price"]/@value')
        loader.add_xpath('brand', '//input[@id="brand"]/@value')
        loader.add_value('category', categories)
        loader.add_xpath('stock', '//@data-instock')
        item = loader.load_item()

        options = response.xpath('//select[@id="js-simple-selector"]/option')
        if not options:
            if loader.get_output_value('identifier'):
                yield item
            return
        for option in options:
            loader = ProductLoader(item=Product(item), selector=option)
            loader.replace_xpath('identifier', './@value')
            loader.add_xpath('name', './text()')
            identifier = loader.get_output_value('identifier')
            price = response.xpath(
                '//div[@data-simple-sku="%s"]//span[contains(@class, "actualPrice")]/text()'
                % identifier).extract()
            loader.replace_value('price', price)
            image_url = response.xpath(
                '//div[@data-simple-sku="%s"]/a[contains(@class, "link_selected")]/@data-product-image'
                % identifier).extract()
            loader.replace_value('image_url', image_url)
            loader.replace_xpath('stock', './@data-instock')
            yield loader.load_item()
Example #19
0
 def parse_products(self, response):
     for url in response.css(
             '.leftoption :contains("Filter by Manufacturers")').xpath(
                 'following-sibling::*//a/@href').extract():
         yield Request(response.urljoin(url), callback=self.parse_products)
     text = re.sub('Estimated *<', 'Estimated &lt;', response.body)
     selector = Selector(text=text)
     category = selector.css('.crumword').xpath(
         './/*[@itemprop="title"]/text()').extract()
     try:
         identifiers = selector.xpath('//script/text()').re(
             'ecomm_prodid: *\[(.+)\]')[0].replace("'", '').split(',')
     except IndexError:
         return
     next_page_url = response.xpath(
         '//div[@class="pagination"]/a[@class="next"]/@href').extract()
     if next_page_url:
         yield Request(response.urljoin(next_page_url[0]),
                       callback=self.parse_products)
     for num, product in enumerate(selector.css('.grid')):
         loader = ProductLoader(item=Product(), selector=product)
         identifier = identifiers[num]
         loader.add_value('identifier', identifier)
         url = product.xpath('@href').extract_first()
         loader.add_value('url', response.urljoin(url))
         name = product.css('.gridname').xpath('text()').extract()
         loader.add_value('name', name)
         price = product.css('.gridPriceVat').xpath('text()').extract()
         if not price:
             price = 0
         loader.add_value('price', price)
         loader.add_value('sku', identifier)
         loader.add_value('category', category)
         image_url = product.css('.gridimage').xpath('.//@src').extract()
         loader.add_value('image_url', image_url)
         if price and loader.get_output_value('price') < 200:
             loader.add_value('shipping_cost', '4.99')
         if 'in stock' not in product.css('.pItemStock').xpath(
                 'text()').extract_first().strip().lower():
             loader.add_value('stock', 0)
         item = loader.load_item()
         if price:
             yield item
         else:
             yield Request(response.urljoin(url),
                           self.parse_product,
                           meta={'product': Product(item)})
Example #20
0
    def parse_category(self, response):
        try:
            data = SpiderSchema(response).get_products()
        except:
            return
        products = False
        for product in data:
            if not product.get('sku'):
                continue
            products = True
            loader = ProductLoader(Product(), response=response)
            loader.add_value('identifier', product['sku'])
            loader.add_value('url', product['url'][0])
            loader.add_value('name', product['name'])
            loader.add_value('sku', product['sku'])
            category = response.css('a.GTM-breadcumb::text').extract(
            )[1:] or response.meta.get('category')
            loader.add_value('category', category)
            loader.add_value('image_url', product['image'])
            loader.add_value('brand', product['brand'])
            if product['offers']['properties']['availability'] != 'in stock':
                loader.add_value('stock', 0)
            price = product['offers']['properties']['price']
            yield Request(loader.get_output_value('url'),
                          self.parse_product,
                          meta={'item': Product(loader.load_item())})
        if not products:
            return

        page = url_query_parameter(response.url, 'page')
        if page:
            url = add_or_replace_parameter(response.url, 'page', int(page) + 1)
        else:
            id_families = response.xpath(
                '//input[@data-key="idFamilies"]/@value').extract_first()
            if id_families:
                url = add_or_replace_parameter(
                    'https://www.pccomponentes.pt/listado/ajax?page=0&order=price-desc',
                    'idFamilies[]', id_families)
            elif response.url.endswith('/novedades/'):
                return
            elif response.url.endswith('/'):
                url = response.url + 'ajax?page=0&order=price-desc'
            else:
                return

        yield Request(url, self.parse_category, meta={'category': category})
 def parse_treatment(self, response):
     base_url = get_base_url(response)
     product = response.xpath('//tr/td[(text()="Code")][1]')[0]
     identifiers = []
     for size in product.xpath('./../following-sibling::tr[position()<5]'):
         loader = ProductLoader(item=Product(), selector=size)
         size_name = size.xpath('td[1]/text()').extract()
         loader.add_value('name', size_name)
         loader.add_xpath('sku', 'td[2]/text()')
         loader.add_xpath('price', 'td[3]/text()')
         if not loader.get_output_value('sku'):
             continue
         loader.add_xpath('identifier', 'td[2]/text()')
         loader.add_value('url', response.url)
         yield loader.load_item()
     else:
         self.treatment = True
Example #22
0
 def parse_product(self, response):
     product = SpiderSchema(response).get_product()
     if not product:
         return
     loader = ProductLoader(Product(), response=response)
     loader.add_value('identifier', product['sku'])
     loader.add_value('url', response.url)
     loader.add_value('name', product['name'])
     loader.add_value('price', product['offers']['properties']['price'])
     loader.add_value('sku', product['sku'])
     loader.add_xpath('category', '//a[@id="breadCrumbDetails"]/text()')
     loader.add_value('image_url', product['image'])
     if loader.get_output_value('price') < 50:
         loader.add_value('shipping_cost', '9.95')
     if product['offers']['properties']['availability'] != 'http://schema.org/InStock':
         loader.add_value('stock', 0)
     yield loader.load_item()
Example #23
0
    def parse_products(self, response):
        category = response.xpath(
            '//div[@id="breadcrumb"]//span[@itemprop="name"]/text()').extract(
            )[2:]
        for product in response.css('.productList .product'):
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('identifier', '@id', re='product-(.+)')
            loader.add_xpath('url', './/@href')
            brand = product.xpath('.//h3/em/text()').extract_first()
            name = product.xpath('.//h3/span/text()').extract_first()
            if name[0].islower():
                loader.add_value('name', brand)
            loader.add_value('name', name)
            loader.add_css('price', '.productPrice dd:last-child::text')
            loader.add_xpath('sku', '@id', re='product-(.+)')
            loader.add_value('category', category)
            loader.add_css('image_url', '.productMainImage img::attr(src)')
            image_url = loader.get_output_value('image_url')
            promotion = None
            if image_url and '3for2' in image_url:
                promotion = '3 for 2'
            loader.add_value('brand', brand)
            loader.add_value('shipping_cost', '3.99')
            stock = product.css('.productStock dd').extract_first().title()
            if 'In Stock' not in stock and 'Low Stock' not in stock:
                loader.add_value('stock', 0)
            product = loader.load_item()

            metadata = ToyMonitorMeta()
            metadata['reviews'] = []
            if promotion:
                metadata['promotions'] = promotion
            product['metadata'] = metadata

            prod_id = re.findall("/(\d+).prd", product['url'])[0]
            reviews_url = "http://api.bazaarvoice.com/data/batch.json?passkey=35w0b6mavcfmefkhv3fccjwcc&apiversion=5.5&displaycode=17045-en_gb&resource.q0=reviews&filter.q0=isratingsonly%3Aeq%3Afalse&filter.q0=productid%3Aeq%3A" + prod_id + "&filter.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&sort.q0=isfeatured%3Adesc&stats.q0=reviews&filteredstats.q0=reviews&include.q0=authors%2Cproducts%2Ccomments&filter_reviews.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&filter_reviewcomments.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&filter_comments.q0=contentlocale%3Aeq%3Aen_GB%2Cen_IE%2Cen_US&limit.q0=100&offset.q0=0&limit_comments.q0=3&callback=bv_1111_57043"

            request = Request(reviews_url,
                              meta={
                                  'product': product,
                                  'offset': 0
                              },
                              callback=self.parse_reviews)
            yield request
Example #24
0
 def parse_product(self, response):
     loader = ProductLoader(Product(), response=response)
     loader.add_value('url', response.url)
     category = response.css('div.treemenu a::text').extract()[1:]
     loader.add_value('category', category)
     loader.add_css('image_url', 'div#mainimage_holder img::attr(data-zoom-image)')
     identifier = response.xpath('//input[@name="fproduct_id"]/@value').extract_first()
     loader.add_value('identifier', identifier)
     loader.add_value('sku', identifier)
     loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
     loader.add_css('price', 'li.shelfBnormalprice::text')
     if loader.get_output_value('price') < 100:
         loader.add_value('shipping_cost', 10)
     item = loader.load_item()
     
     attributes = response.css('table.variabletable tr')
     attributes = [attr for attr in attributes if attr.xpath('td[1]/text()').extract_first() in self.options_to_extract]
     options = []
     for attr in attributes:
         options.append(attr.xpath('td/select/option[not(contains(.,"Please Select"))]'))
     variants = itertools.product(*options)
     if not variants:
         yield item
         return
     
     for variant in variants:
         loader = ProductLoader(Product(), response=response)
         loader.add_value(None, item)
         identifier = item['identifier']
         price = item['price']
         for option in variant:
             identifier += '-' + option.xpath('@value').extract_first()
             name_and_price = option.xpath('text()').extract_first().split('(Add')
             loader.add_value('name', name_and_price[0])
             if len(name_and_price) >1:
                 price += extract_price(name_and_price[1])
         loader.replace_value('identifier', identifier)
         loader.replace_value('sku', identifier)
         loader.replace_value('price', price)
         if price >= 100:
             loader.replace_value('shipping_cost', 0)
         yield loader.load_item()
Example #25
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        name = hxs.select('//h1[@class="product-info-head"]/div[1]/text()').extract()
        name = ''.join(name).strip()
        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('name', name)
        loader.add_xpath('price', ".//span[not(@id)][not(@style)][contains(concat(' ',normalize-space(@class),' '),\" inline price bold productInfo-orgPrice product-info-price-current \")]/text()")
        image_url = hxs.select(".//div[not(@id)][not(@style)][contains(concat(' ',normalize-space(@class),' '),\" productPage_image_default \")]/img[1][not(@id)][not(@style)][contains(concat(' ',normalize-space(@class),' '),\" photo \")]/@src").extract()
        if image_url:
            loader.add_value('image_url', 'http:' + image_url[0])
        loader.add_xpath('brand', ".//dl[not(@id)][not(@class)][not(@style)]/dd[1][not(@id)][not(@class)][not(@style)]/text()")
        category = hxs.select(".//nav[not(@id)][not(@style)][contains(concat(' ',normalize-space(@class),' '),\" breadcrumbs module small \")]/div[2][not(@id)][not(@class)][not(@style)]/a[1][not(@id)][not(@class)][not(@style)]//text()").extract()
        if category:
            category = ''.join(category).strip()
            loader.add_value('category', category)
        loader.add_value('url', response.url)
        loader.add_value('identifier', response.url.split('/')[-1])

        if loader.get_output_value('price'):
            yield loader.load_item()
Example #26
0
 def parse_product(self, response):
     loader = ProductLoader(Product(), response=response)
     identifier = re.search('(\d+)_BQ', response.url).group(1)
     loader.add_value('identifier', identifier)
     loader.add_value('url', response.url)
     loader.add_css('name', '.product-summary h1.product-title::text')
     loader.add_css('price', '.product-price::attr(content)')
     loader.add_css('sku', 'dl.product-code dd::text')
     loader.add_value('category', 'Bedroom')
     category = response.css('.breadcrumb').xpath(
         './/li/a/text()').extract()[-1]
     loader.add_value('category', category)
     image_url = response.css('.main-img img::attr(src)').extract_first()
     if image_url:
         loader.add_value('image_url', response.urljoin(image_url))
     loader.add_xpath('brand',
                      '//th[text()="Brand"]/following-sibling::td/text()')
     if loader.get_output_value('price') < 50:
         loader.add_value('shipping_cost', 5)
     yield loader.load_item()
Example #27
0
 def parse_product(self, response):
     schema = SpiderSchema(response)
     pdata = schema.get_product()
     if not pdata:
         return
     
     loader = ProductLoader(Product(), response=response)
     identifier = re.search('/(\d+)$', url_query_cleaner(response.url)).group(1)
     loader.add_value('identifier', identifier)
     loader.add_value('url', response.url)
     loader.add_value('name', pdata['name'])
     loader.add_xpath('price', '//span[@id="product_priceExVAT"]/text()')
     loader.add_value('sku', pdata['productID'])
     category = response.css('p.breadcrumb a::text').extract()[-3:]
     loader.add_value('category', category)
     loader.add_value('image_url', pdata.get('image'))
     if pdata['brand'].get('properties'):
         loader.add_value('brand', pdata['brand']['properties']['name'])
     if loader.get_output_value('price') < 90:
         loader.add_value('shipping_cost', '5.25')
     yield loader.load_item()
Example #28
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('sku', '//script/@data-flix-sku')
        shipping_cost = self.shipping_costs.get(loader.get_output_value('sku'),
                                                None)
        if shipping_cost:
            loader.add_value('shipping_cost', extract_price(shipping_cost))

        loader.add_xpath('identifier',
                         '//input[contains(@id, "SKUID")]/@value')
        name = response.xpath('//h1/text()').extract() or response.xpath(
            '//h2[@itemprop="name"]/text()').extract()
        if not name:
            return
        name = name[0]
        loader.add_value('name', name)
        loader.add_xpath('price', '//span[@class="TotalPrice"]/text()')
        categories = response.xpath(
            '//a[@class="CMSBreadCrumbsLink"]/text()').extract()
        if not categories:
            categories = ''
        loader.add_value('category', categories)
        for brand in hxs.select(
                '//div[@title="Brand"]/following-sibling::div//span/@title'
        ).extract():
            if name.title().startswith(brand.title()):
                break
        else:
            brand = ''
        loader.add_value('brand', brand)
        loader.add_value('shipping_cost', 19.99)
        if 'In stock' not in hxs.select(
                '//span[@class="stock available"]/text()').extract():
            loader.add_value('stock', 0)

        product = loader.load_item()
        self.products[product['sku']].append(product)
Example #29
0
 def parse_item(self, response):
     url = response.xpath('//link[@rel="canonical"]/@href').extract()
     image_url = response.xpath('//a[@id="zoom1"]/@href').extract()
     image_url = response.urljoin(image_url[0])
     category = response.xpath(
         '//p[@class="breadcrumbs"]/a[position()>1]/text()').extract()
     for product in response.xpath('//div[@class="buy_box internals"]'):
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('name', 'label[@itemprop="name"]/text()[1]')
         loader.add_xpath('identifier', 'input[@name="product[]"]/@value')
         loader.add_xpath('sku', 'input[@name="product[]"]/@value')
         loader.add_xpath('price', 'label/meta[@itemprop="price"]/@content')
         loader.add_value('url', url)
         loader.add_value('image_url', image_url)
         loader.add_value('category', category)
         if not product.xpath(
                 'link[@itemprop="availability"][@href="http://schema.org/InStock"]'
         ):
             loader.add_value('stock', 0)
         if loader.get_output_value('price') < 750:
             loader.add_value('shipping_cost', 36)
         yield loader.load_item()
Example #30
0
    def parse_product(self, response):
        base_url = get_base_url(response)

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('url', response.url)
        name = ''.join(
            response.xpath(
                '//h1[@class="PrpdocutName"]//text()').extract()).strip()
        product_loader.add_value('name', name)
        brand = response.xpath(
            '//span[@class="parent_product_manufacture_logo"]/img/@alt'
        ).extract()
        brand = brand[0].strip() if brand else ''
        product_loader.add_value('brand', brand)
        identifier = response.xpath(
            '//input[@name="products_id"]/@value').extract()
        if not identifier:
            identifier = re.findall('custom_product_id=(\d+)', response.body)
        product_loader.add_value('identifier', identifier[0])
        product_loader.add_value('sku', identifier[0])
        category = response.xpath(
            '//div[@class="breadcrumb"]//span[@itemprop="title"]/text()'
        ).extract()[1:-1]
        product_loader.add_value('category', category)

        image_url = response.xpath(
            '//span[@class="image_container"]/img/@src').extract()
        if image_url:
            image_url = response.urljoin(image_url[0])
            product_loader.add_value('image_url', image_url)

        product = product_loader.load_item()

        options = response.xpath(
            '//table[@id="product_price_list"]//tr[not(contains(@class, "HeadingRow"))]'
        )
        if options:
            for option in options:
                prod = Product(product)
                product_loader = ProductLoader(item=prod, response=response)
                option_name = option.xpath(
                    'td/div[@class="subproduct_name"]/text()').extract()
                if option_name:
                    option_name = name + ' ' + option_name[0].strip()
                    product_loader.add_value('name', option_name)
                identifier = option.xpath(
                    './/input[@name="sub_products_id[]"]/@value').extract()
                if not identifier:
                    identifier = option.xpath(
                        './/input[@name="email_me_products_id"]/@value'
                    ).extract()
                if not identifier:
                    identifier = option.xpath(
                        './/input[@name="products_id"]/@value').extract()

                if identifier:
                    product_loader.add_value(
                        'identifier',
                        product['identifier'] + '-' + identifier[0])
                else:
                    log.msg(' >>>>>> Possible wrong identifier: ' +
                            response.url)

                sku = product_loader.get_output_value('identifier')
                product_loader.add_value('sku', sku)
                price = option.xpath(
                    './/span[@class="productSpecialPrice"]/text()').extract()
                if not price:
                    price = option.xpath(
                        './/span[@class="listing-price"]/text()').extract()
                price = price[0] if price else 0
                product_loader.add_value('price', price)
                in_stock = option.xpath(
                    './/span[@class="instock" and text()="In Stock"]').extract(
                    )
                if not in_stock or not product_loader.get_output_value(
                        'price'):
                    product_loader.add_value('stock', 0)
                if product_loader.get_output_value('price') < 70:
                    product_loader.add_value('shipping_cost', Decimal('9.90'))
                yield product_loader.load_item()
        else:
            log.msg(' >>>>> ERROR: NO OPTIONS' + response.url)
            #if product['price'] < 70:
            #    product['shipping_cost'] = Decimal('9.90')
            '''