Esempio n. 1
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        name = hxs.select(u'//div[@class="product-right"]//div[@class="pp-name"]/h1/text()').extract()[0].strip()
        main_price = hxs.select(u'//div[@class="product-right"]//div[@class="pp-price"]/span/span/text()').extract()[0]
        product_options = hxs.select(u'//select[@class="ekm-productoptions-dropdown-option"]')
        if product_options:
            body = response.body.replace('\xc2', ' ')
            if product_options.select(u'../select[@onchange]'):
                set_option_price = True
            for option in product_options.select(u'./option'):
                name_with_option = name + u' %s' % option.select(u'./text()').extract()[0].strip()
                option_value = option.select(u'./@value').extract()[0]
                price = re.search('== \'%s\'.*?_EKM_PRODUCTPRICE.*?= \'([\d\.]+?)\'' % option_value, body, re.DOTALL).groups()[0]\
                        if set_option_price else main_price

                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('name', name_with_option)
                loader.add_value('price', price)
                loader.add_value('url', response.url)
                if loader.get_output_value('price'):
                    yield loader.load_item()
        else:
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('url', response.url)
            loader.add_value('name', name)
            loader.add_value('price', main_price)
            if loader.get_output_value('price'):
                yield loader.load_item()
Esempio n. 2
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        name = hxs.select(u'//div[@class="product-name"]/h1/text()').extract()[0]
        base_price = hxs.select(u'//p[@class="special-price"]/span[@class="price"]/text()').extract()
        if not base_price:
            base_price = hxs.select(u'//span[@class="regular-price"]/span[@class="price"]/text()').extract()
        base_price = base_price[0]
        product_options = hxs.select(u'//ul[@class="options-list"]/li')
        if product_options:
            for option in product_options:
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value("url", response.url)
                name_with_option = name + u" %s" % option.select(u'./span[@class="label"]/label/text()').extract()[0]
                loader.add_value("name", name_with_option)
                extra_price = option.select(u'./span[@class="label"]/label/span/span/text()').extract()
                if extra_price:
                    extra_price = extra_price[0].replace(u"\xa3", u"")
                base_price = base_price.replace(u"\xa3", u"")
                loader.add_value(
                    "price", Decimal(base_price) + (Decimal(extra_price) if extra_price else Decimal("0.00"))
                )
                if loader.get_output_value("price"):
                    yield loader.load_item()
        else:

            loader = ProductLoader(item=Product(), response=response)
            loader.add_value("url", response.url)
            loader.add_value("name", name)
            loader.add_value("price", base_price)
            if loader.get_output_value("price"):
                yield loader.load_item()
Esempio n. 3
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//h2/text()')
        product_loader.add_xpath('price', u'//em[contains(@class,"ProductPrice")]/text()')
        product_loader.add_xpath('sku', u'//span[@class="VariationProductSKU"]/text()')
        product_loader.add_xpath('category', u'//div[@id="ProductBreadcrumb"]/ul/ul/li[2]/a/text()')
        product_loader.add_xpath('image_url', u'//div[@class="ProductThumbImage"]/a/img/@src')
        product_loader.add_xpath('brand', u'//div[@class="Value"]/a/text()')
        product_loader.add_value('shipping_cost', '')


        options = hxs.select(u'//div[@class="DetailRow"]//ul/li/label/input/../..')
        if options:
            product_id = hxs.select(u'//input[@name="product_id"]/@value').extract()[0]
            product_orig = product_loader.load_item()
            for opt in options:
                name = opt.select(u'.//input/../text()[2]').extract()
                if not name:
                    name = opt.select(u'concat(.//input/../span[1]/text(),.//input/../span[2]/text())').extract()
                var = opt.select(u'.//input/@value').extract()

                product = Product(product_orig)
                product['name'] = (product['name'] + ' ' + name[0].strip()).strip()
                yield Request('http://www.midwestunlimited.com/remote.php' +
                        '?w=GetVariationOptions&productId=' + product_id + '&options=' + var[0],
                        meta={'product': product}, callback=self.parse_price)
        else:
            yield product_loader.load_item()
Esempio n. 4
0
 def parse_products(self, response):
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//*[@id="area-2"]//div[@class="grid-25"]')
     if products:
         for product in products:
             loader = ProductLoader(item=Product(), selector=product)
             loader.add_xpath('url', 'div/h3/a/@href')
             if product.select('div/h3/a/abbr/@title'):
                 loader.add_xpath('name', 'div/h3/a/abbr/@title')
             else:
                 loader.add_xpath('name','div/h3/a/text()')
             price = product.select('div/div/p[@class="prd-amount"]/strong/text()').extract()[0]
             loader.add_value('price', self._encode_price(price))
             yield loader.load_item()
     else:
         products = hxs.select('//*[@id="area-2"]//tr[@class="prd first"]')
         for product in products:
             loader = ProductLoader(item=Product(), selector=product)
             loader.add_xpath('url', 'td/h3/a/@href')
             loader.add_xpath('name', 'td/h3/a/text()')
             if product.select('td/p/strong/text()').extract():
                 price = product.select('td/p/strong/text()').extract()[0]
             else:
                 if product.select('td/div/p/strong/text()').extract():
                     price = product.select('td/div/p/strong/text()').extract()[0]
             loader.add_value('price', self._encode_price(price))
             yield loader.load_item()
Esempio n. 5
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        prod_name = hxs.select('//div[@id="productSpecification"]/div/table/tr[1]/td[2]/text()').extract()
        if prod_name:
            mpn = ''.join(hxs.select('//*[@id="productSpecification"]/div/table/tr[td/text()="Manufacturer Code"]/td[@class="productAttributeValue"]/text()').extract())
            url = response.url
            url = urljoin_rfc(get_base_url(response), url)
            
            loader = ProductLoader(item=Product(), selector=hxs)
            loader.add_value('url', url)
            #if not mpn in prod_name[0]:
            #    loader.add_value('name', ' '.join((prod_name[0], mpn)))
            #else:
            #    loader.add_value('name', prod_name[0])
            loader.add_value('name', prod_name[0])
            sku = hxs.select('//div[@id="productSpecification"]/div/table/tr[2]/td[2]/text()').extract()
            if sku:
                loader.add_value('sku', sku[0])
                loader.add_value('identifier', sku[0])
            price = ''.join(hxs.select('//div[@id="productAddToCart"]/div/b/text()').extract())
            if price:
                loader.add_value('price', price)
            yield loader.load_item()
        else:
            # several productSpecification
            prods = hxs.select('//div[@class="productInformation"]')
            for prod in prods:
                mpn = ''.join([code for code in prod.select('p/text()').extract() if 'Manufacturer Code' in code]).strip().split(':')[-1]
                url = prod.select('./a/@href').extract()
                url = urljoin_rfc(get_base_url(response), url[0])
                if url:
                    loader = ProductLoader(item=Product(), selector=hxs)
                    loader.add_value('url', url)
                    
                    name = prod.select('./a/text()').extract()
                    if name:
                        if not mpn in name[0]:
                            loader.add_value('name', ' '.join((name[0], mpn)))
                        else:
                            loader.add_value('name', name[0])
                        #loader.add_value('name', name[0])
                    
                    sku = prod.select('./p[1]').extract()
                    if sku:
                        match = re.search('(\d+)', sku[0])
                        sku = match.group(1)
                        loader.add_value('sku', sku)
                        loader.add_value('identifier', sku) 

                    price = ''.join(prod.select('./p/b/text()').extract()).split('(')[0]
                    if price:
                        loader.add_value('price', price)
                    yield loader.load_item()
Esempio n. 6
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        options = hxs.select(u'//table[@class="ropetable" or @class="dbitable"]//td/a/@href').extract()
        if not options:
            options = hxs.select(u'//table//a/@href').extract()
            options = [o for o in options if o.startswith(response.url.rstrip('/'))]
        if options:
            for url in options: 
                yield Request(url, meta=response.meta, callback=self.parse_product)
            return

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//h1/text()')
        product_loader.add_xpath('price', u'//em[contains(@class,"ProductPrice")]/text()')
        product_loader.add_xpath('sku', u'//div[@id="sku"]/text()')
        product_loader.add_value('category', response.meta['category'])
        product_loader.add_xpath('image_url', u'//div[@class="ProductThumbImage"]//img/@src')
        product_loader.add_xpath('brand', u'//div[@class="DetailRow"]/div/a/text()')
        product_loader.add_xpath('shipping_cost', u'//div[@class="DetailRow"]/div[contains(text(),"Shipping")]/../div[2]/text()')

        options = hxs.select(u'//div[@class="productAttributeList"]//ul/li/label/input/../../..')
        options2 = hxs.select(u'//div[@class="productAttributeList"]//select')
        # FIXME http://www.ropeandrescue.com/conterra-tac-longbow-ranger-rescue-pack/
        # checkbox support?
        product_id = hxs.select(u'//input[@name="product_id"]/@value').extract()[0]
        product_orig = product_loader.load_item()

        if options:
            for opt in options:
                names = opt.select(u'.//input/../span/text()').extract()
                values = opt.select(u'.//input/@value').extract()
                value_names = opt.select(u'.//input/@name').extract()
                for i in xrange(len(names)):
                    product = Product(product_orig)
                    product['name'] = (product['name'] + ' ' + names[i].strip()).strip()
                    yield Request('http://www.ropeandrescue.com/remote.php' +
                            '?w=getProductAttributeDetails&product_id=' + product_id +
                            '&' + urllib.quote(value_names[i]) + '=' + values[i],
                            meta={'product': product}, callback=self.parse_price)
        elif options2:
            names = options2.select(u'./option[@value!=""]/text()').extract()
            values = options2.select(u'./option[@value!=""]/@value').extract()
            value_name = options2.select(u'./@name').extract()[0]
            for i in xrange(len(names)):
                product = Product(product_orig)
                product['name'] = (product['name'] + ' ' + names[i].strip()).strip()
                yield Request('http://www.ropeandrescue.com/remote.php' +
                        '?w=getProductAttributeDetails&product_id=' + product_id +
                        '&' + urllib.quote(value_name) + '=' + values[i],
                        meta={'product': product}, callback=self.parse_price)
 
        else:
            yield product_loader.load_item()
Esempio n. 7
0
    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)
        search_sku = response.meta["sku"]

        main_name = hxs.select('//span[@id="mainProductName"]/text()').extract()
        main_price = hxs.select("//dd[@class='price']/text()").extract()
        if not main_name and not main_price:
            return
        main_name = main_name[0].strip()
        main_price = main_price[0].strip()
        dec = hxs.select("//dd[@class='price']/span/text()").extract()
        if dec:
            main_price += dec[0]

        skus = []
        sku_text = hxs.select("//strong[text()='Mfg Part Number(s):']/../text()").extract()
        if sku_text:
            skus += [sku.strip() for sku in sku_text[0].split(", ")]

        sub_products = hxs.select('//select[@id="skuIdSelection"]/option')
        if sub_products:
            for p in sub_products:
                p_parts = p.select(".//text()").extract()[0].split("-")
                if p_parts[-1].strip().startswith("$"):
                    price = p_parts[-1].strip()
                else:
                    price = main_price

                sku_id = p.select(".//@value").extract()[0]
                sub_product_node = hxs.select('//input[@name="skuId" and @value="%s"]/../div' % sku_id)
                sku = None
                if sub_product_node:
                    sku = self._get_sku(sub_product_node.select(".//text()").extract()[0], skus)

                loader = ProductLoader(item=Product(), response=response)
                loader.add_value("url", response.url)
                loader.add_value("name", main_name + " " + "".join(p_parts[:-1]).strip())
                loader.add_value("price", price)
                if sku:
                    loader.add_value("sku", search_sku)
                    if sku in search_sku:
                        yield loader.load_item()
        else:
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value("url", response.url)
            loader.add_value("name", main_name)
            loader.add_value("price", main_price)
            if skus:
                loader.add_value("sku", search_sku)
                if any([sku in search_sku for sku in skus]):
                    yield loader.load_item()
Esempio n. 8
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        name = hxs.select('//td[@class="ProductDetails"]/h1/text()').extract()
        if name:
            name = name[0].strip()
            url = response.url
            url = urljoin_rfc(get_base_url(response), url)
            items = hxs.select('//div[@class="Item"]')
            for item in items:
                loader = ProductLoader(item=Product(), selector=item)
                loader.add_value('url', url)
                #loader.add_value('name', name[0])

                sku = ''.join(item.select('./text()').extract())
                n = name
                if sku:
                    n += ' ' + sku.strip()

                loader.add_value('name', n)
                loader.add_xpath('price', './/span[@class="price"]/text()')
                loader.add_xpath('price', './div[@class="price"]/span/text()')


                yield loader.load_item()
Esempio n. 9
0
 def parse(self, response):
     BASE_URL = 'http://www.virginmobile.com/vm/'
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//div[@class="webapp_shophome_3col_spotlight"]')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         xpath = 'div/div/div/div/div/div/div/div/div/div[@class="inner"]/text()'
         if product.select(xpath):
             loader.add_xpath('name', xpath)
             loader.add_xpath('price', 'div/div/div/div/div/div/p/span/text()')
             relative_url = product.select('div/div/div/div/div/p/a/@href')
             if relative_url:
                 url = urljoin_rfc(BASE_URL, relative_url.extract()[0], 
                                   response.encoding)
                 loader.add_value('url', url)
         else:
             xpath = 'div/div/div/div/div/div/div/div/div/div/div[@class="inner"]/text()'
             if product.select(xpath):
                 loader.add_xpath('name', xpath)
                 loader.add_xpath('price', 'div/div/div/div/div/div/div/p/span/text()')
                 relative_url = product.select('div/div/div/div/div/div/p/a/@href')
                 if relative_url:
                     url = urljoin_rfc(BASE_URL, relative_url.extract()[0], 
                                       response.encoding)
                     loader.add_value('url', url)
         yield loader.load_item()
Esempio n. 10
0
    def parse_page(self, response):
        base_url = get_base_url(response)
        base_url_func = functools.partial(urljoin_rfc, base_url)
        hxs = HtmlXPathSelector(response)
        
        # next page
        if self.products_nextpage_xpath:
            url = hxs.select(self.products_nextpage_xpath).extract()
            if url:
                yield Request(urljoin_rfc(base_url, url[0]), callback=self.parse_page)
            
        # products
        i = 0
        if self.products_xpath:
            for z in hxs.select(self.products_xpath)[1:]:
                #name = z.select(".//div[@class='detailsInnerWrap']/a[@class='name']/text()").extract()
                loader = ProductLoader(selector=z, item=Product())
                loader.add_xpath('price', ".//div[@class='storeitem_price']/span[@class='storeitem_firstprice']/text()", comas2dots)
                loader.add_xpath('identifier', "./div/a/@href", first, re="\-(\d+)\.html")
                loader.add_xpath('sku', "./div/a/@href", first, re="\-(\d+)\.html")
                loader.add_xpath('url', "./div/a/@href", first, base_url_func)
                #loader.add_xpath('url', "./div[@class='storeitem_title store_bolded']/a/@href", first, base_url_func)
                loader.add_xpath('name', "./div/a/b/text()")
                #loader.add_xpath('name', "./div[@class='storeitem_title store_bolded']/a/text()")
                loader.add_xpath('name', "./div/a/text()")

                yield loader.load_item()
                i += 1

        if i != 30:
            log.msg("Less than 30 products in %s %s" % (response.url, i))
Esempio n. 11
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        products = hxs.select("//div[@class='box-caracteristic-search']/div[@class='table-wrap']/form/table/tbody/tr")
        for product in products:
            name = product.select("td[@class='prd-details']/h3/a/text()").extract()
            if not name:
                logging.error("ERROR! No name! %s" % response.url)
                continue
            name = name[0]

            url = product.select("td[@class='prd-details']/h3/a/@href").extract()
            if not url:
                logging.error("ERROR! NOT FOUND URL! URL: %s. NAME: %s" % (response.url, name))
                continue
            url = url[0]
            url = self._urljoin(response, url)

            price = product.select("td[@class='prd-amount-details']/div/p[@class='prd-amount']/strong/text()").extract()
            if not price:
                logging.error("ERROR! NOT FOUND PRICE! URL: %s. NAME: %s" % (response.url, name))
                continue
            price = price[0]

            l = ProductLoader(item=Product(), response=response)
            l.add_value('name', name)
            l.add_value('url', url)
            l.add_value('price', price)
            yield l.load_item()
Esempio n. 12
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)

        nextPageLink = hxs.select('//div[@id="center-main"]//a[@class="right-arrow"]/@href')
        if nextPageLink:
            yield Request(self._get_products_url(response, nextPageLink[0].extract()), callback=self.parse_products)

        products = hxs.select('//div[@id="center-main"]//div[@class="details"]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)

            loader.add_xpath("name", "a/text()")
            loader.add_xpath("sku", 'div[@class="sku"]/span/text()')

            # few prices were under div class desc
            price_selector = product.select('.//div[@class="price-row"]/span[@class="price-value"]/span/text()')
            if price_selector:
                price = price_selector[0].extract()
            else:
                price = "0.0"

            loader.add_value("price", price)

            relative_url = product.select("a/@href")[0].extract()
            loader.add_value("url", urljoin_rfc(get_base_url(response), relative_url))

            yield loader.load_item()
Esempio n. 13
0
File: zachys.py Progetto: 0--key/lib
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        products = hxs.select(u'//dl[@class="search_result"]')
        for product in products:
            url = product.select(u'./dt/a[@class="#"]/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            multiple_prices = product.select(u'.//dd[@class="prices"]')
            name = product.select(u'./dt/a[@class="#"]/text()').extract()[0].strip() + u' %s'
            for option in multiple_prices:
                price_xpath = u'.//td[%s]/following-sibling::td[1]/text()'
                if option.select(u'.//td[@class="sale"]'):
                    price_xpath %= u'@class="sale" and %s'
                price_xpath %= u'contains(text(),"%s")'
                product_types = [u'Item', u'Bottle', u'Case']
                for product_type in product_types:
                    loader = ProductLoader(item=Product(), selector=product)
                    loader.add_value('url', url)
                    price = option.select(price_xpath % product_type)
                    if price:
                        loader.add_value('name', name % product_type)
                        loader.add_value('price', price.extract())
                        yield loader.load_item()
Esempio n. 14
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        name = hxs.select(u'//div[@class="datac2"]//h1[@class="mpv_desc"]/text()').extract()[0].strip()
        multiple_options = hxs.select(u'//select[@class="mpv_itemalst"]//option')
        if multiple_options and not u'requested' in response.meta:
            for option in multiple_options:
                formname = u'aspNetForm'
                formdata = {u'ctl00$MainContent$ItemAList' : option.select(u'./@value').extract()[0],
                            u'__EVENTTARGET' : u'ctl00$MainContent$ItemAList',
                            u'__EVENTARGUMENT' : u''}
                req = FormRequest.from_response(response, formname=formname,
                                                    formdata=formdata,
                                                    meta={u'requested': True},
                                                    dont_click=True, callback=self.parse_product)
                yield req
        if multiple_options:
            name += u' %s' % multiple_options.select(u'../option[@selected]/text()').extract()[0].strip()
        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_value('name', name)
        loader.add_xpath('price', u'//div[@class="datac2"]//span[@class="offerprc"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//span[@class="mpv_prc"]/text()')
        if loader.get_output_value('price'):
            yield loader.load_item()
Esempio n. 15
0
    def parse(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        # getting product details from product list
        prod_names = hxs.select('//h4/a/@title').extract()
        prod_urls = hxs.select('//h4/a/@href').extract()
        prices = hxs.select('//td[@class="ProductPrice"]/h4/text()').extract()
        prices = [p.strip().strip(u'\xa3') for p in prices]
        
        names_urls_prices = zip(prod_names, prod_urls, prices)
        for name, url, price in names_urls_prices:
            url = urljoin_rfc(get_base_url(response), url)
            if url:
                loader = ProductLoader(item=Product(), selector=hxs)
                loader.add_value('url', url)
                loader.add_value('name', name)
                loader.add_value('price', price)
                yield loader.load_item()

        # pages
        next_page = hxs.select('//a[@class="NextPage"]/@href').extract()
        if next_page:
            url = urljoin_rfc(get_base_url(response), next_page[0])
            yield Request(url)
Esempio n. 16
0
 def parse_page(self, response):
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//div[@class="item"]')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('name', 'h2/a/text()')
         relative_url = product.select('h2/a/@href').extract()[0]
         url = urljoin_rfc('http://www.dolphinmusic.co.uk/', 
                           relative_url, response.encoding)
         loader.add_value('url', url)
         loader.add_xpath('price', 'div[@class="pricing"]/p[@class="price"]/text()')
         yield loader.load_item()
     next_page = hxs.select('//*[@id="categoryMain"]/div[@class="pagination"]/ul/li/a/@href').extract()
     if not next_page:
         relative_urls = hxs.select('//*[@id="sidebar"]/ul[@id="refineCat"]/li/a/@href').extract()
         for relative_url in relative_urls:
             url = urljoin_rfc('http://www.dolphinmusic.co.uk/',
                               relative_url, response.encoding)
             yield Request(url, callback=self.parse_page)
     else:
         next_url = next_page[-1]
         if self._is_next(next_url):
             url = urljoin_rfc('http://www.dolphinmusic.co.uk/',
                                next_url, response.encoding)
             yield Request(url, callback=self.parse_page)
Esempio n. 17
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        name = hxs.select('//h1[@class="item"]/span/text()').extract()
        if name:
            url = response.url
            url = urljoin_rfc(get_base_url(response), url)
            loader = ProductLoader(item=Product(), selector=hxs)
            loader.add_value('url', url)
            loader.add_value('name', name[0])

            items = hxs.select('//div[@class="sku-details"]')
            for item in items:
                loader = ProductLoader(item=Product(), selector=hxs)
                loader.add_value('url', url)
                #loader.add_value('name', name[0])
                n = name[0].strip()
                sku = ''.join(item.select('.//span[@class="sku-description"]//text()').extract())
                if sku:
                    n += ' ' + sku.strip()

                loader.add_value('name', n)
                price = item.select('./span[@class="price"]/text()').extract()
                if price:
                    loader.add_value('price', price[0])
                else:
                    price = item.select('./span[@class="special-price"]/text()').extract()
                    loader.add_value('price', price[0])
                yield loader.load_item()
Esempio n. 18
0
 def parse_product(self, response):
     loader = ProductLoader(item=Product(), response=response)
     loader.add_xpath("name", '//div[@id="ProductDetails"]//h2/text()')
     loader.add_value("url", response.url)
     loader.add_xpath("price", '//div[@id="ProductDetails"]//em[contains(@class,"ProductPrice")]/text()')
     loader.add_xpath("sku", '//div[@id="ProductDetails"]//span[contains(@class,"VariationProductSKU")]/text()')
     yield loader.load_item()
Esempio n. 19
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        opt_groups = []
        def fix_options(o):
            try:
                return (o[0], o[1].replace(',', ''))
            except:
                return (o[0], '0')

        for option in hxs.select(u'//div[@class="input-box"]//select'):
            opt_list = option.select(u'./option[position() != 1]/text()').extract()
            opt_list = [o.replace('+$', '$').split('$') for o in opt_list]
            opt_groups.append([fix_options(o) for o in opt_list])

        for opt_name, opt_price in multiply(opt_groups):
            product_loader = ProductLoader(item=Product(), selector=hxs)
            product_loader.add_value('url', response.url)
            product_loader.add_xpath('name', u'//h2[@class="title"]/text()')
            product_loader.add_xpath('price', u'//span[contains(@class,"sale-price")]/text()')
            product_loader.add_xpath('sku', u'substring-after(//span[contains(@class,"meta-sku")]/text(),":")')
            product_loader.add_xpath('category', u'//ul[@class="breadcrumb"]/li[2]/a/@title')
            product_loader.add_xpath('image_url', u'//div[@class="teaser-large"]/img/@src')
            product_loader.add_xpath('brand', u'substring-after(//div[@class="product-meta"]/span[contains(text(),"Manufacturer:")]/text(),":")')
            product_loader.add_value('shipping_cost', '')

            product = product_loader.load_item()
            product['name'] = (product['name'] + ' ' + opt_name).strip()
            product['price'] = product['price'] + Decimal(opt_price)
            yield product
Esempio n. 20
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)

        if not self.brand_crawled:
            brands = hxs.select('//*[@class="infoBox-categories"]//a/@href').extract()
            for url in brands:
                if not re.search('^http', url):
                    url = urljoin_rfc(base_url, url)
                yield Request(url, callback=self.parse_products)
            self.brand_crawled = True

        # Is it another subcategory page?
        sub_sub_categories = hxs.select('//div[@id="catView"]//a/@href').extract()
        for url in sub_sub_categories:
            if not re.search('^http', url):
                url = urljoin_rfc(base_url, url)
            yield Request(url, callback=self.parse_products)

        # Is it products page?
        products = hxs.select('//div[@id="productView"]/ul/li[@class="product"]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', './/h2/a/text()')
            loader.add_xpath('price', './/h3/a/text()')
            loader.add_xpath('url', './/h2/a/@href')
            yield loader.load_item()
Esempio n. 21
0
    def parse_page(self, response):
        base_url = get_base_url(response)
        base_url_func = functools.partial(urljoin_rfc, base_url)
        hxs = HtmlXPathSelector(response)
        
        # products next page
        if self.products_nextpage_xpath:
            if not self.products_nextpage_re:
                url = hxs.select(self.products_nextpage_xpath).extract()
            else:
                url = hxs.select(self.products_nextpage_xpath).re(
                                        self.products_nextpage_re)
            if url:
                yield Request(urljoin_rfc(base_url, url[0]),
                        callback=self.parse_page)
            
        # products
        if self.products_xpath:
            for z in hxs.select(self.products_xpath):
                loader = ProductLoader(selector=z, item=Product())
                if self.product_name:
                    for xpath in self.product_name:
                        loader.add_xpath('name', xpath)
                #loader.add_xpath('name', "./div[@class='margue']/text()")
                if self.product_url:
                    for xpath in self.product_url:
                        loader.add_xpath('url', xpath, first, base_url_func)
                if self.product_price:
                    for xpath in self.product_price:
                        loader.add_xpath('price', xpath, comas2dots)

                yield loader.load_item()
Esempio n. 22
0
 def parse_categories(self, response):
     hxs = HtmlXPathSelector(response)
     sub_categories = hxs.select('//div[@class="section_190"]/a/@href').extract()
     if not sub_categories:
         products = hxs.select('//div[@class="list_search_result"]')
         for product in products:
             loader = ProductLoader(item=Product(), selector=product)
             loader.add_xpath('name', 'div[@class="list_search_detail"]/'
                                      'div[@class="list_search_info"]/p/a/'
                                      'span/text()')
             loader.add_xpath('url', 'div[@class="list_search_detail"]/'
                                     'div[@class="list_search_info"]/p/a/@href')
             loader.add_xpath('price', 'div[@class="list_search_detail"]/'
                                       'div[@class="list_search_actionblock"]/'
                                       'p/span[@class="list_search_price"]/text()')
             yield loader.load_item()
         next_page = hxs.select('//div[@class="formfloatright"]/'
                                'strong/a[text()="Next>"]/@href').extract()
         if next_page:
             next_url = next_page[-1]
             yield Request(next_url, callback=self.parse_categories)
     else:
         urls = hxs.select('//div[@class="section_190"]/a/@href').extract()
         for url in urls:
             yield Request(url, callback=self.parse_categories)
Esempio n. 23
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        product = hxs.select('//table[@class="buybox"]')

        if not product:
            return

        loader = ProductLoader(item=Product(), selector=product)
        name = product.select('.//h1[@class="stylename"]/text()').extract()
        if name:
            log.msg(name[0].lower() + ' - ' + response.meta['name'].lower().replace('+', ' '))
            product_words = name[0].lower().strip().split(' ')
            search_words = response.meta['name'].lower().replace('+', ' ').split(' ')
            diff = [w for w in search_words if not w in product_words]
            #if name[0].lower() == response.meta['name'].lower().replace('+', ' '):
            if not diff:
                price = "".join(product.select('.//span[@class="price"]/span/text()').re(r'([0-9\,\. ]+)')).strip()
                loader.add_value('name', name[0])
                loader.add_value('url', response.url)
                loader.add_value('price', price)
                loader.add_value('sku', response.meta['sku'])

                if not 'apparelsave' in loader.get_output_value('name').lower():
                    yield loader.load_item()
Esempio n. 24
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        products = hxs.select('//div[@class="productCellWrapper"]')
        if not products:
            return
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            brand = "".join(product.select('.//div[@class="productBrandTitleColor"]/a/span[@class="brand"]/text()').extract()).strip()
            style = "".join(product.select('.//div[@class="productBrandTitleColor"]/a/span[@class="styleName color"]/text()').extract()).strip()
            name = "".join(product.select('.//div[@class="productBrandTitleColor"]/a/span[@class="styleName name"]/text()').extract()).strip()
            name = brand + ' ' + name + ' ' + style
            product_words = name.lower().split(' ')
            search_words = response.meta['name'].lower().split()
            diff = [w for w in search_words if not w in product_words]
            if not diff:
                url = product.select('.//div[@class="productBrandTitleColor"]/a/@href').extract()[0]
                price = "".join(product.select('.//div[@class="price"]/span[@class="salePrice"]/text()').re(r'([0-9\,\. ]+)')).strip()
                if not price:
                    price = "".join(product.select('.//div[@class="price"]/text()').re(r'([0-9\,\. ]+)')).strip()
                loader.add_value('name', name)
                loader.add_value('url', urljoin_rfc(base_url,url))
                loader.add_value('price', price)
                loader.add_value('sku', response.meta['sku'])

                if not 'apparelsave' in loader.get_output_value('name').lower():
                    yield loader.load_item()
                    break
            """
Esempio n. 25
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for item in hxs.select(u'//tr[contains(@class,"product-item")]'):
            product_loader = ProductLoader(item=Product(), selector=item)

            product_loader.add_xpath('name', u'.//td[@class="productListingNewName"]/b/a/text()')

            price = item.select(u'.//span[@class="js_price_tax"]/text()').extract()[0]
            price = price.strip().replace('.', '').replace(',', '.')
            product_loader.add_value('price', price)

            url = item.select(u'.//td[@class="productListingNewName"]/b/a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)

            # If quantity field is not present on page, there are subproducts
            qty = item.select(u'.//input[@name="products_qty"]').extract()
            if qty:
                yield product_loader.load_item()
            else:
                yield Request(url, callback=self.parse_sub)

        level = response.meta.get('level', 1)
        sub_url = u'//div[@class="box-content"]/' + u'/'.join([u'ul/li'] * level) + '/a/@href'
        subcategories = hxs.select(sub_url).extract()
 
        for subcategory in subcategories:
            url = urljoin_rfc(get_base_url(response), subcategory)
            yield Request(url, meta={'level': level+1})

        next_url = hxs.select(u'//li[@class="page-next"]/a/@href').extract()
        if next_url:
            next_url = urljoin_rfc(get_base_url(response), next_url[0])
            yield Request(next_url, meta={'level': level})
Esempio n. 26
0
 def parse_products(self, response):
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//li[@class="item" or @class="item lastItem"]')
     for product in products:
         name = product.select('div/h3/a/span/text()').extract()[0]
         url = product.select('div/h3/a/@href').extract()
         if url:
             url =  urljoin_rfc(get_base_url(response), url[0])
         options_from = ''.join(product.select('div/p[@class="price money"]/span/abbr/text()').extract()).strip()
         options_now = ''.join(product.select('div/p[@class="price money"]/text()').extract()).strip()
         if ('From' in options_from) or ('Now' in options_now):
             yield Request(url, callback=self.parse_options, meta={'name':name})
         else:
             loader = ProductLoader(item=Product(), selector=product)   
             loader.add_value('name', name)
             loader.add_value('url', url)
             price = product.select('div/p[@class="price money"]/span/span/text()').extract()
             if not price:
                 price = product.select('div/p[@class="price money"]/ins/span/text()').extract()                  
                 if not price:
                     price = ['']
             loader.add_value('price', price[0])
             yield loader.load_item()
     next = hxs.select('//a[@rel="nofollow" and span/text()="Next \xc2\xbb"]/@href'.decode('utf')).extract()
     if next:
         url =  urljoin_rfc(get_base_url(response), next[0])
         yield Request(url, callback=self.parse_products)
     else:
         sub_categories = hxs.select('//*[@id="categoryNavigation"]/li/ul/li/a/@href').extract()
         for sub_category in sub_categories:
             url =  urljoin_rfc(get_base_url(response), sub_category)
             yield Request(url, callback=self.parse_products)
Esempio n. 27
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//h1[@class="productDetailHeader"]/text()')
        if hxs.select(u'//span[@class="productDetailSelling"]/text()'):
            product_loader.add_xpath('price', u'//span[@class="productDetailSelling"]/text()')
        else:
            product_loader.add_value('price', '')
        product_loader.add_xpath('sku', u'//input[@type="hidden" and (@name="hidProductId" or @name="inv")]/@value')
        product_loader.add_xpath('category', u'//td[@class="smallPrint"]/a[position()=2 and contains(text(),"Products")]/../a[3]/text()')

        img = hxs.select(u'//a[@class="smallPrint" and @rel="lightbox"]/@href').extract()
        if img:
            img = urljoin_rfc(get_base_url(response), img[0])
            product_loader.add_value('image_url', img)
        if hxs.select(u'//a[contains(@href,"BrandName")]/@href'):
            product_loader.add_xpath('brand', u'substring-after(//a[contains(@href,"BrandName")]/@href,"=")')
        else:
            brands = hxs.select(u'//strong[@class="sideBarText"]/text()').extract()
            brands = [b.strip() for b in brands]
            for brand in brands:
                if product_loader.get_output_value('name').startswith(brand):
                    product_loader.add_value('brand', brand)
                    break
            else:
                product_loader.add_xpath('brand', u'normalize-space(substring-before(substring-after(//title/text(), " - "), " - "))')
#        product_loader.add_xpath('shipping_cost', u'//div[@class="DetailRow"]/div[contains(text(),"Shipping")]/../div[2]/text()')

        yield product_loader.load_item()
Esempio n. 28
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        products = hxs.select(u'//ol[@id="products-list" and @class="products-list"]//li[contains(@class,"item")]')
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//h2[@class="product-name"]/a/@href').extract()
            url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', url)
            sku = product.select(u'.//small[child::b[contains(text(),"Product Code:")]]/text()').extract()
            if sku:
                sku = sku[0].strip()[3:]
            if sku in self.skus:
                product_loader.add_value('sku', sku)
            name = product.select(u'.//h2[@class="product-name"]/a/text()').extract()[0].strip()
            pack_size = product.select(u'.//small[child::b[contains(text(),"Pack Size:")]]/text()').extract()
            if pack_size:
                name += u' x' + pack_size[0].strip() + u'u.'
            product_loader.add_value('name', name)
            price = product.select(u'.//div[@class="price-box"]/span[contains(@class,"regular-price")]/span[@class="price"]/text()').re(u'[\d\.,]+')
            price = re.sub(',', '', price[0])
            product_loader.add_value('price', price)
            yield product_loader.load_item()
Esempio n. 29
0
    def parse_products(self, hxs, response):
        products = hxs.select('//div[@class="productList clear"]//div[starts-with(@class, "promoCell")]')

        for p in products:
            loader = ProductLoader(item=Product(), selector=p)

            name = p.select('.//p[@class="para1"]//text()').extract()
            name = ' '.join([n.strip() for n in name])
            name = re.sub(' +', ' ', name)

            loader.add_xpath('url', './/a[starts-with(@class, "border")]/@href')
            loader.add_value('name', name)
            loader.add_xpath('sku', './/p[@class="border"]/text()', re='Item: (.*)')
            loader.add_xpath('price', './/p[@class="para3"]/text()', re='Our Price: (.*)')

            if not loader.get_output_value('price'):
                yield Request(loader.get_output_value('url'), callback=self.parse_products2)
                continue



            if not p.select('.//p[@class="para3"]/text()').re('Our Price: (.*)')[0].startswith('$')\
               and response.meta.get('ret', 0) < 3:

                yield Request(response.url, dont_filter=True, meta={'ret': response.meta.get('ret', 0) + 1})
                return

            yield loader.load_item()
Esempio n. 30
0
 def parse_products(self, response):
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//div/div/table')
     for product in products:
         name = ''.join(product.select('tr/td/div[@class="featuredProductLinks"]/a/text()').extract())
         if name:
             loader = ProductLoader(item=Product(), selector=product)
             brand = ''.join(product.select('tr/td/div[@class="featuredMIS"]/a/text()').extract())
             loader.add_value('name', ' '.join((brand, name)))
             relative_url =  product.select('tr/td/div[@class="featuredProductLinks"]/a/@href').extract()
             loader.add_value('url', urljoin_rfc(get_base_url(response), relative_url[0]))
             price = ''.join(product.select('tr/td/div/div'
                                            '[@class="featuredProductPrice"]'
                                            '/span/span[@class="SalePrice1"]'
                                            '/text()').extract()).replace('.','').replace(',','.')
             if not price:
                 price = ''.join(product.select('tr/td/div/div'
                                                '[@class="featuredProductPrice"]'
                                                '/span/span[@class="variantprice1"]'
                                                '/text()').extract()).replace('.','').replace(',','.')
             loader.add_value('price', price)
             yield loader.load_item()
     next = hxs.select('//div[@class="pagingdiv"]/a[not(@class)]/@href').extract()
     if next:
         url =  urljoin_rfc(get_base_url(response), next[-1])
         yield Request(url, callback=self.parse_products)
Esempio n. 31
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        tmp = hxs.select(
            '//form[@id="productDetailsAddToCartForm"]/input[@name="product_id"]/@value'
        ).extract()
        if tmp:
            loader.add_value('identifier', tmp[0])
            loader.add_value('sku', tmp[0])
        else:
            log.msg('### No product ID at ' + response.url, level=log.INFO)
        name = hxs.select(
            '//div[@id="ProductDetails"]/div/h1/text()').extract()
        if name:
            loader.add_value('name', name[0].strip())
        else:
            log.msg('### No name at ' + response.url, level=log.INFO)
        # price
        price = hxs.select(
            '//span[@class="ProductDetailsPriceIncTax"]/text()').extract()
        if price:
            price = extract_price(price[0].split()[0])
            loader.add_value('price', price)
        else:
            loader.add_value('price', 0)
        # image_url
        image_url = hxs.select(
            '//div[@class="ProductThumbImage"]//img[1]/@src').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])
        # get brand
        for brand in self.brands:
            # if brand in name:
            if name and name[0].startswith(brand):
                loader.add_value('brand', brand)
                break
        # category
        tmp = hxs.select(
            '//div[@id="ProductBreadcrumb"]/ul/li/a/text()').extract()
        if len(tmp) > 1:
            loader.add_value('category', tmp[1])
        # shipping_cost

        # stock
        tmp = hxs.select(
            '//div[@class="ProductPriceWrap"]//em[text()="Call for pricing"]')
        if tmp:
            loader.add_value('stock', 0)
        else:
            loader.add_value('stock', 1)

        # process options
        product = loader.load_item()
        options = self.get_options(response)
        # print '###', options
        if options[0][0]:
            identifier = product['identifier']
            name = product['name']
            for option in options:
                # print '###',option
                item = copy.deepcopy(product)
                item['identifier'] = identifier + option[0]
                item['name'] = name + ' ' + option[1]
                option[2].update({'w': 'getProductAttributeDetails'})
                r = FormRequest.from_response(response,
                                              formnumber=1,
                                              formdata=option[2],
                                              meta={'item': item},
                                              callback=self.parse_option)
                yield r.replace(url='http://sxpro.co.uk/remote.php')

        else:
            yield product
Esempio n. 32
0
    def parse_product(self, response):
        #inspect_response(response, self)
        #return
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        tmp = hxs.select('//span[@itemprop="identifier"]/text()').extract()
        if tmp:
            loader.add_value('identifier', tmp[0].strip())
            loader.add_value('sku', tmp[0])
        else:
            log.msg('### No product ID at ' + response.url, level=log.INFO)
            return
        #tmp = hxs.select('//input[@name="productId"]/@value').extract()
        #if tmp:
        #    loader.add_value('sku', tmp[0])
        name = ''
        tmp = hxs.select('//span[@itemprop="name"]/h1/text()').extract()
        if tmp:
            name = tmp[0].strip()
            loader.add_value('name', name)
        else:
            log.msg('### No name at ' + response.url, level=log.INFO)
        #price
        price = 0
        tmp = hxs.select('//span[@itemprop="price"]/text()').extract()
        if tmp:
            price = extract_price(tmp[0].strip().replace(',', ''))
            loader.add_value('price', price)
        #stock
        stock = 0
        tmp = hxs.select('//td[strong="In Stock: "]/text()').extract()
        if tmp and 'yes' in ''.join(tmp).lower():
            stock = 1
        tmp = hxs.select(
            '//td[span/@itemprop="identifier"]/preceding-sibling::td/text()'
        ).extract()
        if tmp and 'availability' in ''.join(tmp).lower():
            stock = 1
        loader.add_value('stock', stock)
        #image_url
        tmp = hxs.select('//img[@itemprop="image"]/@src').extract()
        if not tmp:
            tmp = hxs.select('//td[@width="350"]/img/@src').extract()
        if tmp:
            url = urljoin(response.url, tmp[0])
            loader.add_value('image_url', url)
        #brand
        loader.add_value('brand', 'Le Creuset')
        #category
        loader.add_value('category', 'Le Creuset')
        #shipping_cost
        if price < 20:
            loader.add_value('shipping_cost', 2.49)
        elif price < 50:
            loader.add_value('shipping_cost', 5.95)
        #promotional
        promotional = []
        tmp = hxs.select(
            '//td[strong/font//span/@itemprop="price"]/text()').extract()
        if tmp:
            txt = ''.join(tmp)
            r = re.findall(r'\(Save - \d+%\)', txt)
            if r:
                promotional.append(r[0])
        tmp = hxs.select('//td[@bgcolor="#C00000"]').extract()
        if tmp:
            txt = '\n'.join([
                lxml.html.fromstring(s.replace('<br>',
                                               '\n').strip()).text_content()
                for s in tmp if len(s.strip()) > 0
            ])
            promotional.append(txt.strip())
        features = ''
        tmp = hxs.select(
            '//td[strong="Features:"]/span[@itemprop="description"]').extract(
            )
        if tmp:
            features = '\n'.join([
                lxml.html.fromstring(s.replace('<br>',
                                               '\n').strip()).text_content()
                for s in tmp if len(s.strip()) > 0
            ])
        loader.add_value('metadata', {
            'promotional': promotional,
            'features': features
        })

        product = loader.load_item()

        options = None
        #No options currently.
        if not options:
            if not product.get('identifier', None):
                log.msg('### No product ID at ' + response.url, level=log.INFO)
            else:
                if not product['identifier'] in self.id_seen:
                    self.id_seen.append(product['identifier'])
                    yield product
                else:
                    log.msg('### Duplicate product ID at ' + response.url,
                            level=log.INFO)
            return
        #process options
        #No options currently.
        for sel in options[0:1]:  ###
            item = copy.deepcopy(product)
            tmp = sel.select('.//label/input/@value').extract()
            if tmp:
                item['identifier'] += '-' + tmp[0]
                item['name'] = name + ' - ' + tmp[0]

            if not item.get('identifier', None):
                log.msg('### No product ID at ' + response.url, level=log.INFO)
            else:
                if not item['identifier'] in self.id_seen:
                    self.id_seen.append(item['identifier'])
                    yield item
                else:
                    log.msg('### Duplicate product ID at ' + response.url,
                            level=log.INFO)
Esempio n. 33
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        name = hxs.select(
            '//tr[@class="mainprodboxtitle"]/td/h1/text()').extract()
        loader.add_value('name', name)
        loader.add_value('url', response.url)

        stock = hxs.select(
            '//div[@class="mainprodbox"]//input[@type="image" and @alt="Buy"]')
        if not stock:
            loader.add_value('stock', 0)

        price = hxs.select(
            '//div[@class="mainprodbox"]//span[@class="buyprice"]/text()'
        ).extract()
        if not price:
            price = hxs.select(
                '//table[@class="buybox"]//td[@align="right" and @class="buyboxlightblue"]/text()'
            ).extract()
        if price:
            loader.add_value('price', price)
        else:
            loader.add_value('price', '0.00')

        price = loader.get_output_value('price')
        if Decimal(price or '0.00') < Decimal('50.00'):
            loader.add_value('shipping_cost', '4.95')

        brand = hxs.select('//img[contains(@src,"logo")]/@alt').extract()
        loader.add_value('brand', brand)

        categories = hxs.select(
            '//tr[@class="mainprodboxtitle"]/td/a/text()').extract()
        for category in categories:
            loader.add_value('category', category)

        sku = hxs.select(
            '//div[@class="mainprodbox"]//td[@class="text_small"]/text()').re(
                'Product Code: (.*)')
        loader.add_value('sku', sku)

        identifier = hxs.select(
            '//div[@class="mainprodbox"]//input[@type="hidden" and @name="sn"]/@value'
        ).extract()
        if not identifier:
            identifier = re.search('/sn/(.*)', response.url)
            identifier = identifier.group(1) if identifier else None
        else:
            identifier = identifier[0]
        loader.add_value('identifier', identifier)

        image_url = hxs.select(
            '//div[@class="mainprodbox"]//a[contains(@href,"popup")]/img/@src'
        ).extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))

        self.crawled_ids.append(identifier)

        yield loader.load_item()
Esempio n. 34
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        loader.add_value('url', response.url)
        loader.add_value('brand', response.meta.get('brand'))
        categories = hxs.select(
            '//div[@id="breadcrumbs"]/div[@class="crumbs"]/span/a/span/text()'
        ).extract()
        for category in categories[2:]:
            loader.add_value('category', category)

        sku = hxs.select('//meta[@itemprop="sku"]/@content').extract()
        loader.add_value('sku', sku)

        image_url = hxs.select(
            '//div[@id="product-image"]//img/@src').extract()
        if image_url:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), image_url[0]))

        identifier = loader.get_output_value('name')

        loader.add_value('shipping_cost', '0.00')

        item = loader.load_item()

        variants = response.xpath('//div[@class="variant"]')
        if variants:
            for variant in variants:
                options = variant.select('.//tr')
                variant_name = variant.select(
                    './/div[@class="title"]/h4/text()')[0].extract().strip()
                for option in options:
                    option_name = option.select('.//td[@class="name"]/text()')[
                        0].extract().strip().encode('latin-1')
                    option_item = deepcopy(item)
                    option_item['identifier'] = '{}-{}-{}'.format(
                        identifier, variant_name,
                        option_name).decode('latin-1')
                    option_item['name'] += ' {} {}'.format(
                        variant_name, option_name
                        if option_name.lower() != variant_name.lower() else
                        '').decode('latin-1')
                    option_item['name'] = option_item['name'].strip()
                    price = variant.xpath(
                        './/span[@class="now"]/text()').extract_first(
                        ) or variant.css('p.price span::text').extract_first()
                    option_item['price'] = extract_price(
                        price) if price else Decimal('0.00')
                    if Decimal(option_item['price']) < Decimal('30.00'):
                        option_item['shipping_cost'] = '1.99'
                    stock = option.select('.//td[@class="stock instock"]')
                    if not stock:
                        option_item['stock'] = 0
                    option_item['image_url'] = variant.select(
                        './/img/@src')[0].extract()
                    yield option_item
        else:
            self.log('PRODUCT WITHOUT OPTIONS: ' + response.url)
Esempio n. 35
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        name = ' '.join(
            hxs.select('//div[contains(@class, "deviceTitle")]/text()').
            extract()).strip()
        tariffs = hxs.select(
            '//form[@id="command" and div[contains(@class, "planName")]/text()!="Pay As You Go."]'
        )
        for tariff in tariffs:
            loader = ProductLoader(selector=tariff, item=Product())
            tariff_name = tariff.select(
                'div[contains(@class, "planName")]/text()').extract()[0]
            monthly_cost = tariff.select(
                'div//div[contains(@class, "priceColumn")]/div[contains(@class, "price")]/text()'
            ).extract()[0]
            duration = tariff.select(
                'div//li[contains(text(), "months")]/text()').extract(
                )[0].split(u' ')[0].replace(u'\xa0months', '')
            product_code = tariff.select(
                'input[@name="productCode"]/@value').extract()[0]
            tariff_code = tariff.select(
                'input[@name="packageCode"]/@value').extract()[0]
            loader.add_value(
                'identifier', product_code + '-' +
                tariff_code.replace('ContractD', '') + '-' + str(duration))
            loader.add_value(
                'name', response.meta['device_name'] + ' - ' + tariff_name)
            loader.add_value('url', response.url)
            loader.add_value('brand', name.split()[0])
            price = tariff.select(
                'div//div[contains(@class, "upfrontPrice")]/span/text()'
            ).extract()[0]
            loader.add_value('price', price)
            image_url = hxs.select(
                '//div[@class="devicePicturePanel"]/div/a/img/@src').extract()
            if image_url:
                loader.add_value(
                    'image_url',
                    urljoin_rfc(get_base_url(response), image_url[0]))

            product = loader.load_item()
            metadata = TelecomsMeta()
            metadata['device_name'] = response.meta['device_name']
            metadata['monthly_cost'] = monthly_cost.replace(u'\u00a3', '')
            metadata['tariff_name'] = tariff_name
            metadata['contract_duration'] = duration
            metadata['operator'] = operator
            metadata['channel'] = channel
            metadata['network_generation'] = net_gen
            product['metadata'] = metadata

            yield product

        tariffs = hxs.select('//li[@class="visible"]')
        if tariffs:
            name = hxs.select(
                '//h1[@class="main-title section"]/text()').extract()[0]
            for tariff in tariffs:
                mem_size = tariff.select('@data-memory').extract()[0]
                colour = tariff.select('@data-colour').extract()[0]
                if mem_size in response.meta[
                        'device_name'] and colour in response.url.replace(
                            '_', ' '):
                    loader = ProductLoader(selector=tariff, item=Product())
                    tariff_name = tariff.select('@data-planname').extract()[0]
                    monthly_cost = tariff.select(
                        '@data-monthly-cost').extract()[0]
                    duration = tariff.select(
                        'div/div/p[contains(text(), "month contract")]/em/text()'
                    ).extract()[0]
                    tariff_code = re.search(
                        'ContractD(\w+)',
                        tariff.select(
                            'div/div[@class="links"]/a[@class="chevron-link cta"]/@href'
                        ).extract()[0]).group(1)
                    loader.add_value('identifier',
                                     tariff_code + '-' + str(duration))
                    loader.add_value(
                        'name',
                        response.meta['device_name'] + ' - ' + tariff_name)
                    loader.add_value('url', response.url)
                    loader.add_value('brand', name.split()[0])
                    price = tariff.select('@data-upfront-cost').extract()[0]
                    loader.add_value('price', price)
                    image_url = hxs.select(
                        '//a[contains(@class, "product-imag") and @data-colour="'
                        + colour + '"]/img/@src').extract()
                    if image_url:
                        loader.add_value(
                            'image_url',
                            urljoin_rfc(get_base_url(response), image_url[0]))

                    product = loader.load_item()
                    metadata = TelecomsMeta()
                    metadata['device_name'] = response.meta['device_name']
                    metadata['monthly_cost'] = monthly_cost
                    metadata['tariff_name'] = tariff_name
                    metadata['contract_duration'] = duration
                    metadata['operator'] = operator
                    metadata['channel'] = channel
                    metadata['network_generation'] = net_gen
                    product['metadata'] = metadata

                    yield product
    def parse(self, response):
        if not isinstance(response, HtmlResponse):
            self.log('ERROR: BAD HtmlResponse!!! URL:{}'.format(response.url))
            return
        hxs = HtmlXPathSelector(response)

        # logic to find categories
        # find subcats for Outilage Jardin
        categories = hxs.select(
            '//div[contains(@class,"bg_U15 menugroup") and contains(@alt,"Jardin") and contains(@alt,"Outillage")]//div[@class="jsGroup"]//ul[@class="tree"]//a/@href'
        ).extract()
        # find subcats for Aspirateurs
        categories += hxs.select(
            '//div[contains(@class,"bg_U4 menugroup") and contains(@alt,"Entretien") and contains(@alt,"maison")]//div[@class="jsGroup"]//ul[@class="tree"]//a/@href'
        ).extract()

        for url in categories:
            url = urljoin_rfc(get_base_url(response), url)
            yield self._proxyRequest(url)

        # products new logic
        products = hxs.select(
            u'//div[@id="productList"]//div[contains(@class,"plProductView")]')
        if products:
            for product in products:
                product_loader = ProductLoader(item=Product(),
                                               selector=product)
                product_loader.add_xpath(
                    'url', './/a[contains(@class,"plPrName")]/@href')
                product_loader.add_xpath(
                    'name', './/a[contains(@class,"plPrName")]/text()')
                product_loader.add_xpath(
                    'category', '//div[@class="productListTitle"]/h1/text()')
                product_loader.add_xpath(
                    'image_url',
                    './/div[contains(@class, "plProductImg")]//img/@data-src')
                product_loader.add_xpath('sku', './@data-sku')
                product_loader.add_xpath(
                    'identifier',
                    './/input[contains(@name, "ProductPostedForm.ProductId")]/@value'
                )
                price = product.select(
                    u'.//div[contains(@class,"priceContainer")]/div[contains(@class,"priceM")]/text()'
                ).extract()
                if price:
                    decimals = product.select(
                        u'//div[contains(@class,"priceContainer")]/div[contains(@class,"priceM")]/sup/text()'
                    ).re(u'(\d+)')
                    if decimals:
                        price = price[0] + '.' + decimals[0]
                product_loader.add_value('price', price)
                if product_loader.get_output_value(
                        'name') and product_loader.get_output_value('price'):
                    yield product_loader.load_item()

        # pagination
        next_page = hxs.select(
            u'//ul[@class="PaginationButtons"]//a[contains(text(),"Suivant")]/@href'
        ).extract()
        if next_page:
            next_page = urljoin_rfc(get_base_url(response), next_page[0])
            yield self._proxyRequest(next_page)
Esempio n. 37
0
    def parse_list(self, response):
        # To list all products if they are not all already listed
        limiter_selected = response.xpath(
            '//div[@class="limiter"]/select/option[@selected]/@value').extract(
            )
        limiter_all = response.xpath(
            '//div[@class="limiter"]/select/option[contains(@value, "limit=all")]/@value'
        ).extract()
        if limiter_all and limiter_selected:
            if limiter_selected[0] != limiter_all[0]:
                yield Request(response.urljoin(limiter_all[0]),
                              callback=self.parse_list,
                              meta=response.meta)

        sub_category_urls = response.xpath(
            '//div[@class="category-item-center"]'
            '//span[@class="product-name"]/a/@href').extract()
        for url in sub_category_urls:
            yield Request(response.urljoin(url),
                          callback=self.parse_list,
                          meta=response.meta)

        if not sub_category_urls:
            products = response.xpath(
                '//ul[contains(@class, "products-grid")]/li[contains(@class, "item")]'
            )
            for product_xs in products:
                product_name = ''.join(
                    product_xs.xpath(
                        './/*[contains(@class, "product-name")]//text()').
                    extract()).strip()
                product_url = response.urljoin(
                    product_xs.xpath(
                        './/*[contains(@class, "product-name")]//a/@href').
                    extract()[0])
                product_price = extract_price_eu(
                    product_xs.xpath('.//*[@class="price-box"]//text()').re(
                        r'[\d\.,]+')[-1])
                product_image_url = map(
                    response.urljoin,
                    product_xs.xpath(
                        './/*[contains(@class, "product-image")]//img/@src').
                    extract())
                product_brand = response.meta.get('brand', '')
                product_category = map(
                    unicode.strip,
                    response.xpath(
                        '//div[contains(@class, "breadcrumbs")]//li[contains(@class, '
                        '"category")]/a/text()').extract())[1:]
                product_out_of_stock = bool(
                    product_xs.xpath(
                        './/*[contains(@class, "availability") and contains(@class, "out-of-stock")]'
                    ))
                product_shipping_cost = '0.00' if product_price >= self.free_shipping_over else '5.00'

                try:
                    product_identifier = product_xs.xpath(
                        './/*[contains(@id, "product-price-")]/@id').re(
                            r'(\d+)')[0]
                except:
                    product_identifier = None

                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('name', product_name)
                loader.add_value('url', product_url)
                loader.add_value('price', product_price)
                loader.add_value('shipping_cost', product_shipping_cost)
                loader.add_value('image_url', product_image_url)
                loader.add_value('brand', product_brand)
                loader.add_value('category', product_brand or product_category)
                if product_out_of_stock:
                    loader.add_value('stock', 0)
                if product_identifier is not None:
                    loader.add_value('identifier', product_identifier)
                    loader.add_value('sku', product_identifier)
                    yield loader.load_item()
                else:
                    item = loader.load_item()
                    yield Request(item['url'],
                                  meta={'item': item},
                                  callback=self.parse_options)
Esempio n. 38
0
    def parse_item(self, response):
        meta = response.meta

        categories = response.css(
            '.ProductDetailBreadcrumbs-item::text').extract()
        sku = meta['client_product']['Item Number']

        image_url = response.xpath(
            '//div[contains(@class, "main-carousel")]//a/@data-original-src'
        ).extract()
        if not image_url:
            image_url = response.xpath(
                '//img[contains(@class, "ProductDetailImagesBlock-carousel-image")]/@src'
            ).extract()

        prod_id = response.xpath('//input[@name="sku"]/@value').extract()
        prod_id = prod_id[0] if prod_id else ''

        try:
            name = response.xpath(
                '//h1/span[contains(@class, "ProductDetailInfoBlock-header-title")]/text()'
            ).extract()[0]
        except Exception:
            retry = meta.get('retry', 0)
            if retry <= 10:
                retry += 1
                meta['retry'] = retry
                self.log('ERROR >>> No name found, retry URL: ' + response.url)
                yield Request(response.url,
                              dont_filter=True,
                              callback=self.parse_item,
                              meta=meta)
                return
            else:
                self.log('ERROR >>> Gave up retrying URL: ' + response.url)
                return

        name += response.xpath('//h1/text()').extract()[-1].strip()
        brand = meta['client_product'].get('Brand', '')

        products_collected = []
        sku_list = []

        options = []
        dropdown_options = response.xpath(
            '//select[contains(@class, "stdselect")]/option[@value!="XXXXXXXXXX"]'
        )
        option_elements = []
        if dropdown_options:
            for dropdown_option in dropdown_options:
                option = {}
                option['identifier'] = dropdown_option.xpath(
                    '@value').extract()[0]
                option['sku'] = ''
                option['desc'] = dropdown_option.xpath(
                    './/text()').extract()[0]
                cost = dropdown_option.xpath('@cost').extract() or re.findall(
                    '\+\$([\d.]+)', option['desc'])
                option['cost'] = cost[0] if cost else '0'
                options.append(option)
            option_elements.append(options)
        else:
            dropdown_elements = response.xpath(
                '//div[@class="pdinfoblock"]/div[@class="fl"]//select')
            for dropdown_options in dropdown_elements:
                options = []
                for dropdown_option in dropdown_options.xpath(
                        'option[@value!="XXXXXXXXXX"]'):
                    option = {}
                    option['identifier'] = dropdown_option.xpath(
                        '@value').extract()[0]
                    option['sku'] = ''
                    option['desc'] = dropdown_option.xpath(
                        './/text()').extract()[0].split('-')[0]
                    option['cost'] = dropdown_option.xpath(
                        '@cost').extract()[0]
                    options.append(option)
                option_elements.append(options)

        image_options = response.css('.option_select_wrap .visual_option_wrap')
        if image_options:
            options = []
            for image_option in image_options:
                option = {}
                option['identifier'] = image_option.xpath(
                    '@data-pi-id').extract()[0]
                option['sku'] = ''
                option['desc'] = image_option.xpath('@data-name').extract()[0]
                option['cost'] = image_option.xpath('@data-cost').extract()[0]
                options.append(option)
            option_elements.append(options)

        if option_elements:
            if len(option_elements) > 1:
                combined_options = list(itertools.product(*option_elements))
                options = []
                for combined_option in combined_options:
                    final_option = {}
                    for option in combined_option:
                        final_option['desc'] = final_option.get(
                            'desc', '') + ' - ' + option['desc']
                        final_option['cost'] = final_option.get(
                            'cost', 0) + float(option['cost'])
                        final_option['identifier'] = final_option.get(
                            'identifier', '') + ' - ' + option['identifier']
                    options.append(final_option)
            else:
                options = option_elements[0]

            products_matched = self.hhe_df[self.hhe_df['Wayfair'] ==
                                           meta['client_product']['Wayfair']]

            for option in options:

                price = response.xpath(
                    '//*[@class="dynamic_sku_price"]/span/text()').extract()[0]
                #price += response.xpath('//*[@class="dynamic_sku_price"]/span/sup/text()').extract()[0]
                option_price_value = self.option_price(price,
                                                       str(option['cost']))

                # SKU not unique: match the correct client product sku
                if not products_matched.empty and products_matched.count(
                )['Wayfair'] > 1:
                    current_diff = Decimal(0)
                    current_sku = sku
                    for i, row in products_matched.iterrows():
                        wf_price = Decimal(row['Wayfair Cost'].replace(
                            '$', '').strip())
                        price_diff = abs(option_price_value - wf_price)
                        if (current_diff
                                == Decimal(0)) or (price_diff < current_diff):
                            current_sku = str(row['Item Number'])
                            current_diff = price_diff

                    sku = current_sku

                product_loader = ProductLoader(item=Product(),
                                               response=response)
                product_loader.add_value('name', name + ' ' + option['desc'])
                product_loader.add_value('sku', sku)
                identifier = response.xpath(
                    '//input[@name="sku"]/@value').extract()[0]
                product_loader.add_value(
                    'identifier', identifier + '-' + option['identifier'])
                product_loader.add_value('brand', brand)
                product_loader.add_value('category', categories)
                if image_url:
                    product_loader.add_value('image_url', image_url[0])
                product_loader.add_value('url', response.url)

                product_loader.add_value('price', option_price_value)
                product = product_loader.load_item()

                metadata = HouseholdEssentialsMeta()
                metadata['reviews'] = []
                product['metadata'] = metadata

                products_collected.append(product)
                sku_list.append(product['identifier'])

        else:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('name', name)
            product_loader.add_value('sku', sku)
            product_loader.add_xpath('identifier',
                                     '//input[@name="sku"]/@value')
            product_loader.add_value('brand', brand)
            product_loader.add_value('category', categories)
            if image_url:
                product_loader.add_value('image_url', image_url[0])

            price = response.xpath(
                '//span[@data-id="dynamic-sku-price"]/text()').extract_first()
            #price += response.xpath('//*[@class="dynamic_sku_price"]/span/sup/text()').extract()[0]

            product_loader.add_value('price', price)

            product_loader.add_value('url', response.url)

            product = product_loader.load_item()

            metadata = HouseholdEssentialsMeta()
            metadata['reviews'] = []
            product['metadata'] = metadata

            products_collected.append(product)
            sku_list.append(product['identifier'])

        transaction_id = re.findall(r'"transactionID":"(.*)",',
                                    response.body)[0]
        headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Referer': response.url,
            'X-Requested-With': 'XMLHttpRequest'
        }

        params = urlencode({
            'bpss': 'yes',
            'skulist': '~^~'.join(sku_list),
            'kitmode': '0',
            'postalcode': '67346',
            '_txid': transaction_id
        })

        yield Request(self.ajax_stock_url + '?' + params,
                      headers=headers,
                      dont_filter=True,
                      meta={
                          'product': products_collected,
                          'prod_id': prod_id,
                          'prod_url': response.url
                      },
                      callback=self.parse_stock)
Esempio n. 39
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        prod_lists = hxs.select(
            '//div[@class="product_list"]/div/h3/a/@href').extract()
        if prod_lists:
            for url in prod_lists:
                url = urljoin_rfc(get_base_url(response), url)
                yield Request(url)

        products = hxs.select(
            u'//table[child::tr[child::td[@colspan="2" and child::h2]]]')
        if products:
            try:
                category = hxs.select('//div[@class="page-heading"]/h1/text()'
                                      ).extract()[0].strip()
            except:
                try:
                    category = hxs.select(
                        '//div[@id="frag"]//text()').extract()[0].strip()
                except:
                    category = hxs.select(
                        '//p[@class="text_breadcrumbs"]//text()').extract(
                        ).pop()
        for product in products:
            try:
                image_url = urljoin_rfc(
                    base_url,
                    product.select('.//img/@src').extract()[0])
            except:
                image_url = ''
            multiple_options = product.select(u'.//select/option')
            general_price = product.select(
                u'.//span[@class="actlarge"]/text()').extract()
            general_price = general_price[0] if general_price else None
            if not general_price:
                general_price = product.select(u'.//*/text()').re(
                    u'Price inc UK Mainland Carriage.*?\:.*?\xa3([\d\.,]*)')
                general_price = str(round(float(general_price[0]) /
                                          1.2, 2)) if general_price else None
                log.msg(u'Product with: Price inc UK Mainland Carriage')
            if multiple_options and general_price:
                options_text = u' '.join(
                    product.select(u'.//select/option/text()').extract())
                if u'\xa3' in options_text:
                    log.msg(
                        u'Product with both option and general price: [%s]' %
                        response.url)
            name = product.select(u'.//h2/text()')[0].extract().strip()
            name_complete = ''.join(product.select(u'.//h2//text()').extract())
            if 'special offer' in name.lower():
                special_offer_starts_at = name.lower().index('special offer')
                new_name = name[:special_offer_starts_at].strip()
                if 'ref:' in new_name.lower():
                    self.log("Found special offer")
                    self.log("Before: '%s'" % name)
                    self.log("After: '%s'" % new_name)
                    name = new_name.replace(u'  (Ref', u' \xa0(Ref')
            if multiple_options and not general_price:
                idx = 1
                for option in multiple_options:
                    option_text = option.select(u'./text()')[0].extract()
                    loader = ProductLoader(item=Product(), selector=product)

                    price = re.search(u'\xa3([\d\.,]+)', option_text)
                    if price:
                        price = price.group(1)
                    else:
                        continue
                    regex = r'[\d]{1,2},[\d]{2}'
                    if re.search(regex, price):
                        price = price.replace(',', '.')

                    loader.add_value('name',
                                     name + u' %s' % option_text.strip())
                    loader.add_value('category', category)
                    loader.add_value('image_url', image_url)
                    loader.add_value('url', response.url)
                    loader.add_value('price', price)
                    m = re.search(r'\(Ref:\s*([^\)]+)\)', name_complete, re.I)
                    if m:
                        optsku = option_text.strip().lower().replace(
                            'code', '').strip('-. ').split('-')[0]
                        if optsku:
                            loader.add_value('sku', m.group(1) + optsku)
                        else:
                            loader.add_value('sku',
                                             m.group(1) + ".inc" + str(idx))
                            idx += 1
                        loader.add_value('identifier',
                                         loader.get_output_value('sku'))

                    if loader.get_output_value('sku') not in INVALID_PRODUCTS:
                        yield loader.load_item()
            else:
                loader = ProductLoader(item=Product(), selector=product)
                loader.add_value('url', response.url)
                loader.add_value('name', name)
                loader.add_value('category', category)
                loader.add_value('image_url', image_url)
                if not general_price:
                    continue
                regex = r'[\d]{1,2},[\d]{2}'
                if re.search(regex, general_price):
                    general_price = general_price.replace(',', '')
                loader.add_value('price', general_price)
                m = re.search(r'\(Ref:\s*([^\)]+)\)', name_complete, re.I)
                if m:
                    loader.add_value('sku', m.group(1))
                    loader.add_value('identifier',
                                     loader.get_output_value('sku'))

                # if loader.get_output_value('price'):
                if loader.get_output_value('sku') not in INVALID_PRODUCTS:
                    yield loader.load_item()
Esempio n. 40
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        brand = hxs.select(
            '//span[@class="title-designer-info"]/a/text()').extract()
        brand = brand[0] if brand else ''

        options = re.search('var spConfig = new Product.Config\((.*})\);',
                            response.body)
        options = json.loads(options.group(1)) if options else None
        if options:
            product_name = options['productName']
            price = options['basePrice']
            image_url = options['imageUrl']
            identifier = options['productId']
        else:
            product_name = hxs.select(
                '//span[@itemprop="name"]/text()')[0].extract()
            price = hxs.select(
                '//form//p[@class="special-price"]/span[@class="price"]/text()'
            ).extract()
            if not price:
                price = hxs.select(
                    '//form//span[@class="regular-price"]/span[@class="price"]/text()'
                ).extract()
            price = price[0].replace('.', '').replace(',', '.')
            image_url = hxs.select('//img[@id="image-main"]/@src')[0].extract()
            identifier = hxs.select(
                '//input[@name="product"]/@value')[0].extract()
        product_loader = ProductLoader(item=Product(), selector=hxs)
        # url = 'http://www.retrofurnish.com/de/' + response.url.split('/')[-1]
        product_loader.add_value('url', response.url)
        product_loader.add_value('name', product_name)
        product_loader.add_value('brand', brand)
        product_loader.add_value('image_url', image_url)
        product_loader.add_value('identifier', identifier)
        product_loader.add_value('category',
                                 response.meta.get('category') or '')
        product_loader.add_value('sku', identifier)
        price = re.search('([\d\.]+)', price).group(1)
        product_loader.add_value('price', price)
        product_loader.add_value(
            'shipping_cost',
            self.get_shipping_cost(
                float(product_loader.get_output_value('price'))))
        if not options:
            product = product_loader.load_item()
            if product['identifier'] in self.products_ids:
                product['name'] = self.products_ids[product['identifier']]
            else:
                self.products_ids[product['identifier']] = product['name']
            yield product
            return
        option_names = {}
        for attr in options['attributes'].values():
            for opt in attr['options']:
                for prod in opt['products']:
                    option_names[prod] = option_names.get(prod,
                                                          []) + [opt['label']]
        option_names = dict(
            map(lambda x: (x[0], ' '.join(x[1])), option_names.items()))
        for option in options.get('childProducts').iteritems():
            product_loader = ProductLoader(item=Product(), selector=hxs)
            product_loader.add_value('url', response.url)
            product_loader.add_value(
                'name', '%s %s' % (product_name, option_names[option[0]]))
            product_loader.add_value('image_url', option[1]['imageUrl'])
            product_loader.add_value('identifier', option[0])
            product_loader.add_value('sku', identifier)
            product_loader.add_value('brand', brand)
            product_loader.add_value('category',
                                     response.meta.get('category') or '')
            product_loader.add_value('price', option[1]['finalPrice'])
            product_loader.add_value(
                'shipping_cost',
                self.get_shipping_cost(
                    float(product_loader.get_output_value('price'))))
            product = product_loader.load_item()
            if product['identifier'] in self.products_ids:
                product['name'] = self.products_ids[product['identifier']]
            else:
                self.products_ids[product['identifier']] = product['name']
            yield product
Esempio n. 41
0
    def parse_products(self, response):
        data = json.loads(response.body)
        products = data['response']['products']
        if products:
            u_id = response.meta['u_id']
            u_cat = response.meta['u_cat']
            offset = response.meta['offset']
            for product in products:
                product_loader = ProductLoader(item=Product(),
                                               response=response)
                if product['price']:
                    product_loader.add_value('identifier', product['id'])
                    product_loader.add_value('name', product['title'])
                    product_loader.add_value('sku', product['id'])
                    price = product['price']['value'].replace(' ', '').replace(
                        '.', '').replace(',', '.')
                    product_loader.add_value('price', price)
                    product_loader.add_value(
                        'image_url',
                        response.urljoin(product['featured_image']['source']))
                    product_loader.add_value('url', product['url'])
                    product_loader.add_value('brand', product['brand']['name'])
                    if product['variants'][0]['inventory_quantity'] == '0':
                        product_loader.add_value('stock', 0)
                    product_loader.add_value('category', product['category'])
                    exclusive_online = False
                    metadata = SonaeMeta()
                    promo = False
                    for tag in product['tags']:
                        if u'promo' in tag['title'].lower():
                            promo = True
                        if u"PromoçãoOnline" in tag['title'].title().replace(
                                ' ', ''):
                            exclusive_online = True

                    if self.meta_df is not None and not self.meta_df.empty and product[
                            'id'] in self.meta_df.index:
                        prev_meta = self.meta_df.loc[product['id']]
                    else:
                        prev_meta = {}
                    promo_start = prev_meta.get('promo_start')
                    promo_end = prev_meta.get('promo_end')
                    today = datetime.datetime.now().strftime('%Y-%m-%d')
                    metadata['extraction_timestamp'] = datetime.datetime.now(
                    ).strftime('%Y-%m-%d %H:%M')
                    if promo:
                        metadata[
                            'promo_start'] = promo_start if promo_start and not promo_end else today
                        metadata['promo_end'] = ''
                    else:
                        if promo_start:
                            metadata['promo_start'] = promo_start
                            metadata[
                                'promo_end'] = today if not promo_end else promo_end
                    if exclusive_online:
                        metadata['exclusive_online'] = 'Yes'
                    item = product_loader.load_item()
                    item['metadata'] = metadata
                    yield item

            yield scrapy.Request(
                'http://www.phonehouse.pt/api.php/getProducts/' + u_id + '/' +
                u_cat + '/' + str(offset + 12),
                callback=self.parse_products,
                meta={
                    'u_id': u_id,
                    'u_cat': u_cat,
                    'offset': offset + 12
                })
Esempio n. 42
0
    def parse_product(self, response):
        l = ProductLoader(item=Product(), response=response)
        metadata = SonaeMeta()

        l.add_xpath('image_url', '//img[contains(@class, "product-detail-img-main")]/@src')
        l.add_value('url', response.url)
        name = response.xpath('//h1/text()').extract()[0].strip()
        #name_desc = ''.join(hxs.select('//span[@class="infoDet"]/text()').extract()).strip()
        #l.add_value('name', name + ' ' + name_desc)
        l.add_value('name', name)
        price = ''.join(response.xpath('//span[@class="item-price"]/text()').extract()[0].strip().split())
        l.add_value('price', extract_price(price))
 
        out_of_stock = response.xpath(u'//div[@class="product-btns-panel"]/button[contains(text(), "Indisponível")]')
        if out_of_stock:
            l.add_value('stock', 0)

        categories = response.xpath('//ol[@class="breadcrumb"]/li/a/text()').extract()[1:]
        for category in categories:
            l.add_value('category', category)
        
        brand = response.xpath('//div[h1]/h3/text()').extract()
        if brand:
            l.add_value('brand', brand[0])
        '''
        weight = response.xpath('//div[h2[contains(text(), "Peso")]]/p/text()').extract()
        if not weight:
            weight = response.xpath('//tr[td[contains(text(), "Peso")]]/td/@txt').extract()
        
        weight = extract_price(weight[0]) if weight else 0
        shipping = 0
        if weight>=0.5 and weight<3:
            shipping = 2
        if weight>=3 and weight<5:
            shipping = 4
        if weight>=5 and weight<10:
            shipping = 5
        if weight>=10 and weight<20:
            shipping = 10
        if weight>=20:
            shipping = 15
                
        if shipping:
            l.add_value('shipping_cost', shipping)
        '''
        identifier = response.xpath('//input[@name="Id"]/@value').extract()
        l.add_value('identifier', identifier[0])
        l.add_value('sku', identifier[0])

        if self.meta_df is not None and not self.meta_df.empty and identifier[0] in self.meta_df.index:
            prev_meta = self.meta_df.loc[identifier[0]]
        else:
            prev_meta = {}
        promo = response.xpath('//span[@class="item-old-price"]/span[@class="item-old-price"]/text()')
        promo_start = prev_meta.get('promo_start')
        promo_end = prev_meta.get('promo_end')
        today = datetime.datetime.now().strftime('%Y-%m-%d')

        metadata['extraction_timestamp'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
        if promo:
            metadata['promo_start'] = promo_start if promo_start and not promo_end else today
            metadata['promo_end'] = ''
        else:
            if promo_start:
                metadata['promo_start'] = promo_start
                metadata['promo_end'] = today if not promo_end else promo_end

        item = l.load_item()
        item['metadata'] = metadata
        yield item
Esempio n. 43
0
    def parse_product(self, response):
        log.msg(response.url)
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        identifier = response.url.split('/')[-1].split('-')[-1].split('.')[0]
        log.msg('Identifier: %s' % identifier)
        log.msg(repr(self.seen_ids))
        if identifier in self.seen_ids:
            return
        else:
            self.seen_ids.append(identifier)
        loader.add_value('identifier', identifier)
        sku = hxs.select('//p[@class="pmeta"]/text()').re('(\d+)')
        loader.add_value('sku', sku)
        name = hxs.select('//div[@class="prod-box"]/h1//text()').extract()
        extra_data = name[1].strip() if len(name) > 1 else ''
        loader.add_value('name', name[0])
        #price
        price = re.sub(
            '[\r\n\t]+', ' ',
            hxs.select(
                '//h5[@class="product-price"]//div[contains(@id,"StaticPrice")]/span/text()[normalize-space()]'
            )[0].extract())
        loader.add_value('price', price)
        #image_url
        image_url = hxs.select('//img[@class="product-image"]/@src').extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
        #brand
        loader.add_value('brand', 'Le Creuset')
        #category
        category = hxs.select('//ul[@class="breadcrumbs"]')[0].select(
            './/a/text()').extract()
        loader.add_value('category', ' > '.join(category[2:]))
        #shipping_cost
        price = Decimal(loader.get_output_value('price'))
        if price < 20.00:
            loader.add_value('shipping_cost', '2.00')
        elif 20.00 <= price < 40.00:
            loader.add_value('shipping_cost', '4.99')

        product = loader.load_item()

        options = hxs.select('.//select/option[contains(@class,"%s")]' %
                             identifier)
        if options:
            sid = hxs.select(
                '//input[@type="hidden" and @name="SID"]/@value')[0].extract()
            stock_url = 'http://www.hartsofstur.com/cgi-bin/st000001.pl?ACTION=GETSTOCK&REF=%(identifier)s&SID=%(sid)s&timestamp=%(timestamp)s'
            items = []
            for option in options:
                item = copy.deepcopy(product)
                option_name = option.select('./text()')[0].extract().strip()
                option_identifier = option.select('./@class').re('_(\d+)_')[0]
                self.seen_ids.append(option_identifier)
                item['identifier'] = "%s_%s" % (identifier,
                                                option_identifier.strip())
                item['name'] += ' %s %s' % (option_name, extra_data)
                item['name'] = item['name'].strip()
                items.append(item)
            yield Request(stock_url % {
                'identifier': identifier,
                'sid': sid,
                'timestamp': int(time.time())
            },
                          meta={'items': items},
                          callback=self.parse_stock)
        else:
            product['name'] += ' %s' % extra_data
            yield product
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        try:
            data = re.findall(
                "var productJson = (.*);",
                hxs.select(
                    '//script[contains(text(), "var productJson = ")]/text()').
                extract().pop(),
                flags=re.DOTALL)
            data_json = demjson.decode(data[0].replace("\n", "").replace(
                "[,{", "[{"))
        except (TypeError, ValueError, XMLSyntaxError,
                demjson.JSONDecodeError) as e:
            request = self.retry(response, "Unknown error on " + response.url)
            if request:
                yield request
            return
        except IndexError:
            return

        name = data_json['productTitle']
        if 'price' in data_json['priceData']:
            price = extract_price(data_json['priceData']['price'])
        identifier = data_json['productId']

        sku = identifier

        category = " > ".join(
            hxs.select(
                '//div[@id="breadcrumbs"]/div/div/a[text()!="Home"]/text()').
            extract())

        colors = {}
        for color in data_json['availableColors']:
            if 'mainImageURL' in color:
                colors[color['id']] = color['mainImageURL']

        img = data_json['mainImageURL']

        warranty = hxs.select(
            '//select[@id="warranty_0"]/option/text()').extract()
        warranty_price = ''
        if warranty:
            warranty_price = re.search('Replacement - .([\d\.]+)',
                                       warranty[-1])
            warranty_price = warranty_price.group(1) if warranty_price else ''
        biw_metadata = BIWMeta()
        biw_metadata['warranty'] = warranty_price

        for item in data_json['skus']:
            if item['price']:
                price = extract_price(item['price'])
            if not price:
                continue
            if item['colorId'] in colors:
                img = colors[item['colorId']]
            stock = 0
            if item.get('size', '').lower() == 'one size':
                item['size'] = ''
            if item.get('color', '').lower() == 'one color':
                item['color'] = ''
            itemname = "%s %s %s" % (name, item.get('color',
                                                    ''), item.get('size', ''))
            if "IN_STOCK" in item['avail']:
                stock = 1
            if "NO_DISPLAY" in item['avail']:
                continue

            product = Product()
            product['category'] = category
            product['sku'] = sku
            product['url'] = response.url
            product['stock'] = stock
            product['metadata'] = biw_metadata

            if img:
                product['image_url'] = urljoin_rfc(base_url, img)

            loader = ProductLoader(item=product, response=response)
            loader.add_value('identifier',
                             "%s-%s" % (identifier, item['sku_id']))
            loader.add_value('name', itemname)
            loader.add_value('price', price)
            yield loader.load_item()
Esempio n. 45
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        product_name = hxs.select(
            '//span[@itemprop="name"]/text()').extract()[0].strip()
        try:
            product_brand = hxs.select(
                '//span[@itemprop="brand"]/text()').extract()[0].strip()
        except:
            product_brand = ''
        image_url = hxs.select('//img[@itemprop="image"]/@src').extract()[0]
        category = hxs.select(
            '//div[@class="breadcrumb"]//a/span[@itemprop="title"]/text()'
        ).extract()[-1]
        categories = hxs.select(
            '//ul[@class="simple"]/li/a/span[@itemprop="title"]/text()'
        ).extract()
        categories = categories[1:] if categories else []
        identifier = response.url.split('/')[-1].split('-')[0]
        price = hxs.select(
            '//div[@itemprop="offers"]//ins[@itemprop="price"]/text()'
        ).extract()[0]
        product_url = response.url
        stock = hxs.select(
            '//div[contains(@class, "productDetail")]//div[contains(@class, "availability")]//strong[contains(@class, "available")]/i[@class="icon-ok"]'
        ).extract()
        sellers_url = hxs.select(
            '//a[contains(@href, "all_seller")]/@href').extract()

        dealer = hxs.select(
            '//p[contains(@class, "sellby")]/a/strong/text()').extract()
        if not dealer:
            dealer = hxs.select(
                '//p[contains(@class, "sellby")]/strong/text()').extract()
        if not dealer:
            dealer = hxs.select(
                '//section[@class="col3"]//p/strong/text()').extract()

        dealer = dealer[0].strip() if dealer else 'Pixmania.com'

        if self.pixmania_direct and dealer != 'Pixmania.com':
            return

        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', identifier)
        if self.use_main_id_as_sku:
            l.add_value('sku', identifier)
        if self.append_brand_to_name:
            l.add_value('name', product_brand + ' ' + product_name)
        else:
            l.add_value('name', product_name)
        if not self.full_category_path:
            l.add_value('category', category)
        else:
            l.add_value('category', categories)
        l.add_value('brand', product_brand)
        l.add_value('url', product_url)
        l.add_value('image_url', image_url)

        if not stock:
            l.add_value('stock', 0)

        if not self.only_buybox and not self.pixmania_direct and sellers_url:
            item = l.load_item()
            yield Request(sellers_url[0].strip(),
                          callback=self.parse_sellers,
                          meta={'product': item})
        else:
            l.add_value('price', self._encode_price(price))
            l.add_value('dealer', 'Pix - ' + dealer)
            item = l.load_item()
            item['identifier'] += '-' + dealer
            if self.collect_reviews:
                reviews_url = add_or_replace_parameter(
                    self.reviews_url, 'filter.q0',
                    'productid:eq:%s' % identifier)
                reviews_url = add_or_replace_parameter(reviews_url,
                                                       'offset.q0', '0')
                yield Request(reviews_url,
                              meta={'products': [item]},
                              callback=self.parse_reviews)
            else:
                yield item
    def parse_item(self, response):
        '''Parse page of particular product'''

        hxs = HtmlXPathSelector(response)
        page_title = hxs.select("//title/text()").extract()[0]
        product_category = hxs.select(
            "//div[@id='ProductBreadcrumb']/ul/li/a/text()").extract()[1]
        product_name = hxs.select(
            "//div[@id='ProductDetails']//h1/text()").extract()[0]
        product_price = hxs.select(
            "//span[@class='ProductDetailsPriceIncTax']/text()").extract()
        product_id = hxs.select(
            "//input[@name='product_id']/@value").extract()[0]
        product_brand = product_brand = response.meta['brand']
        product_image = hxs.select(
            "//div[@class='ProductThumbImage']/a/img/@src").extract()[0]

        product_details = hxs.select(
            "//form[@id='productDetailsAddToCartForm']").extract()
        options_to_check = []
        possible_options = {}
        product_option_attributes = []

        if product_price:
            product_price = product_price[0].encode('utf-8')
            price_pattern = '[0-9,]+\.[0-9]{2}'
            product_price = re.findall(re.compile(price_pattern),
                                       product_price)[0]
            product_price = float(re.sub(',', '', product_price))
            stock_status = 1
        else:
            product_price = 0.00
            stock_status = 0

        if product_details:
            product_options = hxs.select(
                "//form[@id='productDetailsAddToCartForm']//div[@class='productOptionViewSelect']"
            )
            product_attribute_labels = hxs.select(
                "//form[@id='productDetailsAddToCartForm']//div[@class='productAttributeLabel']//span[@class='name']/text()"
            ).extract()
            product_attribute_labels = [
                re.compile(r'[\n\r\t]').sub('', product_attribute_label)
                for product_attribute_label in product_attribute_labels
            ]

            for num, product_option in enumerate(product_options):

                product_option_attribute = product_option.select(
                    "select[@class='validation']/@name").extract()[0]
                product_option_attributes.append(product_option_attribute)

                product_option_data = product_option.select(
                    "*/option[not(@value='') and not(contains(text(), 'None'))]"
                ).extract()
                product_option_values_set = [
                    ''.join(re.findall(re.compile('value=\"(.+?)\"'), i))
                    for i in product_option_data
                ]
                product_option_titles_set = [
                    ''.join(re.findall(re.compile('>(.+?)<\/option>'), i))
                    for i in product_option_data
                ]

                possible_options[str(num +
                                     1)] = product_option_values_set + ['']

                for a in range(len(product_option_values_set)):
                    tmp_dict = {}
                    tmp_dict[product_option_titles_set[
                        a]] = product_option_values_set[a]
                    options_to_check.append(tmp_dict)

            # On the website there only products with 0, 1 and 2 numbers of options
            if len(product_option_attributes) == 2:
                for value_01 in possible_options.get('1'):
                    for value_02 in possible_options.get('2'):
                        post_data = {
                            'actions': 'add',
                            'product_id': product_id,
                            product_option_attributes[0]: value_01,
                            product_option_attributes[1]: value_02,
                            'w': 'getProductAttributeDetails'
                        }

                        yield FormRequest('http://www.sxpro.co.uk/remote.php',
                                          formdata=post_data,
                                          method='POST',
                                          callback=self.handle_json_response,
                                          meta={
                                              'value_01': value_01,
                                              'value_02': value_02,
                                              'num_options': 2,
                                              'options': options_to_check,
                                              'product_name': product_name,
                                              'product_id': product_id,
                                              'stock_status': stock_status,
                                              'product_url': response.url,
                                              'product_image': product_image,
                                              'category': product_category,
                                              'product_brand': product_brand
                                          },
                                          dont_filter=True)

            elif len(product_option_attributes) == 1:
                for value_01 in possible_options.get('1'):
                    post_data = {
                        'actions': 'add',
                        'product_id': product_id,
                        product_option_attributes[0]: value_01,
                        'w': 'getProductAttributeDetails'
                    }

                    yield FormRequest('http://www.sxpro.co.uk/remote.php',
                                      formdata=post_data,
                                      method='POST',
                                      callback=self.handle_json_response,
                                      meta={
                                          'value_01': value_01,
                                          'num_options': 1,
                                          'options': options_to_check,
                                          'product_name': product_name,
                                          'product_id': product_id,
                                          'stock_status': stock_status,
                                          'product_url': response.url,
                                          'product_image': product_image,
                                          'category': product_category,
                                          'product_brand': product_brand
                                      },
                                      dont_filter=True)

            else:
                l = ProductLoader(item=Product(), response=response)

                l.add_value('price', product_price)
                l.add_value('stock', stock_status)
                l.add_value('identifier', product_id)
                l.add_value('category', product_category)
                l.add_value('url', response.url)
                l.add_value('name', product_name)
                l.add_value('image_url', product_image)
                l.add_value('brand', product_brand)

                yield l.load_item()
Esempio n. 47
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        name = response.meta['name']
        url = response.url

        sku = hxs.select(
            "//p[@itemprop='identifier']/@content").extract()[0].replace(
                'sku:', '').strip()
        brand = ''.join(
            hxs.select("//span[@itemprop='brand']/text()").extract())
        image_url = ''.join(
            hxs.select("//div[@class='product-img-box']//img[@id='image']/@src"
                       ).extract())
        categories = hxs.select("//div[@class='breadcrumbs']//a")[1:]
        categories = [
            category.select("./span/text()").extract()[0]
            for category in categories
        ]

        try:
            stock = 1 if hxs.select("span[@itemprop='availability']/text()"
                                    ).extract()[0].strip() == 'In stock' else 0
        except:
            stock = 0 if 'out of stock' in response.body.lower() else 1

        if not categories:
            categories = response.meta['categories_tmp']

        try:
            options = json.loads(
                re.findall(
                    re.compile('\"productConfig\":(.+?),"productAttributes'),
                    response.body)[0])
            options_names = json.loads(
                '{' +
                re.findall(re.compile('{\"attributes\":.*(\"options.+?}]})'),
                           response.body)[0])['options']
            options_names = dict((k, options_name['label'])
                                 for options_name in options_names
                                 for k in options_name['products'])
        except Exception as e:
            logging.error('No options found')
            options = None

        if options:
            for option in options:

                l = ProductLoader(item=Product(), response=response)

                option_id = option
                saving_price = re.findall(
                    re.compile('>(.+?)<'),
                    options[option]['saving_price'])[0].encode(
                        'ascii', 'ignore')
                retail_price = re.findall(
                    re.compile('>(.+?)<'),
                    options[option]['retail_price'])[0].encode(
                        'ascii', 'ignore')
                option_price = round(
                    float(retail_price) - float(saving_price), 2)
                sku_tmp = sku + '-' + str(option_id)
                option_name = options_names.get(option_id)

                try:
                    options[option]['stockAlertUrl']
                    stock = 0
                except:
                    stock = 1

                l.add_value('image_url', image_url)
                l.add_value('url', url)
                l.add_value('price', option_price)
                l.add_value('stock', stock)
                l.add_value('brand', brand)
                l.add_value('identifier', sku_tmp)
                l.add_value('sku', sku_tmp)
                l.add_value('name', name + ' ' + option_name)

                for category in categories:
                    l.add_value('category', category)

                yield l.load_item()

        else:

            l = ProductLoader(item=Product(), response=response)

            price = hxs.select(
                "//span[@class='regular-price']/span[@class='price']/text()"
            ).extract()[0].strip()[1:].replace('[', '').replace(']', '')

            l.add_value('image_url', image_url)
            l.add_value('url', url)
            l.add_value('price', price)
            l.add_value('stock', stock)
            l.add_value('brand', brand)
            l.add_value('identifier', sku)
            l.add_value('sku', sku)
            l.add_value('name', name)

            for category in categories:
                l.add_value('category', category)

            yield l.load_item()
Esempio n. 48
0
    def parse_product_list(self, response):
        hxs = HtmlXPathSelector(response)

        #cats = hxs.select(u'//div[@id="RightColumn"]/table/tr/td/center/div[@class="contentsName"]/a/@href').extract()
        products = hxs.select('//h2[@class="product-name"]/a/@href').extract()
        if products:
            for url in products:
                #if url.split('.')[-1].lower() not in ('htm', 'html'):
                # Contains links to PDFs as well
                #    continue
                #url = urljoin_rfc(get_base_url(response), url)
                yield Request(url, callback=self.parse_product_list)
        else:
            opt_groups = []
            # def fix_options(what, o):
            #     try:
            #         return (what + ':' + o[0], o[1].replace(',', ''))
            #     except:
            #         return (what + ':' + o[0], '0')

            option_names = hxs.select(
                '//fieldset[@class="product-options"]/dl/dt/label/text()'
            ).extract()
            for i, option in enumerate(
                    hxs.select(
                        '//select[contains(@class, "product-custom-option") or contains(@class, "required-entry")]'
                    )):
                what = option_names[i].strip().replace(':', '')
                opt_list = option.select(
                    u'./option[@value!="PleaseSelect" and @value!="Please Select" and text()!=""]/text()'
                ).extract()[1:]
                option_ids = option.select(
                    u'./option[@value!="PleaseSelect" and @value!="Please Select" and @value!=""]/@value'
                ).extract()
                opt_list = map(lambda x, y: x + [y], [
                    o.split('+') if len(o.split('+')) > 1 else o.split('+') +
                    ['0'] for o in opt_list
                ], option_ids)
                if opt_list:
                    opt_groups.append(
                        [self.fix_options(what, o) for o in opt_list])

            # Extract option from JavaScript code
            try:
                js_options = ''
                for line in response.body.split('\n'):
                    if "spConfig = new Product.Config(" in line:
                        js_options = line.split(
                            'spConfig = new Product.Config(')[1].split(');')[0]
                json_options = json.loads(js_options)

                for item in json_options['attributes'].iteritems():
                    options = item[-1]['options']
                    option_ids = []
                    opt_list = []
                    for option in options:
                        option_ids.append(option['id'])
                        opt_list.append(option['label'] + '+' +
                                        option['price'])

                    what = option_names[i].strip().replace(':', '')
                    opt_list = map(lambda x, y: x + [y], [
                        o.split('+')
                        if len(o.split('+')) > 1 else o.split('+') + ['0']
                        for o in opt_list
                    ], option_ids)
                    opt_groups.append(
                        [self.fix_options(what, o) for o in opt_list])
            except:
                log.msg('No JSON options: ' + response.url)

            if len(opt_groups) > 4:
                self.log("WARNING: Too many options, using base price only")
                opt_groups = []

            for opt_name, opt_price, opt_id in multiply(opt_groups):
                product_loader = ProductLoader(item=Product(), selector=hxs)
                '''
                if not hxs.select(u'//div[@class="buybox"]'):
                    self.log("WARNING: NOT A PRODUCT")
                    return
                '''

                product_loader.add_value('url', response.url)
                product_loader.add_xpath('name', u'//h1/text()')

                if hxs.select(
                        '//tr[td/text()="Sale Price"]/td[text()!="Sale Price"]/text()'
                ):  #FIXME: fix the other prices
                    product_loader.add_xpath(
                        'price',
                        u'//tr[td/text()="Sale Price"]/td[text()!="Sale Price"]/text()'
                    )
                elif hxs.select('//td/span[@class="price"]/text()'):
                    product_loader.add_xpath(
                        'price', u'//td/span[@class="price"]/text()')
                else:
                    product_loader.add_xpath(
                        'price',
                        u'//div[@class="itemRegPrice"]/span/font/text()')

                sku = hxs.select('//tr[th/text()="MPN"]/td/text()').extract()
                sku = sku[0] if sku else ''
                product_loader.add_value('sku', sku)
                product_loader.add_xpath(
                    'category',
                    u'//div[@class="breadcrumbs"]/ul/li[contains(@class, "category")]/a/text()'
                )
                product_loader.add_xpath(
                    'image_url',
                    u'//div[@class="product-img-box"]//div[@class="prolabel-wrapper"]/a/img/@src'
                )
                #            product_loader.add_xpath('brand', u'substring-after(//div[@class="product-meta"]/span[contains(text(),"Manufacturer:")]/text(),":")')
                product_loader.add_value('shipping_cost', '')
                identifier = hxs.select(
                    '//input[@name="product"]/@value').extract()[0]
                if opt_id:
                    product_loader.add_value('identifier',
                                             identifier + '-' + opt_id)
                else:
                    product_loader.add_value('identifier', identifier)

                product = product_loader.load_item()
                product['name'] = (product['name'] + ' ' + opt_name).strip()

                if not 'price' in product:
                    product['price'] = Decimal(0)
                    self.log('ERROR price is not set, setting to default 0')
                else:
                    product['price'] = product['price'] + Decimal(opt_price)

                yield product

        next = hxs.select('//a[@class="next i-next"]/@href').extract()
        if next:
            yield Request(next[0], callback=self.parse_product_list)
Esempio n. 49
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        URL_BASE = get_base_url(response)

        categories = hxs.select(
            "//nav[contains(@class, 'section_nav')]/ul/li//a/@href").extract()
        for url in categories:
            url = urljoin_rfc(URL_BASE, url)
            request = Request(url, callback=self.parse)
            yield request

        pages = hxs.select("//ul[@class='pagination']//a/@href").extract()
        for url in pages:
            url = urljoin_rfc(URL_BASE, url)
            request = Request(url, callback=self.parse)
            yield request

        category = hxs.select('//span[@class="current"]/text()').extract()[0]
        items = hxs.select("//article[contains(@class, 'product')]")
        for item in items:
            name = item.select(
                ".//div/header[@class='productTitle']/a/text()").extract()
            if not name:
                continue
            name = name[0].strip()
            name = re.sub("[\s]+", " ", name)

            identifier = item.select(
                'div/div/input[@name="sFUPID"]/@value').extract()
            if identifier:
                identifier = identifier[0]
            else:
                identifier = item.select(
                    'div[@class="productAdditional"]/p/@id').extract()
                if identifier:
                    identifier = identifier[0].split('desc_')[-1]
                else:
                    identifier = ''

            url = item.select(
                ".//div/div/header[@class='productTitle']/a/@href").extract()
            if not url:
                logging.error("ERROR! NO URL! URL: %s. NAME: %s" %
                              (response.url, name))
                continue
            url = url[0]
            price = item.select(
                ".//div//span[@class='currentPrice']/ins/text()").extract()
            if not price:
                logging.error("ERROR! NO PRICE! URL: %s. NAME: %s" %
                              (response.url, name))
                continue
            price = price[0].strip()

            l = ProductLoader(item=Product(), selector=item)
            l.add_value('name', name)
            l.add_value('url', url)
            l.add_value('identifier', identifier)
            l.add_value('price', price)
            l.add_value(
                'category',
                hxs.select('//span[@class="current"]/text()').extract()[0])
            l.add_xpath('image_url', 'a/img/@src')
            yield l.load_item()
Esempio n. 50
0
    def parse_cat(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        subcats = hxs.select(
            '//div[contains(@class,"category-fourgrid") or contains(@class,"sub-category-grid")]//a/@href'
        ).extract()
        productsxs = hxs.select(
            '//div[contains(@class,"product-list-row") and div[contains(@class, "product-info")]]'
        )

        if not subcats and not productsxs:
            retry = int(response.meta.get('retry', 0))
            if retry < 10:
                retry += 1
                new_req = response.request.copy()
                new_req.meta['retry'] = retry
                new_req.dont_filter = True
                yield new_req
            return

        for url in subcats:
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse_cat,
                          meta=response.meta)

        for productxs in productsxs:
            product_options_link = productxs.select(
                './/div[@class="form-row"]/a/@href').extract()
            if product_options_link:
                yield Request(urljoin_rfc(base_url, product_options_link[0]),
                              callback=self.parse_cat,
                              meta=response.meta)
            else:
                loader = ProductLoader(item=Product(), selector=productxs)
                loader.add_value(
                    'price', ''.join(
                        productxs.select(
                            './/div[@class="price"]//text()').extract()))
                if productxs.select(
                        './/img[@alt="In stock" or contains(@alt,"days delivery") or contains(@alt,"Day Delivery") or contains(@alt,"Hour Delivery")]'
                ):
                    loader.add_value('stock', 1)
                else:
                    loader.add_value('stock', 0)
                loader.add_xpath('identifier', './/p[@class="code"]/text()')
                product_url = productxs.select(
                    './/h3[@class="product-name"]/a/@href').extract()[0]
                loader.add_value('url', urljoin_rfc(base_url, product_url))
                loader.add_xpath('name',
                                 './/h3[@class="product-name"]/a/text()')
                loader.add_value('category', response.meta.get('category'))
                loader.add_value(
                    'sku',
                    self.map_sku(''.join(
                        productxs.select(
                            './/p[@class="code"]/text()').extract())))
                img = productxs.select(
                    './/div[@class="primaryImageDiv"]//img/@src').extract()
                if img:
                    loader.add_value(
                        'image_url',
                        urljoin_rfc(base_url,
                                    img[0].replace('/medium/', '/large/')))
                loader.add_xpath('brand', './/img[@class="brand-image"]/@alt')
                brand = loader.get_output_value('brand').strip().upper()
                if brand in self.ignore_brands:
                    log.msg('Ignoring %s product: %s' % (brand, response.url))
                    return

                item = self.add_shipping_cost(loader.load_item())

                if item.get('identifier', '').strip():
                    yield item

        for url in hxs.select('//ul[@class="pager"]//a/@href').extract():
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse_cat,
                          meta=response.meta)
Esempio n. 51
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        try:
            brand_name = hxs.select(
                '//*[@itemprop="brand"]/text()').extract()[0]
            name = hxs.select('//h1[@class="product-title"]/span/text()'
                              ).extract()[-1].strip()
        except:
            return

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_value('name', brand_name + ' ' + name)
        product_loader.add_xpath(
            'sku',
            u'//div[@class="additional-product-no" and contains(text(), "Manufacturer Item no.")]',
            re=r'Manufacturer Item no\. (.*)')
        product_loader.add_xpath('identifier', u'//body/@data-vw-id')

        price = hxs.select(
            '//div[contains(@class, "artikel-detail")]//*[@itemprop="price"]/text()'
        ).extract()
        if not price:
            price = ''
        else:
            price = price[0]

        product_loader.add_value('price',
                                 price.replace('.', '').replace(',', '.'))

        category = hxs.select(
            '//nav[@id="breadcrumb"]//a/text()').extract()[-1]

        product_loader.add_value('category', category)

        product_loader.add_value('brand', brand_name.strip())

        try:
            image_url = urljoin_rfc(
                base_url,
                hxs.select('//img[@itemprop="image"]/@src').extract()[0])
            product_loader.add_value('image_url', image_url)
        except:
            pass
        product = product_loader.load_item()
        options = hxs.select(
            '//div[@id="variantselector"]//tr[@class="variant"]')
        if options:
            for opt in options:
                p = Product(product)
                try:
                    p['name'] = p['name'] + ' ' + opt.select(
                        u'.//td[2]/label/text()').extract()[0]
                except IndexError:
                    # No option name extension
                    pass
                p['identifier'] = p['identifier'] + '-' + opt.select(
                    u'.//input/@value').extract()[0]
                if p['identifier'] not in self.idents:
                    self.idents.append(p['identifier'])
                    yield p
        else:
            if product['identifier'] not in self.idents:
                self.idents.append(product['identifier'])
                yield product
Esempio n. 52
0
    def parse_product(self, response):
        url = response.url

        l = ProductLoader(item=Product(), response=response)

        name = response.xpath('//span[@itemprop="name"]/text()').extract()
        try:
            name = name[0].strip()
        except IndexError:
            retry = response.meta.get('retry', 0)
            if retry <= 3:
                yield Request(response.url,
                              dont_filter=True,
                              callback=self.parse_product,
                              meta={'retry': retry + 1})

        l.add_value('name', name)

        price = response.xpath(
            '//p[@class="special-price"]/span[@class="price"]/text()').extract(
            )
        if price:
            price = price[0]
        else:
            price = response.xpath(
                '//span[@class="regular-price"]/span[@class="price"]/text()'
            ).extract()
            if price:
                price = price[0]
        l.add_value('price', price)

        sku = response.xpath(
            '//div[@class="product-shop--sku"]/h4/span/text()').extract()
        l.add_value('sku', sku[0])

        identifier = response.css(
            'div.nosto_product span.product_id::text').extract(
            ) or response.xpath('//input[@id="entity_id"]/@value').extract()
        l.add_value('identifier', identifier[0])

        l.add_value('category', response.meta.get('category', ''))

        image_url = response.xpath(
            '//span[@class="image_url"]/text()').extract()
        l.add_value('image_url', image_url)
        l.add_value('url', url)
        l.add_xpath('brand', '//span[@class="brand"]/text()')

        out_of_stock = response.xpath(
            '//div[contains(@class, "availability-box")]/p[contains(@class, "out-of-stock")]'
        )
        if out_of_stock:
            l.add_value('stock', 0)

        item = l.load_item()

        options = response.xpath('//table[@id="super-product-table"]/tbody/tr')
        if options:
            for option in options:
                option_item = deepcopy(item)
                option_item['name'] = option.xpath('td[1]/text()').extract()[0]
                price = option.xpath(
                    'td//span[@class="price"]/text()').extract()
                price = extract_price(price[0]) if price else 0
                option_item['price'] = price
                identifier = option.xpath('td//input/@name').re('\[(.*)\]')
                if not identifier:
                    identifier = option.xpath('td//span/@id').re(
                        'product-price-(.*)')
                    option_item['stock'] = 0

                option_item['identifier'] += '-' + identifier[0]
                yield option_item
        else:
            yield item
Esempio n. 53
0
    def parse_product(self, response):
        if response.status == 405:
            url = response.meta['redirect_urls'][0]
            retries = response.meta.get('retries', 0)
            if retries >= 9:
                self.logger.error(
                    'Gave up retrying avoid antibot captcha for %s' % url)
                return
            self.logger.debug('DistilNetworks antibot captcha. Retrying %s' %
                              url)
            yield response.request.replace(dont_filter=True,
                                           url=url,
                                           meta={
                                               'retries': retries + 1,
                                               'dont_merge_cookies': True
                                           })
            return

        if response.url in self.old_urls:
            self.old_urls.remove(response.url)

        options_data = response.xpath(
            "//div[@class='v2-product-subproducts']//@data").extract()
        if options_data:
            options_data = json.loads(options_data[0])

            product_name = options_data['name']
            if not options_data.get('sku', 0):
                pass
            else:

                if options_data['sub_products']:

                    for sub_option in options_data:
                        loader = ProductLoader(item=Product(),
                                               response=response)
                        price = extract_price(
                            sub_option['prices']['price']['amount'])

                        loader.add_value('url', response.url)

                        option_name = sub_option['option1']
                        loader.add_value(
                            'name',
                            "{product} {option}".format(product=product_name,
                                                        option=option_name))
                        loader.add_value('stock',
                                         sub_option['stock']['is_in_stock'])

                        loader.add_xpath(
                            'category',
                            "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()"
                        )
                        loader.add_xpath(
                            'brand',
                            "//div[@class='v2-gallery-block']//img/@alt")

                        if price < 10:
                            shipping_cost = extract_price('2.95')
                        else:
                            shipping_cost = 0

                        # Add shipping cost to product price
                        loader.add_value('shipping_cost', shipping_cost)
                        loader.add_value('price', price + shipping_cost)

                        loader.add_value('sku', sub_option['sku'])
                        loader.add_value('identifier', sub_option['sku'])

                        loader.add_xpath(
                            'image_url',
                            sub_option['main_image']['large_path'])

                        product = loader.load_item()

                        promotion = response.xpath(
                            "//div[@id='product-offer-tab']//h3//text()"
                        ).extract()
                        metadata = FragranceDirectMeta()
                        if promotion:
                            metadata['promotion'] = promotion[0]
                        if product.get('price'):
                            metadata['price_exc_vat'] = Decimal(
                                product['price']) / Decimal('1.2')
                        product['metadata'] = metadata
                        yield product
                else:
                    loader = ProductLoader(item=Product(), response=response)
                    price = extract_price(
                        options_data['prices']['price']['amount'])

                    loader.add_value('price', price)
                    loader.add_value('url', response.url)

                    loader.add_value('name', product_name)
                    loader.add_value('stock',
                                     options_data['stock']['is_in_stock'])

                    loader.add_xpath(
                        'category',
                        "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()"
                    )
                    loader.add_xpath(
                        'brand', "//div[@class='v2-gallery-block']//img/@alt")

                    if price < 10:
                        shipping_cost = extract_price('2.95')
                    else:
                        shipping_cost = 0

                    # Add shipping cost to product price
                    loader.add_value('shipping_cost', shipping_cost)
                    loader.add_value('price', price + shipping_cost)

                    loader.add_value('sku', options_data['sku'])
                    loader.add_value('identifier', options_data['sku'])

                    loader.add_value('image_url',
                                     options_data['main_image']['large_path'])

                    product = loader.load_item()

                    promotion = response.xpath(
                        "//div[@id='product-offer-tab']//h3//text()").extract(
                        )
                    metadata = FragranceDirectMeta()
                    if promotion:
                        metadata['promotion'] = promotion[0]
                    if product.get('price'):
                        metadata['price_exc_vat'] = Decimal(
                            product['price']) / Decimal('1.2')
                    product['metadata'] = metadata
                    yield product

        else:
            product_name = response.xpath(
                "//h1[@class='fn']//text()").extract()[0]
            options = response.xpath(
                "//div[contains(@class, 'sub-products')]/div")
            sku = ''.join(
                response.xpath(
                    "//form[@name='notifications']//input[@name='p']/@value").
                extract())
            if options:
                for sub_option_2 in options:
                    sku_option = ''.join(
                        sub_option_2.xpath("./label/@data-sub-sku").extract())

                    loader = ProductLoader(item=Product(), response=response)
                    price = extract_price(
                        sub_option_2.xpath("./label/@data-subprice").extract()
                        [0])
                    if not price:
                        price = extract_price(''.join(
                            response.xpath(
                                '//p[@class="price-info"]//span[@class="Price"]/text()'
                            ).extract()).strip())

                    loader.add_value('price', price)
                    loader.add_value('url', response.url)

                    option_name = sub_option_2.xpath(
                        "./label/@data-option").extract()[0]
                    loader.add_value(
                        'name',
                        u"{product} {option}".format(product=product_name,
                                                     option=option_name))

                    stock = ''.join(
                        sub_option_2.xpath(
                            "./label/@data-stock").extract()).strip().lower()
                    if stock in ['limited', 'in stock']:
                        stock = '1'
                    else:
                        stock = '0'
                    loader.add_value('stock', stock)

                    loader.add_xpath(
                        'category',
                        "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()"
                    )
                    loader.add_xpath('brand',
                                     "//a[@class='product-brand']//img/@alt")

                    if price < 10:
                        shipping_cost = extract_price('2.95')
                    else:
                        shipping_cost = 0

                    # Add shipping cost to product price
                    loader.add_value('shipping_cost', shipping_cost)
                    loader.add_value('price', price + shipping_cost)

                    loader.add_value('sku', sku_option)
                    loader.add_value('identifier',
                                     '{}_{}'.format(sku, sku_option))

                    img = ''.join(
                        sub_option_2.xpath("./data-image-large").extract())
                    if not img:
                        img = ''.join(
                            response.xpath(
                                "//img/@data-original-large").extract())
                    loader.add_value('image_url', 'http:' + img)

                    product = loader.load_item()

                    promotion = response.xpath(
                        "//div[@id='product-offer-tab']//h3//text()").extract(
                        )
                    metadata = FragranceDirectMeta()
                    if promotion:
                        metadata['promotion'] = promotion[0]
                    if product.get('price'):
                        metadata['price_exc_vat'] = Decimal(
                            product['price']) / Decimal('1.2')
                    product['metadata'] = metadata
                    yield product
                return

            options = response.xpath('//option[@data-name]')
            if options:
                for opt in options:
                    loader = ProductLoader(item=Product(), response=response)
                    product_image_json = opt.xpath('@data-image').extract()
                    if product_image_json:
                        product_image_data = json.loads(product_image_json[0])
                        loader.add_value('image_url',
                                         product_image_data['default'])

                    product_stock = opt.xpath('@data-stock').extract()[0]
                    if product_stock == 'Out of Stock':
                        loader.add_value('stock', 0)

                    option_name = opt.xpath('@data-name').extract()[0]
                    loader.add_value('name', product_name + ' ' + option_name)

                    price_data = json.loads(
                        opt.xpath('@data-price').extract()[0])
                    loader.add_value('price', price_data['price'])

                    option_sku = opt.xpath('@value').extract()[0]
                    loader.add_value('sku', option_sku)
                    loader.add_value('identifier', sku + '_' + option_sku)

                    loader.add_xpath(
                        'category',
                        "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()"
                    )
                    loader.add_xpath('brand',
                                     "//a[@class='product-brand']//img/@alt")

                    loader.add_value('url', response.url)

                    price = loader.get_output_value('price')
                    if price < 10:
                        shipping_cost = extract_price('2.95')
                    else:
                        shipping_cost = 0

                    # Add shipping cost to product price
                    loader.add_value('shipping_cost', shipping_cost)
                    loader.add_value('price', price + shipping_cost)

                    product = loader.load_item()

                    promotion = response.xpath(
                        "//div[@id='product-offer-tab']//h3//text()").extract(
                        )
                    metadata = FragranceDirectMeta()
                    if promotion:
                        metadata['promotion'] = promotion[0]
                    if product.get('price'):
                        metadata['price_exc_vat'] = Decimal(
                            product['price']) / Decimal('1.2')
                    product['metadata'] = metadata
                    yield product

            else:
                if not sku:
                    pass
                else:

                    loader = ProductLoader(item=Product(), response=response)
                    price = ''.join(
                        response.xpath(
                            '//p[@class="price-info"]//span[@class="Price"]/text()'
                        ).extract()).strip()
                    if price == '':
                        price = ''.join(
                            response.xpath(
                                "//span[@class='Price ']//span[@class='Price-integer' or @class='Price-decimal']//text()"
                            ).extract())
                    if price == '':
                        self.log("Error! No price! URL: {}".format(
                            response.url))
                        return
                    price = extract_price(price)
                    loader.add_value('url', response.url)

                    loader.add_value('name', product_name)

                    stock = ''.join(
                        response.xpath("//span[@class='stock-level']//text()").
                        extract()).strip()

                    if stock.lower() in ['limited', 'in stock']:
                        stock = '1'
                    else:
                        stock = '0'

                    loader.add_value('stock', stock)

                    loader.add_xpath(
                        'category',
                        "//div[@id='breadcrumb']//li[position() > 1 and position() < last()]//text()"
                    )
                    loader.add_xpath('brand',
                                     "//a[@class='product-brand']//img/@alt")

                    if price < 10:
                        shipping_cost = extract_price('2.95')
                    else:
                        shipping_cost = 0

                    # Add shipping cost to product price
                    loader.add_value('shipping_cost', shipping_cost)
                    loader.add_value('price', price + shipping_cost)

                    loader.add_xpath(
                        'sku',
                        "//form[@name='notifications']//input[@name='p']/@value"
                    )
                    loader.add_xpath(
                        'identifier',
                        "//form[@name='notifications']//input[@name='p']/@value"
                    )

                    loader.add_xpath('image_url', "//img/@data-original-large")

                    product = loader.load_item()

                    promotion = response.xpath(
                        "//div[@id='product-offer-tab']//h3//text()").extract(
                        )
                    metadata = FragranceDirectMeta()
                    if promotion:
                        metadata['promotion'] = promotion[0]
                    if product.get('price'):
                        metadata['price_exc_vat'] = Decimal(
                            product['price']) / Decimal('1.2')
                    product['metadata'] = metadata
                    yield product
Esempio n. 54
0
    def parse_product(self, response):
        if response.status == 504 or response.status == 500 and response.meta.get(
                'retry', 0) < 3:
            port_number = re.findall(':\d+', response.url)
            if port_number:
                meta = response.meta.copy()
                meta['retry'] = meta.get('retry', 0) + 1
                new_url = re.sub(":\d+", "", response.url)
                log.msg('ERROR >>> Redirect, port number in url : ' + new_url)
                yield Request(new_url,
                              dont_filter=True,
                              callback=self.parse_product,
                              meta=meta)
                return

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        identifier = hxs.select(
            '//span[@itemprop="productID"]/text()').extract()

        if not identifier:
            request = self.retry(
                response,
                "ERROR >>> No identifier for product URL: " + response.url)
            if request:
                yield request
            return

        identifier = identifier[0]

        json_data = re.findall("PRODUCT_METADATA_JSON = (.*);", response.body)
        check_options = response.meta.get('check_options', True)
        if json_data and check_options:
            json_data = demjson.decode(
                json_data[0])['attributeDefinition']['attributeLookup']
            for value in json_data.values():
                option_url = response.url.replace(identifier, str(value))
                yield Request(option_url,
                              callback=self.parse_product,
                              meta={
                                  'check_options': False,
                                  'dont_retry': True
                              })

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('identifier', identifier)
        loader.add_value('url', response.url)
        price = hxs.select(
            '//div[@class="pricingReg"]/span[@itemprop="price"]/text()'
        ).extract()
        if not price:
            price = hxs.select('//span[@id="ajaxPrice"]/text()').extract()

        price = price[0] if price else 0
        loader.add_value('price', price)
        loader.add_xpath('name', '//h1[@class="product-title__title"]/text()')
        image_url = hxs.select('//img[@id="mainImage"]/@src').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])

        categories = []
        json_breadcrumb = re.findall("var BREADCRUMB_JSON = (.*);",
                                     response.body)
        if json_breadcrumb:
            json_breadcrumb = demjson.decode(json_breadcrumb[0])
            categories = json_breadcrumb['bcEnsightenData'][
                'contentSubCategory'].split('>')
        loader.add_value('category', categories)

        brand = hxs.select('//h2[@itemprop="brand"]/text()').extract()
        brand = brand[0].strip() if brand else ''
        loader.add_value('brand', brand)

        sku = response.xpath('//script/text()').re('"modelNumber":"(.+?)"')
        loader.add_value('sku', sku)

        if not loader.get_output_value(
                'price') or 'OUT OF STOCK ONLINE' in response.body.upper():
            loader.add_value('stock', 0)

        item = loader.load_item()

        discontinued = hxs.select('//span[@class="discontinuedItem show"]')
        if discontinued:
            item['price'] = 0
            item['stock'] = 0

        if item['identifier']:
            self.new_ids.append(item['identifier'])
            yield item
Esempio n. 55
0
    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        meta = response.meta

        name = hxs.select('//h1[@itemprop="name"]/text()').extract()
        if not name:
            item = self.parse_dvf(response)
            if not item:
                self.errors.append("Name not found on " + response.url)
            else:
                yield item
            return

        l = ProductLoader(item=Product(), response=response)

        brand = hxs.select('//h2[@itemprop="brand"]/a/text()').extract()[0]
        l.add_value('name', brand + ' ' + name.pop())

        url = meta.get('url') if meta.get('url', None) else response.url
        l.add_value('url', url)

        identifier = hxs.select(
            '//input[@id="productId" and @value!=""]/@value').extract()
        if not identifier:
            identifier = hxs.select('//*[@itemprop="sku"]/@content').extract()
        if not identifier:
            identifier = re.findall("product/([^/]*)/", url)

        if identifier:
            identifier = identifier[0]
        l.add_value('identifier', identifier)

        sku = meta.get('sku', None)
        if not sku:
            sku = hxs.select('//meta[@itemprop="sku"]/@content').extract()
            sku = sku[0] if sku else ''
        l.add_value('sku', sku)

        brand = meta.get('brand') if meta.get('brand', None) else brand
        l.add_value('brand', brand)

        image_url = hxs.select('//img[@id="medium-image"]/@src').extract()
        if image_url:
            l.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
        l.add_value('category', meta.get('category'))
        price = hxs.select('//span[@itemprop="price"]/text()').extract()
        if price:
            price = extract_price(price[0])
        else:
            price = 0
        l.add_value('price', price)

        out_of_stock = hxs.select(
            '//div[@class="sold-out-message"]/span/text()').extract()
        if out_of_stock:
            l.add_value('stock', 0)

        yield l.load_item()

        colors = hxs.select(
            '//div[@id="alternative-colors"]/a/@href').extract()
        for color in colors:
            yield Request(urljoin_rfc(base_url, color[0]),
                          callback=self.parse_product)
Esempio n. 56
0
    def parse(self, response):
        #inspect_response(response, self)
        #return
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)

        price = 0
        stock = 0
        tmp = None
        tmp = hxs.select('//ul[@class="size-selector"]//label[1]/@data-weight').extract()
        if tmp:
            if response.url in ['http://www.thebodyshop.fr/parfums/best-sellers/eau-de-toilette-white-musk.aspx', ]:
                dd = tmp[1].split('#')
            else:
                dd = tmp[0].split('#')
            loader.add_value('identifier', dd[4])
            loader.add_value('sku', dd[4])
            price = extract_price(dd[5])
            loader.add_value('price', price)
        else:
            sku = ''.join(hxs.select("//div[@data-sku]/@data-sku").extract())
            loader.add_value('identifier', sku)
            loader.add_value('sku', sku)
            price = extract_price(''.join(
                hxs.select("//div[@data-sku]//p[contains(concat('',@class,''), 'price ')]//text()").extract()
            ))
            loader.add_value('price', price)
        name = ''
        tmp = hxs.select('//h1[@class="title"]/@title').extract()
        if tmp:
            name = tmp[0].strip()
            loader.add_value('name', name)
        else:
            log.msg('### No name at '+response.url, level=log.INFO)
        # stock
        if price:
            stock = 1
        loader.add_value('stock', stock)
        #image_url
        tmp = hxs.select('//img[@class="product"]/@src').extract()
        if tmp:
            url = urljoin(response.url, tmp[0].strip())
            loader.add_value('image_url', url)
        #brand
        loader.add_value('brand', 'THE BODY SHOP')
        #category
        tmp = hxs.select('//nav[@id="breadcrumb_product"]/ul/li/a/text()').extract()
        if len(tmp) > 1:
            for s in tmp[1:]:
                loader.add_value('category', s)
        #shipping_cost
        if price < 40:
           loader.add_value('shipping_cost', 5.95)

        product = loader.load_item()
        metadata = {}
        tmp = hxs.select("//div[@id='product-offers']/p[2]//text()").extract()
        product['metadata'] = ' '.join([x.strip() for x in tmp if x.strip()])

        return product
Esempio n. 57
0
    def parse_product(self, response):
        if response.xpath('//div[@id="ResultSetItems"]'):
            for x in self.parse(response):
                yield x
            return

        first_name = ' '.join(
            response.xpath('//*[@id="itemTitle"]/text()').extract()).strip()
        if not first_name:
            return

        identifier = response.url.split('?')[0].split('/')[-1]

        try:
            category = response.xpath('//ul[@itemtype="http://schema.org/Breadcrumblist"]')[0]\
                               .xpath('.//span[@itemprop="name"]/text()').extract()[1:]
        except:
            category = []
        if category and self.just_last_category:
            category = category.pop()

        seller_id = ''.join(
            response.xpath('.//*[contains(@class, "si-content")]'
                           '//a/*[@class="mbg-nw"]/text()').extract())

        brand = filter(
            lambda s: s.strip() != '',
            response.xpath(
                '//*[@class="attrLabels" and contains(text(), "Brand")]'
                '/following-sibling::*[1]//text()').extract())
        if not brand:
            brand = filter(
                lambda s: s.strip() != '',
                response.xpath(
                    '//*[@class="attrLabels" and contains(text(), "Brand")]'
                    '/following-sibling::*[1]/h2/text()').extract())
        if not brand:
            brand = filter(
                lambda s: s.strip() != '',
                response.xpath(
                    '//*[@class="attrLabels" and contains(text(), "Brand")]'
                    '/following-sibling::*[1]/h3/text()').extract())
        if not brand:
            brand = filter(
                lambda s: s.strip() != '',
                response.xpath(
                    '//*[@class="attrLabels" and contains(text(), "Marke")]'
                    '/following-sibling::*[1]//text()').extract())
        if not brand:
            brand = filter(
                lambda s: s.strip() != '',
                response.xpath(
                    '//*[@class="attrLabels" and contains(text(), "Hersteller")]'
                    '/following-sibling::*[1]//text()').extract())
        if not brand:
            brand = filter(
                lambda s: s.strip() != '',
                response.xpath(
                    '//*[@class="attrLabels" and contains(text(), "Marque")]'
                    '/following-sibling::*[1]//text()').extract())

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('name', first_name)
        product_loader.add_value('identifier', identifier)
        if self.id_as_sku:
            product_loader.add_value('sku', identifier)
        product_loader.add_value('category', category)
        product_loader.add_value('dealer', 'eBay - ' + seller_id)
        product_loader.add_value('brand', brand)
        product_loader.add_xpath('image_url', '//img[@id="icImg"]/@src')
        product_loader.add_value('url', response.url)
        try:
            price = response.xpath(
                '//*[@id="prcIsum"]/text()').extract()[0].strip()
        except:
            try:
                price = response.xpath(
                    '//*[@id="mm-saleDscPrc"]/text()').extract()[0].strip()
            except:
                try:
                    price = re.search(r'"binPrice":".*([\d\.,]+)",',
                                      response.body).groups()[0]
                except:
                    price = re.search(r'"bidPrice":".*([\d\.,]+)",',
                                      response.body).groups()[0]
        product_loader.add_value('price', self.extract_price(price))

        # shipping cost
        try:
            shipping_cost = response.xpath(
                '//*[@id="shippingSection"]//td/div/text()').extract()[0]
            if shipping_cost:
                if 'free' in shipping_cost.lower():
                    product_loader.add_value('shipping_cost', 0)
                else:
                    product_loader.add_value('shipping_cost',
                                             self.extract_price(shipping_cost))
        except:
            pass

        # stock amount
        if self._extract_stock_amount:
            stock = ''
            try:
                in_stock = ''.join(
                    response.xpath('//*[@id="qtySubTxt"]//text()').extract())
                stock = ''
                for match in re.finditer(r"([\d]+)", in_stock):
                    if len(match.group()) > len(stock):
                        stock = match.group()
                if 'More than' in in_stock:
                    stock = 11
            except:
                pass
            if stock:
                product_loader.add_value('stock', stock)

        product_ = product_loader.load_item()

        options_variations = []

        sel = Selector(text=response.body.replace('&quot;', ''))
        try:
            json_var_map = unicode(
                sel.xpath('//*/text()').re(r'("menuItemMap":{.*}.*),'
                                           '"unavailableVariationIds"')[0])
        except:
            pass
        else:
            try:
                variations = demjson.decode(
                    '{' +
                    re.sub(r',"unavailableVariationIds".*', '', json_var_map) +
                    '}')

                menu_map = variations['menuItemMap']

                for key, variation in variations['itemVariationsMap'].items():
                    if variation['traitValuesMap']:
                        new_variation = {}
                        for option, value in variation['traitValuesMap'].items(
                        ):
                            new_variation[option] = menu_map[str(
                                value)]['displayName']
                        options_variations.append({
                            'price':
                            variation['price'],
                            'values':
                            new_variation,
                            'stock':
                            variation['quantityAvailable'],
                            'identifier':
                            '%s:%s' % (identifier, key)
                        })
            except:
                retry_no = int(response.meta.get('retry_no', 0)) + 1
                if retry_no <= 10:
                    self.log('Retrying No. %s => %s' %
                             (retry_no, response.url))
                    req = response.request.copy()
                    req.meta['retry_no'] = retry_no
                    req.dont_filter = True
                    yield req
                else:
                    self.log('Gave up retrying => %s' % response.url)
                return

        if options_variations:
            for model in options_variations:
                model_name = first_name + ' ' + \
                    ' '.join(opt_name.strip().lower()
                             for o, opt_name in model['values'].items())
                new_product = Product(product_)
                new_product['name'] = model_name
                new_product['identifier'] = model['identifier']
                new_product['price'] = self.extract_price(model['price'])
                new_product['stock'] = model['stock']

                yield new_product
        else:
            yield product_
Esempio n. 58
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        l = ProductLoader(item=Product(), response=response)
        identifier = hxs.select('//input[@name="itemId"]/@value').extract()[0]
        l.add_value('identifier', identifier)

        sku = hxs.select('//span[@id="MpsShortSku"]/text()').re('#(\w+).')
        sku = sku[0] if sku else ''
        l.add_value('sku', sku)

        brand = hxs.select('//span[@class="product-designer"]/text()').extract()
        brand = brand[0].strip() if brand else ''
        l.add_value('brand', brand)

        name = ''.join(hxs.select('//h1[@itemprop="name"]/text()').extract()).strip()
        l.add_value('name', name)

        l.add_value('url', response.url)

        image_url = hxs.select('//div[@class="img-wrap"]/img/@src').extract()
        if image_url:
            l.add_value('image_url', image_url[0])
        l.add_value('category', response.meta.get('category', ''))

        price = hxs.select('//div[@class="price pos1priceDisplayStyleOverride"]/text()').extract()
        if price:
            price = price[0]
        else:
            price = hxs.select('//p[@itemprop="price"]/text()').extract()
            if price:
                price = price[0]
            else:
                price = hxs.select('//span[@itemprop="price"]/text()').extract()
                if price:
                    price = price[0]

        if not price:
            price = 0

        l.add_value('price', price)

        out_of_stock = hxs.select('//div[@class="cannotorder"]')
        if out_of_stock:
            l.add_value('stock', 0)

        base_item = l.load_item()
        sub_items = hxs.select('//div[@class="lineItem"]')
        if sub_items:
            for sub_item in sub_items:
                item = deepcopy(base_item)
                price = sub_item.select('.//div[@class="price pos1priceDisplayStyleOverride"]/text()').extract()
                if price:
                    price = price[0]
                else:
                    price = sub_item.select('.//p[@itemprop="price"]/text()').extract()
                    if price:
                        price = price[0]
                    else:
                        price = sub_item.select('.//span[@itemprop="price"]/text()').extract()
                        if price:
                            price = price[0]

                if not price:
                    price = '0'

                item['price'] = extract_price(price)
                item['name'] = sub_item.select('.//h6/text()').extract()[-1].strip()
                sku = hxs.select('.//span[@id="MpsShortSku"]/text()').re('#(\w+).')
                item['sku'] = sku[0] if sku else ''

                identifier = sub_item.select('.//div/input[contains(@id, "prod")]/@value').extract()
                if not identifier:
                    continue
                item['identifier'] = item['identifier'] + '-' + identifier[0]
                if item['identifier'] not in self.ids:
                    self.ids.append(item['identifier'])
                    yield item
                else:
                    continue
        else:
            if base_item['identifier'] not in self.ids:
                self.ids.append(base_item['identifier'])
                yield base_item
            else:
                return
Esempio n. 59
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        data = response.xpath(
            '//script/text()[contains(., "product/data")]').extract_first()
        data = json.loads(
            re.search('product/data",[ \n]*({.+})', data).group(1))

        price = ''.join(
            hxs.select(
                '//div[contains(@class, "js-product-offer-summary")]//div[contains(@class, "price-display")]//text()'
            ).extract())
        if not price:
            price = ''.join(
                response.xpath(
                    '//div[@itemprop="offers"]//div[@itemprop="price"][1]//text()'
                ).extract())
        if not price:
            price = ''.join(
                response.xpath(
                    '//span[contains(@class, "hide-content-m")]/span[@data-tl-id="Price-ProductOffer"]//text()'
                ).extract())
        # Some products are not available online and these have no price
        if price:
            stock_status = 1
            if 'out of stock' in price.lower():
                stock_status = 0

            product_name = filter(
                lambda x: bool(x),
                map(
                    unicode.strip,
                    hxs.select('//h1[contains(@itemprop, "name")]//text()').
                    extract()))

            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('name', product_name)
            loader.add_value('identifier',
                             re.search(r'/(\d+)$', response.url).group(1))
            loader.add_value('sku', response.meta['sku'])
            loader.add_value('brand', response.meta['brand'])
            categories = hxs.select(
                '//ol[contains(@class, "breadcrumb-list")]//li//a/span/text()'
            ).extract()
            categories = map(lambda x: x.strip(), categories)
            loader.add_value('category', categories)
            loader.add_value('url', response.url)
            loader.add_xpath(
                'image_url',
                '//img[contains(@class, "js-product-primary-image")]/@src')
            try:
                loader.add_value(
                    'shipping_cost',
                    data['buyingOptions']['shippingPrice']['displayPrice'])
            except KeyError:
                loader.add_css('shipping_cost',
                               'h2.js-shipping-primary-msg::text')

            loader.add_value('price', price)
            if not stock_status:
                loader.add_value('stock', 0)
            item = loader.load_item()
            item['metadata'] = {}

            yield Request(self._get_reviews_url(item, 1),
                          meta={
                              'product': item,
                              'page': 1
                          },
                          callback=self.parse_product_reviews)
Esempio n. 60
0
    def parse_product_main(cls, response, self_product_ids,
                           self_matched_identifiers):
        # log.msg(">>>>>>>>>>>>>>> PARSE PRODUCT >>>")
        hxs = HtmlXPathSelector(response)

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//h1[@id="productTitle"]/text()')
        price = response.xpath(
            u'//div[contains(@class, "bem-product-price")]/div[contains(@class, "bem-product-price")]//text()'
        ).re(r'[\d,.]+')
        if not price:
            price = response.xpath(
                u'//*[contains(@class, "unit-price")]/text()').re(r'[\d,.]+')
        if price:
            price = price[0]
        else:
            price = '0.0'
        price = extract_price(price)
        if not price:
            discontinued = bool(
                response.xpath(
                    '//div[contains(@class, "discontinuedProduct")]'))
            retry = int(response.meta.get('retry', 0))
            if (not discontinued) and retry < 20:
                meta = response.meta.copy()
                meta['retry'] = retry + 1
                yield Request(
                    response.url,
                    meta=meta,
                    dont_filter=True,
                    callback=lambda r: cls.parse_product_main(
                        r, self_product_ids, self_matched_identifiers))
        if price:
            product_loader.add_value('price', price)
            product_loader.add_xpath(
                'category', u'//ul[@id="breadcrumbs"]/li[2]/div/a/@title')
            product_loader.add_xpath(
                'image_url', u'concat("http:", //img[@itemprop="image"]/@src)')
            product_loader.add_xpath(
                'brand', u'//span[@itemprop="manufacturer"]/text()')
            # product_loader.add_xpath('shipping_cost', '')
            product = product_loader.load_item()

            metadata = CRCMeta()
            metadata['rrp'] = cls.extract_rrp(response)
            product['metadata'] = metadata

            identifier = response.xpath(
                '//*[@id="quickBuyBox"]/form/input[@name="id"]/@value'
            ).extract()
            if identifier:
                # single option product
                prod = Product(product)
                prod['name'] = prod['name'] + ' ' + response.xpath(
                    u'normalize-space(//*[@id="quickBuyBox"]/form/div[@class="option-text"]/text())'
                ).extract()[0]
                prod['name'] = prod['name'].strip()
                prod['identifier'] = identifier[0]
                prod['sku'] = identifier[0]
                if prod['identifier'] in self_product_ids:
                    prod['name'] = self_product_ids[prod['identifier']]
                else:
                    self_product_ids[prod['identifier']] = prod['name']
                out_of_stock = response.xpath(
                    '//span[contains(@class, "out-of-stock")]')
                if not out_of_stock:
                    out_of_stock = response.xpath(
                        '//div[@id="productAvailabilityMessage" and contains(@class, "out-of-stock")]'
                    )
                if out_of_stock:
                    prod['stock'] = 0

                self_matched_identifiers.add(prod['identifier'])
                yield prod
            else:
                # multiple options product
                option_groups = response.xpath(
                    u'//select[@id="productOptionDropDown2"]/optgroup')
                if option_groups:
                    for option_group in option_groups:
                        label = option_group.xpath('@label').extract()[0]
                        options = option_group.xpath(u'option')
                        for option in options:
                            value = option.xpath(u'./@value').extract()[0]
                            if not value:
                                continue
                            prod = Product(product)
                            opt_name = option.xpath(
                                u'normalize-space(./text())').extract()[0]
                            last_pos = opt_name.find('- Out of stock')
                            if last_pos == -1:
                                last_pos = len(opt_name)
                            prod['name'] = prod[
                                'name'] + ' ' + label + ' ' + opt_name[:
                                                                       last_pos].strip(
                                                                       )
                            prod['name'] = prod['name'].strip()
                            prod['identifier'] = value
                            prod['sku'] = value
                            stock = option.xpath('./@class').extract()
                            stock = stock[0] if stock else ''
                            if stock.startswith('out'):
                                prod['stock'] = 0
                            if prod['identifier'] in self_product_ids:
                                prod['name'] = self_product_ids[
                                    prod['identifier']]
                            else:
                                self_product_ids[
                                    prod['identifier']] = prod['name']
                            self_matched_identifiers.add(prod['identifier'])
                            yield prod
                    # root options
                    options = response.xpath(
                        u'//select[@id="productOptionDropDown2"]/option[not(@disabled)]'
                    )
                    for option in options:
                        value = option.xpath(u'./@value').extract()[0]
                        if not value:
                            continue
                        prod = Product(product)
                        opt_name = option.xpath(
                            u'normalize-space(./text())').extract()[0]
                        last_pos = opt_name.find('- Out of stock')
                        if last_pos == -1:
                            last_pos = len(opt_name)
                        prod['name'] = prod[
                            'name'] + ' ' + label + ' ' + opt_name[:
                                                                   last_pos].strip(
                                                                   )
                        prod['name'] = prod['name'].strip()
                        prod['identifier'] = value
                        prod['sku'] = value
                        stock = option.xpath('./@class').extract()
                        stock = stock[0] if stock else ''
                        if stock.startswith('out'):
                            prod['stock'] = 0
                        if prod['identifier'] in self_product_ids:
                            prod['name'] = self_product_ids[prod['identifier']]
                        else:
                            self_product_ids[prod['identifier']] = prod['name']
                        self_matched_identifiers.add(prod['identifier'])
                        yield prod
                else:
                    options = response.xpath(
                        u'//select[@id="productOptionDropDown2"]//option')
                    if options:
                        for option in options:
                            value = option.xpath(u'./@value').extract()[0]
                            if not value:
                                continue
                            prod = Product(product)
                            opt_name = option.xpath(
                                u'normalize-space(./text())').extract()[0]
                            last_pos = opt_name.find('- Out of stock')
                            if last_pos == -1:
                                last_pos = len(opt_name)
                            prod['name'] = prod[
                                'name'] + ' ' + opt_name[:last_pos].strip()
                            prod['name'] = prod['name'].strip()
                            prod['identifier'] = value
                            prod['sku'] = value
                            stock = option.xpath('./@class').extract()
                            stock = stock[0] if stock else ''
                            if stock.startswith('out'):
                                prod['stock'] = 0
                            if prod['identifier'] in self_product_ids:
                                prod['name'] = self_product_ids[
                                    prod['identifier']]
                            else:
                                self_product_ids[
                                    prod['identifier']] = prod['name']
                            self_matched_identifiers.add(prod['identifier'])
                            yield prod
                    else:
                        options = response.xpath('//input[@name="id"]')
                        for option in options:
                            value = option.xpath(u'./@id').extract()
                            if not value:
                                continue
                            prod = Product(product)
                            prod['name'] = prod['name'] + ' ' + ' '.join(
                                option.xpath(
                                    u'./@data-colour|./@data-size').extract())
                            prod['name'] = prod['name'].strip()
                            prod['identifier'] = value[0].strip()
                            prod['sku'] = value[0].strip()
                            stock = 'in-stock' in option.xpath(
                                '@class').extract()[0]
                            if not stock:
                                prod['stock'] = 0
                            if prod['identifier'] in self_product_ids:
                                prod['name'] = self_product_ids[
                                    prod['identifier']]
                            else:
                                self_product_ids[
                                    prod['identifier']] = prod['name']
                            self_matched_identifiers.add(prod['identifier'])
                            yield prod