Esempio n. 1
0
 def parse_products(self, response):
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//*[@id="area-2"]//div[@class="grid-25"]')
     if products:
         for product in products:
             loader = ProductLoader(item=Product(), selector=product)
             loader.add_xpath('url', 'div/h3/a/@href')
             if product.select('div/h3/a/abbr/@title'):
                 loader.add_xpath('name', 'div/h3/a/abbr/@title')
             else:
                 loader.add_xpath('name','div/h3/a/text()')
             price = product.select('div/div/p[@class="prd-amount"]/strong/text()').extract()[0]
             loader.add_value('price', self._encode_price(price))
             yield loader.load_item()
     else:
         products = hxs.select('//*[@id="area-2"]//tr[@class="prd first"]')
         for product in products:
             loader = ProductLoader(item=Product(), selector=product)
             loader.add_xpath('url', 'td/h3/a/@href')
             loader.add_xpath('name', 'td/h3/a/text()')
             if product.select('td/p/strong/text()').extract():
                 price = product.select('td/p/strong/text()').extract()[0]
             else:
                 if product.select('td/div/p/strong/text()').extract():
                     price = product.select('td/div/p/strong/text()').extract()[0]
             loader.add_value('price', self._encode_price(price))
             yield loader.load_item()
Esempio n. 2
0
    def parse_page(self, response):
        base_url = get_base_url(response)
        base_url_func = functools.partial(urljoin_rfc, base_url)

        hxs = HtmlXPathSelector(response)
        cats = hxs.select("//ul[@id='nav']//a/@href").extract()
        for url in cats:
            yield Request(urljoin_rfc(base_url, url), callback=self.parse_page)

        # next page
        hxs = HtmlXPathSelector(response)
        url = hxs.select("//div[@class='pagerLine']//a[@class='next']/@data-query").extract()
        if url:
            yield Request(urljoin_rfc(base_url, url[0]), callback=self.parse_page)

        # products
        for z in hxs.select("//div[@class='products']//li"):
            #name = z.select(".//div[@class='detailsInnerWrap']/a[@class='name']/text()").extract()
            loader = ProductLoader(selector=z, item=Product())
            loader.add_xpath('identifier', "@data-product-url", first, re="articleNumber=(\d+)")
            loader.add_xpath('sku', "@data-product-url", first, re="articleNumber=(\d+)")
            loader.add_xpath('url', "@data-product-url", first, base_url_func)
            loader.add_xpath('name', ".//div[@class='detailsInnerWrap']/span[@class='brand']/text()")
            loader.add_xpath('name', ".//div[@class='detailsInnerWrap']/a[starts-with(@class, 'name')]/text()")
            price = z.select(".//p[@class='price']/ins//text()") \
                    or z.select(".//p[@class='price']//text()") \
                    or z.select(".//p[@class='price']/del//text()")

            price = ''.join(price.extract()).replace(',', '.').replace(u'\xa0', '')
            loader.add_value('price', price)

            yield loader.load_item()
Esempio n. 3
0
    def parse_product(self, response):
        base_url = get_base_url(response)
        search_sku = response.meta['sku']
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        name_xpaths = [u'//font[contains(@class,"productname")]/big/text()',
                       u'//font[contains(@class,"productname")]/text()']
        for name_xpath in name_xpaths:
            main_name = hxs.select(name_xpath).extract()
            if main_name:
                main_name = main_name[0].strip()
                break
        if not main_name:
            main_name = response.url
            main_name = re.search(u'.*/(.*)\.htm', main_name)
            if main_name:
                main_name = main_name.groups()[0] + u' (%s)' % search_sku
        options =  hxs.select(u'//td//text()').re(u'PURCHASE OPTIONS: (.*)')
        if options:
            main_name += u' %s' % options[0].strip()
        loader.add_value('name', main_name)
        loader.add_xpath('price', u'//td//font[contains(@class,"pricecolor") and not(ancestor::table[contains(@id,"related")])]/text()')
        loader.add_value('sku', search_sku)

        sku = hxs.select(u'//span[@class="product_code"]/text()').extract()
        if sku:
            sku = re.sub('-', '', sku[0])
            if sku.startswith(search_sku):
                yield loader.load_item()
Esempio n. 4
0
    def parse_products(self, hxs, response):
        print response.encoding
        model_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' +
                               ' and text()="Model"]/preceding-sibling::*) + 1').extract()
        description_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' +
                                     ' and text()="Description"]/preceding-sibling::*) + 1').extract()
        price_pos = hxs.select('count(//td[starts-with(@class, "orderinfo")' +
                                ' and text()="Price"]/preceding-sibling::*) + 1').extract()

        if model_pos and description_pos and price_pos:
            model_pos = model_pos[0].split('.')[0]
            description_pos = description_pos[0].split('.')[0]
            price_pos = price_pos[0].split('.')[0]

            products = hxs.select('//td[starts-with(@class, "orderinfo") and position()=%s \
                                   and not(text()="Model")]/..' % model_pos)
            for product in products:
                loader = ProductLoader(selector=product, item=Product())
                url = response.url
                model_url = product.select('.//td[starts-with(@class, "orderinfo") \
                                            and position()=%s]//a/@href' % model_pos).extract()
                if model_url:
                    url = urljoin_rfc(get_base_url(response), model_url[0])

                loader.add_value('url', url)
                loader.add_xpath('name', './/td[starts-with(@class, "orderinfo") and position()=%s]/text()' % description_pos)
                loader.add_xpath('price', './/td[starts-with(@class, "orderinfo") and position()=%s]//text()' % price_pos)
                if not loader.get_output_value('price') or not loader.get_output_value('name').strip():
                    continue

                yield loader.load_item()
Esempio n. 5
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        
        for item in hxs.select(u'//div[@class="prelement"]'):
            product_loader = ProductLoader(item=Product(), selector=item)

            product_loader.add_xpath('name', u'.//a/text()')

            price = item.select(u'.//p[@class="prpri"]/text()').extract()[0]
            price = price.strip().lstrip('Pris: DKK ').replace('.', '').replace(',', '.')
            product_loader.add_value('price', price)

            url = item.select(u'.//a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)

            yield product_loader.load_item()

        level = response.meta.get('level', 1)
        sub_url = u'//ul[@id="pMenul0"]/../' + u'/'.join([u'ul/li'] * level) + '/a/@href'
        subcategories = hxs.select(sub_url).extract()

        for subcategory in subcategories:
            url = urljoin_rfc(get_base_url(response), subcategory)
            yield Request(url, meta={'level': level+1})
Esempio n. 6
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for item in hxs.select(u'//td[@class="Description_ProductList"]'):
            product_loader = ProductLoader(item=Product(), selector=item)

            product_loader.add_xpath('name', u'.//a/@title')

            price = item.select(u'../..//span[@class="Price_Productlist"]/text()').extract()[0]
            price = price.strip().rstrip(' DKK').replace('.', '').replace(',', '.')
            if price == u'Ring for pris!':
                price = 0
            product_loader.add_value('price', price)

            url = item.select(u'.//a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)

            yield product_loader.load_item()

        level = response.meta.get('level', 1)
        sub_url = u'//table[@id="ProductMenu_Table"]/../' + u'/'.join([u'table/tr/td'] * level) + '/a/@href'
        subcategories = hxs.select(sub_url).extract()
 
        for subcategory in subcategories:
            url = urljoin_rfc(get_base_url(response), subcategory)
            yield Request(url, meta={'level': level+1})
Esempio n. 7
0
 def parse_page(self, response):
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//div[@class="item"]')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('name', 'h2/a/text()')
         relative_url = product.select('h2/a/@href').extract()[0]
         url = urljoin_rfc('http://www.dolphinmusic.co.uk/', 
                           relative_url, response.encoding)
         loader.add_value('url', url)
         loader.add_xpath('price', 'div[@class="pricing"]/p[@class="price"]/text()')
         yield loader.load_item()
     next_page = hxs.select('//*[@id="categoryMain"]/div[@class="pagination"]/ul/li/a/@href').extract()
     if not next_page:
         relative_urls = hxs.select('//*[@id="sidebar"]/ul[@id="refineCat"]/li/a/@href').extract()
         for relative_url in relative_urls:
             url = urljoin_rfc('http://www.dolphinmusic.co.uk/',
                               relative_url, response.encoding)
             yield Request(url, callback=self.parse_page)
     else:
         next_url = next_page[-1]
         if self._is_next(next_url):
             url = urljoin_rfc('http://www.dolphinmusic.co.uk/',
                                next_url, response.encoding)
             yield Request(url, callback=self.parse_page)
Esempio n. 8
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        name = hxs.select(u'//div[@class="datac2"]//h1[@class="mpv_desc"]/text()').extract()[0].strip()
        multiple_options = hxs.select(u'//select[@class="mpv_itemalst"]//option')
        if multiple_options and not u'requested' in response.meta:
            for option in multiple_options:
                formname = u'aspNetForm'
                formdata = {u'ctl00$MainContent$ItemAList' : option.select(u'./@value').extract()[0],
                            u'__EVENTTARGET' : u'ctl00$MainContent$ItemAList',
                            u'__EVENTARGUMENT' : u''}
                req = FormRequest.from_response(response, formname=formname,
                                                    formdata=formdata,
                                                    meta={u'requested': True},
                                                    dont_click=True, callback=self.parse_product)
                yield req
        if multiple_options:
            name += u' %s' % multiple_options.select(u'../option[@selected]/text()').extract()[0].strip()
        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)
        loader.add_value('name', name)
        loader.add_xpath('price', u'//div[@class="datac2"]//span[@class="offerprc"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//span[@class="mpv_prc"]/text()')
        if loader.get_output_value('price'):
            yield loader.load_item()
Esempio n. 9
0
    def parse_pagination(self, response):
        URL_BASE = 'http://www.dv247.com/'

        hxs = HtmlXPathSelector(response)
        products = hxs.select('//div[@class="listItem clearfix"]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            name = ''.join(product.select('.//a//text()').extract())
            loader.add_value('name', name)
            relative_url = product.select('.//a/@href')[0].extract()
            url = urljoin_rfc(URL_BASE, relative_url)
            loader.add_value('url', url)
            loader.add_xpath('price', './/li[@class="price"]/text()')
            yield loader.load_item()
        
        #next page
        next_pages = hxs.select('//div[@class="listPaging"]')
        next_ten = []
        if next_pages:
            next_ten = next_pages[0].select('.//a[text()="Next 10"]/@href').extract()
        
        if next_pages:
            next_pages = next_pages[0].select('.//a[not(@class="selectedpage") and not(text()="Next 10") and not(text()="Previous 10")]/@href').extract()
            for page in next_pages:
                url = urljoin_rfc(URL_BASE, page)
                yield Request(url, callback=self.parse_pagination)

        if next_ten:
            next_ten_url = urljoin_rfc(URL_BASE, next_ten[0])
            yield Request(next_ten_url, callback=self.parse_pagination)
Esempio n. 10
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)

        nextPageLink = hxs.select('//div[@id="center-main"]//a[@class="right-arrow"]/@href')
        if nextPageLink:
            yield Request(self._get_products_url(response, nextPageLink[0].extract()), callback=self.parse_products)

        products = hxs.select('//div[@id="center-main"]//div[@class="details"]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)

            loader.add_xpath("name", "a/text()")
            loader.add_xpath("sku", 'div[@class="sku"]/span/text()')

            # few prices were under div class desc
            price_selector = product.select('.//div[@class="price-row"]/span[@class="price-value"]/span/text()')
            if price_selector:
                price = price_selector[0].extract()
            else:
                price = "0.0"

            loader.add_value("price", price)

            relative_url = product.select("a/@href")[0].extract()
            loader.add_value("url", urljoin_rfc(get_base_url(response), relative_url))

            yield loader.load_item()
Esempio n. 11
0
    def parse(self, response):
        if response.url in self.junk_urls:
            return

        hxs = HtmlXPathSelector(response)

        for item in hxs.select(u'//div[@class="item_wrapper"]'):
            product_loader = ProductLoader(item=Product(), selector=item)

            product_loader.add_xpath('name', u'.//div[@class="name"]/a/text()')

            price = item.select(u'.//div[@class="price"]/text()[last()]').extract()[0]
            price = price.strip().lstrip('Kr. ').replace('.', '').replace(',', '.')
            product_loader.add_value('price', price)

            url = item.select(u'.//div[@class="name"]/a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)

            yield product_loader.load_item()

        level = response.meta.get('level', 1)
        sub_url = u'//div[@id="shopnav"]/' + u'/'.join([u'ul/li'] * level) + '/a/@href'
        subcategories = hxs.select(sub_url).extract()
 
        for subcategory in subcategories:
            url = urljoin_rfc(get_base_url(response), subcategory)
            yield Request(url, meta={'level': level+1})
Esempio n. 12
0
 def parse_products(self, response):
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//*[@id="products-list"]/li')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('name', 'div[@class="product-details left"]/h2/a/text()')
         price = product.select('div[@class="product-shop left"]/div/div/p/span/span/text()')
         if price:
             price = price[0]
         else:
             price = product.select('div[@class="product-shop left"]/div/div/span/text()')
             if price:
                 price = price[0]
             else:
                 price = product.select('div[@class="product-shop left"]/div/div/p/span/text()')
                 if len(price)==1:
                     price = price[0]
                 else:
                     price = price[1]
         loader.add_value('price', price)
         loader.add_xpath('url', 'div[@class="product-details left"]/h2/a/@href')
         yield loader.load_item()
     next = hxs.select('//div[@class="right-nav right"]/a/@href').extract()
     if next:
         url = next[0]
         yield Request(url, callback=self.parse_products)
Esempio n. 13
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        search_results = []
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', './/h3/a/span/text()')
            if not loader.get_output_value('name'):
                loader.add_xpath('name', './/h3/a/text()')
            loader.add_xpath('url', './/h3/a/@href')
            loader.add_xpath('price', './/ul/li/a/span/text()', re='\$(.*)')
            if not loader.get_output_value('price'):
                loader.add_xpath('price', './/div[@class="newPrice"]//span[contains(@class,"price")]/text()')
            loader.add_value('sku', response.meta['sku'])
            loader.add_value('identifier', response.meta['sku'].lower())
            if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') >
                                                                   loader.get_output_value('price')):
                pr = loader
                search_results.append(pr)

        # if pr:
            # yield pr.load_item()
        if search_results:
            cur_prod = search_results[0]
            next_prods = search_results[1:]
            yield Request(cur_prod.get_output_value('url'), callback=self.parse_mfrgids,
                          meta={'mfrgid': response.meta['mfrgid'], 'name': response.meta['name'], 'cur_prod':cur_prod, 'next_prods':next_prods}, dont_filter=True)
Esempio n. 14
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for item in hxs.select(u'//tr[contains(@class,"product-item")]'):
            product_loader = ProductLoader(item=Product(), selector=item)

            product_loader.add_xpath('name', u'.//td[@class="productListingNewName"]/b/a/text()')

            price = item.select(u'.//span[@class="js_price_tax"]/text()').extract()[0]
            price = price.strip().replace('.', '').replace(',', '.')
            product_loader.add_value('price', price)

            url = item.select(u'.//td[@class="productListingNewName"]/b/a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)

            # If quantity field is not present on page, there are subproducts
            qty = item.select(u'.//input[@name="products_qty"]').extract()
            if qty:
                yield product_loader.load_item()
            else:
                yield Request(url, callback=self.parse_sub)

        level = response.meta.get('level', 1)
        sub_url = u'//div[@class="box-content"]/' + u'/'.join([u'ul/li'] * level) + '/a/@href'
        subcategories = hxs.select(sub_url).extract()
 
        for subcategory in subcategories:
            url = urljoin_rfc(get_base_url(response), subcategory)
            yield Request(url, meta={'level': level+1})

        next_url = hxs.select(u'//li[@class="page-next"]/a/@href').extract()
        if next_url:
            next_url = urljoin_rfc(get_base_url(response), next_url[0])
            yield Request(next_url, meta={'level': level})
Esempio n. 15
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        name = hxs.select('//td[@class="ProductDetails"]/h1/text()').extract()
        if name:
            name = name[0].strip()
            url = response.url
            url = urljoin_rfc(get_base_url(response), url)
            items = hxs.select('//div[@class="Item"]')
            for item in items:
                loader = ProductLoader(item=Product(), selector=item)
                loader.add_value('url', url)
                #loader.add_value('name', name[0])

                sku = ''.join(item.select('./text()').extract())
                n = name
                if sku:
                    n += ' ' + sku.strip()

                loader.add_value('name', n)
                loader.add_xpath('price', './/span[@class="price"]/text()')
                loader.add_xpath('price', './div[@class="price"]/span/text()')


                yield loader.load_item()
Esempio n. 16
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        loader.add_xpath('price', '//span[@itemprop="price"]/text()')

        yield loader.load_item()
Esempio n. 17
0
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     loader = ProductLoader(item=Product(), response=response)
     mpn = ''.join(hxs.select('//*[@id="product-information"]/table/tr[th/text()="Part number"]/td/span/text()').extract()).strip()
     loader.add_value('identifier', mpn)
     loader.add_value('name', ' '.join((response.meta['name'].strip(), mpn)))
     loader.add_value('url', response.url)
     loader.add_xpath('price', '//*[@id="product-price"]/p[@class="no-vat"]/text()')
     yield loader.load_item()
Esempio n. 18
0
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     
     product_loader = ProductLoader(item=Product(), selector=hxs)
     product_loader.add_xpath('name', '//h1[@id="product_description"]/text()')
     product_loader.add_value('price', hxs.select('//p[@id="product_price"]/span/text()').re('(\d+(?:\.\d+))')[0])
     product_loader.add_value('sku', response.meta['sku'])
     product_loader.add_value('url', response.url)
     yield product_loader.load_item()
Esempio n. 19
0
 def parse_products(self, hxs, response):
     products = hxs.select('//h3[@class="product_name"]/../..')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('name', './/h3[@class="product_name"]/a/text()')
         url = product.select('.//h3[@class="product_name"]/a/@href').extract()[0]
         url = urljoin_rfc(get_base_url(response), url)
         loader.add_value('url', url)
         loader.add_xpath('price', './/p[@class="price"]/text()')
         yield loader.load_item()
Esempio n. 20
0
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     loader = ProductLoader(item=Product(), response=response)
     loader.add_xpath('name', '//*[@id="header"]/text()')
     loader.add_value('url', response.url)
     price = ''.join(hxs.select('//*[@id="productdesc"]/font/font/text()').extract()).replace('.','').replace(',','.')
     if price:
         price = price.split(':')[-1]
     loader.add_value('price', price)
     yield loader.load_item()
Esempio n. 21
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(response=response, item=Product())
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//div[@id="productDetail"]//h1[@class="productDetailTitle"]/text()')
        loader.add_xpath('price', '//div[@id="productDetail"]//span[contains(@class,"price")]/text()')
        sku = hxs.select('//div[@id="productDetail"]//p[1]')[0].re('Ref\. Code: (\d+)')
        loader.add_value('sku', sku)

        yield loader.load_item()
Esempio n. 22
0
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     
     product_loader = ProductLoader(item=Product(), response=response)
     product_loader.add_xpath('name', '//div[@id="prodTITLE"]//h1/text()')
     product_loader.add_xpath('price', '//div[@id="prodDETAILS"]//span[@class="price"]/text()')
     product_loader.add_value('sku', response.meta['sku'])
     product_loader.add_value('url', response.url)
     yield product_loader.load_item()
Esempio n. 23
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     if self.products.has_key(response.url):
         sku = self.products[response.url]
         loader = ProductLoader(item=Product(), response=response)
         loader.add_value('sku', sku)
         loader.add_value('url', response.url)
         loader.add_xpath('name', '//*[@id="feature_content_info"]/h1/text()')
         loader.add_xpath('price', '//*[@id="productBuy"]/p/span/text()')
         return loader.load_item()
Esempio n. 24
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        loader.add_xpath('price', '//*[@itemprop="price"]/text()')
        loader.add_value('url', response.url)

        yield loader.load_item()
Esempio n. 25
0
 def parse_products(self, hxs, response):
     products = hxs.select('//table[@class="SearchGrid"]//td/a[contains(@href, "productdetail.aspx")]/../..')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         url = product.select('.//a[contains(@href, "productdetail.aspx")]/@href').extract()[0]
         url = urljoin_rfc(get_base_url(response), url)
         loader.add_value('url', url)
         loader.add_xpath('name', './/td[position() = 2]//a[contains(@href, "productdetail.aspx")]/text()')
         loader.add_xpath('price', './/td[position() = 3]//text()')
         yield loader.load_item()
Esempio n. 26
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(response=response, item=Product())
        loader.add_xpath('name', '//h1[@id="partNameId"]/text()')
        loader.add_value('url', response.url)
        loader.add_xpath('price', '//font[@class="txt-purchaseprice20blue"]/text()')
        sku = ''.join(hxs.select('//b[contains(text(), "Model #:")]/../text()').extract()).strip()
        loader.add_value('sku', sku)

        yield loader.load_item()
Esempio n. 27
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        name = hxs.select('//span[@id="ProductDetail1_lblDescription"]//text()').extract()
        if name:
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('name', name)
            loader.add_value('url', response.url)
            loader.add_xpath('price', '//*[@class="yourPriceText"]//text()')
            loader.add_value('sku', response.meta['sku'])
            yield loader.load_item()
Esempio n. 28
0
File: svh24.py Progetto: 0--key/lib
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('url', response.url)
        loader.add_xpath('name', u'//h1[@itemprop="name"]/text()')
        price = hxs.select(u'//span[@itemprop="price"]/text()').extract()[0].replace(',', '.')
        loader.add_value('price', price)
        loader.add_value('sku', response.meta['sku'])
        yield loader.load_item()
Esempio n. 29
0
File: amazon.py Progetto: 0--key/lib
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('url', response.url)
        loader.add_xpath('name', u'//div[@class="buying"]/h1[@class="parseasinTitle"]/span[@id="btAsinTitle"]/text()')
        price = hxs.select(u'//div[@class="buying"]/table[@class="product"]//b[@class="priceLarge"]/text()').extract()[0]
        loader.add_value('price', price.replace(',', '.'))
        loader.add_value('sku', response.meta['sku'])
        yield loader.load_item()
Esempio n. 30
0
 def parse_products(self, hxs, response):
     base_url = get_base_url(response)
     for product in hxs.select('//div[@id="titre_pdt"]/..'):
         loader = ProductLoader(selector=product, item=Product())
         loader.add_xpath('name', './/h2/text()')
         url = product.select('.//div[@id="img_pdt"]/a/@href').extract()[0]
         url = urljoin_rfc(base_url, url)
         loader.add_value('url', url)
         price = u''.join(product.select(".//a[@class='prix_normal']//text()").extract())
         loader.add_value('price', price)
         yield loader.load_item()
Esempio n. 31
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        row = response.meta['row']

        loader = ProductLoader(response=response, item=Product())
        loader.add_value('identifier', row['PRODUCT_NUMBER'])
        loader.add_value('sku', row['PRODUCT_NUMBER'])
        loader.add_xpath('brand', '//span[@itemprop="brand"]/text()')
        categories = hxs.select(
            '//a[@class="breadcrumb"]/text()').extract()[1:]
        loader.add_value('category', categories)
        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        loader.add_xpath('price', '//span[@id="price_container"]/text()')
        loader.add_value('url', response.url)
        loader.add_xpath('image_url', '//img[@id="product_image"]/@src')
        in_stock = hxs.select(
            '//link[@itemprop="availability" and contains(@href, "InStock")]')
        if not in_stock:
            loader.add_value('stock', 0)
        yield loader.load_item()
Esempio n. 32
0
    def parse_products(self, hxs, response):
        products = hxs.select('//div[starts-with(@id, "productData-")]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', './/a[@class="pagedLink"]/text()')
            loader.add_xpath('price', './/div[@class="pagedPriceSale"]/text()')
            url = product.select('.//a[@class="pagedLink"]/@href').extract()[0]
            loader.add_value('url', urljoin_rfc(get_base_url(response), url))
            sku = self._get_sku(url)
            loader.add_value('sku', sku)

            yield loader.load_item()

        products = hxs.select('//table[@id="multi"]//td[@id="multi-product3"]/..')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', './/td[position() = 2]/a/text()')
            url = product.select('.//td[position() = 2]/a/@href').extract()[0]
            loader.add_value('url', urljoin_rfc(get_base_url(response), url))
            loader.add_value('sku', self._get_sku(url))
            loader.add_xpath('price', './/td[@id="multi-price2"]//text()')
            yield loader.load_item()
Esempio n. 33
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        row = response.meta['row']

        loader = ProductLoader(response=response, item=Product())
        loader.add_value('identifier', row['PRODUCT_NUMBER'])
        loader.add_value('sku', row['PRODUCT_NUMBER'])
        loader.add_xpath('brand', '//span[@class="brand"]/text()')
        categories = hxs.select('//ul[@class="crumb"]/li/a/text()').extract()
        loader.add_value('category', categories)
        loader.add_xpath('name', '//h2[@itemprop="name"]/text()')
        loader.add_xpath('price', '//span[@class="list_price"]/text()')
        loader.add_value('url', response.url)
        loader.add_xpath('image_url', '//img[@id="imyimage"]/@src')
        out_of_stock = hxs.select(
            '//meta[@itemprop="availability" and contains(@content, "OutOfStock")]'
        )
        if out_of_stock:
            loader.add_value('stock', 0)
        yield loader.load_item()
Esempio n. 34
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), response=response)

        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@class="prod_name"]/text()')
        in_stock = 'EN STOCK' in ''.join(
            hxs.select('//span[contains(@class, "prod_stock_text")]/text()').
            extract()).upper()
        if not in_stock:
            loader.add_value('stock', 0)

        for category in response.meta['categories']:
            loader.add_value('category', category)

        loader.add_xpath('brand',
                         '//li[span[contains(text(), "Fabricante")]]/text()')
        loader.add_value('shipping_cost', 6.99)
        loader.add_xpath(
            'sku', u'//li[span[contains(text(), "Cod. Artículo")]]/text()')

        identifier = hxs.select(
            '//input[@id="JS_google_remarketing__prodid"]/@value').extract()

        loader.add_value('identifier', identifier)
        image_url = hxs.select('//ul[@class="etalage"]//img/@src').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])
        product = loader.load_item()

        meta = response.meta.copy()
        meta['product'] = product

        cart_url = 'http://www.mytelecom.es/es/ajax_cart_update/only_minicart:true'
        data = {
            'product_id': product['identifier'],
            'quantity': '1',
            'view_minicart': 'front_cart/v_modal_add_cart'
        }
        yield FormRequest(cart_url,
                          formdata=data,
                          headers={
                              'Accept':
                              'application/json, text/javascript, */*; q=0.01',
                              'X-Requested-With': 'XMLHttpRequest'
                          },
                          dont_filter=True,
                          meta=meta,
                          callback=self.parse_price)
Esempio n. 35
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        product = hxs.select('//table[@id="tblParts"]//tr[@onmouseout]')

        if product:
            loader = ProductLoader(item=Product(), selector=product[0])
            loader.add_xpath('name', './td[3]/div/a//text()')
            url = product.select('./td[3]/div/a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            loader.add_value('url', url)
            loader.add_xpath('price', './td[6]//text()', re='\$(.*)')
            loader.add_xpath('sku', './td[3]/div/a[2]/text()')
            if loader.get_output_value('sku') == response.meta['sku'].lower():
                yield loader.load_item()
Esempio n. 36
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        product = hxs.select('//div[@class="single_product clearfix"]')
	cnt = 1
        if not product:
	    product = hxs.select('//div[@class="ThumbContainer1"]')
	    cnt = 2
	if not product:
	    return
	pr = None    
	if cnt>1:
	    for prd in product:
		loader = ProductLoader(item=Product(), selector=prd)
		url = prd.select('.//div[@class="ThumbImage"]//a/@href').extract()[0]
		if response.meta['sku'] in url:
		    loader.add_value('url', url)
		    loader.add_xpath('name', './/span[@class="underline"]/strong/text()')
		    nm = loader.get_output_value('name')
		    if not nm:
			loader.add_xpath('name', './/span[@class="underline"]/strong/i/text()')
        	    loader.add_value('sku', response.meta['sku'])
        	    loader.add_xpath('price', './/span[@class="Label1"]/text()')
		    pr = loader
		else:
		    continue
        else:
	    loader = ProductLoader(item=Product(), selector=product)
	    url = product.select('.//div[@class="product_image"]/a/@href').extract()[0]
	    if response.meta['sku'] in url:
		loader.add_value('url', url)
		loader.add_xpath('name', './/li[@class="product_title"]/text()')
        	loader.add_value('sku', response.meta['sku'])
        	price = product.select('.//div[@class="price"]/text()').extract()[1]
        	loader.add_value('price', price)
		pr = loader
	
	if pr:                                                                                                             
	    yield pr.load_item()    
Esempio n. 37
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        identifier = hxs.select(
            '//input[@id="product_page_product_id"]/@value').extract()
        identifier = identifier[0] if identifier else response.url.split(
            '/')[-1].split('-')[0]

        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', identifier)
        l.add_xpath('name', '//div[@id="product_title"]/h1/text()')
        l.add_value('category', response.meta['category'])
        l.add_xpath('brand', '//a[@class="brand_image"]/@title')
        l.add_xpath('sku', '//h2[@id="product_reference"]/span/text()')
        l.add_value('url', response.url)
        price = hxs.select('//span[@id="our_price_display"]/text()').extract()
        if price:
            price = ''.join(price[0].replace(',', '.').split())
        else:
            price = 0
        #price = ''.join(price[0].strip().split()).replace(',','.') if price else 0
        l.add_value('price', price)
        l.add_xpath('image_url', '//div[@id="image-block"]/img/@src')
        yield l.load_item()
Esempio n. 38
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//li[@class="item first"]')
     for product in products:
         name = products.select(
             'h2[@class="product-name"]/a/text()').extract()[0].lower()
         if name == response.meta['name']:
             loader = ProductLoader(item=Product(), selector=product)
             loader.add_xpath('name', 'h2[@class="product-name"]/a/text()')
             loader.add_xpath('url', 'h2[@class="product-name"]/a/@href')
             loader.add_xpath(
                 'price',
                 'div[@class="price-box"]/span[@class="price-excluding-tax"]/span[@class="price"]/text()'
             )
             loader.add_value('sku', response.meta['sku'])
             yield loader.load_item()
Esempio n. 39
0
    def parse_node(self, response, node):
        if not isinstance(response, XmlResponse):
            return

        identifier = node.select(u'./product-url/text()').re(
            r'product/([^/]+)/')
        identifier = identifier[0]

        loader = ProductLoader(item=Product(), selector=node)
        url = node.select(u'./product-url/text()').extract()[0]
        loader.add_value('url', url)
        loader.add_xpath('name', u'./title/text()')
        price = node.select(u'./price/text()').extract()[0].replace(',', '.')
        loader.add_value('price', price)
        loader.add_xpath('category', u'merchant-category/text()')
        loader.add_xpath('brand', u'brand/text()')
        loader.add_xpath('image_url', u'image-url/text()')
        loader.add_value('sku', identifier)
        loader.add_value('identifier', identifier)
        if loader.get_output_value('price'):
            return loader.load_item()
        else:
            return Product()
Esempio n. 40
0
 def parse_product(self, response):
     loader = ProductLoader(item=Product(), response=response)
     loader.add_value('identifier',
                      response.url.split('-')[-1].split('.')[0])
     loader.add_xpath('name', '//meta[@itemprop="name"]/@content')
     loader.add_xpath('price', '//meta[@itemprop="price"]/@content')
     categories = response.xpath(
         '//div[@id="prodBreadCrumbs"]//a/text()').extract()
     for cat in categories:
         loader.add_value('category', cat)
     if loader.get_output_value('price') is None:
         delisted_text = response.xpath(
             '//span[@class="markup-blu markup-lg"]/text()')
         if delisted_text and 'discontinued' in delisted_text.extract(
         )[0].lower():
             self.log('delisted product {}'.format(response.url))
             return
         loader.add_value('stock', 0)
         loader.add_value('price', 0)
     loader.add_xpath('image_url', '//img[@itemprop="image"]/@src')
     loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content')
     loader.add_value('url', response.url)
     yield loader.load_item()
Esempio n. 41
0
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     loader = ProductLoader(item=Product(), response=response)
     identifier = hxs.select('//input[@name="product"]/@value').extract()
     if not identifier:
         identifier = re.search(r'product=(.*)', response.url).groups()[0]
     loader.add_value('identifier', identifier)
     loader.add_xpath('name', '//*[@id="header"]/text()')
     loader.add_xpath('brand', '//a[@class="hilight"]/text()')
     loader.add_value('url', response.url)
     price = hxs.select('//*[@id="productdata"]//span[@class="price"]/text()').extract()
     if not price:
         price = hxs.select('//*[@id="productdesc"]//font/text()').re(u"Værdi: (.+)")
     if not price:
         price = hxs.select('//*[@id="productdesc"]//font/text()').re(u"Vejl. udsalgspris: (.+)")
     price = ''.join(price).replace('.', '').replace(',', '.')
     loader.add_value('price', price)
     loader.add_xpath('sku', '//input[@name="product"]/@value')
     loader.add_xpath('category', u'//span[@id="productnavgroup"]/a[1]/text()')
     img = hxs.select(u'//img[@id="productimg"]/@src').extract()
     if img:
         loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0]))
     yield loader.load_item()
    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        sku = hxs.select('//table[@id="product-attribute-specs-table"]'
                         '//th[@class="label" and contains(text(), "SKU")]'
                         '/following-sibling::*/text()').extract()
        if not sku:
            sku = hxs.select('//table[@id="product-attribute-specs-table"]'
                             '//th[@class="label" and contains(text(), "Barcode")]'
                             '/following-sibling::*/text()').re(r'(\d\d\d\d\d)\d$')

        loader = ProductLoader(response=response, item=Product())
        loader.add_xpath('price', '//div[@class="price-box"]/span[@class="price-excluding-tax"]/span[@class="price"]/text()')
        loader.add_xpath('identifier', '//input[@name="product"]/@value')
        loader.add_value('sku', sku)
        loader.add_value('brand', 'Draper')
        loader.add_value('url', urljoin_rfc(base_url, response.url))
        loader.add_xpath('name', '//div[@class="product-name"]/h1/text()')
        loader.add_xpath('image_url', '//div[@class="product-img-box"]//img[@id="image"]/@src')
        loader.add_value('category', response.meta['category'])

        yield loader.load_item()
Esempio n. 43
0
    def parse_cat(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        product_divs = hxs.select('//div[@class="product-info"]')
        for product in product_divs:
            url = product.select(
                './/a[@class="product-title"]/@href').extract()[0]

            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath(
                'identifier',
                'preceding-sibling::input[contains(@name, "product_id")]/@value'
            )
            loader.add_xpath('name', './/a[@class="product-title"]/text()')
            loader.add_value('url', urljoin_rfc(base_url, url))
            loader.add_xpath('price',
                             './/span[@class="price"]/span[@id]/text()')
            loader.add_xpath(
                'sku',
                './/p[@class="sku"]//span[contains(@id,"product_code")]/text()'
            )
            yield loader.load_item()
Esempio n. 44
0
    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        # Is this a product with many option/attributes?
        options = hxs.select('//select[@name="selection[]"]/option')
        if options:
            log.msg('Found %d options for this product' % len(options))
            url = urljoin_rfc(base_url, 'cmsplus/store-stockcheck.php')
            # Post these parameters
            prodid = hxs.select('//input[@name="prodid"]/@value').extract()[0]
            versionids = hxs.select(
                '//input[@name="verids"]/@value').extract()[0]
            presel = hxs.select('//input[@name="presel"]/@value').extract()[0]
            curtime = "%s" % int(round(time.time() * 1000))
            name = hxs.select('//h1/text()').extract()[0]
            sku = hxs.select('//input[@name="prodid"]/@value').extract()[0]

            for option in options:
                loader = ProductLoader(item=Product(), response=response)
                loader.add_value('url', response.url)
                #loader.add_xpath('price', '//span[@id="ourprice"]/text()')
                sku_sub = option.select('@value').extract()[0]
                loader.add_value('sku', '%s_%s' % (sku, sku_sub))
                loader.add_value(
                    'name', '%s Type: %s' %
                    (name, option.select('text()').extract()[0]))
                request = FormRequest(url=url,
                                      formdata={
                                          'prodid': prodid,
                                          'versionids': versionids,
                                          'presel': presel,
                                          'var[]': sku_sub,
                                          'firstrun': '1',
                                          'curtime': curtime
                                      },
                                      callback=self.parse_product_price)
                request.meta['item'] = loader
                yield request

        else:
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('url', response.url)
            loader.add_xpath('sku', '//input[@name="prodid"]/@value')
            loader.add_xpath('name', '//h1/text()')
            loader.add_xpath('price', '//span[@id="ourprice"]/text()')
            yield loader.load_item()
Esempio n. 45
0
    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)
        name = hxs.select('//span[@class="ArTit"]//text()').extract()[0]
        name = " ".join(name.split())
        loader.add_value('name', name)
        loader.add_xpath(
            'sku', '//span[@id="MainContent_ngpArticolo_lblARCd_AR"]/text()')
        price = hxs.select(
            '//span[@id="MainContent_ngpArticolo_lblPrezzoScontato"]/text()'
        )[0].extract()
        price = price.replace('.', '').replace(',', '.')
        loader.add_value('price', price)
        loader.add_xpath(
            'brand',
            '//span[@id="MainContent_ngpArticolo_lblARMarcaDescrizione"]/text()'
        )
        loader.add_xpath(
            'category',
            '//span[@id="MainContent_ngpArticolo_lblCd_ARGruppo2"]/text()')
        image_url = hxs.select('//div[@id="gallery"]/img/@src')
        if not image_url:
            image_url = hxs.select('//div[@id="gallery"]/input/@src')

        image_url = image_url[0].extract()
        if not image_url.strip().endswith('noimage.png'):
            loader.add_value('image_url', urljoin_rfc(base_url, image_url))
        if hxs.select('//div[@class="art-light-red"]'):
            loader.add_value('stock', 0)
        loader.add_value('url', response.url)
        loader.add_value('identifier', response.url.split('id=')[1])

        price = extract_price(price)

        if price < Decimal(100):
            loader.add_value('shipping_cost', '15.00')
        elif price < Decimal(251):
            loader.add_value('shipping_cost', '30.00')
        elif price < Decimal(751):
            loader.add_value('shipping_cost', '40.00')
        elif price < Decimal(1000):
            loader.add_value('shipping_cost', '60.00')
        else:
            loader.add_value('shipping_cost', '100.00')

        yield loader.load_item()
Esempio n. 46
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        options = hxs.select(u'//script').re('Product\.Bundle\((.*)\)')

        if options:
            options = json.loads(options[0])
            mandatory_options = hxs.select(u'//div[@class="input-box"]//input[@type="hidden"]')

            name = hxs.select(u'//div[@class="product-name"]/h1/text()').extract()[0].strip()
            price = Decimal(0.0)

            exclude = set()
            for mandatory_option in mandatory_options:
                option = mandatory_option.select(u'./@name').re('bundle_option\[(.*)\]')[0]
                selection = mandatory_option.select(u'./@value').extract()[0]
                option = options['options'][option]['selections'][selection]
                name += u' %s' % option['name'].strip()
                price += Decimal(option['price']).quantize(Decimal('0.01'))
                exclude.add(mandatory_option)

            option_keys = set(options['options'].keys()).difference(exclude)
            for option in option_keys:

                selection_keys = options['options'][option]['selections'].keys()
                for selection in selection_keys:
                    selection_name = options['options'][option]['selections'][selection]['name']
                    selection_price = options['options'][option]['selections'][selection]['price']
                    selection_price = Decimal(selection_price).quantize(Decimal('0.01'))

                    loader = ProductLoader(item=Product(), selector=hxs)
                    loader.add_value('url', response.url)
                    loader.add_value('name', name + u' %s' % selection_name.strip())
                    loader.add_value('price', price + selection_price)
                    if loader.get_output_value('price'):
                        yield loader.load_item()

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('url', response.url)
        loader.add_xpath('name', u'//div[@class="product-name"]/h1/text()')
        loader.add_xpath('price', u'//span[@class="regular-price"]/span[@class="price"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//div[@class="price-box"]//p[@class="minimal-price" or @class="price-from"]/span[@class="price"]/text()')
        if loader.get_output_value('price'):
            yield loader.load_item()
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=response.meta['product'], selector=hxs)
        loader.add_xpath('identifier', '//input[@name="product"]/@value')
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1/*[@itemprop="name"]//text()')
        loader.add_xpath(
            'sku', '//tr/th[contains(text(), "Model Number")]/../td/text()')

        loader.add_value('category', response.meta.get('category'))

        img = hxs.select('//img[@id="product-image-zoom-img"]/@src').extract()
        if img:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), img[0]))

        loader.add_xpath(
            'brand', '//tr/th[contains(text(), "Brand Name")]/../td/text()')

        brand = loader.get_output_value('brand').strip().upper()
        if brand in self.ignore_brands:
            log.msg('Ignoring %s product: %s' % (brand, response.url))
            return
        yield self.add_shipping_cost(loader.load_item())
Esempio n. 48
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        # skip error page
        if hxs.select('//div[@class="portlet-msg-error"]'):
            self.log('[WARNING] Error page when loading url: %s' %
                     response.url)
            return

        if not hxs.select(
                '//ul[contains(@class, "breadcrumbs")]/li/span/a/text()'
        ).extract():
            # retry
            yield self.retry(response,
                             "Error getting category from: %s" % response.url)
            return

        l = ProductLoader(item=Product(), response=response)
        l.add_xpath('name', '//div[contains(@class, "description")]/h1/text()')
        l.add_value('url', response.url)
        sku = hxs.select(
            '//div[contains(@class, "description-panel")]/span[contains(text(), "Ref. ")]/text()'
        ).extract()
        sku = sku[0].strip().replace('Ref. ', '') if sku else ''
        l.add_value('sku', sku)
        l.add_value('identifier', sku)
        l.add_value('brand', '')
        l.add_xpath('image_url', '//img[@id="current-zoomed"]/@src')
        category = hxs.select(
            '//ul[contains(@class, "breadcrumbs")]/li/span/a/text()').extract(
            )[-1]
        l.add_value('category', category)
        price = hxs.select(
            '//div[@class="price"]/span[@class="amount"]/text()').extract()
        price = price[0].replace('.', '').replace(',', '.') if price else 0
        l.add_xpath('price', price)
        item = l.load_item()

        add_button = hxs.select(
            '//div[@class="add-section"]/a[contains(@class, "btn-green")]')
        if add_button:
            formdata = {'product': sku, 'quantity': '1'}

            product = {'item': item, 'formdata': formdata}
            self.collect_products.append(product)
        else:
            yield item
Esempio n. 49
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        options = hxs.select('//a[@itemprop="url"]/@href').extract()
        if options:
            for url in options:
                yield Request(response.urljoin(url),
                              callback=self.parse_product)
            return

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//h1[@itemprop="name"]//text()')
        categories = hxs.select(
            '//div[@class="breadcrumbs"]/ul/li/a/span/text()').extract()[6:]
        for category in categories:
            if category not in loader.get_output_value('name'):
                loader.add_value('name', category)
        loader.add_xpath('identifier',
                         '//meta[@itemprop="productID"]/@content')
        loader.add_xpath('price', '//span[@itemprop="price"]/text()')
        loader.add_css('price', '.price ::text')
        loader.add_value('url', response.url)
        loader.add_xpath('image_url', '//img[@itemprop="image"]/@src')
        if loader.get_output_value('identifier'):
            yield loader.load_item()
Esempio n. 50
0
 def parse_products(self, hxs, response):
     products = hxs.select(
         '//table[@class="v65-productDisplay"]//a[contains(@class, "productnamecolor")]/..'
     )
     if not products:
         products = hxs.select(
             '//a[contains(@class, "productnamecolor")]/../..')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath(
             'url', './/a[contains(@class, "productnamecolor")]/@href')
         loader.add_xpath(
             'name', './/a[contains(@class, "productnamecolor")]/text()')
         loader.add_xpath(
             'price',
             './/font[contains(@class, "colors_productprice")]/text()')
         yield loader.load_item()
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)

        base_url = get_base_url(response)
        if not self.brand_crawled:
            brands = hxs.select(
                '//*[@class="infoBox-categories"]//a/@href').extract()
            for url in brands:
                if not re.search('^http', url):
                    url = urljoin_rfc(base_url, url)
                yield Request(url, callback=self.parse_products)
            self.brand_crawled = True

        # Is it another subcategory page?
        sub_sub_categories = hxs.select(
            '//div[@id="catView"]//a/@href').extract()
        for url in sub_sub_categories:
            if not re.search('^http', url):
                url = urljoin_rfc(base_url, url)
            yield Request(url, callback=self.parse_products)

        # Is it products page?
        products = hxs.select(
            '//div[@id="productView"]/ul/li[@class="product"]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', './/h2/a/text()')
            if product.select(
                    './/h3/a/span[@class="productSpecialPrice"]/text()'):
                loader.add_xpath(
                    'price',
                    './/h3/a/span[@class="productSpecialPrice"]/text()')
            else:
                loader.add_xpath('price', './/h3/a/text()')
            loader.add_xpath('url', './/h2/a/@href')
            yield loader.load_item()
Esempio n. 52
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//table[@class="grid"]/tr/td')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         loader.add_xpath('name',
                          'table/tr/td/div[@class="ttl g-std"]/a/@title')
         loader.add_xpath('url',
                          'table/tr/td/div[@class="ttl g-std"]/a/@href')
         loader.add_xpath(
             'price',
             'table/tr/td/div/table/tr/td/span[@itemprop="price"]/text()')
         yield loader.load_item()
     next = hxs.select('//td[@class="next"]/a/@href').extract()
     if next:
         url = urljoin_rfc(get_base_url(response), next[0])
         yield Request(url)
Esempio n. 53
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(response=response, item=Product())

        loader.add_value('url', response.url)
        loader.add_xpath('identifier', '//div[@id="dv_ref"]/@title')
        loader.add_xpath('sku', '//div[@id="dv_ref"]/@title')

        price = hxs.select(
            '//span[contains(@class, "prix")]/@data-prix-origine').extract()
        if not price:
            price = hxs.select(
                '//div[@class="fa-infos-prix"]/div//span[contains(@class, "prix")]/text()'
            ).extract()
        price = price[0] if price else 0
        loader.add_value('price', price)

        loader.add_xpath('name', '//h1[@itemprop="name"]//text()')
        categories = hxs.select(
            '//div[@class="breadcrumb"]/span/a/span[@itemprop="title"]/text()'
        ).extract()[:-1]
        loader.add_value('category', categories)
        img = ''.join(hxs.select('//img[@itemprop="image"]/@src').extract())
        if img:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), img))

        loader.add_value('brand', response.meta.get('brand'))

        in_stock = hxs.select(
            '//div[contains(@class, "text-dispo") and contains(text(), "En stock")]'
        )
        if in_stock:
            loader.add_value('stock', '1')
        else:
            loader.add_value('stock', '0')

        product = loader.load_item()
        metadata = KeterMeta()
        metadata['reviews'] = []
        product['metadata'] = metadata

        response.meta['product'] = product
        for x in self.parse_review(response):
            yield x
Esempio n. 54
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_xpath('name', './/h1[@itemprop="name"]/text()')
        loader.add_value('url', response.url)
        loader.add_value('identifier', response.meta.get('row').get('PRODUCT_NUMBER'))
        loader.add_xpath('image_url', '//div[@class="product"]//img[@id="product-image"]/@src')
        loader.add_xpath('price', '//meta[@itemprop="price"]/@content')
        loader.add_value('sku', response.meta.get('row').get('PRODUCT_NUMBER'))

        loader.add_value('brand', 'Poundstretcher')

        for category in hxs.select('//li[contains(@itemtype,"Breadcrumb")]/a/span/text()')[1:].extract():
            loader.add_value('category', category)

        yield loader.load_item()
 def parse_product(self, response):
     hxs = HtmlXPathSelector(response)
     loader = ProductLoader(item=Product(), selector=hxs)
     loader.add_xpath('name', '//span[@class="testo16-titolo"]/text()')
     loader.add_xpath('sku', '//span[@id="shownIntxt"]/text()')
     price = hxs.select('//span[@class="titolo-cat"]/text()') or 0
     if price:
         price = price.extract()[0].split()[1]
     if price==0:
         log.msg('ERROR: No price url: ' + response.url)
         return
     loader.add_value('price', price)
     category = hxs.select('//div[@class="testo11-nero"]/strong/text()').extract()[-1]
     loader.add_value('category', category)
     loader.add_xpath('image_url', '//a[@id="foto_visualizzata_link"]/img/@src')
     loader.add_value('identifier', hxs.select('//meta[@property="og:url"]/@content').extract()[0].split('/')[4])
     loader.add_value('url', response.url)
     yield loader.load_item()
Esempio n. 56
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@id="atfResults"]//div[starts-with(@id, "result_")]')
        pr = None
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', './/h3[@class="title"]/a/text()')

            loader.add_xpath('url', './/h3[@class="title"]/a/@href')
            loader.add_xpath('price', './/td[@class="toeOurPrice"]/a/text()')
            loader.add_value('sku', response.meta['sku'])

            if loader.get_output_value('price') and (pr is None or pr.get_output_value('price') >
                                                                   loader.get_output_value('price')):
                pr = loader

        if pr:
            yield pr.load_item()
Esempio n. 57
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        products = hxs.select('//a[@class="prod-box"]')
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath(
                'name', 'span[@class="prod-desc-area"]/'
                'span[@class="prod-name-row"]/strong/text()')
            loader.add_xpath('url', '@href')
            loader.add_xpath(
                'price', 'span[@class="prod-desc-area"]/'
                'span[@class="price-prod"]/text()')
            yield loader.load_item()

        next_page = hxs.select('//*[@id="sli_pagination_footer"]/'
                               'span/a[text()="Next"]/@href').extract()
        if next_page:
            next_url = next_page[-1]
            yield Request(next_url, callback=self.parse)
Esempio n. 58
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        prescription = response.xpath(
            '//ul[@itemprop="description"]/li[contains(text(), "Prescription required")]'
        )
        if not prescription:
            prescription = response.xpath(
                '//strong[text()="Prescription only"]')

        if not prescription:
            loader = ProductLoader(item=response.meta['product'], selector=hxs)
            identifier = response.css(
                '#MainProduct_Product_ProductGUID ::attr(value)').extract(
                ) or response.xpath(
                    '//span[@class="hdnProductGuid"]/text()').extract()
            if not identifier:
                return
            loader.add_value('identifier', identifier)
            loader.add_xpath('url', '//link[@rel="canonical"]/@href')
            loader.add_value('url', response.url)
            loader.add_xpath('name', '//h1/text()')
            loader.add_xpath('sku', '//label[@class="prodCodeSize"]/b/text()')

            category = response.css('.breadcrumb').xpath(
                '//span[@itemprop="title"]/text()').extract()[1:-1]
            loader.add_value('category', category)

            img = response.css('.slider-main img::attr(src)').extract(
            ) or response.css('.img-responsive ::attr(src)').extract()
            if img:
                loader.add_value('image_url',
                                 urljoin_rfc(get_base_url(response), img[0]))

            brand = ''.join(
                response.xpath('//a[@class="abrandPLink"]//text()').extract())
            brand = brand.replace('See More ', '').replace(' Products',
                                                           '').strip()
            if not brand:
                brand = loader.get_output_value('name').split()[0]
            if brand:
                loader.add_value('brand', brand)
            yield self.add_shipping_cost(loader.load_item())
Esempio n. 59
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        if hxs.select('//div[@id="noResultsMsg"]') \
           or not hxs.select('//td[text()="Mfr. Model #"]/following-sibling::td/text()'):
            return

        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//div[@id="PageTitle"]/h1//text()')
        loader.add_value('url', response.url)
        loader.add_xpath(
            'price',
            '//td[@class="tdrightalign"]/strong[starts-with(text(), "$")]/text()'
        )
        loader.add_xpath(
            'sku', '//td[text()="Mfr. Model #"]/following-sibling::td/text()')
        sku = loader.get_output_value('sku')
        if sku.lower() != response.meta['sku'].lower():
            return

        yield loader.load_item()
Esempio n. 60
0
    def parse_product(self, response):
        if 'Server is encountered an error' in response.body:
            return
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=response.meta['product'], selector=hxs)

        loader.add_value(
            'identifier',
            re.search(
                "shopee.product.rating.showProductRating\(.*'([^']+)','ecshopfx_rating_container'\);",
                response.body).group(1))
        #        loader.add_value('identifier', response.url.split('/')[-1].split('?')[0])
        if not loader.get_output_value('identifier'):
            return
        loader.add_xpath('sku',
                         '//span[@id="ecshopfx_product_serial_value"]/text()')
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h3[@id="producttitle"]/text()')

        loader.add_value(
            'category',
            re.findall(
                "shopee.breadcrumb.addToBreadCrumbs\('breadcrumb_container','([^']+)'",
                response.body.decode('utf8'))[:-1])

        img = [
            '/elko/upload/images/products/ecshop_zoom_' +
            loader.get_output_value('sku') + '.jpg'
        ]
        if img:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), img[0]))

        loader.add_xpath(
            'brand',
            'normalize-space(//td[contains(text(),"Framlei")]/following-sibling::td/text())'
        )
        if not loader.get_output_value('brand'):
            loader.add_value('brand',
                             loader.get_output_value('name').split()[0])
        yield self.add_shipping_cost(loader.load_item())