def parse_products(self, response):
        hxs = HtmlXPathSelector(response)

        products = hxs.select(
            '//div[@id="productListing"]//h5[a[contains(@class, "product-name")]]/..'
        )
        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', './/a/text()')
            url = product.select('.//a/@href').extract()[0]
            loader.add_value('url', url)
            price = product.select(
                '..//div[@class="product-buttons"]//span[@class="sellPrice"]/text()'
            ).extract()
            if not price:
                price = product.select(
                    '..//div[@class="product-buttons"]//div[@class="productSpecialPrice"]/span/text()'
                ).extract()
            loader.add_value('price', price[0])
            yield Request(url,
                          callback=self.parse_product,
                          meta={'loader': loader})  #loader.load_item()
        next = hxs.select('//a[@title=" Next Page "]/@href').extract()
        if next:
            url = urljoin_rfc(get_base_url(response), next[0])
            yield Request(url, callback=self.parse_products)
Exemple #2
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        redirected_urls = response.meta.get('redirect_urls', None)
        if redirected_urls:
            log.msg('Skips product, redirected url: ' +
                    str(redirected_urls[0]))
            return

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_value(
            'identifier',
            re.search('p-(\d+)\.html', response.url).group(1))
        name = hxs.select(
            u'//td[@class="pageHeading" and @valign="top" and not(@align)]/text()'
        ).extract()[0]
        product_loader.add_value('name', name)
        price = ''.join(
            hxs.select(
                u'//td[@class="pageHeading" and @valign="top" and @align="right"]/text()'
            ).extract()).strip()
        if not price:
            price = ''.join(
                hxs.select(
                    u'//td[@class="pageHeading" and @valign="top" and @align="right"]/span[@class="productSpecialPrice"]/text()'
                ).extract())

        product_loader.add_value('price', price)
        product_loader.add_xpath(
            'sku',
            u'//td[@class="pageHeading" and @valign="top" and not(@align)]/span[@class="smallText"]/text()',
            re='\[(.*)\]')
        product_loader.add_value('category', response.meta.get('category'))
        image_url = hxs.select(
            u'//a[contains(@href,"images") and child::img]/@href').extract()
        if image_url:
            image_url = urljoin_rfc(get_base_url(response), image_url[0])
            product_loader.add_value('image_url', image_url)
        # product_loader.add_xpath('brand', u'')

        brand = ''
        brands = hxs.select(
            '//form[@name="manufacturers"]/select/option/text()').extract()
        for brand in brands:
            if '..' in brand:
                incomplete_brand = ' '.join(brand.split()[:-1])
                if incomplete_brand.lower() in name.lower():
                    product_loader.add_value('brand', brand.replace('..', ''))
            else:
                if brand.lower() in name.lower():
                    product_loader.add_value('brand', brand.replace('..', ''))
                    break

        yield product_loader.load_item()
Exemple #3
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//div[@itemprop="name"]/text()')
        price = hxs.select(u'//span[@itemprop="price"]/text()').extract()
        price = price[0] if price else '0'

        product_loader.add_value('price', price)

        product_id = hxs.select(
            u'//form//input[@type="hidden" and @name="products_id"]/@value'
        ).extract()
        if not product_id:
            product_id = hxs.select(
                '//div[@id="productTellFriendLink"]/a/@href').re(
                    'products_id=(.*)')
            if not product_id:
                product_id = re.findall(r'products_id=(.*)" class',
                                        response.body)
                if not product_id:
                    log.msg('Product without identifier: ' + response.url)
                    return

        product_loader.add_value('identifier', product_id[0])

        sku = hxs.select(u'//span[@itemprop="identifier"]/text()').extract()
        if sku:
            product_loader.add_value('sku', sku[0])

        product_loader.add_xpath('category',
                                 u'//div[@id="navBreadCrumb"]/a[2]/text()')

        img = hxs.select(u'//div[@id="productMainImage"]//img/@src').extract()
        if img:
            img = urljoin_rfc(get_base_url(response), img[0])
            product_loader.add_value('image_url', img)

        brand = hxs.select('//li[@itemprop="brand"]/text()').extract()
        if brand:
            brand = brand[0].replace('Manufactured by: ', '')
            product_loader.add_value('brand', brand)

        product = product_loader.load_item()
        if product['price'] > 0:
            yield product
        """
Exemple #4
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        product_loader = AxeMusicProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name',
                                 u'//div[@class="product-name"]/h1/text()')
        price = hxs.select(
            u'//div[@class="price-box"]//span[@class="price"]/text()').extract(
            )
        if price:
            price = price[0].strip()
            product_loader.add_value('price', price)
        else:
            return
        product_loader.add_xpath(
            'sku', u'//div[@class="sku"]/span[@class="value"]/text()')
        product_loader.add_xpath(
            'category', u'//div[@class="breadcrumbs"]/ul/li[2]/a/span/text()')

        img = hxs.select('//img[@id="image-main"]/@src').extract()
        if img:
            img = urljoin_rfc(get_base_url(response), img[0])
            product_loader.add_value('image_url', img)

        identifier = hxs.select('//meta[@itemprop="productID"]/@content').re(
            'sku:(.*)')[0]
        product_loader.add_value('identifier', identifier)

        product_loader.add_value(
            'brand',
            self._get_brand_from_name(product_loader.get_output_value('name')))

        #stock_status = ''.join(hxs.select('//p[@class="availability in-stock"]/h10/text()').extract()).strip()
        # if stock_status:
        #     if 'OUT OF STOCK' in stock_status.upper():
        #         product_loader.add_value('stock', 0)

        yield product_loader.load_item()
Exemple #5
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//span[@itemprop="name"]/text()')
        price = hxs.select(u'//form[@id="vCSS_mainform"]//span[@itemprop="price"]/text()').extract()
        price = price[0] if price else u'0'
        product_loader.add_value('price', price)
        product_loader.add_xpath('sku', u'//span[@class="product_code"]/text()')
        product_loader.add_xpath('identifier', u'//span[@class="product_code"]/text()')
        product_loader.add_xpath('category', u'//td[@class="vCSS_breadcrumb_td"]//a[position()=2]/@title')
        product_loader.add_xpath('image_url', u'concat("http://lamusic.ca",//img[@id="product_photo"]/@src)')
        product_loader.add_xpath('brand', u'//meta[@itemprop="manufacturer"]/@content')
        availability_label = ''.join(hxs.select('//b[contains(text(), "Availability:")]/text()').extract()).strip()
        # in_stock = 'IN STOCK' in ''.join(hxs.select('//div[@itemprop="offers"]/text()').extract()).strip().upper()
        # if availability_label and not in_stock:
        #     product_loader.add_value('stock', 0)
        if hxs.select(u'//img[@class="vCSS_img_icon_free_shipping"]'):
            product_loader.add_value('shipping_cost', '0')

        product = product_loader.load_item()
        if hxs.select(u'//tr[@class="Multi-Child_Background"]'):
            for opt in hxs.select(u'//tr[@class="Multi-Child_Background"]'):
                p = Product(product)
                p['sku'] = opt.select(u'./td[1]/text()').extract()[0].strip()
                p['identifier'] = opt.select(u'./td[1]/text()').extract()[0].strip()
                p['name'] = opt.select(u'./td[2]/text()').extract()[0].strip()
                try:
                    p['price'] = opt.select(u'./td[4]//span[@itemprop="price"]/text()').extract()[0].strip().replace('$', '').replace(',', '')
                except:
                    price = opt.select(u'./td[4]//span/text()').extract()
                    if not price:
                        price = opt.select(u'./td[3]//span[contains(text(), "$")]/text()').extract()

                    p['price'] = price[0].strip().replace('$', '').replace(',', '')
                    
                if p.get('identifier') and p.get('price') > 0:
                    yield p
        elif product.get('identifier') and product.get('price') > 0:
            yield product