Exemple #1
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        # products
        products = hxs.select(u'//form/div[@id="price"]')

        if not products:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('url', response.url)
            product_loader.add_xpath('name', u'//div[@class="product"]/h1/text()')
            price = hxs.select(u'//div[@class="product"]//p[@class="price1"]/text()').re(u'\xa3(.*)')
            if not price:
                return
            product_loader.add_value('price', price)
            yield product_loader.load_item()
        else:
            for product in products:
                product_loader = ProductLoader(item=Product(), selector=product)
                product_loader.add_xpath('name', u'./h4/text()')
                product_loader.add_value('url', response.url)
                price = product.select(u'.//p[@class="price1"]/text()').re('\xa3(.*[0-9])')
                if not price:
                    continue
                product_loader.add_value('price', price)
                yield product_loader.load_item()
Exemple #2
0
    def browse_and_parse(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        for subcat_href in hxs.select('//div[@id="navColumnOne"]//a/@href').extract():
            subsubcat_url = urlparse.urljoin(base_url, subcat_href)
            if subsubcat_url not in self.navig_url_set:
                self.navig_url_set.add(subsubcat_url)
                yield Request(subsubcat_url, callback=self.browse_and_parse)

        next_page = hxs.select("//div[@id='productListing']//div[@id='productsListingListingTopLinks']//a[contains(., 'Neste')]/@href")
        if next_page:
            yield Request(next_page[0].extract(), callback=self.browse_and_parse)

        # parse product listing in this page, if any
        for tr in hxs.select('//div[@id="productListing"]//tr[@class="productListing-even" or @class="productListing-odd"]'):
            product_loader = ProductLoader(item=Product(), response=response)

            product_loader.add_value('url', tr.select(".//td[2]//a/@href").extract()[0])
            product_loader.add_value('name', tr.select(".//td[2]//a/text()").extract()[0])
            product_loader.add_value('price', tr.select(".//td[3]/text()").extract()[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.'))

            yield product_loader.load_item()

        # edge case: product listing page with a single product
        product_price = hxs.select('//h2[@id="productPrices"]/text()').extract()
        if product_price:
            # this product listing page contains a single product
            product_loader = ProductLoader(item=Product(), response=response)

            product_loader.add_xpath('name', '//h1[@id="productName"]/text()')
            product_loader.add_value('url', response.url)
            product_loader.add_value('price', product_price[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.'))

            yield product_loader.load_item()
Exemple #3
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        product_loader = ProductLoader(item=Product(), response=response)

        product_loader.add_xpath('price', u'//div[@id="conv-box"]//dd[@class="amount"]/text()')
        if not product_loader.get_output_value('price'):
            product_loader.add_xpath('price', u'//dl[@class="ssa-price-dl"]/dd[@class="ssa-price"]/text()')
        product_loader.add_value('url', response.url)
        product_loader.add_value('sku', response.meta['sku'])
        product_loader.add_value('identifier', response.meta['sku'].lower())

        name = hxs.select(u'//div[@class="right-column-left"]/div[@class="title"]/h2/text()').extract()[0].strip()
        product_loader.add_value('name', name)

        # sku = response.meta['sku'].lower().split(' ')
        # name = product_loader.get_output_value('name').lower()
        # sku = filter(lambda x: x != '' and x in name, sku)
        part_number = hxs.select(u'//div[@class="title"]/h2/span/text()').re('Part No. (.*)')[0]
        mfrgid = response.meta['mfrgid']
        if part_number == mfrgid and product_loader.get_output_value('price'):
            yield product_loader.load_item()
Exemple #4
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)


        products = hxs.select('//ul[@class="product-list"]/li')
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)

            name = product.select('.//h2/a/strong/text()').extract()[0]
            extra_name = product.select('.//h2/a/text()').extract()
            if extra_name:
                name += ' ' + extra_name[0]
            product_loader.add_value('name', name)
            url = product.select('.//h2/a/@href').extract()
            url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', url)
            product_loader.add_xpath('price', u'.//p/strong/text()', re='\xa3(.*)')
            yield product_loader.load_item()

        if not products and not response.meta.get('retry'):
            yield Request(response.url, callback=self.parse_product, dont_filter=True,
                          cookies={}, meta={'dont_merge_cookies': True, 'retry': True})
Exemple #5
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        multiple_options = hxs.select(u'//div[@id="spec-with-options"]//table//tr')[1:]

        name = hxs.select('//div[@id="product-title"]/text()').extract()[0]

        if not multiple_options:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('name', name)
            product_loader.add_value('url', response.url)
            product_loader.add_xpath('price', u'//div[@class="price-now"]/span[contains(@id,"product-price")]/text()',
                                     re='\xa3(.*)')
            yield product_loader.load_item()
        else:
            for option in multiple_options:
                product_loader = ProductLoader(item=Product(), selector=option)
                option_name = option.select('./td[position()=2]/text()').extract()[0]
                product_loader.add_value('name', name + ' ' + option_name)
                product_loader.add_value('url', response.url)
                product_loader.add_xpath('price', './/div[@class="price-now"]/span/text()', re='\xa3(.*)')
                yield product_loader.load_item()
Exemple #6
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        # products
        products = hxs.select(u'//div[@class="rightcol"]//td[contains(child::text(),"\xa3")] | //div[@class="rightcol"]//td[child::h1]')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            product_loader.add_xpath('name', './a/text()')
            product_loader.add_xpath('name', './h1/text()')
            url = product.select('./a/@href').extract()
            if not url:
                url = response.url
            else:
                url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', url)
            price = product.select('./text()').re('\xa3(.*)')
            if not price:
                price = product.select('.//span[@id="_EKM_PRODUCTPRICE"]/text()').extract()
            if not price:
                continue
            product_loader.add_value('price', price)
            yield product_loader.load_item()
Exemple #7
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        # products
        products = hxs.select('//div[@id="mtbody"]//table//table//a/img/../..')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            #product_loader.add_xpath('name', './/span[@class="cellheader"]/a/text()')
            product_loader.add_xpath('price', './/span[@class="pricetext"]/text()',
                                     re='.*\$(.*[0-9])')
            sku = product.select('.//span[@class="sku"]/text()').extract()
            if not sku:
                continue
            sku = re.sub('[.\- ]', '', sku[0])
            product_loader.add_value('sku', sku)
            if sku:
                product_loader.add_value('name', sku.lower())
            else:
                product_loader.add_xpath('name', './/span[@class="cellheader"]/a/text()')

            url = product.select('.//span[@class="cellheader"]/a/@href').extract()
            if not url:
                continue
            url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', url)
            yield product_loader.load_item()
Exemple #8
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        # products
        products = hxs.select('//div[@class="product_listing"]')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            #product_loader.add_xpath('name', './/span[@class="prod_name"]/a/@title')
            product_loader.add_xpath('price', './/span[@class="prod_our_price"]/strong/text()',
                                     re='.*\$(.*[0-9])')
            sku = product.select('.//span[@class="prod_number"]/text()').re('\((.*)\)')
            sku = re.sub('[\-]', '', sku[0])
            product_loader.add_value('sku', sku)
            if sku:
                product_loader.add_value('name', sku.lower())
            else:
                product_loader.add_xpath('name', './/span[@class="prod_name"]/a/@title')
            url = product.select('.//span[@class="prod_name"]/a/@href').extract()
            url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', url)
            yield product_loader.load_item()
Exemple #9
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        # pages
        next_page = hxs.select(u'//div[@style="float:left;padding-right:8px;"]/a[child::img]/@href').extract()
        if next_page:
            next_page = urljoin_rfc(get_base_url(response), next_page[0])
            yield Request(next_page, callback=self.parse_product)

        products = hxs.select(u'//div[contains(@class,"itemGrid")]')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//a[@class="oesLink"]/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            name = product.select(u'.//a[@class="oesLink"]/span/text()').extract()[0]
            name += ' ' + product.select(u'.//a[@class="oesLink"]/text()').extract()[0]
            product_loader.add_value('name', name)
            product_loader.add_xpath('price', u'.//span[@class="PlistOfferPrice"]/text()',
                                 re=u'\$(.*)')
            product_loader.add_xpath('price', u'.//div[@class="pricing"]/span/div/span/text()',
                                 re=u'\$(.*)')
            loaded = product_loader.get_output_value('name') and product_loader.get_output_value('price')
            if not loaded:
                continue
            yield product_loader.load_item()
Exemple #10
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        # detect multiple product page
        tableHeader = hxs.select("//td[text()='Item#']")

        if tableHeader:
            subProducts = tableHeader.select("../../tr[@class='Multi-Child_Background']")
            for subProduct in subProducts:
                loader = ProductLoader(Product(), subProduct)
                theTDs = subProduct.select("td")
                loader.add_value('sku', theTDs[0].select("text()").extract())
                loader.add_value('name', theTDs[1].select("text()").extract())
                loader.add_value('price', theTDs.select("b/text()").extract())
                loader.add_value('url', response.url)

                yield loader.load_item()

        else:
            productNode = hxs.select('//table[@id="v65-product-parent"]')[0]
            priceNode = productNode.select(".//font[@class='pricecolor colors_productprice']/text()")

            # Unavailable products are still online but have no price
            if priceNode:
                loader = ProductLoader(selector=productNode, item=Product())
                loader.add_xpath('name', './/font[@class="productnamecolorLARGE colors_productname"]/text()')
                loader.add_value('url', response.url)
                loader.add_value('price', priceNode.extract())
                sku = ''.join(hxs.select('.//span[@class="product_code"]/text()').extract()).strip()
                loader.add_value('sku', sku)

                yield loader.load_item()
Exemple #11
0
    def parse_option_price(self, response):
        product_loader = ProductLoader(item=Product(), response=response)

        product_loader.add_value('name', response.meta['name'])
        product_loader.add_value('url', response.meta['url'])
        product_loader.add_xpath('price', u'//div[@class="webPriceLabel"]/text()',
                                     re=u'\xa3(.*)')
        yield product_loader.load_item()
Exemple #12
0
    def parse_product(self, response):
      hxs = HtmlXPathSelector(response)

      loader = ProductLoader(item=Product(), response=response)
      loader.add_value('url', response.url)
      loader.add_xpath('name', '//h1[@id="top_product_info_block_product_title_text"]/text()')
      loader.add_xpath('sku', '//ul[@id="top_product_info_block_product_data_list"]/li/strong/text()')
      loader.add_xpath('price', '//p[@id="top_product_info_block_product_data_new_low_price"]/text()')
      yield loader.load_item()
Exemple #13
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        product_loader.add_xpath('price', '//div[@class="club"]/span[@itemprop="Price"]/text()',
                                 re='.*\$(.*[0-9])')
        product_loader.add_value('url', response.url)
        return product_loader.load_item()
Exemple #14
0
 def parse_products(self, hxs, base_url):
     products = hxs.select('//div[@class="productResultInfo"]')
     for product in products:
         product_loader = ProductLoader(Product(), product)
         product_loader.add_xpath('name', './/a[@class="ProductNameText"]/text()')
         url = product.select('.//a[@class="ProductNameText"]/@href').extract()[0]
         product_loader.add_value('url', urljoin_rfc(base_url, url))
         price = ' '.join(product.select('.//span[@class="variantprice"]//text()').extract())
         product_loader.add_value('price', price)
         product_loader.add_xpath('sku', './/p[contains(@class, "productSKU")]/text()')
         yield product_loader.load_item()
Exemple #15
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('name', response.meta['sku'])
        product_loader.add_xpath('price', '//div[@class="yourPrice"]/span[@class="salePriceContent"]/text()', re='.*\$(.*)')
        product_loader.add_xpath('price', '//div[@class="yourPrice"]/span[@class="itemPriceContent"]/text()', re='.*\$(.*)')
        product_loader.add_value('url', response.url)
        product_loader.add_value('sku', response.meta['sku'])
        return product_loader.load_item()
Exemple #16
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        product_loader.add_xpath(
            'price',
            '//div[@class="club"]/span[@itemprop="Price"]/text()',
            re='.*\$(.*[0-9])')
        product_loader.add_value('url', response.url)
        return product_loader.load_item()
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        image_url = hxs.select(
            '//div[@class="ProductThumbImage"]/a/@href').extract()
        image_url = image_url[0] if image_url else ''
        brand = hxs.select('//h4[@class="BrandName"]/a/text()').extract()
        brand = brand[0] if brand else ''
        items = hxs.select(
            "//div[@id='ProductDetails']/div[@class='BlockContent']")
        for item in items:
            title = item.select(
                './/div[@class="ProductDetailsGrid"]//h1/text()').extract()[0]
            url = response.url
            product_id = item.select(
                ".//input[@type='hidden' and @name='product_id']/@value"
            ).extract()[0]
            select_el = item.select(
                ".//div[@class='productOptionViewSelect']/select")
            options = list(Options(select_el).gen())
            if options:
                field_name = select_el.select("@name").extract()[0]
                for option in options:
                    options_dict = {x[0]: x[1][0] for x in option}
                    item_options = json_api_request_args.copy()
                    item_options.update(options_dict)
                    item_options['product_id'] = product_id

                    new_item_name = title + " " + " ".join(
                        [x[1][1] for x in option])
                    request = FormRequest(url=json_api_url,
                                          formdata=item_options,
                                          callback=self._parse_item_json)
                    request.meta['item_name'] = new_item_name
                    request.meta['item_url'] = url
                    request.meta['subtype_id'] = "-".join(
                        [x[1][0] for x in option])
                    request.meta['product_id'] = product_id
                    request.meta['image_url'] = image_url
                    request.meta['brand'] = brand
                    request.meta['category'] = response.meta.get('category')
                    yield request
            else:
                l = ProductLoader(item=Product(), response=response)
                l.add_value('identifier', product_id)
                l.add_value('name', title)
                l.add_value('url', url)
                l.add_value('image_url', image_url)
                l.add_value('category', response.meta.get('category'))
                l.add_value('brand', brand)
                l.add_xpath(
                    'price',
                    '//div[contains(@class, "PriceRow")]/div/span/text()')
                yield l.load_item()
Exemple #18
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        row = response.meta['row']

        name = hxs.select('//span[@itemprop="name"]/text()').extract()[0].strip()
        url = response.url
        price = hxs.select('//p[@class="special-price"]/span[@class="price"]/text()').extract()
        if not price:
            price = hxs.select('//span[@class="regular-price"]/span[@class="price"]/text()').extract()
        price = price[0] if price else 0

        l = ProductLoader(item=Product(), response=response)
        l.add_value('name', name)        
        l.add_value('url', response.url)
        l.add_value('sku', row['SKU'])
        l.add_value('price', price)
        identifier = hxs.select('//input[@name="productId"]/@value').extract()
        if not identifier:
            identifier = hxs.select('//input[@name="product"]/@value').extract()

        l.add_value('identifier', identifier)
        l.add_xpath('brand', '//tr[th/text()="Brand"]/td/text()')
        l.add_xpath('image_url', '//a[@id="shoe-spin"]/img/@src')
        categories = hxs.select('//li[@typeof="v:Breadcrumb"]/a/text()').extract()
        l.add_value('category', categories)
        in_stock = hxs.select('//div[@class="offer"]//p[@class="availability in-stock"]')
        if not in_stock:
            l.add_value('stock', 0)
        item = l.load_item()

        options_config = re.search(r'var spConfig = new Product.Config\((.*)\)', response.body)
        if options_config:
            option_item = deepcopy(item)
            product_data = json.loads(options_config.groups()[0])
            products = {}
            prices = {}
            for attr in product_data['attributes'].itervalues():
                for option in attr['options']:
                    for product in option['products']:
                        products[product] = ' - '.join((products.get(product, ''), option['label']))
                        prices[product] = prices.get(product, 0) +  extract_price(option['price'])

            for option_id, option_name in products.iteritems():
                option_item = deepcopy(item)
                option_item['identifier'] = option_item['identifier'] + '-' + option_id
                option_item['name'] = option_item['name'] + re.findall('(.*) \(', option_name)[0]
                option_item['price'] = option_item['price'] + prices[option_id]
                if 'IN STOCK' not in option_name.upper():
                    option_item['stock'] = 0
                yield option_item
        else:
            yield item
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        special_price_url = hxs.select(
            "//td[@class='tdcf10bk']/a/@href").extract()
        if special_price_url:
            special_price_url = special_price_url[0]
            special_price_url = urljoin_rfc(get_base_url(response),
                                            special_price_url)

            request = Request(url=special_price_url,
                              callback=self.parse_product)
            request.meta['sku'] = response.meta['sku']
            request.meta['mfrgid'] = response.meta['mfrgid']
            request.meta['search_q'] = response.meta['search_q']
            yield request
            return

        product_loader = ProductLoader(item=Product(), response=response)

        name = hxs.select("//h1/font/b/text()").extract()
        price = hxs.select("//font[@color='#990000']/b/text()").extract()
        if not name or not price:
            retry_count = self.retry_urls.get(response.url, 0)
            retry_count += 1
            if retry_count > 100:
                self.log("ERROR MAX retry count reached (100), giving up...")
                return
            else:
                self.log(
                    "ERROR parsing HTML, adding to retry queue (#{})".format(
                        retry_count))
                self.retry_urls[response.url] = retry_count
                request = Request(url=response.url,
                                  callback=self.parse_product,
                                  dont_filter=True)
                request.meta['sku'] = response.meta['sku']
                request.meta['mfrgid'] = response.meta['mfrgid']
                request.meta['search_q'] = response.meta['search_q']
                yield request
                return
        else:
            product_loader.add_value('name', name[0])
            product_loader.add_value('price', price[0])
            product_loader.add_value('url', response.url)
            product_loader.add_value('sku', response.meta['sku'].lower())
            product_loader.add_xpath('identifier',
                                     '//form/input[@name="PID"]/@value')
            yield product_loader.load_item()
Exemple #20
0
    def parse_product(self, response):
        compound = [product for product in self._parse_compound_product(response)]
        if compound:
            for product in compound:
                yield product
            return

        loader = ProductLoader(response=response, item=Product())
        loader.add_xpath('name', '//h1[@itemprop="Name"]//text()')
        loader.add_xpath('price', '//input[@name="price"]/@value')
        loader.add_value('url', response.url)
        loader.add_xpath('sku', '//span[@itemprop="model"]/text()')
        yield loader.load_item()
Exemple #21
0
    def browse_and_parse(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        for subcat_href in hxs.select(
                '//div[@id="navColumnOne"]//a/@href').extract():
            subsubcat_url = urlparse.urljoin(base_url, subcat_href)
            if subsubcat_url not in self.navig_url_set:
                self.navig_url_set.add(subsubcat_url)
                yield Request(subsubcat_url, callback=self.browse_and_parse)

        next_page = hxs.select(
            "//div[@id='productListing']//div[@id='productsListingListingTopLinks']//a[contains(., 'Neste')]/@href"
        )
        if next_page:
            yield Request(next_page[0].extract(),
                          callback=self.browse_and_parse)

        # parse product listing in this page, if any
        for tr in hxs.select(
                '//div[@id="productListing"]//tr[@class="productListing-even" or @class="productListing-odd"]'
        ):
            product_loader = ProductLoader(item=Product(), response=response)

            product_loader.add_value(
                'url',
                tr.select(".//td[2]//a/@href").extract()[0])
            product_loader.add_value(
                'name',
                tr.select(".//td[2]//a/text()").extract()[0])
            product_loader.add_value(
                'price',
                tr.select(".//td[3]/text()").extract()[0].split("-")[0].split(
                    " ")[1].replace('.', '').replace(',', '.'))

            yield product_loader.load_item()

        # edge case: product listing page with a single product
        product_price = hxs.select(
            '//h2[@id="productPrices"]/text()').extract()
        if product_price:
            # this product listing page contains a single product
            product_loader = ProductLoader(item=Product(), response=response)

            product_loader.add_xpath('name', '//h1[@id="productName"]/text()')
            product_loader.add_value('url', response.url)
            product_loader.add_value(
                'price', product_price[0].split("-")[0].split(" ")[1].replace(
                    '.', '').replace(',', '.'))

            yield product_loader.load_item()
Exemple #22
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        multiple_prices = hxs.select(
            u'//table[@class="grpChld"]//tr[@class="r1"]')
        if not multiple_prices:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_xpath('name', u'//div[@class="det"]/h1/text()')
            product_loader.add_value('url', response.url)
            product_loader.add_xpath(
                'price',
                u'//div[@class="addBsk"]/div[@class="pri"]/b/text()',
                re=u'\xa3(.*)')
            yield product_loader.load_item()
        else:
            for name_and_price in multiple_prices:
                product_loader = ProductLoader(item=Product(),
                                               selector=name_and_price)
                product_loader.add_xpath('name',
                                         u'./td[@class="c1"]/text()',
                                         re=u'.*?-[\xa0]*(.*)')
                product_loader.add_value('url', response.url)
                product_loader.add_xpath(
                    'price',
                    u'./following-sibling::node()[1]/td[@class="c3"]/span/text()',
                    re=u'\xa3(.*)')
                yield product_loader.load_item()
Exemple #23
0
    def load_products(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        next = hxs.select('//div[@class="pager"]//a[@class="next i-next"]')
        if next:
            href = next.select("./@href").extract()[0]
            yield Request(urlparse.urljoin(base_url, href),
                          callback=self.load_products)

        for product_box in hxs.select('//ol[@id="products-list"]/li'):
            product_loader = ProductLoader(item=Product(),
                                           selector=product_box)

            product_loader.add_xpath('name',
                                     './/h2[@class="product-name"]/a/text()')
            product_loader.add_xpath('url',
                                     './/h2[@class="product-name"]/a/@href')

            if product_box.select('.//p[@class="special-price"]'):
                product_loader.add_xpath(
                    'price',
                    './/div[@class="price-box"]/p[@class="special-price"]/span[@class="price"]/text()'
                )
            else:
                product_loader.add_xpath(
                    'price',
                    './/div[@class="price-box"]//span[@class="regular-price"]/span[@class="price"]/text()'
                )

            yield product_loader.load_item()
Exemple #24
0
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        name = hxs.select(
            '//td[@id="workspace"]/h1/a/text()').pop().extract().strip()

        category = hxs.select(
            '//div[@class="odkazy_cesta"]/a/text()').pop().extract().strip()

        sku = hxs.select('//input[@name="detail"]/@value').extract().pop()

        pid = sku

        if not sku:
            sku = pid

        price = self.parse_price(
            hxs.select(
                '//table[@id="detail_tabulka2"]/tr/th[contains(text(), "Cena s DPH")]/following-sibling::td/descendant-or-self::text()'
            ).pop().extract())

        stock = hxs.select(
            '//table[@id="detail_tabulka2"]/tr/td//img[contains(@src, "skladem.png")]'
        )

        if price:
            loader = ProductLoader(response=response, item=Product())
            loader.add_value('url', urljoin(base_url, response.url))
            loader.add_value('name', name)
            try:
                loader.add_xpath('image_url',
                                 '//td[@id="detail_foto"]/div/a/img/@src',
                                 Compose(lambda v: urljoin(base_url, v[0])))
            except IndexError:
                self.errors.append("No image set for url: '%s'" %
                                   urljoin(base_url, response.url))
            loader.add_value('price', price)
            loader.add_value('category', category)
            loader.add_value('sku', sku)
            loader.add_value('identifier', pid)
            loader.add_value('brand', 'LEGO')
            loader.add_value('shipping_cost', 69)
            if not stock:
                loader.add_value('stock', 0)
            yield self.load_item_with_metadata(loader.load_item())
        else:
            self.errors.append("No price set for url: '%s'" %
                               urljoin(base_url, response.url))
Exemple #25
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@class="summaryboxsearch"]')
        for product in products[0:1]:  # extract only the first product
            product_loader = ProductLoader(item=Product(), selector=product)
            product_loader.add_xpath(
                'price', u'.//span[@class="floatl sli_price"]/text()')
            product_loader.add_xpath('url',
                                     u'.//p[@class="mtext nobreak"]/a/@title')
            product_loader.add_value('sku', response.meta['sku'])
            product_loader.add_value('identifier',
                                     response.meta['sku'].lower())
            product_loader.add_xpath('name',
                                     u'.//p[@class="mtext nobreak"]/a/text()')
            name = product_loader.get_output_value('name').lower()
            sku = product_loader.get_output_value('sku').lower().split(' ')
            sku = filter(lambda x: x != '' and x in name, sku)
            site_mfrgid = product.select(
                './/span[@class="floatl sli_grid_code"]/text()').extract()
            if site_mfrgid:
                mfrgid = response.meta['mfrgid'].lower()
                site_mfrgid = site_mfrgid[0].strip().lower()
                if mfrgid in site_mfrgid and sku:
                    yield product_loader.load_item()

        if not products:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_xpath('price',
                                     u'//p[@class="strong"]/span/text()')
            product_loader.add_value('url', response.url)
            product_loader.add_value('sku', response.meta['sku'])
            product_loader.add_value('identifier',
                                     response.meta['sku'].lower())
            product_loader.add_xpath(
                'name', u'//div[@class="indentl orderbox"]//h1/text()')
            name = product_loader.get_output_value('name').lower()
            sku = product_loader.get_output_value('sku').lower().split(' ')
            sku = filter(lambda x: x != '' and x in name, sku)
            site_mfrgid = hxs.select(
                '//div[@class="indentl orderbox"]/div[@class="floatl"]/p/strong/text()'
            ).extract()
            if site_mfrgid:
                site_mfrgid = site_mfrgid[0].strip().lower()
                mfrgid = response.meta['mfrgid'].lower()
                if mfrgid in site_mfrgid and sku:
                    yield product_loader.load_item()
Exemple #26
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        name = hxs.select(u'//div[@class="product-name fn"]/h1/text()').extract()[0]

        multiple_prices = hxs.select(u'//table[@id="super-product-table"]//tr')
        if not multiple_prices:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('name', name)
            product_loader.add_value('url', response.url)
            product_loader.add_xpath('price', u'//div[@class="price-box"]/span[contains(@id,"product-price")]/span[@class="price"]/text()',
                                     re='\xa3(.*[0-9])')
            product_loader.add_xpath('price', u'//div[@class="price-box"]/p[@class="special-price"]/span[@class="price"]/text()',
                                     re='\xa3(.*[0-9])')
            yield product_loader.load_item()
        else:
            for name_and_price in multiple_prices:
                product_loader = ProductLoader(item=Product(), selector=name_and_price)
                name_options = name_and_price.select(u'./td[position()=1]/text()').extract()[0]
                product_loader.add_value('name', name + ' ' + name_options)
                product_loader.add_value('url', response.url)
                product_loader.add_xpath('price', u'./td[position()=2]/div[@class="price-box"]/span[@class="regular-price"]/span[@class="price"]/text()',
                                         re=u'\xa3(.*)')
                product_loader.add_xpath('price', u'./td[position()=2]/div[@class="price-box"]/p[@class="special-price"]/span[@class="price"]/text()',
                                         re=u'\xa3(.*)')
                yield product_loader.load_item()
Exemple #27
0
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        name = hxs.select(
            '//div[@id="product-detail"]/h2/text()').extract().pop().strip()

        category = hxs.select(
            '//div[@id="product-detail"]/p[@id="zarazeni"]/a/text()').pop(
            ).extract().strip()

        sku = hxs.select(
            '//div[@class="content"]/p/strong[contains(text(), "d produktu")]/following-sibling::text()'
        )[0].extract().strip()
        if sku.startswith('lego'):
            sku = sku[4:]
        pid = sku
        #pid = hxs.select('//input[@name="order[id]"]/@value').pop().extract()

        price = self.parse_price(
            hxs.select(
                '//div[@class="content"]/p/strong[@class="price"]/big/text()').
            pop().extract())

        stock = hxs.select(
            '//div[@class="content"]/p/strong[@class="price"][contains(text(), "Dostupnost: Skladem")]'
        )

        if price:
            loader = ProductLoader(response=response, item=Product())
            loader.add_value('url', urljoin(base_url, response.url))
            loader.add_value('name', name)
            loader.add_xpath('image_url',
                             '//table[@id="pictures"]/tr/td[1]/a/img/@src',
                             Compose(lambda v: urljoin(base_url, v[0])))
            loader.add_value('price', price)
            loader.add_value('category', category)
            loader.add_value('sku', sku)
            loader.add_value('identifier', pid)
            loader.add_value('brand', 'LEGO')
            if int(price) < 2500:
                loader.add_value('shipping_cost', 89)
            if not stock:
                loader.add_value('stock', 0)
            yield self.load_item_with_metadata(loader.load_item())
        else:
            self.errors.append("No price set for url: '%s'" %
                               urljoin(base_url, response.url))
Exemple #28
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        products = hxs.select(u'//table[@class="list"]//tr')[1:]
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//h3/a/@href').extract()
            url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', url)
            product_loader.add_xpath('name', u'.//h3/a/text()')
            product_loader.add_xpath('price', u'.//p[@class="prixPromo"]/text()',
                                     re=u'([\d\.]+)')
            yield product_loader.load_item()
Exemple #29
0
    def parse_category(self, response):
        hxs = HtmlXPathSelector(response)
        '''
        categories = hxs.select("//ul[@id='nav']//a/@href").extract()
        for category in categories:
            yield Request(category, callback=self.parse)
        '''
        pages = hxs.select("//div[@class='pages']/ol/li/a/@href").extract()
        for page in pages:
            yield Request(page, callback=self.parse_category)

        items = hxs.select('//li[contains(@class, "item")]/div')
        for item in items:
            name = item.select("h2[@class='product-name']/a/text()").extract()
            if not name:
                logging.error("NO NAME! %s" % response.url)
                return
            name = name[0]

            url = item.select("h2[@class='product-name']/a/@href").extract()
            if not url:
                logging.error("NO URL! %s" % response.url)
                return
            url = url[0]

            # adding product
            price = item.select(
                "div[@class='price-box']/p[@class='special-price']/span[@class='price']/text() |\
                                 div[@class='price-box']/span[@class='regular-price']/span[@class='price']/text()"
            ).extract()
            if not price:
                logging.error("NO PRICE! %s" % response.url)
                return
            price = price[0].replace(".", "").replace(",", ".")

            identifier = item.select(
                './/*[contains(@id, "product-price-")]/@id').re(
                    r'product-price-(\d+)')

            l = ProductLoader(item=Product(), response=response)
            l.add_value('identifier', identifier)
            l.add_value('name', name)
            l.add_value('url', url)
            l.add_xpath('image_url', 'a[@class="product-image"]/img/@src')
            l.add_xpath('category',
                        '//div[contains(@class, "category-title")]/h1/text()')
            l.add_value('price', price)
            yield l.load_item()
Exemple #30
0
    def parse_product(self, response):
        loader = ProductLoader(item=Product(), selector=response)
        loader.add_value('url', response.url)

        image_url = response.xpath('//img[@itemprop="image"]/@src').extract()
        if image_url:
            loader.add_value('image_url', response.urljoin(image_url[0]))

        identifier = response.xpath('//form[@name="form1"]/@action').extract()
        if not identifier:
            yield self.retry_request(response)
            return
        identifier = identifier[0]
        loader.add_value('identifier', identifier)
        price = response.xpath(
            '//*[@class="price"]/*[@class="mainPrice"]/text()')[0].extract()
        loader.add_value('price', price)

        stock = response.xpath('//div[@class="stockLevel"]//text()').re(
            r'(\d+)')
        if stock:
            loader.add_value('stock', stock[0])

        brand = response.xpath('//*[@itemprop="brand"]/@content').extract()
        if not brand:
            brand = response.xpath(
                '//div[@class="hidden"]/input[@class="producerName"]/@value'
            ).extract()
        if brand:
            brand = brand[0].strip()
            loader.add_value('brand', brand)
        if 'category' in response.meta:
            if response.meta['category'] != 'Car tyres':
                loader.add_value('category', response.meta['category'])
            else:
                category = response.xpath(
                    '//dt[contains(text(), "Type:")]/following-sibling::dd/text()'
                ).extract()
                if category:
                    loader.add_value('category', category[0].strip())
        else:
            loader.add_value('category', loader.get_output_value('brand'))

        loader.add_xpath('name', '//h1[@itemprop="name"]/text()')
        loader.add_xpath('sku', '//*[@itemprop="sku"]/@content')

        if self.exclude_word not in loader.get_output_value('name'):
            yield loader.load_item()
Exemple #31
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        products = hxs.select(u'//div[@class="product-listing-2"]/div[contains(@class,"rec")]')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//div[@class="description"]/h2/a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            product_loader.add_xpath('name', u'.//div[@class="description"]/h2/a/text()')
            product_loader.add_xpath('price', u'.//span[@class="prod-price"]/text()',
                                 re=u'\$(.*)')
            yield product_loader.load_item()
Exemple #32
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        products = hxs.select(u'//tr[@align="center" and child::td[child::a[@target="_top"]]]')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//a[@target="_top" and child::span]/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            product_loader.add_xpath('name', u'.//a[@target="_top"]/span/text()')
            product_loader.add_xpath('price', u'.//span[contains(@class,"price")]/text()',
                                             re=u'\xa3([\d\.,]+)')
            yield product_loader.load_item()
Exemple #33
0
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@class="shopprods"]')
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            product_loader.add_xpath('name', './/p/strong/a/text()')
            url = product.select('.//p/strong/a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            price = product.select('.//span[@class="price"]/text()').extract()[0]
            price = Decimal(price) + Decimal(5)
            price = str(price)
            product_loader.add_value('price', price)
            yield product_loader.load_item()
Exemple #34
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        products = hxs.select(u'//form[@name="frmCompare"]//ul[@class="ProductList "]//li')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//div[@class="ProductDetails"]/strong/a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            product_loader.add_xpath('name', u'.//div[@class="ProductDetails"]/strong/a/text()')
            product_loader.add_xpath('price', u'.//div[@class="ProductPriceRating"]/em/text()',
                                 re=u'\$(.*)')
            yield product_loader.load_item()
Exemple #35
0
 def parse_products(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     #pagination
     urls = hxs.select(
         '//ul[@class="pager"]/li[@class!="selected"]/a/@href').extract()
     for url in urls:
         yield Request(urljoin_rfc(base_url, url),
                       callback=self.parse_products)
     #products
     category = hxs.select('//*[@id="listing_h1"]/h1/text()').extract()
     products = hxs.select(
         '//*[@id="listing_products2"]/div[@class="product"]')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         name = product.select('.//div[@class="product_title"]/h2/a/text()'
                               ).extract()[0].strip()
         url = product.select(
             './/div[@class="product_title"]/h2/a/@href').extract()[0]
         loader.add_value('url', urljoin_rfc(base_url, url))
         loader.add_value('name', name)
         loader.add_xpath('image_url',
                          './/div[@class="product_image"]/a/img/@src',
                          Compose(lambda v: urljoin(base_url, v[0])))
         price = product.select(
             './/span[@class="price"]/text()').extract()[0]
         price = price.split(u'\xa0')[0]
         price = extract_price_eu(price)
         loader.add_value('price', price)
         sku = product.select(
             './/table/tr[1]/td[2]/strong/text()').extract()
         if sku:
             loader.add_value('sku', sku[0])
         identifier = product.select(
             './/div[@class="product_title"]/h2/a/@href').re(
                 r"-p([\d]+)$")[0]
         loader.add_value('identifier', identifier)
         loader.add_value('brand', 'LEGO')
         stock = product.select(
             './/table//span[@class="skladom"]/text()').extract()
         if stock:
             results = re.search(r"\b([\d]+)\b", stock[0])
             if results:
                 loader.add_value('stock', results.group(1))
         if category:
             loader.add_value('category', category[0])
         yield self.load_item_with_metadata(loader.load_item())
Exemple #36
0
    def parse_product(self, response):
        base_url = get_base_url(response)

        image_url = response.xpath('//img[@id="mainImg"]/@src').extract()

        product_loader = ProductLoader(item=Product(), response=response)
        identifier = response.xpath('//div[@class="productCode"]/span/text()').extract()
        if not identifier:
            return

        identifier = identifier[0]
        product_loader.add_value('identifier', identifier)
        product_loader.add_xpath('name', '//h1/text()')
        if image_url:
            product_loader.add_value('image_url', response.urljoin(image_url[0]))
        product_loader.add_value('sku', identifier)
        price = response.xpath('//div[@class="prodRightWrapper"]//div[@class="price"]/text()').extract()[0].strip()
        product_loader.add_value('price', price)
        product_loader.add_value('url', response.url)
        categories = response.xpath('//div[@id="breadCrumbWrapper"]//div[@itemprop="title"]/text()').extract()[1:-1]
        product_loader.add_value('category', categories)
        product_loader.add_value('brand', '')
        item = product_loader.load_item()

        options_url = "http://www.careco.co.uk/ajaxTwoDimSelect/"

        options = response.xpath('//select[@class="buysSelect"]/option[@value!=""]')
        if options:
            for option in options:
                option_item = deepcopy(item)
                name = option.xpath('text()').extract()[0].split(u'\xa3')[0].strip()
                option_item['name'] += ' ' + name
                identifier = option.xpath('@value').extract()[0]
                option_item['identifier'] += '-' + identifier
                price = option.xpath('text()').re(u'\xa3\d+\.\d+')
                if price:
                    option_item['price'] = extract_price(price[0])
                
                ajax_option = option.xpath('@onclick')
                if ajax_option:
                    formdata = {'FS': item['identifier'], 
                      	        'CODE': option.xpath('@value').extract()[0]}
                    yield FormRequest(options_url, dont_filter=True, formdata=formdata, callback=self.parse_options, meta={'item': option_item})
                else:
                    yield option_item
        else:
            yield item
Exemple #37
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        prices = hxs.select('//span[@class="price"]/text()')
        loader = ProductLoader(response=response, item=Product())

        if prices:
            loader.add_value('price', prices[len(prices) - 1])

        loader.add_xpath('name', '//div[@class="product_l"]/h2/text()')
        loader.add_value('url', response.url)

        txt = hxs.select("//label[starts-with(text(), 'Manufacturers')]").extract()[0]
        sku = txt[txt.find('/label>')+7:]
        loader.add_value('sku', sku.strip())

        yield loader.load_item()
Exemple #38
0
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)

        products = hxs.select('//div[@class="shopprods"]')
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            product_loader.add_xpath('name', './/p/strong/a/text()')
            url = product.select('.//p/strong/a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            price = product.select(
                './/span[@class="price"]/text()').extract()[0]
            price = Decimal(price) + Decimal(5)
            price = str(price)
            product_loader.add_value('price', price)
            yield product_loader.load_item()
Exemple #39
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('name', response.meta['sku'])
        product_loader.add_xpath(
            'price',
            '//div[@class="yourPrice"]/span[@class="salePriceContent"]/text()',
            re='.*\$(.*)')
        product_loader.add_xpath(
            'price',
            '//div[@class="yourPrice"]/span[@class="itemPriceContent"]/text()',
            re='.*\$(.*)')
        product_loader.add_value('url', response.url)
        product_loader.add_value('sku', response.meta['sku'])
        return product_loader.load_item()
Exemple #40
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        products = hxs.select(u'//div[@class="listitem"]')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//div[@class="heading"]/a[child::span[@class="ProductListHead"]]/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            name = product.select(u'.//div[@class="heading"]/a/span[@class="ProductListHead"]/text()').extract()[0].strip()
            product_loader.add_value('name', name)
            product_loader.add_xpath('price', u'.//span[@class="price"]/span[@class="ProductListItem"]/text()',
                                             re=u'\xa3(.*)')
            yield product_loader.load_item()
Exemple #41
0
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        name = hxs.select(
            '//div[@class="product-name"]/h1/text()').extract().pop().strip()

        category = hxs.select('//div[@class="breadcrumbs"]/ul/li/a/text()'
                              ).pop().extract().strip()
        if category.startswith(u'Dom\u016f'):
            category = ""

        sku = self.get_sku_from_text(name)

        pid = hxs.select('//input[@name="product"]/@value').pop().extract()

        if not sku:
            sku = ""

        price = self.parse_price("".join(
            hxs.select(
                '//span[contains(@id, "product-price")]/descendant-or-self::text()'
            ).extract()))

        #stock = hxs.select('//p[@class="availability in-stock"]')

        if price:
            loader = ProductLoader(response=response, item=Product())
            loader.add_value('url', urljoin(base_url, response.url))
            loader.add_value('name', name)
            loader.add_xpath('image_url',
                             '//div[@class="product-img-box"]/p/img/@src',
                             Compose(lambda v: urljoin(base_url, v[0])))
            loader.add_value('price', price)
            loader.add_value('category', category)
            loader.add_value('sku', sku)
            loader.add_value('identifier', pid)
            loader.add_value('brand', 'LEGO')
            loader.add_value('shipping_cost', 59)
            #if not stock:
            #loader.add_value('stock', 0)
            yield self.load_item_with_metadata(loader.load_item())
        else:
            self.errors.append("No price set for url: '%s'" %
                               urljoin(base_url, response.url))
Exemple #42
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_xpath('price', u'//span[@class="price"]/span[@class="price" and contains(@id, "sec_discounted_price")]/text()')
        product_loader.add_value('url', response.url)
        product_loader.add_value('sku', response.meta['sku'])
        product_loader.add_value('identifier', response.meta['sku'].lower())
        product_loader.add_xpath('name', u'//div[@class="product-info"]/h1[@class="mainbox-title"]/text()')
        site_mfrgid = hxs.select(u'//div[@class="form-field" and child::label[contains(text(),"Model#")]]/text()').extract()
        if len(site_mfrgid) > 1:
            site_mfrgid = site_mfrgid[1].strip()
            if site_mfrgid == response.meta['mfrgid']:
                yield product_loader.load_item()
Exemple #43
0
 def parse_products(self, hxs):
     products = hxs.select('//div[starts-with(@id, "product_")]')
     for product in products:
         product_loader = ProductLoader(Product(), product)
         product_loader.add_xpath('url', './/span[@class="description"]/a/@href')
         product_loader.add_xpath('name', './/span[@class="description"]/a/b/text()')
         #product_loader.add_xpath('price', './/label/text()')
         product_loader.add_xpath('price', './/div[@class="our_price"]/text()')
         product_loader.add_xpath('sku', './/span[@class="description"]', re='Model #:[\s(]*([\S^)]*)')
         yield product_loader.load_item()
Exemple #44
0
 def parse_products(self, response):
     hxs = HtmlXPathSelector(response)
     base_url = get_base_url(response)
     #pagination
     urls = hxs.select(
         '//ul[@class="pager"]/li[@class!="selected"]/a/@href').extract()
     for url in urls:
         yield Request(urljoin_rfc(base_url, url),
                       callback=self.parse_products)
     #products
     category = hxs.select('//*[@id="wherei"]/p//a/text()').extract()
     products = hxs.select('//div[@class="productBody"]')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)
         name = product.select(
             './/div[@class="productTitleContent"]/a/text()').extract(
             )[0].strip()
         url = product.select(
             './/div[@class="productTitleContent"]/a/@href').extract()[0]
         loader.add_value('url', urljoin_rfc(base_url, url))
         loader.add_value('name', name)
         loader.add_xpath('image_url',
                          './/div[@class="img_box"]/a/img[1]/@src',
                          Compose(lambda v: urljoin(base_url, v[0])))
         price = product.select(
             './/div[@class="productPrice"]/span[contains(@itemprop, "price")]/text()'
         ).extract()[0]
         price = price.split(u'\xa0')[0]
         price = extract_price_eu(price)
         loader.add_value('price', price)
         results = re.search(r"\b([\d]+)\b", name)
         if results:
             loader.add_value('sku', results.group(1))
         identifier = product.select(
             './/div[@class="img_box"]/a/img[1]/@rel').extract()[0]
         loader.add_value('identifier', identifier)
         loader.add_value('brand', 'LEGO')
         stock = product.select('.//div[@class="stock_no"]').extract()
         if stock:
             loader.add_value('stock', 0)
         if category:
             loader.add_value('category', category[-1])
         if price < 15:
             loader.add_value('shipping_cost', 2.69)
         yield self.load_item_with_metadata(loader.load_item())
Exemple #45
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        # products
        products = hxs.select(u'//b[contains(text(), "\xa3")]/../..')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            product_loader.add_xpath('name', './b/font/text()')
            product_loader.add_value('url', response.url)
            price = product.select(u'.//b[contains(text(), "\xa3")]/text()').re('\xa3(.*[0-9])')
            if not price:
                continue
            product_loader.add_value('price', price)
            yield product_loader.load_item()
Exemple #46
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(response=response, item=Product())
        loader.add_xpath(
            'name',
            '//font[@class="productnamecolorLARGE colors_productname"]/text()')
        loader.add_value('url', response.url)
        loader.add_xpath(
            'price', '//font[@class="pricecolor colors_productprice"]/text()')
        sku = (''.join(
            hxs.select(
                '//span[@class="product_code"]/text()').extract()).strip())
        # sku = [x.strip() for x in sku if x.strip()]
        sku = sku[3:]
        loader.add_value('sku', sku)
        # loader.add_value('sku', "the_sku")

        yield loader.load_item()
 def parse_product(self, response):
     if not isinstance(response, HtmlResponse):
         return
     hxs = HtmlXPathSelector(response)
     
     products = hxs.select('//table[@width="86%"]/tr')
     for product in products:
         sku_ = product.select('./form/td[1]/b/text()').extract()
         if sku_:
           site_mfrgid = product.select('./form/td[2]/font[contains(text(),"Manufacturer")]/b/text()').extract()
           if site_mfrgid:
               site_mfrgid = site_mfrgid[0].lower() == response.meta['mfrgid'].lower()
           else:
               site_mfrgid = False
           if sku_[0] == response.meta['search_q'] or site_mfrgid:
             price = "".join(product.select("./form/td[3]/font/b/text()").re(r'([0-9\,\. ]+)')).strip()
             if price:
                 name = product.select('./form/td[2]/text()').extract()[0]
                 product_loader = ProductLoader(item=Product(), response=response)
                 if '...Regularly' in name:
                     name = re.sub('\.{3}Regularly.*?\$.*$', '', name)
                 product_loader.add_value('price', price)
                 product_loader.add_value('url', response.url)
                 product_loader.add_value('sku', response.meta['sku'])
                 product_loader.add_value('identifier', response.meta['sku'].lower())
                 product_loader.add_value('name', response.meta['sku'] + ' '  + name)
                 yield product_loader.load_item()
     name = hxs.select(u'//h1[@class="big product_title"]/text()').extract()
     if not products and name:
         product_loader = ProductLoader(item=Product(), response=response)
         name = name[0]
         if '...Regularly' in name:
             name = re.sub('\.{3}Regularly.*?\$.*$', '', name)
         product_loader.add_value('name', name)
         product_loader.add_xpath('price', u'//dt[@id="prod_price"]//span[@class="small"]/strong[@class="big"]/text()',
                                 re='\$(.*)')
         product_loader.add_value('sku', response.meta['sku'])
         product_loader.add_value('identifier', response.meta['sku'].lower())
         product_loader.add_value('url', response.url)
         site_mfrgid = hxs.select(u'//span[@class="small" and contains(text(),"Manufacturer")]/following-sibling::strong[1]/text()').extract()
         if site_mfrgid:
             site_mfrgid = site_mfrgid[0].lower().strip()
             if site_mfrgid == response.meta['mfrgid'].strip().lower():
                 yield product_loader.load_item()
Exemple #48
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        featured_product = hxs.select(u'//div[@class="featuredProduct"]')
        product_loader = ProductLoader(item=Product(), selector=featured_product)
        url = featured_product.select(u'.//div[@class="fDescription"]/a/@href').extract()
        if url:
            url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', url)
            product_loader.add_xpath('name', u'.//div[@class="fDescription"]/a/strong/text()')
            price_css_classes = [{'tag': 'span', 'class': 'newprice'}, {'tag': 'div', 'class': 'price'}]
            for price_css_class in price_css_classes:
                price = featured_product.select(u'.//' + price_css_class['tag'] + '[@class="' + price_css_class['class'] + '"]/text()').re(u'([0-9\,\.]+)')
                if price:
                    price = re.sub(',', '.', price[0])
                    product_loader.add_value('price', price)
                    break
            yield product_loader.load_item()

        products = hxs.select(u'//div[contains(@class,"productsRow")]/div[contains(@class,"productItem")]')
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//div[@class="prodDecription"]/a/@href').extract()
            if not url:
                continue
            url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', url)
            product_loader.add_xpath('name', u'.//div[@class="prodDecription"]/a/text()')
            price_css_classes = [{'tag': 'span', 'class': 'newprice'}, {'tag': 'div', 'class': 'price'}]
            for price_css_class in price_css_classes:
                price = product.select(u'.//' + price_css_class['tag'] + '[@class="' + price_css_class['class'] + '"]/text()').re(u'([0-9\,\.]+)')
                if price:
                    price = re.sub(',', '.', price[0])
                    product_loader.add_value('price', price)
                    break
            yield product_loader.load_item()

        if not products or not featured_product:
            log.msg('Retrying url: %s' % response.url, level=log.WARNING)
            retries = response.meta.get('retries', 0)
            if retries < 3:
                yield Request(response.url, dont_filter=True, meta={'retries': retries + 1})
Exemple #49
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        try:
            name = hxs.select(
                '//*[@itemprop="name"]/text()').extract().pop().strip()
        except IndexError:
            yield Request(response.url.replace(
                'hamleys.com/',
                'hamleys.com/detail.jsp?pName=').replace('.ir', ''),
                          callback=self.parse_product)
            return

        out_of_stock = 'OUT OF STOCK' in ''.join(
            hxs.select(
                '//li[@class="stockStatus"]/span/text()').extract()).upper()

        # cat_regex = 'LEGO Duplo|LEGO Bricks and More|LEGO Bricks|LEGO Creator|LEGO City|LEGO Ninjago|LEGO Monster Fighters|LEGO Super Heros|LEGO Lord Of The Rings|LEGO Star Wars|LEGO Games'

        loader = ProductLoader(response=response, item=Product())
        loader.add_value('url', urljoin(base_url, response.url))
        loader.add_value('name', name)
        loader.add_xpath('image_url', '//img[@class="productMain"]/@src',
                         TakeFirst())
        loader.add_xpath('price',
                         '//div[@class="productprice "]/text()',
                         Join(''),
                         re="([.0-9]+)")
        category = hxs.select(
            '//div[@class="pagetopnav"]/ul[contains(@class, "crumb")]/li/a/text()'
        ).extract()[-2]
        loader.add_value('category', category)
        loader.add_value('sku', name, re=' (\d\d\d+)\s*$')
        loader.add_value('brand', 'LEGO')
        identifier = hxs.select(
            '//*[@itemprop="productID"]/text()').extract()[0].replace(
                'Code: ', '')
        loader.add_value('identifier', identifier)

        if out_of_stock:
            loader.add_value('stock', 0)

        yield loader.load_item()
Exemple #50
0
    def parse_product(self, response):
        schema = SpiderSchema(response)
        product_data = schema.get_product()
        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('identifier', product_data['productID'])
        loader.add_value('sku', product_data['productID'])
        loader.add_value('name', product_data['name'])
        out_stock = bool(response.css('.product-shop .out-of-stock'))
        if (not out_stock) and (
                'InStock'
                in product_data['offers']['properties']['availability']):
            loader.add_value('stock', 1)
        else:
            loader.add_value('stock', 0)
        category = response.css('.breadcrumbs').xpath(
            './/li/a/text()').extract()[1:]
        loader.add_value('category', category)
        loader.add_value('url', response.url)
        loader.add_xpath('image_url', '//img[@itemprop="image"]/@src')
        loader.add_xpath(
            'brand', '//th[@class="label" and contains(text(), '
            '"Brand")]/following-sibling::td/text()')
        price = response.css('.product-shop .price-box .minimal-price .price'
                             ).xpath('text()').re_first(r'[\d\.,]+')
        if not price:
            price = response.css(
                '.product-shop .price-box .regular-price .price').xpath(
                    'text()').re_first(r'[\d\.,]+')
        if not price:
            price = response.css(
                '.product-shop .price-box .special-price .price').xpath(
                    'text()').re_first(r'[\d\.,]+')
        loader.add_value('price', price)

        if loader.get_output_value('price') >= Decimal('45.0'):
            loader.add_value('shipping_cost', '0.0')
        else:
            loader.add_value('shipping_cost', '4.95')

        yield loader.load_item()

        for url in response.css('.grouped-items-table-wrapper .name-wrapper'
                                ).xpath('a/@href').extract():
            yield Request(url, callback=self.parse_product)
Exemple #51
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        base_name = hxs.select(u'//div[@class="ProductTopTitle"]/h1/text()').extract()
        multiple_options = hxs.select('//div[@class="variantdiv"]')
        if not multiple_options:
            product_loader = ProductLoader(item=Product(), response=response)
            product_loader.add_value('name', base_name)
            product_loader.add_value('url', response.url)
            product_loader.add_xpath('price', u'//div[@class="webPriceLabel"]/text()',
                                     re=u'\xa3(.*)')
            yield product_loader.load_item()
        else:
            color_options = multiple_options.select(u'.//select[contains(@id,"Color")]/option/@value').extract()
            size_options = multiple_options.select(u'.//select[contains(@id,"Size")]/option/@value').extract()
            
            if color_options:
                for color in color_options[1:]:
                    if size_options:
                        for size in size_options[1:]:
                            params = {'Colour': color, 'Size': size}
                            url = response.url + '?' + urlencode(params)
                            request = Request(url, callback=self.parse_option_price, dont_filter=True)
                            request.meta['name'] = base_name[0] + ' ' + size + ' ' + color
                            request.meta['url'] = response.url
                            yield request
                    else:
                        params = {'Colour': color}
                        url = response.url + '?' + urlencode(params)
                        request = Request(url, callback=self.parse_option_price, dont_filter=True)
                        request.meta['name'] = base_name[0] + ' ' + color
                        request.meta['url'] = response.url
                        yield request
            elif size_options:
                for size in size_options[1:]:
                    params = {'Size': size}
                    url = response.url + '?' + urlencode(params)
                    request = Request(url, callback=self.parse_option_price, dont_filter=True)
                    request.meta['name'] = base_name[0] + ' ' + size
                    request.meta['url'] = response.url
                    yield request
Exemple #52
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        # products
        products = hxs.select(u'//b[contains(text(), "\xa3")]/../..')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            product_loader.add_xpath('name', './b/font/text()')
            product_loader.add_value('url', response.url)
            price = product.select(
                u'.//b[contains(text(), "\xa3")]/text()').re('\xa3(.*[0-9])')
            if not price:
                continue
            product_loader.add_value('price', price)
            yield product_loader.load_item()
Exemple #53
0
    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        products = hxs.select(u'//ul[@class="product-list"]/li')

        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            url = product.select(u'.//div[@class="listItemLink"]/a/@href').extract()[0]
            url = urljoin_rfc(get_base_url(response), url)
            product_loader.add_value('url', url)
            name = product.select(u'.//div[@class="listBrand"]/text()').extract()[0]
            name += ' ' + product.select(u'.//div[@class="listItemLink"]/a/text()').extract()[0]
            name += ' ' + product.select(u'.//div[@class="listData"]/text()').extract()[0]
            product_loader.add_value('name', name)
            product_loader.add_xpath('price', u'.//span[@class="salePrice"]/span/text()',
                                 re=u'\$(.*)')
            yield product_loader.load_item()
Exemple #54
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        # products
        products = hxs.select('//div[@class="product_listing"]')
        id_regex = re.compile(r'id=(\d+)')
        for product in products:
            product_loader = ProductLoader(item=Product(), selector=product)
            # product_loader.add_xpath('name', './/span[@class="prod_name"]/a/@title')
            product_loader.add_xpath(
                'price',
                './/span[@class="prod_our_price"]/strong/text()',
                re='.*\$(.*[0-9])')
            price = product.select(
                './/span[@class="prod_our_price"]/strong/text()').re(
                    r'.*\$(.*[0-9])')
            if not price:
                price = product.select(
                    './/span[@class="prod_sale_price"]/span/text()').re(
                        r'.*\$(.*[0-9])')
            if not price:
                self.log('NO PRICE => %s' % response.url)
                continue
            product_loader.add_value('price', price[0])
            sku = product.select('.//span[@class="prod_number"]/text()').re(
                '\((.*)\)')
            sku = re.sub('[\-]', '', sku[0])
            product_loader.add_value('sku', sku)
            product_loader.add_xpath('name',
                                     './/span[@class="prod_name"]/a/@title')
            url = product.select(
                './/span[@class="prod_name"]/a/@href').extract()
            product_loader.add_value('identifier',
                                     re.search(id_regex, url[0]).groups()[0])
            url = urljoin_rfc(get_base_url(response), url[0])
            product_loader.add_value('url', url)
            img = product.select('.//img/@src').extract()
            if img:
                product_loader.add_value(
                    'image_url', urljoin_rfc(get_base_url(response), img[0]))
            product_loader.add_value('category', response.meta.get('category'))

            yield product_loader.load_item()
Exemple #55
0
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        name = hxs.select('//*[@itemprop="name"]/text()').extract().pop().strip()

        # category = hxs.select('//div[@class="breadcrumbs"]/a/text()')[-1].extract().strip()
        # category = name.split('-')[0].strip()
        categories = map(unicode.strip, hxs.select('//ul[@id="breadcrumbs"]/li/a/text()').extract())
        if categories:
            category = categories[-1]
            if category.startswith(u"\xbb"):
                category = category[2:]
        else:
            category = ''

        pid = hxs.select('//*[@itemprop="identifier"]/text()').pop().extract().strip()

        sku = hxs.select(u'//th[contains(text(), "K\xf3d produktu")]/following-sibling::td[1]/text()').extract().pop().strip()

        price = hxs.select('//meta[@itemprop="price"]/@content').pop().extract()

        stock = hxs.select('//meta[@itemprop="availability" and @content="in_stock"]')

        if price:
            loader = ProductLoader(response=response, item=Product())
            loader.add_value('url', urljoin(base_url, response.url))
            loader.add_value('name', name)
            loader.add_xpath('image_url', '//img[@itemprop="image"]/@src', Compose(lambda v: urljoin(base_url, v[0])))
            loader.add_value('price', price)
            loader.add_value('category', category)
            loader.add_value('sku', sku)
            loader.add_value('identifier', pid)
            loader.add_value('brand', 'LEGO')
            if Decimal(price) < Decimal('3000'):
                loader.add_value('shipping_cost', 95)
            if not stock:
                loader.add_value('stock', 0)
            yield self.load_item_with_metadata(loader.load_item())
        else:
            self.errors.append("No price set for url: '%s'" % urljoin(base_url, response.url))