Ejemplo n.º 1
0
    def parse_item(self, response):
        url = response.url

        hxs = HtmlXPathSelector(response)
        name = hxs.select(
            "//div[@id='primary_block']/div[@id='pb-left-column']/h2/text()"
        ).extract()
        if not name:
            logging.error("NO NAME! %s" % url)
            return
        name = name[0]

        price = hxs.select(
            "//p[@class='price']/span[@class='our_price_display']/span/text()"
        ).extract()
        if not price:
            logging.error("NO PRICE! %s" % url)
            return
        price = price[0]
        price = Decimal(extract_price2uk(price))

        eco_tax = hxs.select(
            "//p[@class='price-ecotax']/span/text()").extract()
        if eco_tax:
            eco_tax[0] = eco_tax[0].encode('ascii', 'ignore')
            print "Found eco tax %s" % eco_tax[0]
            price -= Decimal(extract_price2uk(eco_tax[0]))

        l = ProductLoader(item=Product(), response=response)
        l.add_value('identifier', str(name))
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', unicode(price))
        yield l.load_item()
Ejemplo n.º 2
0
    def parse_item(self, response):
        url = response.url

        hxs = HtmlXPathSelector(response)
        name = hxs.select("//div[@id='primary_block']/div[@id='pb-left-column']/h2/text()").extract()
        if not name:
            logging.error("NO NAME! %s" % url)
            return
        name = name[0]

        price = hxs.select("//p[@class='price']/span[@class='our_price_display']/span/text()").extract()
        if not price:
            logging.error("NO PRICE! %s" % url)
            return
        price = price[0]
        price = Decimal(extract_price2uk(price))

        eco_tax = hxs.select("//p[@class='price-ecotax']/span/text()").extract()
        if eco_tax:
            eco_tax[0] = eco_tax[0].encode("ascii", "ignore")
            print "Found eco tax %s" % eco_tax[0]
            price -= Decimal(extract_price2uk(eco_tax[0]))

        l = ProductLoader(item=Product(), response=response)
        l.add_value("identifier", str(name))
        l.add_value("name", name)
        l.add_value("url", url)
        l.add_value("price", unicode(price))
        yield l.load_item()
Ejemplo n.º 3
0
 def _get_item_price(self, item):
     try:
         price = item.select(
             '//*[@id="prcIsum"]/text()').extract()[0].strip()
     except IndexError:
         try:
             price = item.select(
                 '//*[@id="mm-saleDscPrc"]/text()').extract()[0].strip()
         except IndexError:
             try:
                 price = re.search(r'"binPrice":".*[\$\xA3]([\d\.,]+)",',
                                   item.response.body).groups()[0]
             except AttributeError:
                 self.errors.append("Price not found for " +
                                    item.response.url)
                 return None
     # Converted price
     if self._converted_price:
         converted_price = item.select(
             u'//div[@id="prcIsumConv"]/span/text()').extract()
         price = converted_price[0] if converted_price else price
     if not price:
         return None
     if callable(self._check_valid_currency):
         currency = ''
         for char in price:
             if char.isdigit():
                 break
             currency += char
         if not self._check_valid_currency(currency):
             return None
     return extract_price2uk(price)
Ejemplo n.º 4
0
 def parse_product(self, response):
     product = response.meta['product'].copy()
     price = response.css(
         '.pdetails .pproductpriceVAT::text').extract_first()
     if price:
         product['price'] = extract_price2uk(price)
     yield product
Ejemplo n.º 5
0
    def parse_product(self, response):
        data = response.xpath('//script/text()').re('var context = ({.+?});')
        data = json.loads(data[0])
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(selector=hxs, item=Product())

        loader.add_xpath('name', '//span[contains(@id, "ProductTitle")]/text()')

        split_url = list(urlsplit(response.url))
        url = urlunsplit(split_url[:3]+['', ''])
        loader.add_value('url', url)

        loader.add_value('brand', 'Lego')
        loader.add_value('category', 'Lego')

        price = hxs.select('//div[contains(@class, "prodprice")]/span/text()').extract()[0]
        price = extract_price2uk(price)
        loader.add_value('price', price)

        image_url = data['pdpProduct']['additionalMedia'][0]['url']
        loader.add_value('image_url', urljoin(base_url, image_url))

        loader.add_xpath('sku', '//span[@itemprop="model"]/text()')

        identifier = hxs.select('//span[@itemprop="productid"]/text()').extract()[0]
        loader.add_value('identifier', identifier)

        stock_url = STOCK_URL + identifier
        request = Request(stock_url, callback=self.parse_availability)
        request.meta['loader'] = loader
        yield request
Ejemplo n.º 6
0
    def __construct_product(self,
                            item,
                            meta=None,
                            use_seller_id_in_identifier=None):
        """
        Constructs `Product` instance from dict
        """
        if use_seller_id_in_identifier is None:
            if self.all_sellers:
                use_seller_id_in_identifier = True
            else:
                use_seller_id_in_identifier = False

        if meta and 'item' in meta:
            search_item = meta['item']
        elif meta and 'search_item' in meta:
            search_item = meta['search_item']
        else:
            search_item = self.current_search_item

        loader = AmazonProductLoader(item=AmazonProduct(),
                                     response=HtmlResponse(''))
        necessary_fields = ['name']
        optional_fields = ['sku', 'image_url', 'brand', 'stock']
        fields_from_search_item = ['sku', 'category', 'brand', 'identifier']

        synonym_fields = {
            'vendor': 'dealer',
        }

        identifier = item[
            'identifier'] if self.use_amazon_identifier else search_item.get(
                'identifier')
        if self.semicolon_in_identifier and \
                identifier and \
                self.use_amazon_identifier and \
                not identifier.startswith(':'):
            identifier = ':' + identifier

        if identifier and use_seller_id_in_identifier and item.get(
                'seller_identifier'):
            identifier += ':' + item['seller_identifier']

        loader.add_value('identifier', identifier)

        for field in necessary_fields:
            loader.add_value(field, item[field])

        if item['price'] is not None:
            try:
                if type(item['price']) == tuple or type(item['price']) == list:
                    item['price'] = item['price'][0]
                price = extract_price2uk(item['price']) if not isinstance(
                    item['price'], Decimal) else item['price']
            except Exception, e:
                self.log('ERROR: extracting price => PRICE: %s' %
                         repr(item['price']))
                raise e
Ejemplo n.º 7
0
 def get_option(self, response):
     product = response.meta['product']
     data = json.loads(response.body)
     product['price'] = extract_price2uk(data['unformattedPrice'])
     if data['combinationid']:
         product['identifier'] = response.meta['id'] + '-' + data['combinationid']
     else:
         product['identifier'] = response.meta['id']
     product['name'] = response.meta['name']
     yield product
Ejemplo n.º 8
0
    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        try:
            name = response.css(
                '.content-fiche-produit h1::text').extract_first().strip()
        except:
            retry = int(response.meta.get('retry', 0))
            if retry < 10:
                retry += 1
                new_meta = response.meta.copy()
                new_meta['retry'] = retry
                yield Request(response.url,
                              meta=new_meta,
                              callback=self.parse_product,
                              dont_filter=True)
            return

        category = response.css('#breadcrumb a::text').extract()
        if category:
            category = category[-2]
        else:
            category = ""

        sku = response.css('.content-fiche-produit p::text').re(
            u'Référence (\d+)')

        pid = response.css('.content-fiche-produit p::text').re(u'Ref (\d+)')

        price = response.css('.new-price ::text').extract_first()

        stock = bool(
            response.xpath(
                '//p[contains(@class, "in-stock")]/text()').extract())
        if not stock:
            stock = 'DISPONIBLE' in ''.join(
                response.xpath('//p[contains(@class, "availability")]//text()'
                               ).extract()).upper()

        if price:
            loader = ProductLoader(response=response, item=Product())
            loader.add_value('url', urljoin(base_url, response.url))
            loader.add_value('name', name)
            loader.add_css('image_url', '#image ::attr(src)')
            loader.add_value('price', extract_price2uk(price))
            loader.add_value('category', category)
            loader.add_value('sku', sku)
            loader.add_value('identifier', pid)
            loader.add_value('brand', response.meta.get("brand", ""))
            #loader.add_value('stock', int(stock))
            yield loader.load_item()
        else:
            self.errors.append("No price set for url: '%s'" %
                               urljoin(base_url, response.url))
Ejemplo n.º 9
0
    def parse_search(self, response):
        hxs = HtmlXPathSelector(response)

        count_el = hxs.select(
            "//table[@id='ctl05_myContainer']/tr[3]/td[2]/div[@id='ctl05_ctl12']/h1/text()"
        ).extract()

        count = '0'
        for el in count_el:
            m = re.search("[\d]+", el)
            if m:
                count = m.group(0)
            else:
                count = '0'
        logging.error("Found %s hotels" % count)

        hotels = hxs.select(
            "//div[@id='divResults']/div[@class='accomodation grey'] | \
                                  //div[@id='divResults']/div[@class='accomodation']"
        )
        for hotel in hotels:
            name = hotel.select("div[1]/h4/a/text()").extract()
            if not name:
                logging.error("No name")
            name = name[0]

            url = hotel.select("a[1]/@href").extract()
            if not url:
                logging.error("No url %s")
            url = url[0]

            price = hotel.select(
                "div[@class='price']/span[@class='sum2']/text()").extract()
            if not price:
                logging.error("No price")
            price = price[0]
            price = extract_price2uk(price)
            if price is None:
                print "No price %s" % name
                continue
            price = int(price) * nights

            l = ProductLoader(item=Product(), response=response)
            l.add_value('name', name.encode('ascii', 'replace'))
            l.add_value('identifier', name.encode('ascii', 'replace'))
            l.add_value('url', url)
            l.add_value('price', price)
            yield l.load_item()

        yield self.get_city_request()
Ejemplo n.º 10
0
    def parse_product(self, response):
        price = response.css('div.pprice .price ::text').extract()
        if price:
            price = extract_price2uk(price[0])
            stock = 1
        else:
            price = 0
            stock = 0
        in_stock = bool(response.xpath('//*[contains(@class, "availability") and contains(@class, "in-stock")]'))
        if not in_stock:
            stock = 0
        identifier = response.xpath('//input[@name="product"]/@value').extract()
        sku = map(unicode.strip, response.xpath('//div[@class="product-name"]/*[@class="sku_prd"]/text()').re(r'Product Code:(.*)'))
        category = filter(lambda s: bool(s),
                          map(unicode.strip,
                              response.xpath('//*[@itemtype="http://schema.org/BreadcrumbList"]'
                                             '//*[contains(@itemprop, "name")]/text()').extract()))[1:-1]

        name = response.xpath('//div[@class="product-name"]/h1/text()').extract()

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('name', name)
        loader.add_value('price', price)
        loader.add_value('stock', stock)
        loader.add_xpath('brand', '//div[@class="product-essential"]//a[@class="man_img"]/@title')
        loader.add_value('identifier', identifier)
        loader.add_value('sku', sku)
        loader.add_xpath('image_url', '//meta[@property="og:image"]/@content')
        loader.add_value('shipping_cost', 0)
        loader.add_value('url', response.url)
        loader.add_value('category', category)
        main_product = loader.load_item()

        option_boxes = response.xpath('//div[@id="product-options-wrapper"]//select')
        if option_boxes:
            product_config = re.findall(string=response.body, pattern=r'var spConfig = new Product.Config\((.*)?\);')
            if product_config:
                product_data = json.loads(product_config[0])
                products = {}
                for attr in product_data['attributes'].itervalues():
                    for option in attr['options']:
                        for opt_id in option['products']:
                            products[opt_id] = ' - '.join((products.get(opt_id, ''), option['label']))
                for identifier, option_name in products.iteritems():
                    new_item = Product(main_product)
                    new_item['identifier'] += '_' + identifier
                    new_item['name'] += option_name
                    yield new_item
        else:
            yield main_product
Ejemplo n.º 11
0
    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(selector=hxs, item=Product())

        brand = hxs.select(
            '//label[@class="jda-brand-name"]/text()').extract()[0]
        brand = fix_spaces(brand)
        if brand.title() != 'Lego':
            return

        name = hxs.select('//div[@id="right-side"]//h1/text()').extract()[0]
        name = fix_spaces(name)
        loader.add_value('name', name)
        loader.add_value('url', response.url)

        price = hxs.select(
            '//div[@id="price"]//dd[@class="ours"]/text()').extract()
        if price:
            price = extract_price2uk(price[0])
            loader.add_value('price', price)

        img_url = hxs.select('//div[@id="bubble-wrapper"]//img/@src').extract()
        if img_url:
            loader.add_value('image_url', urljoin(base_url, img_url[0]))

        loader.add_value('category', 'Lego')
        loader.add_value('brand', 'Lego')

        stock = hxs.select('//*[@id="product-out-of-stock"]/a/img').extract()
        if stock:
            stock = 0
        else:
            stock = 1
        loader.add_value('stock', stock)

        loader.add_xpath('identifier', '//input[@name="productId_0"]/@value')

        if ')' in name:
            sku = name.split('(')[-1]
            sku = sku.split(')')[0]
            loader.add_value('sku', sku)

        yield loader.load_item()
Ejemplo n.º 12
0
    def parse_search(self, response):
        hxs = HtmlXPathSelector(response)

        count_el = hxs.select("//table[@id='ctl05_myContainer']/tr[3]/td[2]/div[@id='ctl05_ctl12']/h1/text()").extract()

        count = '0'
        for el in count_el:
            m = re.search("[\d]+", el)
            if m:
                count = m.group(0)
            else:
                count = '0'
        logging.error("Found %s hotels" % count)

        hotels = hxs.select("//div[@id='divResults']/div[@class='accomodation grey'] | \
                                  //div[@id='divResults']/div[@class='accomodation']")
        for hotel in hotels:
            name = hotel.select("div[1]/h4/a/text()").extract()
            if not name:
                logging.error("No name")
            name = name[0]

            url = hotel.select("a[1]/@href").extract()
            if not url:
                logging.error("No url %s")
            url = url[0]

            price = hotel.select("div[@class='price']/span[@class='sum2']/text()").extract()
            if not price:
                logging.error("No price")
            price = price[0]
            price = extract_price2uk(price)
            if price is None:
                print "No price %s" % name
                continue
            price = int(price)*nights

            l = ProductLoader(item=Product(), response=response)
            l.add_value('name', name.encode('ascii', 'replace'))
            l.add_value('identifier', name.encode('ascii', 'replace'))
            l.add_value('url', url)
            l.add_value('price', price)
            yield l.load_item()

        yield self.get_city_request()
Ejemplo n.º 13
0
 def variant_sum(self, response):
     data = json.loads(response.body)
     options = data.get('variables')
     if options:
         option_type_id = url_query_parameter(response.url, 'optionId')
         for option in options:
             url = add_or_replace_parameter(response.url, 'variableIds[%s]' %option_type_id, option)
             meta=response.meta.copy()
             meta['product'] = Product(response.meta['product'])
             next_options = response.meta.get('next_options')[:]
             if next_options:
                 url = add_or_replace_parameter(url, 'optionId', next_options.pop(0))
                 meta['next_options'] = next_options[:]
             else:
                 url = ''.join(url.split('sMenu'))
                 url = url_query_cleaner(url, ('optionId',), remove=True)
             meta['option_name'] = response.meta['option_name'] + ' ' + options[option]
             yield Request(url, self.variant_sum, meta=meta)
         return
     product = Product(response.meta['product'])
     product['price'] = extract_price2uk(data['price'])
     product['name'] = fix_spaces(' '.join((response.meta['product_name'], response.meta['option_name'])))
     product['identifier'] = response.meta['product_id'] + '-' + data['id']
     yield product
Ejemplo n.º 14
0
    def parse_product(response):
        hxs = HtmlXPathSelector(response)

        opt_groups = []
        inside = False
        lst = ''
        for line in response.body.split('\n'):
            if line.startswith('perms[\''):
                inside = True
                lst = ''
            elif line.startswith('];'):
                if lst:
                    opts = eval('[' + lst + ']')
                    # XXX http://www.thesleepshop.co.uk/acatalog/4ft6_Double_Kyoto_Memphis_Futon.html#a11717
                    # second option has "Deluxe Mattress" twice with different additional price
                    # however price calculation ignores second addition price (uses first value)
                    filtered_opts = []
                    for price, name in opts:
                        if not [name for pn in filtered_opts if pn[1] == name]:
                            filtered_opts.append([price, name])
                    opt_groups.append(filtered_opts)
                inside = False
            elif inside:
                lst += line

        identifier = hxs.select(
            '//form//input[contains(@name, "Q_")]/@name').re(r'Q_(.*)$')[0]

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_value('url', response.url)
        product_loader.add_xpath('name', u'//h3[@class="product"]/text()')
        product_loader.add_xpath('name', u'//span[@class="product"]/text()')
        product_loader.add_value('sku', identifier)
        product_loader.add_value('identifier', identifier)
        product_loader.add_value('category', response.meta.get('category'))

        product_loader.add_css('price', '.discprice::text')
        price_reg = response.xpath(
            '//div[@id="price_inside"]//span//text()').extract_first(
            ) or response.xpath(
                '//div[@id="price_inside"]//span/@ppraw').extract_first()
        price_reg = extract_price2uk(price_reg)
        product_loader.add_value('price', price_reg)
        product_loader.add_value('price', '')

        discount = product_loader.get_output_value('price') / price_reg

        img = hxs.select(
            u'//div[@class="slides_control"]/a/img/@src').extract()
        if not img:
            img = hxs.select(
                u'//div[@class="image_product"]//img/@src').extract()
        product_loader.add_value('image_url',
                                 urljoin_rfc(get_base_url(response), img[0]))

        brand_logo = hxs.select(
            u'//h3[@class="product"]/../img/@src').extract()
        if not brand_logo:
            brand_logo = hxs.select(
                u'//h3[@class="product"]/img/@src').extract()

        brands = {
            '6thsense.jpg': '6th sense',
            'bentley.gif': 'bentley',
            'birlea.gif': 'birlea',
            'blank.gif': '',
            'brand': '',
            'Breasley.gif': 'breasley',
            'buoyant.jpg': 'buoyant',
            'cro.gif': 'cro',
            'cumfilux.gif': 'cumfilux',
            'dt.gif': 'dt',
            'dunlopillo.gif': 'dunlopillo',
            'durabeds.gif': 'durabeds',
            'easycomfort.gif': 'easy comfort',
            'friendship_mill.gif': 'friendship mill',
            'Furmanac.gif': 'furmanac',
            'gainsborough.gif': 'fainsborough',
            'gleneagle.gif': 'gleneagle',
            'harlequin.gif': 'harlequin',
            'harmony.gif': 'harmony',
            'healthbeds.gif': 'healt beds',
            'highgate.gif': 'highgate',
            'hypnos.gif': 'hypnos',
            'jay-be.gif': 'jay be',
            'julianbowenlogo.jpg': 'julian bowen',
            'kaymed.gif': 'kaymed',
            'komfi.gif': 'komfi',
            'kyoto.gif': 'kyoto',
            'limelight.gif': 'limelight',
            'metalbeds.gif': 'metalbeds',
            'millbrook.gif': 'millbrook',
            'myers.gif': 'myers',
            'nd.gif': 'newdesign',
            'nestledown.gif': 'nestledown',
            'obc.gif': 'original bedstead',
            'Protectabed.gif': 'protectabed',
            'rauch.gif': 'rauch',
            'relaxsan.gif': 'relaxsan',
            'relyon.gif': 'relyon',
            'rest_assured.gif': 'rest assured',
            'richman.gif': 'richman',
            'sealy.gif': 'sealy',
            'shakespeare.gif': 'shakespeare',
            'silentnight.gif': 'silentnight',
            'sleepeezee.gif': 'sleepeezee',
            'sleepshaper.gif': 'sleepshaper',
            'sleepyvalley.gif': 'sleepyvalley',
            'slumberland.gif': 'slumberland',
            'staples.gif': 'staples',
            'steens.gif': 'steens',
            'swanglen.gif': 'swanglen',
            'sweetdreams.gif': 'sweetdreams',
            'tss.gif': 'the sleep shop',
            'verona.jpg': 'verona',
            'welcome.gif': 'welcome furniture',
        }
        product_loader.add_value(
            'brand', brands.get(brand_logo[0],
                                remove_extension(brand_logo[0])))
        product = product_loader.load_item()
        for opt_price, opt_name in multiply(opt_groups):
            prod = Product(product)
            prod['name'] = (prod['name'] + ' ' + opt_name).strip()
            try:
                prod['price'] = (Decimal(prod['price']) +
                                 Decimal(opt_price) * discount).quantize(
                                     Decimal('1.00'))
            except TypeError:
                prod['price'] = Decimal(0)
            prod['identifier'] = prod['identifier'] + ':' + opt_name
            yield prod
Ejemplo n.º 15
0
            bushnell_product = self.bushnell_products.get(
                loader.get_output_value('sku').upper().strip(), None)
            if bushnell_product:
                category = bushnell_product['Class']
                self.log('Extracts category "%s" from bushnell file, URL: %s' %
                         (category, loader.get_output_value('url')))

        if category:
            if isinstance(category, list):
                for cat in category:
                    loader.add_value('category', cat)
            else:
                loader.add_value('category', category)
        else:
            loader.add_value('category', '')

        if item.get('shipping_cost', None):
            loader.add_value(
                'shipping_cost',
                extract_price2uk(item['shipping_cost'])
                if not isinstance(item['shipping_cost'], Decimal) else
                item['shipping_cost'])

        for synonym_field, field in synonym_fields.items():
            if synonym_field in item:
                value = item[synonym_field]
                loader.add_value(field, value)

        product = loader.load_item()
        return product
Ejemplo n.º 16
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        categories = hxs.select("//td[@class='columnLeft']/div[@id='nav']//a/@href").extract()
        for category in categories:
            url = urljoin_rfc(base_url, category)
            yield Request(url, callback=self.parse)

        items_table = hxs.select("//table[@class='productsBox']/tr/td[@class='newProducts']")
        for item in items_table:
            name = item.select("h2/a/text()").extract()
            if not name:
                logging.error("%s - ERROR! NO NAME!" % response.url)
                continue
            name = name[0]
            url = item.select("h2/a/@href").extract()
            if not url:
                logging.error("%s - ERROR! NO URL!" % response.url)
                continue
            url = url[0]
            url = urljoin_rfc(base_url, url)
            price = item.select(".//div[@class='price']/text()").extract()
            if not price:
                logging.error("%s - %s - ERROR! NO PRICE!" % (response.url, name))
                continue
            price = price[-1]
            l = ProductLoader(item=Product(), response=response)
            l.add_value("identifier", str(name))
            l.add_value("name", name)
            l.add_value("url", url)
            l.add_value("price", price)
            yield l.load_item()

        items_list = hxs.select("//table[@class='cartTbl']/tr")
        for item in items_list:
            name = item.select("td[2]/a/text()").extract()
            if not name:
                logging.error("%s - ERROR! NO NAME!" % response.url)
                continue
            name = name[0]
            url = item.select("td[2]/a/@href").extract()
            if not url:
                logging.error("%s - ERROR! NO URL!" % response.url)
                continue
            url = url[0]
            url = urljoin_rfc(base_url, url)
            price = item.select("td[2]/text()").extract()
            if not price:
                logging.error("%s - %s - ERROR! NO PRICE!" % (response.url, name))
                continue
            price = " ".join(price)
            if not extract_price2uk(price):
                logging.error("%s - %s - ERROR! NO PRICE!" % (response.url, name))
                continue
            l = ProductLoader(item=Product(), response=response)
            l.add_value("identifier", str(name))
            l.add_value("name", name)
            l.add_value("url", url)
            l.add_value("price", price)
            yield l.load_item()
Ejemplo n.º 17
0
    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        products = hxs.select('//ul[@id="product_list"]/li')

        for product in products:
            try:
                identifier = product\
                    .select('.//a[contains(@class, "ajax_add_to_cart_button")]/@href')\
                    .re(r'^.*&id_product=(\d+)&token')
                name = product.select('.//h3/a/@title').extract().pop().strip()
                url = urljoin_rfc(
                    base_url,
                    product.select('.//h3/a/@href').extract().pop().strip())
                price = extract_price2uk(
                    product.select(
                        './/div[@class="content_price"]'
                        '/*[@class="price"]/text()').extract().pop().strip())
                image = urljoin_rfc(
                    get_base_url(response),
                    product.select('.//a[@class="product_img_link"]'
                                   '/img/@src').extract().pop().strip())

                category = None
                try:
                    category = hxs.select('//span[@class="navigation_page"]'
                                          '/text()').extract().pop().strip()
                except:
                    pass

            except:
                pass
            else:
                if not identifier:
                    loader = ProductLoader(response=response, item=Product())
                    loader.add_value('name', name)
                    loader.add_value('url', url)
                    loader.add_value('brand', category)
                    loader.add_value('price', price)
                    loader.add_value('image_url', image)
                    yield Request(url,
                                  meta={'product': loader.load_item()},
                                  callback=self.parse_identifier)
                else:
                    loader = ProductLoader(response=response, item=Product())
                    loader.add_value('name', name)
                    loader.add_value('identifier', identifier)
                    loader.add_value('url', url)
                    loader.add_value('brand', category)
                    loader.add_value('price', price)
                    loader.add_value('image_url', image)

                    yield loader.load_item()

        next_page = hxs.select(
            '//div[@id="pagination"]'
            '//li[@id="pagination_next"]/a/@href').extract()

        if next_page:
            yield Request(urljoin_rfc(base_url,
                                      next_page.pop().strip()),
                          callback=self.parse_products)
Ejemplo n.º 18
0
    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        name = hxs.select('//h1/text()').extract()
        if not name:
            return
        else:
            name = name[0]
        identifier = hxs.select(
            '//input[@name="product_id"]/@value').extract()[0]
        price = hxs.select(
            '//div[@class="price"]/div[@id="myoc-lpu"]/text()').extract()
        if price:
            price = extract_price2uk(price[0])
            stock = 1
        else:
            price = Decimal(0)
            stock = 0

        loader = ProductLoader(selector=hxs, item=Product())
        loader.add_value('name', name)
        loader.add_value('price', price)
        loader.add_value('stock', stock)
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        loader.add_xpath('image_url', '//a[@class="thumbnail"]/img/@src')
        loader.add_value('url', response.url)
        loader.add_value('shipping_cost', 0)
        for category in hxs.select(
                '//ul[@class="breadcrumb"]/li/a/text()')[:-1].extract():
            loader.add_value('category', category)
        loader.add_xpath('brand', '//li[contains(text(), "Brand")]/a/text()')
        product = loader.load_item()

        option_boxes = hxs.select(
            '//select[@class="form-control" and contains(@id, "option")\
                        and not(contains(./option/., "V.A.T."))\
                        and not(contains(./option/., "VAT"))\
                        and not(contains(./option/., "Delivery"))]')
        if not option_boxes:
            yield product
            return

        options_dict = dict()
        options = []
        for option_box in option_boxes:
            option_group = []
            for option in option_box.select(
                    './option[@value!="" and not(contains(.,"VAT Exempt"))]'):
                option_id = option.select('./@value')[0].extract()
                option_name = option.select('./text()')[0].extract()
                option_price = re.search(u'\(\+\xa3(.*)\)', option_name)
                option_price = Decimal(
                    option_price.group(1)) if option_price else Decimal('0.00')

                option_name = re.sub('VAT Payable ?-? ?', '', option_name)
                option_name = re.sub(u'\(\+\xa3(.*)\)', '',
                                     option_name).strip()
                options_dict[option_id] = {
                    'name': option_name,
                    'price': option_price
                }
                option_group.append(option_id)
            options.append(option_group)

        options = itertools.product(*options)

        for option in options:
            option_name = ' '.join(
                [options_dict[option_id]['name'] for option_id in option])
            option_price = sum(
                [options_dict[option_id]['price'] for option_id in option])
            option = sorted(option)
            option_identifier = '-'.join(option)
            product['identifier'] = '-'.join((identifier, option_identifier))
            product['price'] = price + option_price
            product['name'] = fix_spaces(' '.join((name, option_name)))
            yield product
Ejemplo n.º 19
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        categories = hxs.select(
            "//td[@class='columnLeft']/div[@id='nav']//a/@href").extract()
        for category in categories:
            url = urljoin_rfc(base_url, category)
            yield Request(url, callback=self.parse)

        items_table = hxs.select(
            "//table[@class='productsBox']/tr/td[@class='newProducts']")
        for item in items_table:
            name = item.select("h2/a/text()").extract()
            if not name:
                logging.error("%s - ERROR! NO NAME!" % response.url)
                continue
            name = name[0]
            url = item.select("h2/a/@href").extract()
            if not url:
                logging.error("%s - ERROR! NO URL!" % response.url)
                continue
            url = url[0]
            url = urljoin_rfc(base_url, url)
            price = item.select(".//div[@class='price']/text()").extract()
            if not price:
                logging.error("%s - %s - ERROR! NO PRICE!" %
                              (response.url, name))
                continue
            price = price[-1]
            l = ProductLoader(item=Product(), response=response)
            l.add_value('identifier', str(name))
            l.add_value('name', name)
            l.add_value('url', url)
            l.add_value('price', price)
            yield l.load_item()

        items_list = hxs.select("//table[@class='cartTbl']/tr")
        for item in items_list:
            name = item.select("td[2]/a/text()").extract()
            if not name:
                logging.error("%s - ERROR! NO NAME!" % response.url)
                continue
            name = name[0]
            url = item.select("td[2]/a/@href").extract()
            if not url:
                logging.error("%s - ERROR! NO URL!" % response.url)
                continue
            url = url[0]
            url = urljoin_rfc(base_url, url)
            price = item.select("td[2]/text()").extract()
            if not price:
                logging.error("%s - %s - ERROR! NO PRICE!" %
                              (response.url, name))
                continue
            price = " ".join(price)
            if not extract_price2uk(price):
                logging.error("%s - %s - ERROR! NO PRICE!" %
                              (response.url, name))
                continue
            l = ProductLoader(item=Product(), response=response)
            l.add_value('identifier', str(name))
            l.add_value('name', name)
            l.add_value('url', url)
            l.add_value('price', price)
            yield l.load_item()
Ejemplo n.º 20
0
    def load_item(self, item, name, identifier, price, response):
        try:
            category = item.select(
                '//td[@id="vi-VR-brumb-lnkLst"]//span[@itemprop="name"]/text()'
            ).extract().pop()
        except IndexError:
            category = ''
        seller_id = ''.join(
            item.select('.//*[contains(@class, "si-content")]'
                        '//a/*[@class="mbg-nw"]/text()').extract())

        brand = response.meta['item_meta'].get('brand')
        if not brand:
            brand = filter(
                lambda s: s.strip() != '',
                item.select(
                    '//*[@class="attrLabels" and contains(text(), "Brand")]'
                    '/following-sibling::*[1]//text()').extract())
        if not brand:
            brand = filter(
                lambda s: s.strip() != '',
                item.select(
                    '//*[@class="attrLabels" and contains(text(), "Brand")]'
                    '/following-sibling::*[1]/h2/text()').extract())
        if not brand:
            brand = filter(
                lambda s: s.strip() != '',
                item.select(
                    '//*[@class="attrLabels" and contains(text(), "Brand")]'
                    '/following-sibling::*[1]/h3/text()').extract())
        if not brand:
            brand = filter(
                lambda s: s.strip() != '',
                item.select(
                    '//*[@class="attrLabels" and contains(text(), "Marke")]'
                    '/following-sibling::*[1]//text()').extract())
        if not brand:
            brand = filter(
                lambda s: s.strip() != '',
                item.select(
                    '//*[@class="attrLabels" and contains(text(), "Hersteller")]'
                    '/following-sibling::*[1]//text()').extract())
        if not brand:
            brand = filter(
                lambda s: s.strip() != '',
                item.select(
                    '//*[@class="attrLabels" and contains(text(), "Marque")]'
                    '/following-sibling::*[1]//text()').extract())

        product_loader = ProductLoader(item=Product(), selector=item)
        for field in self._match_fields:
            product_loader.add_value(
                field, response.meta['item_meta'].get(field, None))
        product_loader.add_value('name', name)
        product_loader.add_value('category', category)
        product_loader.add_value('dealer', 'eBay - ' + seller_id)
        product_loader.add_value('identifier', identifier)
        if brand:
            if type(brand) == list:
                product_loader.add_value('brand', brand[0])
            else:
                product_loader.add_value('brand', brand)
        product_loader.add_xpath('image_url', '//img[@id="icImg"]/@src')
        product_loader.add_value('url', item.response.url)
        price = price if price is not None else self._get_item_price(item)
        product_loader.add_value('price', price)

        # stock amount
        if self._extract_stock_amount:
            stock = ''
            try:
                in_stock = ''.join(
                    item.select('//*[@id="qtySubTxt"]//text()').extract())
                stock = ''
                for match in re.finditer(r"([\d]+)", in_stock):
                    if len(match.group()) > len(stock):
                        stock = match.group()
                if 'More than' in in_stock:
                    stock = 11
            except:
                pass
            if stock:
                product_loader.add_value('stock', stock)

        # shipping cost
        try:
            shipping_cost = item.select(
                '//*[@id="shippingSection"]//td/div/text()').extract()[0]
            if shipping_cost:
                if 'free' in shipping_cost.lower():
                    product_loader.add_value('shipping_cost', 0)
                else:
                    product_loader.add_value('shipping_cost',
                                             extract_price2uk(shipping_cost))
        except IndexError:
            pass

        return product_loader
Ejemplo n.º 21
0
    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(selector=hxs, item=Product())

        brand = hxs.select('//section[contains(@class, "product-variations")]/div/div[2]/span/@data-img-attributes').extract()
        if not brand:
            brand = hxs.select('//u[./a/@href="#product-details"]/preceding-sibling::div[last()]//@data-img-attributes').extract()
        if brand:
            brand = re.findall(r'alt="(.*)"', brand[0])[0]
        sku = hxs.select('//h1[contains(@class, "product-title")]/following-sibling::p/text()').extract()
        sku = re.findall(r'\#(.*)', sku[0])
        image_url = hxs.select('//a[@class="thumbnail"]/@href').extract()
        if image_url:
            image_url = urljoin(base_url, image_url[0])
        price = hxs.select('//span[@class="price-price"]/text()').extract()
        if price:
            price = extract_price2uk(price[0])
            stock = 1
        else:
            price = 0
            stock = 0
        product_id = response.xpath('//input[@name="product_id"]/@value').extract_first()
        name = response.xpath('//h1[contains(@class, "product-title")]/text()').extract()[0]
        
        loader.add_value('name', name)
        loader.add_value('price', price)
        loader.add_value('brand', brand)
        loader.add_value('sku', sku)
        loader.add_value('image_url', image_url)
        loader.add_value('url', response.url)
        loader.add_value('identifier', product_id)
        loader.add_value('shipping_cost', 0)
        loader.add_value('stock', stock)
        product = loader.load_item()

        selects = response.css('.product-variations select')
        if not selects:
            yield product
            return
        selected_ids = response.xpath('//@data-selected-variable-ids').extract_first()
        selected_ids = json.loads(selected_ids)
        options = selects.pop(0).xpath('option[@value!=""]')
        get_sum_url = 'http://www.bigbrandbeds.co.uk/admin/controller/ProductVariations/getVariationData?productId=%s'
        get_menu_url = 'http://www.bigbrandbeds.co.uk/admin/controller/ProductVariations/getVariationsMenuData?productId=%s&optionId=%s'
        if selected_ids:
            form = {'variableIds[%s]' %var_id: str(selected_ids[var_id]) for var_id in selected_ids if selected_ids[var_id]}
        else:
            form = dict()
        if selects:
            url = get_menu_url %(product_id, selects.pop(0).xpath('@data-variations-menu').extract_first())
        else:
            url = get_sum_url %product_id
        options_name = ''
        for option in options:
            option_type_id = option.xpath('../@data-variations-menu').extract_first()
            if option_type_id:
                form['variableIds[%s]' %option_type_id] = option.xpath('@value').extract_first()
            option_name = option.xpath('text()').extract_first()
            request = FormRequest(url, formdata=form, 
                                method="GET",
                                callback=self.variant_sum, 
                                dont_filter=True)
            request.meta['product'] = Product(product)
            #request.meta['option_id'] = value
            request.meta['option_name'] = option_name
            request.meta['product_name'] = name
            request.meta['product_id'] = product_id
            if selects:
                request.meta['next_options'] = selects.xpath('@data-variations-menu').extract()[:]
            else:
                request.meta['next_options'] = []
            yield request