Beispiel #1
0
 def _populate_from_html(self, response, product):
     reseller_id = re.findall('\/sku(\d+)', response.url)
     # reseller_id = reseller_id[0] if reseller_id else None
     cond_set(product, 'reseller_id', reseller_id)
     cond_set(product, 'title',
              response.css('[itemprop=name]::text').extract())
     cond_set(product, 'brand',
              response.css('#ctl00_content_lnkBrand::text').extract())
     cond_set(product, 'price',
              response.css('[itemprop=price]::text').extract())
     if product.get('price', '') and not isinstance(product['price'], Price):
         if not 'Rp' in product['price']:
             self.log('Unrecognized currency at %s' % response.url)
         else:
             product['price'] = Price(
                 price=product['price'].lower().replace(
                     'rp', '').replace(',', '').strip(),
                 priceCurrency='IDR'
             )
     cond_replace(product, 'image_url',
                  response.css('#prodMedia img::attr(src)').extract())
     specs = response.css('.spesifications').extract()
     specs = specs[0] if specs else ''
     description = product.get('description', '') + specs.strip()
     cond_replace_value(product, 'description', description)
     self._get_model_from_title(product)
Beispiel #2
0
 def _unify_price(self, product):
     price = product.get('price')
     if not price:
         return
     price_match = re.search('\$ *([, 0-9]+(?:\.[, 0-9]+)?)', price)
     if price_match:
         price = price_match.group(1)
         price = ''.join(re.split('[ ,]+', price))
     cond_replace_value(product, 'price', Price('USD', price))
Beispiel #3
0
    def _price_from_html(self, response, product):
        css = '.product-price-bol [itemprop=price]::attr(content)'
        cond_replace(product, 'price', response.css(css).extract())
        cond_set(
            product,
            'price',
            response.xpath(
                "//span[@class='offer_price']/meta[@itemprop='price']/@content"
            ).extract())

        currency = response.css('[itemprop=priceCurrency]::attr(content)')
        currency = currency.extract()[0] if currency else 'EUR'
        price = product.get('price', '')
        price = price.replace(',', '.')
        if price and re.match(' *\d+\.?\d* *\Z', price):
            cond_replace_value(product, 'price', Price(currency, price))
    def _populate_from_html(self, response, product):
        cond_set(product, 'image_url',
                 response.css('.largeimage::attr(src)').extract())
        cond_set(product, 'title',
                 response.css('.productname::text').extract())
        cond_set(product, 'brand',
                 response.css('.productbrand [itemprop=name]::text').extract())
        delivery_opts = response.css('.deliverycallout li')
        delivery_opts = [bool(do.css('.available')) for do in delivery_opts]
        opt_len = len(filter(None, delivery_opts))
        if opt_len:
            cond_set_value(product, 'is_in_store_only', delivery_opts[1]
                           and opt_len == 1)
        else:
            cond_set_value(product, 'is_out_of_stock', False)
        cond_set(product, 'price',
                 response.css('[itemprop=price]::text').extract(),
                 unicode.strip)
        cond_set(product, 'model',
                 response.css('[itemprop=model]::text').extract())

        regex = "\/(\d+)"
        reseller_id = re.findall(regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, "reseller_id", reseller_id)

        price = product.get("price")
        if not re.findall(u'\xa3 *\d[\d, .]*', price):
            price = response.xpath(
                "//ul[contains(@class, 'pricing')]/li[last()]/span/text()"
            ).extract()
            if price:
                price = price[0].strip()
        price = re.findall(u'\xa3 *\d[\d, .]*', price)
        if price:
            price = re.sub(u'[\xa3, ]+', '', price[0])
            cond_replace_value(product, 'price',
                               Price(priceCurrency='GBP', price=price))

        xpath = '//div[@id="pdpTab1"]/node()[normalize-space()]'
        cond_set_value(product, 'description',
                       response.xpath(xpath).extract(), ''.join)
        product['url'] = product['url'].rsplit('#', 1)[0]
Beispiel #5
0
    def parse_product_old(self, response):
        prod = response.meta['product']
        # populate_from_open_graph not awailable cause no type=product
        metadata = _extract_open_graph_metadata(response)
        description = response.xpath('//p[@itemprop="description"]//text()').extract()
        if description:
            cond_set_value(prod, 'description', description[0])
        else:
            cond_set_value(prod, 'description', metadata.get('description'))
        cond_set_value(prod, 'title', metadata.get('title'))
        cond_replace_value(prod, 'url', metadata.get('url'))

        img_url = metadata.get('image').rstrip('?$browse_thumbnail$')
        cond_set_value(prod, 'image_url', img_url)
        locale = response.xpath(
            '//meta[@name="gwt:property"]/@content'
        ).re(r'locale=\s*(.*)')
        if locale:
            cond_set_value(prod, 'locale', locale[0])

        re_pattern = r'(\d+,\d+|\d+)'
        price = response.xpath(
            '//span[@itemprop="price"]//span[contains(@class,"price-sales")]//text()'
        ).extract()
        if len(price) > 0:
            price = re.findall(r'[\d\.]+', price[0])
            if len(price) > 0:
                price = price[0].replace(",", "")
        else:
            price = None
        # in case item use usual price, not sale
        if price:
            prod['price'] = Price(
                priceCurrency='USD',
                price=price
            )

        brand = response.xpath(
            '//meta[@itemprop="brand"]/@content'
        ).extract()
        cond_set(prod, 'brand', brand)
        return prod
    def _populate_from_html(self, response, product):
        cond_set(product, 'price', response.css('.price span::text').re(
            u'\u00a3([\d, .]+)'))
        cond_set(product, 'title', _itemprop(response, 'model'), unicode.strip)
        cond_set(product, 'brand',
                 _itemprop(_itemprop(response, 'brand', False), 'name'),
                 unicode.strip)
        cond_set(product, 'image_url', _itemprop(response, 'image', False)
                 .css('img::attr(src)').extract())
        image = product.get('image_url')
        if image and image.endswith('noImage.gif'):
            del (product['image_url'])
        cond_set_value(product, 'is_out_of_stock',
                       response.css('.stockMessaging::text').re(
                           'out of stock|Discontinued product'),
                       bool)

        regex = "\/([0-9]+)[\/\?]"
        reseller_id = re.findall(regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, "reseller_id", reseller_id)

        details = response.css('.prodDetailsContainer').xpath(
            'node()[normalize-space()]')
        details = [d.extract() for d in details if not d.css('form')]
        if details:
            cond_set_value(product, 'description', details, conv=''.join)
        self._populate_related_products(response, product)
        self._populate_buyer_reviews(response, product)
        price = product.get('price', None)
        if price == 0:
            del (product['price'])
        elif price:
            price = re.sub(', ', '', price)
            cond_replace_value(product, 'price', Price(priceCurrency='GBP',
                                                       price=price))
 def _unify_price(self, product):
     price = product['price']
     price = unify_price(['USD'], {'$': 'USD'}, unify_decimal(', ', '.'),
                         'USD')(price)
     cond_replace_value(product, 'price', price)
Beispiel #8
0
 def _unify_price(self, product):
     price = product['price'].encode('utf-8')
     price = unify_price(valid_currency_codes, CURRENCY_SIGNS,
                         unify_decimal(', ', '.'))(price)
     cond_replace_value(product, 'price', price)
 def _unify_price(self, product):
     price = product.get('price')
     if not price:
         return
     cond_replace_value(product, 'price', Price('USD', price))
Beispiel #10
0
    def _populate_from_html(self, response, product):
        cond_set(product, 'title', _itemprop(response, 'name'), unicode.strip)
        product['title'] = _strip_non_ascii(product.get('title', ''))
        cond_set(product, 'model', _itemprop(response, 'model'),
                 lambda s: s.replace(u'\xa0 Model # ', ''))
        cond_set(product, 'price', _itemprop(response, 'price'))
        cond_set(product, 'image_url',
                 response.css('.skuImageSTD::attr(src)').extract(),
                 lambda url: urljoin(response.url, url))
        xpath = '//div[@id="divDescription"]/div[@class="qOverflow"]' \
                '/node()[normalize-space()]'
        cond_set_value(product, 'description',
                       response.xpath(xpath).extract(), ''.join)

        if not product.get("description"):
            desc = response.xpath("//div[@id='SkuTabDescription']").extract()
            if desc:
                product["description"] = desc[0]
        cond_set(
            product, 'brand',
            filter(
                product.get('title', '').startswith,
                response.meta.get('brands', [])))
        if product.get('description', '') == '':
            xpath = '//div[@id="divDescription"]/node()[normalize-space()]'
            product['description'] = ''.join(response.xpath(xpath).extract())
        self._populate_related_products(response, product)
        price = product.get('price')
        if price:
            if price.startswith('$'):
                price = re.sub('[$ ,]+', '', price)
                product['price'] = Price(priceCurrency='USD', price=price)
            else:
                self.log('Incorrect price format %s at %s' %
                         (price, response.url))
                product['price'] = None
        self._buyer_reviews_from_html(response, product)
        cond_replace_value(product, 'url', response.url.split('?', 1)[0])

        regex = "\/(\d+)\."
        reseller_id = re.findall(regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, "reseller_id", reseller_id)

        data = r'quillMData\s=\s(.*)</script>'
        data_script = re.findall(data, response.body_as_unicode())
        j = json.loads(data_script[0])
        brand = j.get('brandName')
        if brand:
            cond_set_value(product, 'brand', brand[0])

        locale = j['culturecode']
        cond_set_value(product, 'locale', locale)

        if not product.get("is_out_of_stock"):
            red_span = response.xpath('//span[@class="red"]/text()').extract()
            if red_span:
                s = 'Currently out of stock'
                if s in red_span[0]:
                    cond_set_value(product, 'is_out_of_stock', True)
            else:
                cond_set_value(product, 'is_out_of_stock', False)
 def _unify_price(self, product):
     price = product['price'].encode('utf-8')
     price = unify_price(['GBP'], {SYM_GBP: 'GBP'},
                         unify_decimal(', ', '.'), 'GBP')(price)
     cond_replace_value(product, 'price', price)
Beispiel #12
0
    def parse_product_new(self, response):
        prod = response.meta['product']
        populate_from_open_graph(response, prod)

        prod['locale'] = 'en_US'

        title = response.xpath(
            '//meta[@property="og:title"]/@content'
            ).extract()
        if title:
            cond_set_value(prod, 'title', title[0].capitalize())

        price = response.xpath(
            '//div[@class="sales-price-container"]'
            '/span[contains(@class, "salesprice")]/text()'
        ).extract()
        # if no sale price was found
        if not price:
            price = response.xpath(
                '//div[@class="product-price"]/span/text()'
            ).extract()
        if price and '$' in price[0]:
            n_price = price[0].strip().replace('$', '').\
                replace(',', '').strip()
            prod['price'] = Price(priceCurrency='USD', price=n_price)

        brand = response.xpath(
            '//meta[@itemprop="brand"]/@content'
        ).extract()
        cond_set(prod, 'brand', brand)

        # we need repopulate description cause at meta data it may be false
        description = response.xpath(
            '//p[@itemprop="description"]/text()'
        ).extract()
        if description:
            cond_replace_value(prod, 'description', description[0].strip())

        only_in_online_stock = response.xpath(
            '//li[@class="product-message"]'
        ).extract()
        if only_in_online_stock:
            prod['is_in_store_only'] = True
        else:
            prod['is_in_store_only'] = False

        recommendations = []
        unique_checker = []
        related_div = response.xpath(
            '//div[@id="relatedProducts"]/div[contains(@class, '
            '"recommendations")]//div[@itemprop="isRelatedTo"]'
        )
        for div in related_div:
            link = div.xpath('.//a[@itemprop="url"]/@href').extract()
            name = div.xpath('.//meta[@itemprop="name"]/@content').extract()
            if name and link:
                # because site can recommend the same items
                if name not in unique_checker:
                    unique_checker.append(name)
                    item = RelatedProduct(title=name[0].strip().capitalize(),
                                          url=link[0].strip())
                    recommendations.append(item)
        prod['related_products'] = {'recommended': recommendations}
        return prod