Example #1
0
    def _populate_from_html(self, response, product):
        cond_set(product, 'image_url',
                 response.css('[itemprop=image]::attr(src)').extract(),
                 lambda url: urlparse.urljoin(response.url, url))
        _populate_from_open_graph_product(response, product)
        cond_set(product, 'price',
                 response.css('.currentPrice ins::text').extract(),
                 unicode.strip)
        cond_set(product, 'brand',
                 response.css('[itemprop=brand]::text').extract())
        if not product.get('brand', None):
            dump_url_to_file(response.url)

        cond_set(product, 'title',
                 response.css('[itemprop=name]::text').extract())
        css = '#longDesc article'
        desc = response.css(css).extract()
        desc = desc[0] if desc else None
        cond_set_value(product, 'description', desc)

        reseller_id_regex = "(\d+)-pdt"
        reseller_id = re.findall(reseller_id_regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, 'reseller_id', reseller_id)

        self._unify_price(product)
Example #2
0
 def _scrape_product_links(self, response):
     for box in self._fetch_product_boxes(response):
         url = urlparse.urljoin(response.url, self._link_from_box(box))
         product = SiteProductItem()
         self._populate_from_box(response, box, product)
         if not product.get('brand', None):
             dump_url_to_file(response.url)
         meta = response.meta.copy()
         meta['product'] = product
         user_agent = USER_AGENT_LIST.pop(0)
         USER_AGENT_LIST.append(user_agent)
         request = Request(url, callback=self.parse_product, meta=meta)
         request.headers.setdefault('User-Agent', user_agent)
         yield request, product
Example #3
0
    def parse_product(self, response):
        product = response.meta['product']
        cond_set(product, 'title',
                 response.css('#pdpProduct h1::text').extract(),
                 lambda s: string.strip(s, ' \n'))
        if not product.get('brand', None):
            brand = guess_brand_from_first_words(
                product.get('title').strip() if product.get('title') else '')
            if brand:
                product['brand'] = brand

        if not product.get('brand', None):
            dump_url_to_file(response.url)

        if product.get('price') is None:
            currency = response.css('.currency::text').extract()
            currency = currency[0] if currency else ''
            price = response.css('.actualprice .price::text').re('\d+')
            price = price[0] if price else ''
            cond_set_value(product, 'price', currency + price)
        if not u'£' in product.get('price', ''):
            self.log('Invalid price at: %s' % response.url, level=ERROR)
        else:
            product['price'] = Price(price=product['price'].replace(
                u'£', '').strip(),
                                     priceCurrency='GBP')
        cond_set(product, 'image_url',
                 response.css('#mainimage.photo::attr(src)').extract(),
                 lambda url: urlparse.urljoin(response.url, url))
        cond_set(product, 'description',
                 response.css('.fullDetails').extract(), _inner_html)
        cond_set(product, 'is_out_of_stock',
                 response.css('#globalDeliveryGrey[style="display:block;"]'),
                 bool)
        reseller_id = re.findall(r'partNumber/(\d+)', response.url)
        cond_set(product, 'reseller_id',
                 reseller_id[0] if reseller_id else None)
        # Hardcoded
        cond_set_value(product, 'locale', 'en-GB')

        cond_set(
            product, 'model',
            response.xpath('//div[@class="fullDetails"]/ul/li/text()').re(
                'EAN:\s(.*).'))

        if self.fetch_related_products:
            return self._request_related_products(response)
        else:
            return product
Example #4
0
    def _populate_from_html(self, response, product):
        self._populate_hardcoded_fields(product)
        cond_set(product, 'title', response.css('#itemTitle::text').extract())
        cond_set(
            product, 'price',
            response.css('[itemprop=price]::text , '
                         '#mm-saleDscPrc::text').extract(), self._unify_price)

        seller = response.xpath('//div[@class="mbg"]/a/span/text()').extract()

        if seller:
            seller = seller[0].strip()
            product["marketplace"] = [{
                "name": seller,
                "price": product.get("price", None)
            }]

        cond_replace(product, 'image_url',
                     response.css('[itemprop=image]::attr(src)').extract())
        xpath = '//*[@id="vi-desc-maincntr"]/node()[normalize-space()]'
        cond_set_value(product, 'description',
                       response.xpath(xpath).extract(), ''.join)
        cond_replace(product, 'url',
                     response.css('[rel=canonical]::attr(href)').extract())
        xpath = '//td[@class="attrLabels" and contains(text(), "Brand:")]' \
                '/following-sibling::td/span/text()'
        cond_set(product, 'brand', response.xpath(xpath).extract())
        if not product.get('brand', None):
            dump_url_to_file(response.url)
        xpath = '//td[@class="attrLabels" and contains(text(), "Model:")]' \
                '/following-sibling::td/span/text()'
        cond_set(product, 'model', response.xpath(xpath).extract())

        reseller_id_regex = "-\/([^\/&?\.\s]+)"
        reseller_id = re.findall(reseller_id_regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, 'reseller_id', reseller_id)
Example #5
0
    def parse_product(self, response):
        prod = response.meta['product']
        reviews = response.xpath('//div[@id="review_loading"]/'
                                 'following::div[contains(@id, "review_")]')
        if reviews and len(reviews) > 0:
            total = len(reviews)
            stars = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
            sum = 0

            for review in reviews:
                stars_count = len(review.xpath('./i[@class="icon-star"]'))
                if stars_count == 0:
                    total -= 1
                    continue
                stars[stars_count] += 1
                sum += stars_count
            try:
                avg = float(sum) / float(total)
            except ZeroDivisionError:
                avg = float(0)
            prod['buyer_reviews'] = BuyerReviews(total, avg, stars)
        else:
            prod['buyer_reviews'] = ZERO_REVIEWS_VALUE
        title = response.xpath(
            '//h2[@class="product_name product_title"]/span[@itemprop="name"]/text()'
        ).extract()
        cond_set(prod, 'title', title)

        price = is_empty(
            re.findall(
                '(.?)(\d+.\d+)',
                is_empty(
                    response.xpath(
                        '//div[@class="price"]/text() |'
                        ' //div[@class="price"]/div/text()').extract(), "")),
            0)
        if price:
            priceCurrency = self.convert_currency[is_empty(price, "")]
            prod["price"] = Price(priceCurrency=priceCurrency, price=price[1])

        des = response.xpath(
            '//div[@class="clearfix text_box margin_after bg_white"]'
            '| //div[@class="wide_page"]').extract()
        if len(des) < 1:
            des = response.xpath(
                "//span[contains(@class,'product_smallprint')]").extract()
        cond_set(prod, 'description', des)

        img_url = response.xpath('//img[@id="product_image"]/@src').extract()
        cond_set(prod, 'image_url', img_url)

        cond_set(prod, 'locale', ['en-US'])

        cond_set(prod, 'brand', ['NO BRAND'])
        if not prod.get('brand', None):
            dump_url_to_file(response.url)

        prod['url'] = unicode(response.url)

        cond_set(
            prod, 'upc',
            response.xpath("//script[contains(text(),'window.product = ')]").
            re(r"'id' : \"(\d+)\""))

        items = response.xpath(
            '//a[contains(@class,"g-med")] | //a[contains(@class,"g-large")]')
        related = []
        for item in items:
            name = item.xpath('.//img/@title').extract()
            link = item.xpath('.//@href').extract()
            if name and link:
                name = is_empty(name, "")
                link = (link, "")
                related.append(RelatedProduct(title=name, url=link))

        prod['related_products'] = {'Similar Products': related}

        available = response.xpath(
            '//meta[@property="og:price:availability"]/@content').extract()
        if 'preorder' in available:
            prod['is_out_of_stock'] = True
        elif 'instock' in available:
            prod['is_out_of_stock'] = False

        return prod
Example #6
0
    def parse_product(self, response):
        product = response.meta['product']

        cond_set(
            product, 'title',
            response.xpath("//section[@itemscope]/h1"
                           "/span[@itemprop='name']/text()").extract())

        cond_set(
            product, 'brand',
            response.xpath("//section[@itemscope]/h1"
                           "/span[@itemprop='brand']/text()").extract())

        if not product.get('brand', None):
            dump_url_to_file(response.url)

        cond_set(
            product, 'upc',
            response.xpath("//section[@itemscope]/meta[@itemprop='identifier']"
                           "/@content").extract())

        price = response.xpath(
            "//section[@itemscope]/div[contains(@class,'productDetail')]"
            "/section[contains(@class,'description')]"
            "/div/div[contains(@class,'productPrices')]"
            "/span[@itemprop='price']/ins/text()").re(FLOATING_POINT_RGEX)

        if price:
            product['price'] = Price(price=price[0], priceCurrency='GBP')

        cond_set(
            product, 'image_url',
            response.xpath(
                "//section[@itemscope]/descendant::section[@class='productMedias']"
                "/div[@id='currentView']/a/img/@src").extract())

        regex = "(\d+)-pdt"
        reseller_id = re.findall(regex, response.url)
        reseller_id = reseller_id[0] if reseller_id else None
        cond_set_value(product, "reseller_id", reseller_id)

        if self.DO_DESCRIPTION:
            cond_set(product, 'description',
                     response.xpath("//section[@id='longDesc']").extract())

        cond_set_value(product, 'locale', "en-GB")

        out_of_stock = response.xpath(
            "//div[contains(@class,'productDetail')]"
            "/section[@class='col3']/div[@class='nested']"
            "/strong/text()").re(r"Out of stock")
        if out_of_stock:
            product['is_out_of_stock'] = True

        # review = response.xpath(
        #     "//div[contains(@class,'productDetail')]"
        #     "/section[@class='col3']/p[@id='reviews']"
        #     "/a/@href"
        #     ).extract()

        payload = self._extract_rr_parms(response)
        productid = payload['p']
        product['upc'] = productid

        review_url = (
            'http://mark.reevoo.com/reevoomark/en-GB/product?sku={sku}'
            '&trkref=PCG').format(sku=productid)
        new_meta = response.meta.copy()
        new_meta['handle_httpstatus_list'] = [404]
        reevoo_request = Request(url=review_url,
                                 callback=self._parse_reevoo,
                                 meta=new_meta)
        response.meta['reevoo'] = reevoo_request

        if payload:
            new_meta = response.meta.copy()
            rr_url = urlparse.urljoin(self.SCRIPT_URL,
                                      "?" + urllib.urlencode(payload))
            return Request(rr_url, self._parse_rr_json, meta=new_meta)
        else:
            self.log("No {rr} payload at %s" % response.url, DEBUG)

        return product
Example #7
0
    def parse_product(self, response):
        product = response.meta['product']

        data = is_empty(
            re.findall("page_products\'\:\s+([^\}]*)",
                       response.body_as_unicode())) + "}"

        try:
            data = json.loads(data.strip().replace("'", "\""))
        except ValueError:
            data = {}

        product["description"] = is_empty(
            response.xpath(
                "//div[contains(@class, 'prd-description')]").extract())

        average = is_empty(
            response.xpath(
                "//span[contains(@class, 'b-rating-average')]/text()").extract(
                ))
        total = is_empty(
            response.xpath("//h2[@class='b-ttl-2']/span/text()").re(
                FLOATING_POINT_RGEX))
        if average and total:
            product["buyer_reviews"] = BuyerReviews(num_of_reviews=total,
                                                    average_rating=average,
                                                    rating_by_star={})

        if data:
            product["price"] = Price(price=data["prod_price"],
                                     priceCurrency=data["currency"])

            product["is_out_of_stock"] = not bool(int(data["stock_available"]))
            product["title"] = data["prod_name"]
            product["image_url"] = data["prod_image_url"]
            product["url"] = data["prod_url"]
            if not product["description"]:
                product["description"] = data["description"]
            product["brand"] = data["brand"]

        else:
            price = is_empty(
                response.xpath("//span[contains(@class, 'price')]/text()").re(
                    FLOATING_POINT_RGEX), None)
            if price:
                product["price"] = Price(price=price, priceCurrency="GBP")
            product["title"] = is_empty(
                response.xpath(
                    "//h1[contains(@class, 'b-ttl-main')]/text()").extract())
            product["image_url"] = is_empty(
                response.xpath(
                    "//*[@id='cart-form']/div[2]/div[1]/div/div/a/@href").
                extract())
            product["url"] = response.url
            product["brand"] = is_empty(
                response.xpath("//span[@itemprop='brand']/text()").extract())

        if not product.get('brand', None):
            dump_url_to_file(response.url)

        cond_set_value(product, 'locale', "en-GB")

        if "You May Also Like" in response.body_as_unicode():
            catId = is_empty(
                re.findall("cat_id\'\:\s+(\d+)", response.body_as_unicode()))
            sid = is_empty(
                re.findall("sid\'\:\s+\"([^\"]*)", response.body_as_unicode()))
            if catId and sid and "item_id" in data:
                url = "http://www.rakuten.co.uk/api/recommendation?" \
                    "category_id=%s" \
                    "&item_id=%s" \
                    "&shop_id=%s" % (catId, data["item_id"], sid)
                return Request(url=url,
                               callback=self._related_parse,
                               meta={"product": product})
        return product
Example #8
0
    def parse_product(self, response):
        prod = response.meta['product']

        prod['url'] = response.url
        prod['locale'] = 'en_GB'

        title = response.xpath('//h1[@class="product-title"]/text()').extract()
        if title:
            prod['title'] = title[0].strip()

        img = response.xpath('//img[@itemprop="image"]/@src').extract()
        if img:
            prod['image_url'] = urlparse.urljoin(response.url, img[0])

        price = response.xpath('//span[@itemprop="price"]/text()').re(
            FLOATING_POINT_RGEX)
        if price:
            prod['price'] = Price(price=price[0], priceCurrency='GBP')

        description = response.xpath(
            '//div[@class="product-description"]').extract()
        if not description:
            description = response.xpath(
                '//ul[@itemprop="description"]').extract()
        if description:
            prod['description'] = description[0].strip()

        brand = response.xpath('//img[@itemprop="logo"]/@alt').extract()
        if brand:
            prod['brand'] = brand[0]

        if not prod.get('brand', None):
            dump_url_to_file(response.url)

        in_stock = response.xpath(
            '//p[@itemprop="availability"]/@content').extract()
        if in_stock:
            if in_stock[0] == 'in_stock':
                prod['is_out_of_stock'] = False
            else:
                prod['is_out_of_stock'] = True

        sku = response.xpath('//strong[@itemprop="sku"]/text()').extract()
        if sku:
            prod['model'] = sku[0]

        d = re.findall(r'window.ebuyer.config\s=\s(.*);',
                       response.body_as_unicode())
        if d:
            data = json.loads(d[0])
            a = data['richRelevance']['apiKey']
            p = data['product']['id']
            s = data['sessionId']
            pt = '|item_page.recs_1|item_page.recs_2'
            l = 1
            get_dict = {'a': a, 'p': p, 's': s, 'pt': pt, 'l': l}
            converted_get = urllib.urlencode(get_dict)
            related_link = self.SCRIPT_URL + converted_get
            meta = response.meta.copy()
            meta['item_id'] = p
            yield Request(related_link,
                          callback=self.get_recommended_id,
                          meta=meta)
        yield prod
Example #9
0
    def parse_product(self, response):
        reviewed = response.meta.get('reviewed')
        prod = response.meta['product']

        # if there was no any request for item review try to send it
        if not reviewed:
            revs_a = response.xpath('//a[@class="read_reviews_action"]')
            if revs_a:
                avg = revs_a.xpath(
                    './/span[@itemprop="ratingValue"]/text()').extract()
                total = revs_a.xpath(
                    './/span[@itemprop="ratingCount"]/text()').extract()
                rev_url = response.url + '/reviewhtml/all'
                meta = response.meta.copy()
                meta['avg'] = avg
                meta['total'] = total
                meta['initial_response'] = response
                return Request(rev_url,
                               callback=self.populate_reviews,
                               meta=meta)
            else:
                cond_set_value(prod, 'buyer_reviews', ZERO_REVIEWS_VALUE)
        title = response.xpath(
            '//div[@class="product-summary"]/h1/text()').extract()
        cond_set(prod, 'title', title)

        brand = [
            is_empty(re.findall(r'"manufacturer":\s"(.*)",', response.body),
                     None)
        ]
        if not brand:
            if prod.get("title"):
                brand = is_empty([guess_brand_from_first_words(prod['title'])],
                                 None)
        if brand:
            cond_set(prod, 'brand', brand)

        if not prod.get('brand', None):
            dump_url_to_file(response.url)
        price = response.xpath(
            '//p[@class="new-price"]/meta[@itemprop="price"]/@content'
        ).extract()
        priceCurrency = response.xpath(
            '//p[@class="new-price"]/meta[@itemprop="priceCurrency"]/@content'
        ).extract()
        if price and priceCurrency:
            if re.match("\d+(.\d+){0,1}", price[0]):
                prod["price"] = Price(priceCurrency=priceCurrency[0],
                                      price=price[0])
            else:
                prod["price"] = Price(priceCurrency="GBP", price=0.00)
        else:
            prod["price"] = Price(priceCurrency="GBP", price=0.00)

        des = response.xpath('//div[@class="productDescription"]').extract()
        cond_set(prod, 'description', des)

        img_url = response.xpath(
            '//div[@class="product-images"]/img/@src').extract()
        cond_set(prod, 'image_url', img_url)

        cond_set(prod, 'locale', ['en-US'])

        if not prod.get("reseller_id"):
            reseller_id = response.xpath(
                './/*[@itemprop="sku"]/text()').extract()
            cond_set(prod, 'reseller_id', reseller_id)

        prod['url'] = response.url

        available = response.xpath(
            '//form[contains(@id,"addToCartForm")]/input[@type="submit"]/@value'
        ).extract()

        if available and 'Email when back in stock' in available[0]:
            cond_set(prod, 'is_out_of_stock', [True])

        if available and 'Last few in store' in available[0]:
            lim = LimitedStock(is_limited=True, items_left=[1])
            cond_set(prod, 'limited_stock', [lim])

        prod_id = re.findall(r'"id":\s"(.*)",', response.body)
        if prod_id:
            recomm_url = self.RECOMM_URL.format(prod_id=prod_id[0])
            return Request(recomm_url,
                           callback=self.populate_recommendations,
                           meta=response.meta.copy())

        return prod
Example #10
0
    def parse_product(self, response):
        def full_url(url):
            return urlparse.urljoin(response.url, url)

        product = response.meta['product']
        # case when we parse first response of product as usual
        if not response.meta.get('after_reviews'):

            cond_set(product, 'title', response.xpath(
                "//div[@class='description']/h1[@itemprop='name']/text()"
            ).extract())

            cond_set(product, 'brand', response.xpath(
                "//div[@class='product-media-top']/"
                "img[@id='product_brand_img']/@alt"
            ).extract())

            if not product.get('brand', None):
                dump_url_to_file(response.url)

            cond_set(product, 'image_url', response.xpath(
                "//div[@class='product-media-top']/noscript"
                "/a[@id='product_image_ref']/img/@src").extract())

            price = response.xpath(
                "//p[@id='product_price']/span[@itemprop='price']"
                "/text()").re(FLOATING_POINT_RGEX)
            if price:
                product['price'] = Price(
                    price=price[0], priceCurrency='GBP')

            cond_set(product, 'description', response.xpath(
                "//div[@id='product_details_container']"
                "/div[@class='description']"
            ).extract())

            regex = "\/([a-z\d]+)(?:$|\?)"
            reseller_id = re.findall(regex, product.get('url', ''))
            reseller_id = reseller_id[0] if reseller_id else None
            cond_set_value(product, "reseller_id", reseller_id)

            stock_status = response.xpath(
                '//link[@itemprop="availability"]/@href'
            ).extract()
            if stock_status:
                if 'OutOfStock' in stock_status[0]:
                    product['is_in_store_only'] = True
                else:
                    product['is_in_store_only'] = False

            cond_set_value(product, 'locale', "en-GB")

            # try to extract some data for additional request for
            # recommendations
            jsessionid = response.xpath(
                '//input[@id="jsessionid_value_V1_MR_rr"]/@value'
            ).extract()
            product_id = response.xpath(
                '//input[@id="product_value_v1_th_rr"]/@value'
            )
            product_id = product_id or response.css(
                '[itemprop=productID]::text'
            )
            product_id = product_id.extract()
            product_id = product_id[0] if product_id else None

            # for reviews and model(may be another than for recommendations)
            prod_id = re.findall(r"'ecomm_prodid':\s'(.*)'", response.body)
            if prod_id:
                prod_id = prod_id[0].strip()
                product['model'] = prod_id
            if prod_id or product_id:
                # populate buyer reviews
                rev_url = self.REVS_BASE.format(prod_id=prod_id or product_id)
                meta = response.meta.copy()
                meta['jsessionid'] = jsessionid
                meta['product_id'] = product_id
                return Request(rev_url, callback=self.populate_buyer_reviews,
                               meta=meta)
            else:
                self.log('Could not scrape buyer reviews '
                         '(product id could not be scraped)')

        # case when we use this function second time after populating
        # buyer reviews
        else:
            jsessionid = response.meta.get('jsessionid')
            product_id = response.meta.get('product_id')

        if jsessionid and product_id:
            scheme = 'V1_MR_rr'
            url = self.generate_related_url(jsessionid, product_id, scheme)
            return Request(url, callback=self.populate_related,
                           meta={'product': product,
                                 'jsessionid': jsessionid,
                                 'product_id': product_id},
                           dont_filter=True)
        return product