Beispiel #1
0
    def parse_review(self, response):
        hxs = HtmlXPathSelector(response)
        product = response.meta['product']

        for review in hxs.select(u'//div[@class="pr-review-wrap"]'):
            item = Review()
            loader = ReviewLoader(item=Review(),
                                  selector=review,
                                  date_format=u'%m/%d/%Y')
            loader.add_xpath(
                'date',
                u'.//div[contains(@class, "pr-review-author-date")]/text()')
            comments = review.select(
                u'.//p[@class="pr-comments"]/text()').extract()[0]
            bottom_line = review.select(
                u'.//div[@class="pr-review-bottom-line-wrapper"]/p/text()[2]'
            ).extract()
            if bottom_line:
                bottom_line = bottom_line[0]
            else:
                bottom_line = ''

            pros = hxs.select(
                './/div[contains(@class,"pr-attribute-pros")]//li/text()'
            ).extract()
            cons = hxs.select(
                './/div[contains(@class,"pr-attribute-cons")]//li/text()'
            ).extract()
            best_uses = hxs.select(
                './/div[contains(@class,"pr-attribute-bestuses")]//li/text()'
            ).extract()

            loader.add_value(
                'full_text',
                u'%s\nBottom Line: %s\nPros: %s\nCons: %s\nBest Uses: %s\n' %
                (comments, bottom_line, u', '.join(pros), u', '.join(cons),
                 u', '.join(best_uses)))

            loader.add_value(
                'rating',
                int(
                    float(
                        review.select(
                            u'.//span[contains(@class,"pr-rating")]/text()').
                        extract()[0])))
            loader.add_value('url', response.url)

            product['metadata']['reviews'].append(loader.load_item())

        next_url = hxs.select(
            u'//span[@class="pr-page-next"]/a/@href').extract()
        if next_url:
            yield Request(next_url[0],
                          meta=response.meta,
                          callback=self.parse_review)
        else:
            yield product
Beispiel #2
0
 def create_review_loader(self, response, data):
     fields = ['url', 'date', 'rating', 'product_url', 'sku', 'full_text']
     loader = ReviewLoader(item=Review(), response=response, date_format=u'%d/%m/%Y')
     for key, value in data.items():
         if key in fields:
             loader.add_value(key, value)
     return loader.load_item()
    def process_product_reviews(self, response):
        hxs = HtmlXPathSelector(response)
        visited_reviews = response.meta.get('visited_reviews', set())
        product = response.meta['product']
        visited_reviews.add(response.url)
        base_url = get_base_url(response)
        for review_box in hxs.select(
                '//div[@class="boxproductinfo"]//table//tr'):
            loader = ReviewLoader(item=Review(),
                                  selector=hxs,
                                  date_format='%d/%m/%Y')
            date = review_date_format.findall(
                review_box.select("./td/div[1]//p//span/text()").extract()[1])
            if date:
                loader.add_value('date', date[0])
            loader.add_value(
                'full_text',
                review_box.select("./td/div[2]/text()").extract()[0].strip(
                    '" \r\n"'))
            loader.add_value('url', response.url)
            product['metadata']['reviews'].append(loader.load_item())

        for link in hxs.select(
                '//table[@class="pricingbox"]//a/@href').extract():
            next_page = urlparse.urljoin(base_url, link)
            if "productreviews" in link and not next_page in visited_reviews:
                yield Request(next_page,
                              callback=self.process_product_reviews,
                              meta={
                                  'product': product,
                                  'visited_reviews': visited_reviews
                              })
                return
        yield product
Beispiel #4
0
    def parse_review(self, response, product=None):
        hxs = HtmlXPathSelector(response)
        if not product:
            product = response.meta['product']

        for review in hxs.select('//div[@class="pr-review-main"]'):
            loader = ReviewLoader(item=Review(),
                                  selector=review,
                                  date_format=u'%Y-%m-%d')
            loader.add_xpath('date',
                             './/span[@itemprop="dtreviewed"]/@datetime')

            loader.add_xpath('full_text',
                             './/div[@class="pr-review-infos-title"]/text()')
            loader.add_xpath('full_text',
                             './/div[@class="pr-comments"]/text()')
            loader.add_value('product_url', product['url'])
            loader.add_value('url', product['url'])
            loader.add_value('sku', product['sku'])
            loader.add_value(
                'rating',
                len(
                    review.select(
                        './/div[@class="pr-stars pr-stars-small"]/span[contains(@class, "pr-star")]'
                    ).extract()))
            product['metadata']['reviews'].append(loader.load_item())

        next = hxs.select(
            '//span[@class="pr-page-next"]/a[@href!="#"]/@href').extract()
        if not next:
            yield product
        else:
            yield Request(urljoin_rfc(get_base_url(response), next[0]),
                          callback=self.parse_review,
                          meta=response.meta)
Beispiel #5
0
    def parse_review(self, response):
        hxs = HtmlXPathSelector(response)
        product = response.meta['product']

        for review in hxs.select('//ul[@class="liste-avis"]/li'):
            loader = ReviewLoader(item=Review(),
                                  selector=review,
                                  date_format=u'%m/%d/%Y')
            date_review = ''.join(
                review.select('.//a[@href="#"]/strong/text()').extract())
            date_review = datetime.datetime.strptime(date_review[:8],
                                                     "%d/%m/%y").date()
            date_review = date_review.strftime("%m/%d/%Y")
            loader.add_value('date', date_review)

            loader.add_value(
                'full_text',
                normalize_space(' '.join(
                    review.select('.//text()').extract())))
            loader.add_value('product_url', product['url'])
            loader.add_value('url', product['url'])
            loader.add_value('sku', product['sku'])
            loader.add_value(
                'rating',
                hxs.select(
                    './/a[starts-with(@class, "note-produit note-produit-")]/@class'
                ).extract()[0][-1])
            product['metadata']['reviews'].append(loader.load_item())

        yield product
Beispiel #6
0
    def parse_review(self, response):
        hxs = HtmlXPathSelector(response)
        product = response.meta['product']

        for review in hxs.select(u'//tr[@class="singlereview"]'):
            item = Review()
            date = review.select(u'.//div[contains(@class,"ltbodytext")]/text()').extract()[0]
            date = date.split('/')
            item['date'] = date[1] + '/' + date[0] + '/' + date[2]

            title = review.select(u'.//p[@class="subtitle"]/text()').extract()
            if title: title = title[0]
            else: title = ''
            text = review.select(u'.//div[@class="bodytext"]/p/text()').extract()
            if text: text = text[0]
            else: text = ''
            item['full_text'] = title + '\n' + text

            item['rating'] = int(float(review.select(u'.//div[contains(@class,"rating_avg_sm")]/text()').extract()[0]))

            item['url'] = response.url

            product['metadata']['reviews'].append(item)

        next_url = hxs.select(u'//div[contains(@class,"pagination")]/a[contains(text(),"Next")]/@href').extract()
        logging.error(next_url)
        if next_url:
            yield Request('http://www.wayfair.com/ajax/view_reviews_action.php?prsku=%s&rvpg=%s&rvso=0' % (
                product['sku'], next_url[0].split('curpage=')[1]), meta=response.meta, callback=self.parse_review)
        else:
            yield product
Beispiel #7
0
    def parse_review_page(self, response):
        item_ = response.meta.get('product', '')
        hxs = HtmlXPathSelector(text=self._extract_html(response))
        reviews = hxs.select('//div[@class="BVRRReviewDisplayStyle5"]')
        for review in reviews:
            l = ReviewLoader(item=Review(),
                             response=response,
                             date_format='%m/%d/%Y')
            rating = review.select(
                ".//span[contains(@class,'BVRRRatingNumber')]/text()").extract(
                )[0]
            date = review.select(
                ".//span[contains(@class,'BVRRValue BVRRReviewDate')]/text()"
            ).extract()[0]
            review = review.select(
                ".//span[contains(@class,'BVRRReviewText')]/text()"
            )[1].extract()

            l.add_value('rating', rating)
            l.add_value('url', response.url)
            l.add_value(
                'date',
                datetime.strptime(date, '%d %B %Y').strftime('%m/%d/%Y'))
            l.add_value('full_text', review)
            item_['metadata']['reviews'].append(l.load_item())

        next = hxs.select(
            '//span[@class="BVRRPageLink BVRRNextPage"]/a/@data-bvjsref'
        ).extract()
        if next:
            yield Request(next[0],
                          callback=self.parse_review_page,
                          meta={'product': item_})
        else:
            yield item_
Beispiel #8
0
    def parse_review(self, response):
        hxs = HtmlXPathSelector(response)

        reviews = hxs.select(
            u'//div[contains(@id,"BVRRDisplayContentReviewID")]')
        product = response.meta['product']

        if not reviews:
            yield product
            return

        for review in reviews:
            loader = ReviewLoader(item=Review(),
                                  selector=review,
                                  date_format=u'%m/%d/%Y')
            date = review.select(
                u'.//span[contains(@class, "BVRRReviewDate")]/span[@class="value-title"]/@title'
            ).extract()
            if date:
                date = time.strptime(date.pop(), u'%Y-%m-%d')
                date = time.strftime(u'%m/%d/%Y', date)

                loader.add_value('date', date)

            title = review.select(
                u'.//span[@class="BVRRValue BVRRReviewTitle"]/text()').extract(
                )
            if not title:
                title = u'Untitled'
            else:
                title = title[0]
            text = '\n'.join(
                review.select(
                    './/div[@class="BVRRReviewDisplayStyle3Summary"]//text()[normalize-space()]'
                ).extract())
            text += '\n' + '\n'.join(
                review.select(
                    u'.//div[@class="BVRRReviewDisplayStyle3Content"]//text()[normalize-space()]'
                ).extract())

            loader.add_value('full_text', u'%s\n%s' % (title, text))
            loader.add_value('product_url', product['url'])
            loader.add_value('url', product['url'])
            loader.add_value('sku', product.get('sku') or '')
            loader.add_xpath(
                'rating',
                u'.//div[@id="BVRRRatingOverall_Review_Display"]//span[@class="BVRRNumber BVRRRatingNumber"]/text()'
            )
            product['metadata']['reviews'].append(loader.load_item())

        next_page = hxs.select(
            u'.//a[contains(text(),"Next page")]/@data-bvjsref').extract()
        if not next_page:
            yield product
            return
        else:
            yield Request(urljoin_rfc(get_base_url(response), next_page[0]),
                          meta=response.meta,
                          callback=self.parse_review,
                          dont_filter=True)
Beispiel #9
0
    def parse_review(self, response):
        hxs = HtmlXPathSelector(response)
        product = response.meta['product']

        for review in hxs.select(
                u'//div[starts-with(@id, "BVRRDisplayContentReviewID_")]'):
            review_loader = ReviewLoader(item=Review(),
                                         selector=review,
                                         date_format="%B %d, %Y")
            review_loader.add_value(
                'date',
                review.select(
                    u'.//span[contains(@class,"BVRRReviewDate")]/text()').
                extract()[1])

            title = review.select(
                u'.//span[contains(@class,"BVRRCustomFullTitle")]/text()'
            ).extract()
            text = ' '.join(
                review.select(
                    u'.//span[contains(@class,"BVRRReviewText")]/text()').
                extract())

            if title:
                full_text = title[0] + '\n' + text
            else:
                full_text = text

            pros = review.select(
                u'.//span[contains(@class,"BVRRReviewProTags")]/span/text()'
            ).extract()
            cons = review.select(
                u'.//span[contains(@class,"BVRRReviewConTags")]/span/text()'
            ).extract()
            if pros:
                full_text += '\nPros: ' + ', '.join(pros)
            if cons:
                full_text += '\nCons: ' + ', '.join(cons)

            review_loader.add_value('full_text', full_text)
            rating = review.select(
                u'.//img[@class="BVImgOrSprite"]/@title').extract()[0]
            review_loader.add_value('rating', rating.split()[0])
            review_loader.add_value('url', response.url)

            product['metadata']['reviews'].append(review_loader.load_item())

        next_url = hxs.select(
            u'//a[contains(@name,"BV_TrackingTag_Review_Display_NextPage")]/@data-bvjsref'
        ).extract()
        if next_url:
            yield Request(next_url[0],
                          meta=response.meta,
                          callback=self.parse_review_js)

        else:
            yield product
Beispiel #10
0
    def parse_review(self, response):
        hxs = HtmlXPathSelector(response)
        product = response.meta['product']

        for review in hxs.select(
                u'//div[starts-with(@id, "BVRRDisplayContentReviewID_")]'):
            review_loader = ReviewLoader(item=Review(),
                                         selector=review,
                                         date_format="%B %d, %Y")
            review_loader.add_xpath(
                'date', u'.//span[contains(@class,"BVRRReviewDate")]/text()')

            title = review.select(
                u'.//span[contains(@class,"BVRRReviewTitle")]/text()').extract(
                )
            text = review.select(
                u'.//span[contains(@class,"BVRRReviewText")]/text()').extract(
                )
            text = ' '.join(text)

            if title:
                full_text = title[0] + '\n' + text
            else:
                full_text = text

            pros = review.select(
                u'.//span[contains(@class,"BVRRReviewProTags")]/span/text()'
            ).extract()
            cons = review.select(
                u'.//span[contains(@class,"BVRRReviewConTags")]/span/text()'
            ).extract()
            if pros:
                full_text += '\nPros: ' + ', '.join(pros)
            if cons:
                full_text += '\nCons: ' + ', '.join(cons)

            review_loader.add_value('full_text', full_text)
            review_loader.add_xpath(
                'rating',
                u'.//span[contains(@class,"BVRRRatingNumber")]/text()')
            review_loader.add_value('url', response.url)

            product['metadata']['reviews'].append(review_loader.load_item())

        next_url = hxs.select(
            u'//div[contains(@class,"BVRRNextPage")]/a/@href').extract()
        if next_url:
            yield Request(next_url[0],
                          meta=response.meta,
                          callback=self.parse_review)
        else:
            price_url = 'http://www.homedepot.ca/async-fetch-regional-price?storeId=9999&pnList='
            price_url += product['url'].split('/')[-1]

            yield Request(price_url,
                          meta=response.meta,
                          callback=self.parse_price)
Beispiel #11
0
    def parse_review(self, response):
        hxs = HtmlXPathSelector(response)

        reviews = hxs.select(u'//div[contains(@class, "review-item")]')
        products = response.meta['products']

        if not reviews:
            for product in products:
                yield product
            return

        for review in reviews:
            loader = ReviewLoader(item=Review(), selector=review, date_format=u'%m/%d/%Y')

            review_id = review.select('@data-review-id').extract()[0]
            loader.add_value('review_id', review_id)

            date = review.select(u'.//div[@class="item-author"]//text()').re(r'Written on (.*)')[0].strip()
            date = time.strptime(date, u'%B %d, %Y')
            date = time.strftime(u'%m/%d/%Y', date)

            loader.add_value('date', date)

            title = review.select(u'./h2/a/text()').extract()
            if not title:
                title = u'Untitled'
            else:
                title = title[0]
            text = ' '.join([s.strip().replace('\n', '') for s in review.select(u'.//div[@class="item-text"]//text()').extract() if s.strip()])
            text = re.sub(' {2,}', ' ', text)
            loader.add_value('full_text', u'%s\n%s' % (title, text))
            loader.add_value('product_url', response.meta['product_url'])
            loader.add_value('url', response.url)
            product = products[0] if products else {}
            loader.add_value('sku', product.get('sku') or '')
            rating = review.select(u'./div[@class="item-rating"]/div[contains(@class, "stars")]/div/@style').re(r'width: (\d+)%;')[0]
            loader.add_value('rating', int(rating) / 20)
            products[0]['metadata']['reviews'].append(loader.load_item())
        next_page = hxs.select('//div[contains(@class, "next-button") and not(contains(@class, "disabled"))]')
        if next_page:
            identifier = hxs.select('//input[@name="identifier"]/@value').extract()[0]
            next_page = int(response.meta.get('current_page', 1)) + 1
            meta = response.meta.copy()
            meta['current_page'] = next_page
            req = FormRequest(
                response.url.split('?')[0],
                headers={'Accept': 'application/json, text/javascript, */*; q=0.01'},
                formdata={'identifier': identifier,
                          'page': str(next_page),
                          'page_size': '10',
                          'sort': 'newest'},
                callback=self.parse_review,
                meta=meta)
            yield req
        else:
            for product in products:
                yield product
Beispiel #12
0
    def parse_reviews(self, response):
        hxs = HtmlXPathSelector(response)
        product = response.meta['product']
        sku = response.meta['sku']
        body = response.body.strip().partition('(')[-1].replace('});',
                                                                '}').replace(
                                                                    '})', '}')
        json_body = json.loads(body)

        reviews = json_body['BatchedResults']['q0']['Results']
        for review in reviews:
            review_loader = ReviewLoader(item=Review(),
                                         response=response,
                                         date_format="%B %d, %Y")
            review_date = datetime.datetime.strptime(
                review['SubmissionTime'].split('.')[0], '%Y-%m-%dT%H:%M:%S')
            review_loader.add_value('date', review_date.strftime("%B %d, %Y"))

            title = review['Title']
            text = review['ReviewText']

            if title:
                full_text = title[0] + '\n' + text
            else:
                full_text = text

            pros = review['Pros']
            cons = review['Cons']
            if pros:
                full_text += '\nPros: ' + ', '.join(pros)
            if cons:
                full_text += '\nCons: ' + ', '.join(cons)

            review_loader.add_value('full_text', full_text)
            rating = review['Rating']
            review_loader.add_value('rating', rating)
            review_loader.add_value('url', product['url'])

            product['metadata']['reviews'].append(review_loader.load_item())

        if len(reviews) == 100:
            offset = response.meta['offset'] + 100
            next_reviews = 'http://api.bazaarvoice.com/data/batch.json?passkey=asiwwvlu4jk00qyffn49sr7tb&apiversion=5.4&displaycode=1235-en_gb&resource.q0=reviews&filter.q0=isratingsonly%3Aeq%3Afalse&filter.q0=productid%3Aeq%3A' + sku + '&filter.q0=contentlocale%3Aeq%3Aen_AU%2Cen_CA%2Cen_DE%2Cen_GB%2Cen_IE%2Cen_NZ%2Cen_US&sort.q0=rating%3Adesc&stats.q0=reviews&filteredstats.q0=reviews&include.q0=authors%2Cproducts%2Ccomments&filter_reviews.q0=contentlocale%3Aeq%3Aen_AU%2Cen_CA%2Cen_DE%2Cen_GB%2Cen_IE%2Cen_NZ%2Cen_US&limit.q0=100&offset.q0=' + str(
                offset) + '&limit_comments.q0=3&callback=bv182_28795'
            request = Request(next_reviews,
                              meta={
                                  'product': product,
                                  'offset': offset,
                                  'sku': sku
                              },
                              callback=self.parse_reviews)
            yield request
        else:
            if product['price']:
                yield product
Beispiel #13
0
    def parse_review(self, response):

        html = re.search('var materials={.*?(<div.*?)"},.initializers', response.body, re.DOTALL).group(1)
        html = re.sub(r'\\n', r'\n', html)
        html = re.sub(r'\\(.)', r'\1', html)

        hxs = HtmlXPathSelector(text=html)

        reviews = hxs.select(u'//div[starts-with(@id, "BVRRDisplayContentReviewID_")]')
        products = response.meta['products']

        if not reviews:
            for product in products:
                yield product
            return

        for review in reviews:
            loader = ReviewLoader(item=Review(), selector=review, date_format=u'%m/%d/%Y')

            date = review.select(u'.//span[@class="BVRRValue BVRRReviewDate"]/text()').extract()[0]
            date = time.strptime(date, u'%B %d, %Y')
            date = time.strftime(u'%m/%d/%Y', date)

            loader.add_value('date', date)

            title = review.select(u'.//span[@class="BVRRValue BVRRReviewTitle"]/text()').extract()
            if not title:
                title = u'Untitled'
            else:
                title = title[0]
            text = review.select(u'.//span[@class="BVRRReviewText"]/text()').extract()
            if text:
                text = text[0]
            else:
                text = u'No text supplied.'
            loader.add_value('full_text', u'%s\n%s' % (title, text))
            loader.add_value('product_url', response.meta['product_url'])
            review_id = review.select('@id').re(r'ReviewID_(\d+)$')[0]
            loader.add_value('review_id', review_id)
            loader.add_value('url', response.meta['product_url'])
            product = products[0] if products else {}
            loader.add_value('sku', product.get('sku') or '')
            loader.add_xpath('rating', u'.//div[@id="BVRRRatingOverall_Review_Display"]//span[@class="BVRRNumber BVRRRatingNumber"]/text()')
            products[0]['metadata']['reviews'].append(loader.load_item())

        next_page = hxs.select(u'.//a[contains(text(),"Next page")]/@data-bvjsref').extract()
        if not next_page:
            for product in products:
                yield product
            return
        else:
            yield Request(urljoin_rfc(get_base_url(response), next_page[0]),
                          meta=response.meta,
                          callback=self.parse_review,
                          dont_filter=True)
Beispiel #14
0
    def parse_review(self, response):

        html = re.search('var materials={.*?(<div.*?)"},.initializers', response.body, re.DOTALL).group(1)
        html = re.sub(r'\\n', r'\n', html)
        html = re.sub(r'\\(.)', r'\1', html)

        hxs = HtmlXPathSelector(text=html)

        reviews = hxs.select(u'//div[starts-with(@id, "BVRRDisplayContentReviewID_")]')
        product = response.meta['product']

        if not reviews:
            yield product
            return

        for review in reviews:
            loader = ReviewLoader(item=Review(), selector=review, date_format=u'%d/%m/%Y')

            date = review.select(u'.//span[@class="BVRRValue BVRRReviewDate"]/text()').extract()[0]
            date = time.strptime(date, u'%B %d, %Y')
            date = time.strftime(u'%d/%m/%Y', date)

            loader.add_value('date', date)

            title = review.select(u'.//span[@class="BVRRValue BVRRReviewTitle"]/text()').extract()
            if not title:
                title = u'Untitled'
            else:
                title = title[0]
            text = review.select(u'.//span[@class="BVRRReviewText"]/text()').extract()
            if text:
                text = text[0]
            else:
                text = u'No text supplied.'
            loader.add_value('full_text', u'%s\n%s' % (title, text))
            loader.add_value('url', response.meta['product_url'])
            loader.add_xpath('rating', u'.//div[@id="BVRRRatingOverall_Review_Display"]//span[@class="BVRRNumber BVRRRatingNumber"]/text()')
            product['metadata']['reviews'].append(loader.load_item())

        cur_page = hxs.select(u'//span[@class="BVRRPageLink BVRRPageNumber BVRRSelectedPageNumber"]/text()').extract()
        if not cur_page:
            yield product
            return
        else:
            cur_page = int(cur_page[0])

        if 'last_page' not in response.meta:
            response.meta['last_page'] = int(hxs.select(u'//span[@class="BVRRPageLink BVRRPageNumber"]/a/text()').extract()[-1])

        if cur_page < response.meta['last_page']:
            url = response.meta['reviews_url'] + u'&page=%s' % str(cur_page + 1)
            yield Request(url, meta=response.meta, callback=self.parse_review)
        else:
            yield product
    def parse_review(self, response):

        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        soup = BeautifulSoup(response.body)
        product = response.meta['product']

        reviews = hxs.select(
            u'//table[@id="productReviews"]//div[@style="margin-left:0.5em;"]')

        if not reviews:
            yield product
            return

        for review in reviews:
            loader = ReviewLoader(item=Review(),
                                  selector=hxs,
                                  date_format=u'%d/%m/%Y')
            date = review.select(u'.//nobr/text()')[0].extract()
            res = None
            date_formats = (u'%B %d, %Y', u'%d %b %Y', u'%d %B %Y')
            for fmt in date_formats:
                try:
                    res = time.strptime(date, fmt)
                except ValueError:
                    pass
                if res:
                    break
            date = time.strftime(u'%d/%m/%Y', res)
            loader.add_value('date', date)

            rating = review.select(u'.//text()').re(
                u'([\d\.]+) out of 5 stars')[0]
            rating = int(float(rating))
            loader.add_value('rating', rating)
            loader.add_value('url', response.url)

            title = review.select(u'.//b/text()')[0].extract()
            text = ''.join([
                s.strip() for s in review.select(
                    u'div[@class="reviewText"]/text()').extract()
            ])
            loader.add_value('full_text', u'%s\n%s' % (title, text))

            product['metadata']['reviews'].append(loader.load_item())

        next_page = soup.find('a', text=re.compile('Next'))
        if next_page and next_page.parent.get('href'):
            next_page = next_page.parent['href']
            yield Request(urljoin_rfc(base_url, next_page),
                          meta=response.meta,
                          callback=self.parse_review)
        else:
            yield product
Beispiel #16
0
    def parse_review(self, response):

        reviews = re.search(u'= (.*);$', response.body, re.DOTALL)

        product = response.meta['product']

        if response.status != 200 or not reviews:
            yield product
            return

        reviews = reviews.group(1)
        reviews = map(lambda x: x.get('r'), demjson.decode(reviews))

        for review in reviews:
            loader = ReviewLoader(item=Review(),
                                  selector=review,
                                  date_format=u'%m/%d/%Y')

            loader.add_value('review_id', review['id'])

            date_review = datetime.strptime(review.get('d'), "%m/%d/%Y").date()
            date_review = date_review.strftime("%m/%d/%Y")

            loader.add_value('date', date_review)

            title = review['h']
            text = review['p']

            review_data = {}
            if review.get('g'):
                for data in review['g']:
                    review_data[data['n']] = u', '.join(map(str, data['v']))
            fields = [u'Pros', u'Cons', u'Best Uses']
            text += u'\n'
            for field in fields:
                if review_data.get(field):
                    text += u'%s:\n%s\n' % (field, review_data.get(field))
            if review.get('b'):
                if review['b']['k'] == 'Yes':
                    text += u'Yes, I would recommend this to a friend.'
                else:
                    text += u'No, I would not recommend this to a friend.'

            loader.add_value('full_text', u'%s\n%s' % (title, text))
            loader.add_value('product_url', response.meta['product_url'])
            loader.add_value('url', response.meta['product_url'])
            loader.add_value('rating', review['r'])
            product['metadata']['reviews'].append(loader.load_item())

        cur_page = response.meta['cur_page']

        url = response.meta['reviews_url'] % str(cur_page + 1)
        response.meta['cur_page'] += 1
        yield Request(url, meta=response.meta, callback=self.parse_review)
Beispiel #17
0
    def parse_review(self, response):
        product = response.meta['product']

        body = response.body.split('] = ')[1]
        body = body.rstrip(';')
        data = load_js_objects(body)

        for r in data:
            r = r['r']
            review = Review()

            date = r['db']
            review['date'] = date[8:10] + '/' + date[5:7] + '/' + date[:4]

            comments = r['p']
            header = r['h']
            pros = cons = best_uses = []
            for short in r.get('g', []):
                k = short['k']
                if k == 'cons':
                    cons = short['v']
                elif k == 'pros':
                    pros = short['v']
                elif k == 'bestuses':
                    bestuses = short['v']

            bottom = {
                "yes": "Yes, I would recommend this to a friend",
                "no": "No, I would not recommend this to a friend"
            }
            try:
                bottom_line = bottom[r['b']['k'].lower()]
            except:
                bottom_line = ''

            review[
                'full_text'] = u'%s\n%s\nBottom Line: %s\nPros: %s\nCons: %s\nBest Uses: %s\n' % (
                    header, comments, bottom_line, u', '.join(pros),
                    u', '.join(cons), u', '.join(best_uses))

            review['rating'] = r['r']

            review['url'] = response.url
            product['metadata']['reviews'].append(review)
        # XXX maybe there is a better way to yield product after all review have been fetched
        if response.meta['review_n'] == response.meta['review_pages']:
            yield product
        else:
            response.meta['review_n'] = response.meta['review_n'] + 1
            yield Request(review_url(response.meta['review_sku'],
                                     response.meta['review_n']),
                          meta=response.meta,
                          callback=self.parse_review)
Beispiel #18
0
    def parse_review(self, response):

        product = response.meta['product']

        html = json.loads(response.body)[0]['result']
        hxs = HtmlXPathSelector(text=html)

        reviews = hxs.select(
            '//div[contains(@class, "yotpo-reviews")]/div[contains(@class, "yotpo-regular-box")]'
        )
        for review in reviews:
            loader = ReviewLoader(item=Review(),
                                  selector=review,
                                  date_format=u'%m/%d/%Y')
            date_review = review.select(
                './/label[contains(@class, "yotpo-review-date")]/text()'
            ).extract()[0]
            for month, num in ((u'janvier', '01'), (u'f\xe9vrier', '02'),
                               (u'mars', '03'), (u'avril', '04'),
                               (u'mai', '05'), (u'juin', '06'),
                               (u'juillet', '07'), (u'ao\xfbt', '08'),
                               (u'septembre', '09'), (u'octobre', '10'),
                               (u'novembre', '11'), (u'd\xe9cembre', '12')):
                date_review = date_review.replace(month, num)
            date_review = datetime.datetime.strptime(date_review,
                                                     "%d/%m/%y").date()
            date_review = date_review.strftime("%m/%d/%Y")
            loader.add_value('date', date_review)

            loader.add_xpath(
                'full_text',
                './/div[contains(@class, "content-title")]/text()')
            content = ''.join(
                review.select(
                    './/div[contains(@class, "content-review")]/text()').
                extract()).strip()
            if not content:
                continue

            loader.add_value('full_text', content)
            loader.add_value('product_url', product['url'])
            loader.add_value('url', product['url'])
            loader.add_value('sku', product['sku'])
            loader.add_value(
                'rating',
                len(
                    review.select('.//span[@class="yotpo-review-stars"]/span').
                    extract()))
            product['metadata']['reviews'].append(loader.load_item())

        yield product
Beispiel #19
0
    def parse_review(self, response):
        hxs = HtmlXPathSelector(response)

        reviews = hxs.select(
            '//div[@id="bvseo-reviewsSection"]/div[@itemprop="review"]')
        product = response.meta['product']

        if not reviews:
            yield product
            return

        for review in reviews:
            loader = ReviewLoader(item=Review(),
                                  selector=review,
                                  date_format=u'%m/%d/%Y')

            date = review.select(
                './meta[@itemprop="datePublished"]/@content').extract()[0]

            date = time.strptime(date, u'%Y-%m-%d')
            date = time.strftime(u'%m/%d/%Y', date)

            loader.add_value('date', date)

            title = ''.join(
                review.select('./span[@itemprop="name"]/text()').extract())
            if not title:
                title = u'Untitled'
            text = ''.join(
                review.select('./span[@itemprop="description"]/text()').
                extract()).strip()
            if not text:
                text = u'No text supplied.'
            loader.add_value('full_text', u'%s\n%s' % (title, text))
            loader.add_value('product_url', product['url'])
            loader.add_value('url', product['url'])
            loader.add_value('sku', product.get('sku') or '')
            rating = review.select(
                './/span[@itemprop="ratingValue"]/text()').extract()[0]
            loader.add_value('rating', rating)
            product['metadata']['reviews'].append(loader.load_item())

        yield product
Beispiel #20
0
    def parse_review(self, response):
        hxs = HtmlXPathSelector(response)
        product = response.meta['product']

        for r in hxs.select(u'//div[starts-with(@id,"BVRRDisplayContentReviewID_")]'):
            loader = ReviewLoader(item=Review(), selector=r, date_format='%d %B %Y')

            title = r.select(u'.//span[contains(@class,"BVRRReviewTitle")]/text()').extract()
            text = ' '.join(r.select(u'.//span[contains(@class,"BVRRReviewText")]/text()').extract())
            if title:
                text = title[0] + '\n' + text
            loader.add_value('full_text', text)
            loader.add_xpath('date', u'.//span[contains(@class,"BVRRReviewDate") and position()=1]/text()')
            loader.add_value('rating', r.select(u'.//div[@class="BVRRRatingNormalImage"]/img/@title').extract()[0].split()[0])
            loader.add_value('url', response.url)
            product['metadata']['reviews'].append(loader.load_item())

        next_url = hxs.select(u'//span[contains(@class,"BVRRNextPage")]/a/@href').extract()
        if next_url:
            yield Request(next_url[0], meta=response.meta, callback=self.parse_review)
        else:
            yield product
Beispiel #21
0
    def parse_reviews(self, response):
        hxs = HtmlXPathSelector(response)
        product = response.meta['product']
        json_body = json.loads(response.body)

        reviews = json_body['BatchedResults']['q2']['Results']
        for review in reviews:
            review_loader = ReviewLoader(item=Review(),
                                         response=response,
                                         date_format="%B %d, %Y")
            review_date = datetime.datetime.strptime(
                review['SubmissionTime'].split('.')[0], '%Y-%m-%dT%H:%M:%S')
            review_loader.add_value('date', review_date.strftime("%B %d, %Y"))

            title = review['Title']
            text = review['ReviewText']

            if title:
                full_text = title[0] + '\n' + text
            else:
                full_text = text

            pros = review['Pros']
            cons = review['Cons']
            if pros:
                full_text += '\nPros: ' + ', '.join(pros)
            if cons:
                full_text += '\nCons: ' + ', '.join(cons)

            review_loader.add_value('full_text', full_text)
            rating = review['Rating']
            review_loader.add_value('rating', rating)
            review_loader.add_value('url', response.url)

            product['metadata']['reviews'].append(review_loader.load_item())

        yield product
    def parse_review(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        reviews = hxs.select(u'//div[@class="boxproductinfo"]/table/tr')
        product = response.meta['product']

        if not reviews:
            yield product
            return

        for review in reviews:
            loader = ReviewLoader(item=Review(),
                                  selector=review,
                                  date_format=u'%d/%m/%Y')
            loader.add_value(
                'date',
                review.select(u'./td/div/p/span/text()').re(
                    u'(\d{2}/\d{2}/\d{4})')[0])
            loader.add_xpath('full_text', u'./td/div[2]/text()')
            loader.add_value('url', response.url)

            product['metadata']['reviews'].append(loader.load_item())

        next_page = hxs.select(
            u'//h4/a[contains(text(),"Next")]/@href').extract()

        if next_page:
            next_page = urljoin_rfc(get_base_url(response), next_page[0])
            yield Request(next_page,
                          meta={'product': product},
                          callback=self.parse_review)
        else:
            yield product
Beispiel #23
0
class RubbermaidSpider(BaseSpider):
    name = 'keter-rubbermaid.com'
    allowed_domains = ['rubbermaid.com']

    start_urls = [
        'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=Outdoor&SubCatId=shed-accessories',
        'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=Outdoor&SubCatId=VerticalSheds',
        'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=Outdoor&SubCatId=HorizontalSheds',
        'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=Outdoor&SubCatId=DeckBoxesPatioBenches',
        'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=GarageOrganization&SubCatId=ResinCabinets',
        'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=GarageOrganization&SubCatId=FastTrackGarageOrganizationSystem'
    ]

    def __init__(self, *args, **kwargs):
        super(RubbermaidSpider, self).__init__(*args, **kwargs)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

        self._browser = PhantomJS.create_browser()

        max_wait = 60
        self._browser.set_page_load_timeout(max_wait)
        self._browser.set_script_timeout(max_wait)

    def spider_closed(self):
        self._browser.quit()

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        for url in hxs.select(
                '//div[@id="foodStorageBlock"]//a/@href').extract():
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse_product)

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        shipping_cost = hxs.select(
            './/a[contains(text(), "Delivery Surcharge")]//../..//td[2]//span/text()'
        ).extract()
        if not shipping_cost:
            shipping_cost = hxs.select(
                './/td[contains(text(), "Shipping Surcharge")]//..//td[2]//span/text()'
            ).extract()

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@id="ProductNameH1"]/text()')
        loader.add_value(
            'category',
            hxs.select('//div[@class="breadcrum"]/div/a/text()').extract()[-1])
        loader.add_xpath(
            'identifier',
            '//form//input[@id="hdnProdId" or @name="hdnProdId"]/@value')
        price = hxs.select(
            './/td[contains(text(), "Price:")]//..//td[2]//span/text()'
        ).extract()
        if price:
            loader.add_value('price', price[0])
        else:
            loader.add_value('price', 0)
        try:
            loader.add_value('shipping_cost', shipping_cost[0].strip())
        except:
            pass

        item = hxs.select('//td/strong')
        if item and item[0].select('../text()'):
            loader.add_value(
                'sku', item[0].select('../text()').extract()[1].strip('#() '))

        image_url = hxs.select(
            '//div[@id="divImageBlock"]//img/@src').extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))

        loader.add_value('brand', 'Rubbermaid')

        product = loader.load_item()

        product['sku'] = product['sku'].upper()

        metadata = KeterMeta()
        metadata['brand'] = 'Rubbermaid'
        metadata['reviews'] = []
        product['metadata'] = metadata

        self.log('>> BROWSER => GET < %s />' % response.url)
        self._browser.get(response.url)
        self.log('>> OK')

        self.log('>> BROWSER => Looking for more reviews ...')
        try:
            load_more_button = self._browser.find_element_by_xpath(
                '//div[@class="bv-content-pagination"]//button')
            more_reviews = load_more_button.is_displayed()
            max_pages = 25
            while more_reviews and max_pages:
                self.log('>> More reviews found...')
                load_more_button.click()
                self.log('>> BROWSER => CLICK "Load more"')
                time.sleep(20)
                self.log('>> OK')
                load_more_button = self._browser.find_element_by_xpath(
                    '//div[@class="bv-content-pagination"]//button')
                more_reviews = load_more_button.is_displayed()
                max_pages -= 1
            self.log('>> No more reviews...')
        except Exception, e:
            self.log('>> ERROR FOUND => %s' % e)

        hxs = HtmlXPathSelector(text=self._browser.page_source)

        for review in hxs.select(
                '//ol[contains(@class, "bv-content-list-Reviews")]//li[contains(@class, "bv-content-review")]'
        ):
            review_loader = ReviewLoader(item=Review(),
                                         selector=review,
                                         date_format='%m/%d/%Y')

            review_loader.add_xpath(
                'date',
                u'.//div[@class="bv-content-datetime"][1]//meta[@itemprop="dateCreated"]/@content'
            )
            review_loader.add_xpath(
                'full_text', u'.//div[@itemprop="reviewBody"]/p/text()')
            review_loader.add_xpath(
                'rating',
                u'.//abbr[contains(@class, "bv-rating-stars-on")][1]/@title')
            review_loader.add_value('url', response.url)

            product['metadata']['reviews'].append(review_loader.load_item())

        yield product
Beispiel #24
0
    def parse_review(self, response):

        hxs = HtmlXPathSelector(response)

        reviews = hxs.select(u'//div[@class="rating-box"]')
        product = response.meta['product']

        if not reviews:
            yield product
            return

        months = enumerate(
            u'janvier, f\xe9vrier, mars, avril, mai, juin, juillet, ao\xfbt, septembre, octobre, novembre, d\xe9cembre'
            .split(', '), 1)
        months = dict(((y, x) for x, y in months))
        for review in reviews:
            loader = ReviewLoader(item=Review(),
                                  selector=review,
                                  date_format=u'%m/%d/%Y')

            date = review.select(u'.//footer/p/text()').extract()[0]

            for month, number in months.items():
                if month in date:
                    date = date.replace(month, str(number)).replace(' - ', '')
                    break
            date = time.strptime(date, u'%d %m %Y')
            date = time.strftime(u'%m/%d/%Y', date)

            loader.add_value('date', date)

            title = review.select(u'.//article/header/h3/text()').extract()
            if not title:
                title = u'Untitled'
            else:
                title = title[0]
            text = ''
            ratings = review.select('.//div[@class="infos-note"]/p')
            for rating in ratings:
                text += u'{} {}\n'.format(
                    *rating.select('.//text()[normalize-space()]').extract())
            lines = review.select(
                './/article//p//text()[normalize-space()]').extract()
            for line in lines:
                text += u'{}\n'.format(line.strip())
            if not text:
                text = u'No text supplied.'
            loader.add_value('full_text', u'%s\n%s' % (title, text))
            loader.add_value('product_url', response.meta['product_url'])
            loader.add_value('url', response.meta['product_url'])
            loader.add_value('sku', product.get('sku') or '')
            loader.add_xpath('rating',
                             u'.//span[@itemprop="ratingValue"]/text()')
            product['metadata']['reviews'].append(loader.load_item())

        reviews_url = response.meta.get('reviews_url')
        meta = response.meta
        meta['page'] += 1
        yield Request(reviews_url.format(response.meta.get('product_id'),
                                         str(response.meta.get('page') + 1)),
                      meta=meta,
                      callback=self.parse_review,
                      dont_filter=True)
Beispiel #25
0
    def parse_review(self, response):

        html = re.search('var materials={.*?(<div.*?)"},.initializers',
                         response.body, re.DOTALL).group(1)
        html = re.sub(r'\\n', r'\n', html)
        html = re.sub(r'\\(.)', r'\1', html)

        hxs = HtmlXPathSelector(text=html)

        reviews = hxs.select(
            u'//div[starts-with(@id, "BVRRDisplayContentReviewID_")]')
        products = response.meta['products']
        if not reviews:
            for product in products:
                yield product
            return

        for review in reviews:
            loader = ReviewLoader(item=Review(),
                                  selector=review,
                                  date_format=u'%m/%d/%Y')

            review_id = review.select("@id").re(
                r'BVRRDisplayContentReviewID_(\d+)')[0]
            loader.add_value('review_id', review_id)

            date = review.select(
                u'.//span[contains(@class, "BVRRValue BVRRReviewDate")]/text()'
            ).extract()[0]
            date = time.strptime(date, u'%B %d, %Y')
            date = time.strftime(u'%m/%d/%Y', date)

            loader.add_value('date', date)

            title = review.select(
                u'.//span[@class="BVRRValue BVRRReviewTitle summary"]/text()'
            ).extract()
            if not title:
                title = u'Untitled'
            else:
                title = title[0]
            pros_cons_text = u' '.join(reviews[0].select(
                u'.//div[@class="BVRRReviewProsConsContainer"]//text()').
                                       extract())
            text = review.select(
                u'.//span[@class="BVRRReviewText"]/text()').extract()
            if text:
                text = text[0]
            else:
                text = u'No text supplied.'
            extra_information = u' '.join(
                review.select(
                    u'.//div[@class="BVRRContextDataContainer"]//text()').
                extract())
            text = '%s\n%s\n%s' % (pros_cons_text, text, extra_information)
            loader.add_value('full_text', u'%s\n%s' % (title, text))
            loader.add_value('product_url', response.meta['product_url'])
            loader.add_value('url', response.meta['product_url'])
            product = products[0] if products else {}
            loader.add_value('sku', product.get('sku') or '')
            loader.add_xpath(
                'rating',
                u'.//div[@id="BVRRRatingOverall_Review_Display"]//span[@itemprop="ratingValue"]/text()'
            )
            products[0]['metadata']['reviews'].append(loader.load_item())

        cur_page = hxs.select(
            u'//span[@class="BVRRPageLink BVRRPageNumber BVRRSelectedPageNumber"]/text()'
        ).extract()
        if not cur_page:
            for product in products:
                yield product
            return
        else:
            cur_page = int(cur_page[0])

        if 'last_page' not in response.meta:
            response.meta['last_page'] = int(
                hxs.select(
                    u'//span[@class="BVRRPageLink BVRRPageNumber"]/a/text()').
                extract()[-1])

        if cur_page < response.meta['last_page']:
            url = response.meta['reviews_url'] + u'&page=%s' % str(cur_page +
                                                                   1)
            yield Request(url, meta=response.meta, callback=self.parse_review)
        else:
            for product in products:
                yield product