Beispiel #1
0
    def parse_review(self, response):
        hxs = HtmlXPathSelector(response)

        reviews = hxs.select(
            u'//div[contains(@id,"BVRRDisplayContentReviewID")]')
        product = response.meta['product']

        if not reviews:
            yield product
            return

        for review in reviews:
            loader = ReviewLoader(item=Review(),
                                  selector=review,
                                  date_format=u'%m/%d/%Y')
            date = review.select(
                u'.//span[contains(@class, "BVRRReviewDate")]/span[@class="value-title"]/@title'
            ).extract()
            if date:
                date = time.strptime(date.pop(), u'%Y-%m-%d')
                date = time.strftime(u'%m/%d/%Y', date)

                loader.add_value('date', date)

            title = review.select(
                u'.//span[@class="BVRRValue BVRRReviewTitle"]/text()').extract(
                )
            if not title:
                title = u'Untitled'
            else:
                title = title[0]
            text = '\n'.join(
                review.select(
                    './/div[@class="BVRRReviewDisplayStyle3Summary"]//text()[normalize-space()]'
                ).extract())
            text += '\n' + '\n'.join(
                review.select(
                    u'.//div[@class="BVRRReviewDisplayStyle3Content"]//text()[normalize-space()]'
                ).extract())

            loader.add_value('full_text', u'%s\n%s' % (title, text))
            loader.add_value('product_url', product['url'])
            loader.add_value('url', product['url'])
            loader.add_value('sku', product.get('sku') or '')
            loader.add_xpath(
                'rating',
                u'.//div[@id="BVRRRatingOverall_Review_Display"]//span[@class="BVRRNumber BVRRRatingNumber"]/text()'
            )
            product['metadata']['reviews'].append(loader.load_item())

        next_page = hxs.select(
            u'.//a[contains(text(),"Next page")]/@data-bvjsref').extract()
        if not next_page:
            yield product
            return
        else:
            yield Request(urljoin_rfc(get_base_url(response), next_page[0]),
                          meta=response.meta,
                          callback=self.parse_review,
                          dont_filter=True)
Beispiel #2
0
    def parse_review(self, response):
        hxs = HtmlXPathSelector(response)
        product = response.meta['product']

        for review in hxs.select(u'//div[@class="pr-review-wrap"]'):
            item = Review()
            loader = ReviewLoader(item=Review(),
                                  selector=review,
                                  date_format=u'%m/%d/%Y')
            loader.add_xpath(
                'date',
                u'.//div[contains(@class, "pr-review-author-date")]/text()')
            comments = review.select(
                u'.//p[@class="pr-comments"]/text()').extract()[0]
            bottom_line = review.select(
                u'.//div[@class="pr-review-bottom-line-wrapper"]/p/text()[2]'
            ).extract()
            if bottom_line:
                bottom_line = bottom_line[0]
            else:
                bottom_line = ''

            pros = hxs.select(
                './/div[contains(@class,"pr-attribute-pros")]//li/text()'
            ).extract()
            cons = hxs.select(
                './/div[contains(@class,"pr-attribute-cons")]//li/text()'
            ).extract()
            best_uses = hxs.select(
                './/div[contains(@class,"pr-attribute-bestuses")]//li/text()'
            ).extract()

            loader.add_value(
                'full_text',
                u'%s\nBottom Line: %s\nPros: %s\nCons: %s\nBest Uses: %s\n' %
                (comments, bottom_line, u', '.join(pros), u', '.join(cons),
                 u', '.join(best_uses)))

            loader.add_value(
                'rating',
                int(
                    float(
                        review.select(
                            u'.//span[contains(@class,"pr-rating")]/text()').
                        extract()[0])))
            loader.add_value('url', response.url)

            product['metadata']['reviews'].append(loader.load_item())

        next_url = hxs.select(
            u'//span[@class="pr-page-next"]/a/@href').extract()
        if next_url:
            yield Request(next_url[0],
                          meta=response.meta,
                          callback=self.parse_review)
        else:
            yield product
Beispiel #3
0
    def parse_review(self, response):
        hxs = HtmlXPathSelector(response)
        product = response.meta['product']

        for review in hxs.select(
                u'//div[starts-with(@id, "BVRRDisplayContentReviewID_")]'):
            review_loader = ReviewLoader(item=Review(),
                                         selector=review,
                                         date_format="%B %d, %Y")
            review_loader.add_xpath(
                'date', u'.//span[contains(@class,"BVRRReviewDate")]/text()')

            title = review.select(
                u'.//span[contains(@class,"BVRRReviewTitle")]/text()').extract(
                )
            text = review.select(
                u'.//span[contains(@class,"BVRRReviewText")]/text()').extract(
                )
            text = ' '.join(text)

            if title:
                full_text = title[0] + '\n' + text
            else:
                full_text = text

            pros = review.select(
                u'.//span[contains(@class,"BVRRReviewProTags")]/span/text()'
            ).extract()
            cons = review.select(
                u'.//span[contains(@class,"BVRRReviewConTags")]/span/text()'
            ).extract()
            if pros:
                full_text += '\nPros: ' + ', '.join(pros)
            if cons:
                full_text += '\nCons: ' + ', '.join(cons)

            review_loader.add_value('full_text', full_text)
            review_loader.add_xpath(
                'rating',
                u'.//span[contains(@class,"BVRRRatingNumber")]/text()')
            review_loader.add_value('url', response.url)

            product['metadata']['reviews'].append(review_loader.load_item())

        next_url = hxs.select(
            u'//div[contains(@class,"BVRRNextPage")]/a/@href').extract()
        if next_url:
            yield Request(next_url[0],
                          meta=response.meta,
                          callback=self.parse_review)
        else:
            price_url = 'http://www.homedepot.ca/async-fetch-regional-price?storeId=9999&pnList='
            price_url += product['url'].split('/')[-1]

            yield Request(price_url,
                          meta=response.meta,
                          callback=self.parse_price)
Beispiel #4
0
    def parse_review(self, response):

        html = re.search('var materials={.*?(<div.*?)"},.initializers', response.body, re.DOTALL).group(1)
        html = re.sub(r'\\n', r'\n', html)
        html = re.sub(r'\\(.)', r'\1', html)

        hxs = HtmlXPathSelector(text=html)

        reviews = hxs.select(u'//div[starts-with(@id, "BVRRDisplayContentReviewID_")]')
        products = response.meta['products']

        if not reviews:
            for product in products:
                yield product
            return

        for review in reviews:
            loader = ReviewLoader(item=Review(), selector=review, date_format=u'%m/%d/%Y')

            date = review.select(u'.//span[@class="BVRRValue BVRRReviewDate"]/text()').extract()[0]
            date = time.strptime(date, u'%B %d, %Y')
            date = time.strftime(u'%m/%d/%Y', date)

            loader.add_value('date', date)

            title = review.select(u'.//span[@class="BVRRValue BVRRReviewTitle"]/text()').extract()
            if not title:
                title = u'Untitled'
            else:
                title = title[0]
            text = review.select(u'.//span[@class="BVRRReviewText"]/text()').extract()
            if text:
                text = text[0]
            else:
                text = u'No text supplied.'
            loader.add_value('full_text', u'%s\n%s' % (title, text))
            loader.add_value('product_url', response.meta['product_url'])
            review_id = review.select('@id').re(r'ReviewID_(\d+)$')[0]
            loader.add_value('review_id', review_id)
            loader.add_value('url', response.meta['product_url'])
            product = products[0] if products else {}
            loader.add_value('sku', product.get('sku') or '')
            loader.add_xpath('rating', u'.//div[@id="BVRRRatingOverall_Review_Display"]//span[@class="BVRRNumber BVRRRatingNumber"]/text()')
            products[0]['metadata']['reviews'].append(loader.load_item())

        next_page = hxs.select(u'.//a[contains(text(),"Next page")]/@data-bvjsref').extract()
        if not next_page:
            for product in products:
                yield product
            return
        else:
            yield Request(urljoin_rfc(get_base_url(response), next_page[0]),
                          meta=response.meta,
                          callback=self.parse_review,
                          dont_filter=True)
Beispiel #5
0
    def parse_review(self, response):

        html = re.search('var materials={.*?(<div.*?)"},.initializers', response.body, re.DOTALL).group(1)
        html = re.sub(r'\\n', r'\n', html)
        html = re.sub(r'\\(.)', r'\1', html)

        hxs = HtmlXPathSelector(text=html)

        reviews = hxs.select(u'//div[starts-with(@id, "BVRRDisplayContentReviewID_")]')
        product = response.meta['product']

        if not reviews:
            yield product
            return

        for review in reviews:
            loader = ReviewLoader(item=Review(), selector=review, date_format=u'%d/%m/%Y')

            date = review.select(u'.//span[@class="BVRRValue BVRRReviewDate"]/text()').extract()[0]
            date = time.strptime(date, u'%B %d, %Y')
            date = time.strftime(u'%d/%m/%Y', date)

            loader.add_value('date', date)

            title = review.select(u'.//span[@class="BVRRValue BVRRReviewTitle"]/text()').extract()
            if not title:
                title = u'Untitled'
            else:
                title = title[0]
            text = review.select(u'.//span[@class="BVRRReviewText"]/text()').extract()
            if text:
                text = text[0]
            else:
                text = u'No text supplied.'
            loader.add_value('full_text', u'%s\n%s' % (title, text))
            loader.add_value('url', response.meta['product_url'])
            loader.add_xpath('rating', u'.//div[@id="BVRRRatingOverall_Review_Display"]//span[@class="BVRRNumber BVRRRatingNumber"]/text()')
            product['metadata']['reviews'].append(loader.load_item())

        cur_page = hxs.select(u'//span[@class="BVRRPageLink BVRRPageNumber BVRRSelectedPageNumber"]/text()').extract()
        if not cur_page:
            yield product
            return
        else:
            cur_page = int(cur_page[0])

        if 'last_page' not in response.meta:
            response.meta['last_page'] = int(hxs.select(u'//span[@class="BVRRPageLink BVRRPageNumber"]/a/text()').extract()[-1])

        if cur_page < response.meta['last_page']:
            url = response.meta['reviews_url'] + u'&page=%s' % str(cur_page + 1)
            yield Request(url, meta=response.meta, callback=self.parse_review)
        else:
            yield product
Beispiel #6
0
    def parse_review(self, response):

        product = response.meta['product']

        html = json.loads(response.body)[0]['result']
        hxs = HtmlXPathSelector(text=html)

        reviews = hxs.select(
            '//div[contains(@class, "yotpo-reviews")]/div[contains(@class, "yotpo-regular-box")]'
        )
        for review in reviews:
            loader = ReviewLoader(item=Review(),
                                  selector=review,
                                  date_format=u'%m/%d/%Y')
            date_review = review.select(
                './/label[contains(@class, "yotpo-review-date")]/text()'
            ).extract()[0]
            for month, num in ((u'janvier', '01'), (u'f\xe9vrier', '02'),
                               (u'mars', '03'), (u'avril', '04'),
                               (u'mai', '05'), (u'juin', '06'),
                               (u'juillet', '07'), (u'ao\xfbt', '08'),
                               (u'septembre', '09'), (u'octobre', '10'),
                               (u'novembre', '11'), (u'd\xe9cembre', '12')):
                date_review = date_review.replace(month, num)
            date_review = datetime.datetime.strptime(date_review,
                                                     "%d/%m/%y").date()
            date_review = date_review.strftime("%m/%d/%Y")
            loader.add_value('date', date_review)

            loader.add_xpath(
                'full_text',
                './/div[contains(@class, "content-title")]/text()')
            content = ''.join(
                review.select(
                    './/div[contains(@class, "content-review")]/text()').
                extract()).strip()
            if not content:
                continue

            loader.add_value('full_text', content)
            loader.add_value('product_url', product['url'])
            loader.add_value('url', product['url'])
            loader.add_value('sku', product['sku'])
            loader.add_value(
                'rating',
                len(
                    review.select('.//span[@class="yotpo-review-stars"]/span').
                    extract()))
            product['metadata']['reviews'].append(loader.load_item())

        yield product
Beispiel #7
0
    def parse_review(self, response, product=None):
        hxs = HtmlXPathSelector(response)
        if not product:
            product = response.meta['product']

        for review in hxs.select('//div[@class="pr-review-main"]'):
            loader = ReviewLoader(item=Review(),
                                  selector=review,
                                  date_format=u'%Y-%m-%d')
            loader.add_xpath('date',
                             './/span[@itemprop="dtreviewed"]/@datetime')

            loader.add_xpath('full_text',
                             './/div[@class="pr-review-infos-title"]/text()')
            loader.add_xpath('full_text',
                             './/div[@class="pr-comments"]/text()')
            loader.add_value('product_url', product['url'])
            loader.add_value('url', product['url'])
            loader.add_value('sku', product['sku'])
            loader.add_value(
                'rating',
                len(
                    review.select(
                        './/div[@class="pr-stars pr-stars-small"]/span[contains(@class, "pr-star")]'
                    ).extract()))
            product['metadata']['reviews'].append(loader.load_item())

        next = hxs.select(
            '//span[@class="pr-page-next"]/a[@href!="#"]/@href').extract()
        if not next:
            yield product
        else:
            yield Request(urljoin_rfc(get_base_url(response), next[0]),
                          callback=self.parse_review,
                          meta=response.meta)
Beispiel #8
0
    def parse_review(self, response):
        hxs = HtmlXPathSelector(response)
        product = response.meta['product']

        for r in hxs.select(u'//div[starts-with(@id,"BVRRDisplayContentReviewID_")]'):
            loader = ReviewLoader(item=Review(), selector=r, date_format='%d %B %Y')

            title = r.select(u'.//span[contains(@class,"BVRRReviewTitle")]/text()').extract()
            text = ' '.join(r.select(u'.//span[contains(@class,"BVRRReviewText")]/text()').extract())
            if title:
                text = title[0] + '\n' + text
            loader.add_value('full_text', text)
            loader.add_xpath('date', u'.//span[contains(@class,"BVRRReviewDate") and position()=1]/text()')
            loader.add_value('rating', r.select(u'.//div[@class="BVRRRatingNormalImage"]/img/@title').extract()[0].split()[0])
            loader.add_value('url', response.url)
            product['metadata']['reviews'].append(loader.load_item())

        next_url = hxs.select(u'//span[contains(@class,"BVRRNextPage")]/a/@href').extract()
        if next_url:
            yield Request(next_url[0], meta=response.meta, callback=self.parse_review)
        else:
            yield product
    def parse_review(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        reviews = hxs.select(u'//div[@class="boxproductinfo"]/table/tr')
        product = response.meta['product']

        if not reviews:
            yield product
            return

        for review in reviews:
            loader = ReviewLoader(item=Review(),
                                  selector=review,
                                  date_format=u'%d/%m/%Y')
            loader.add_value(
                'date',
                review.select(u'./td/div/p/span/text()').re(
                    u'(\d{2}/\d{2}/\d{4})')[0])
            loader.add_xpath('full_text', u'./td/div[2]/text()')
            loader.add_value('url', response.url)

            product['metadata']['reviews'].append(loader.load_item())

        next_page = hxs.select(
            u'//h4/a[contains(text(),"Next")]/@href').extract()

        if next_page:
            next_page = urljoin_rfc(get_base_url(response), next_page[0])
            yield Request(next_page,
                          meta={'product': product},
                          callback=self.parse_review)
        else:
            yield product
Beispiel #10
0
    def parse_review(self, response):

        hxs = HtmlXPathSelector(response)

        reviews = hxs.select(u'//div[@class="rating-box"]')
        product = response.meta['product']

        if not reviews:
            yield product
            return

        months = enumerate(
            u'janvier, f\xe9vrier, mars, avril, mai, juin, juillet, ao\xfbt, septembre, octobre, novembre, d\xe9cembre'
            .split(', '), 1)
        months = dict(((y, x) for x, y in months))
        for review in reviews:
            loader = ReviewLoader(item=Review(),
                                  selector=review,
                                  date_format=u'%m/%d/%Y')

            date = review.select(u'.//footer/p/text()').extract()[0]

            for month, number in months.items():
                if month in date:
                    date = date.replace(month, str(number)).replace(' - ', '')
                    break
            date = time.strptime(date, u'%d %m %Y')
            date = time.strftime(u'%m/%d/%Y', date)

            loader.add_value('date', date)

            title = review.select(u'.//article/header/h3/text()').extract()
            if not title:
                title = u'Untitled'
            else:
                title = title[0]
            text = ''
            ratings = review.select('.//div[@class="infos-note"]/p')
            for rating in ratings:
                text += u'{} {}\n'.format(
                    *rating.select('.//text()[normalize-space()]').extract())
            lines = review.select(
                './/article//p//text()[normalize-space()]').extract()
            for line in lines:
                text += u'{}\n'.format(line.strip())
            if not text:
                text = u'No text supplied.'
            loader.add_value('full_text', u'%s\n%s' % (title, text))
            loader.add_value('product_url', response.meta['product_url'])
            loader.add_value('url', response.meta['product_url'])
            loader.add_value('sku', product.get('sku') or '')
            loader.add_xpath('rating',
                             u'.//span[@itemprop="ratingValue"]/text()')
            product['metadata']['reviews'].append(loader.load_item())

        reviews_url = response.meta.get('reviews_url')
        meta = response.meta
        meta['page'] += 1
        yield Request(reviews_url.format(response.meta.get('product_id'),
                                         str(response.meta.get('page') + 1)),
                      meta=meta,
                      callback=self.parse_review,
                      dont_filter=True)
Beispiel #11
0
    def parse_review(self, response):

        html = re.search('var materials={.*?(<div.*?)"},.initializers',
                         response.body, re.DOTALL).group(1)
        html = re.sub(r'\\n', r'\n', html)
        html = re.sub(r'\\(.)', r'\1', html)

        hxs = HtmlXPathSelector(text=html)

        reviews = hxs.select(
            u'//div[starts-with(@id, "BVRRDisplayContentReviewID_")]')
        products = response.meta['products']
        if not reviews:
            for product in products:
                yield product
            return

        for review in reviews:
            loader = ReviewLoader(item=Review(),
                                  selector=review,
                                  date_format=u'%m/%d/%Y')

            review_id = review.select("@id").re(
                r'BVRRDisplayContentReviewID_(\d+)')[0]
            loader.add_value('review_id', review_id)

            date = review.select(
                u'.//span[contains(@class, "BVRRValue BVRRReviewDate")]/text()'
            ).extract()[0]
            date = time.strptime(date, u'%B %d, %Y')
            date = time.strftime(u'%m/%d/%Y', date)

            loader.add_value('date', date)

            title = review.select(
                u'.//span[@class="BVRRValue BVRRReviewTitle summary"]/text()'
            ).extract()
            if not title:
                title = u'Untitled'
            else:
                title = title[0]
            pros_cons_text = u' '.join(reviews[0].select(
                u'.//div[@class="BVRRReviewProsConsContainer"]//text()').
                                       extract())
            text = review.select(
                u'.//span[@class="BVRRReviewText"]/text()').extract()
            if text:
                text = text[0]
            else:
                text = u'No text supplied.'
            extra_information = u' '.join(
                review.select(
                    u'.//div[@class="BVRRContextDataContainer"]//text()').
                extract())
            text = '%s\n%s\n%s' % (pros_cons_text, text, extra_information)
            loader.add_value('full_text', u'%s\n%s' % (title, text))
            loader.add_value('product_url', response.meta['product_url'])
            loader.add_value('url', response.meta['product_url'])
            product = products[0] if products else {}
            loader.add_value('sku', product.get('sku') or '')
            loader.add_xpath(
                'rating',
                u'.//div[@id="BVRRRatingOverall_Review_Display"]//span[@itemprop="ratingValue"]/text()'
            )
            products[0]['metadata']['reviews'].append(loader.load_item())

        cur_page = hxs.select(
            u'//span[@class="BVRRPageLink BVRRPageNumber BVRRSelectedPageNumber"]/text()'
        ).extract()
        if not cur_page:
            for product in products:
                yield product
            return
        else:
            cur_page = int(cur_page[0])

        if 'last_page' not in response.meta:
            response.meta['last_page'] = int(
                hxs.select(
                    u'//span[@class="BVRRPageLink BVRRPageNumber"]/a/text()').
                extract()[-1])

        if cur_page < response.meta['last_page']:
            url = response.meta['reviews_url'] + u'&page=%s' % str(cur_page +
                                                                   1)
            yield Request(url, meta=response.meta, callback=self.parse_review)
        else:
            for product in products:
                yield product