def parse_product_rev_page(self, response):
        """
        parses a single product page and makes requests for subsequent pages
        """

        # Start parsing this page
        log.msg('Parsing product reviews: %s p%d' % (response.meta['id'], response.meta['page']), level=log.INFO, spider=self)
        # from scrapy.shell import inspect_response
        # inspect_response(response)

        hxs = HtmlXPathSelector(response)

        # yield reviews and members posting them
        product_id = response.meta['id']
        revElems = hxs.select('//table[@id="productReviews"]//td/div')
        for rev in revElems:
            # yield review info
            review = SingleValItemLoader(item=Review(), response=response)
            member_id = only_elem_or_default(rev.select('.//div[contains(text(), "By")]/following-sibling::div/a[contains(@href, "profile")]/@href').re(member_url_id_re))
            if member_id:
                member_id = str(member_id)
            star_rating_tmp = rev.select('.//span/span[contains(@class, "swSprite")]/@title').re(star_rating_re)
            if not star_rating_tmp:
                # It is probably a manufacturer response, not a review
                continue
            review.add_value('starRating', star_rating_tmp)
            review.add_value('id', rev.select('.//span[@class="tiny"]/a[contains(text(), "Permalink")]/@href').re(review_id_re))
            review.add_value('productId', product_id)
            review.add_value('memberId', member_id)
            review.add_value('helpful', rev.select('div[contains(text(), "helpful")]/text()').re(r'\d+'))
            review.add_value('title', rev.select('.//span[contains(span/@class, "swSprite")]/following-sibling::span//b/text()').extract())
            review.add_value('date', rev.select('div/span/nobr/text()').extract())
            review.add_value('verifiedPurchase', rev.select('.//span[contains(@class, "crVerifiedStripe")]'))
            review.add_value('reviewTxt', rev.select('text()').extract())
            nComments_tmp = only_elem_or_default(rev.select('.//div//div/a/text()').re(r'Comments\s+\((\d+)\)'), '0')
            review.add_value('nComments', nComments_tmp)
            review.add_value('vine', rev.select('.//span/b[contains(text(), "Customer review from the Amazon Vine Program")]'))
            yield review.load_item()

            # yield the reviewer
            yield self._item_page_request(member_id, MEMBER_TYPE, referrer=product_id)

        # request subsequent pages to be downloaded
        # Find out the number of review pages
        noPagesXPath = '(//table[@class="CMheadingBar"])[1]//span[@class="paging"]//a[following-sibling::a[1][starts-with(text(),"Next")]]/text()'
        noPages = int(only_elem_or_default(hxs.select(noPagesXPath).re(r'\d+'), default='1'))
        if response.meta['page'] < noPages:
            yield self._successor_page_request(response)
    def parse_member_rev_page(self, response):
        """
        Parses a member reviews page and makes requests for subsequent pages
        """

        log.msg('Parsing member reviews: %s p%d' % (response.meta['id'], response.meta['page']), level=log.INFO, spider=self)

        hxs = HtmlXPathSelector(response)
        member_id = response.meta['id']

        # yield each review
        rev_body_elems = hxs.select('//table//td[not(@width)]//table//tr[not(@valign)]/td[@class="small"]/div')
        rev_header_elems = hxs.select('//table//td[not(@width)]//table//tr[@valign]/td[@align][2]//table[@class="small"]')
        for rev_header, rev_body in zip(rev_header_elems, rev_body_elems):
            # populating review data
            review = SingleValItemLoader(item=Review(), response=response)
            product_id = only_elem_or_default(rev_header.select('.//b/a/@href').re(product_url_id_re))
            if product_id:
                product_id = str(product_id)
            star_rating_tmp = rev_body.select('.//span/img[contains(@title, "stars")]/@title').re(star_rating_re)
            if not star_rating_tmp:
                # The review is probably a manufacturer response and not an actual review
                continue
            review.add_value('starRating', star_rating_tmp)
            review.add_value('productId', product_id)
            review.add_value('memberId', member_id)
            review.add_value('id', rev_body.select('div/a[contains(text(), "Permalink")]/@href').re(review_id_re))
            review.add_value('helpful', rev_body.select('div[contains(text(), "helpful")]/text()').re(r'\d+'))
            review.add_value('title', rev_body.select('div/span[contains(img/@alt, "stars")]/following-sibling::b[1]/text()').extract())
            review.add_value('date', rev_body.select('div/nobr/text()').extract())
            review.add_value('verifiedPurchase', rev_body.select('.//span[contains(@class, "crVerifiedStripe")]'))
            review.add_value('reviewTxt', rev_body.select('text()').extract())
            nComments_tmp = only_elem_or_default(rev_body.select('.//div/a/text()').re(r'Comments\s+\((\d+)\)'), '0')
            review.add_value('nComments', nComments_tmp)
            review.add_value('vine', rev_body.select('.//span/b[contains(text(), "Customer review from the Amazon Vine Program")]'))

            yield review.load_item()

            # yield the product
            yield self._item_page_request(product_id, PROD_TYPE, referrer=member_id)

        #make request for subsequent pages
        if hxs.select('//table//table//td[@class="small"]/b/text()').re(r'(\d+)\s+\|'):
            yield self._successor_page_request(response)
    def parse_product_details_page(self, response):
        """
        Extracts information from a product page and yields its review page and pages of products in the same category
        """
        log.msg('Parsing product info: %s' % response.meta['id'], level=log.INFO, spider=self)
        # from scrapy.shell import inspect_response
        # inspect_response(response)
        hxs = HtmlXPathSelector(response)
        product_id = response.meta['id']

        # yield product details
        name = hxs.select('//body//span[@id="btAsinTitle"]/text()').extract() or \
               hxs.select('//body//div[@id="title_feature_div"]//h1/text()').extract()
        if not name:
            name = hxs.select('//head/title/text()').re(r'(?:Amazon:\s+)?([^:]+)')
        price = hxs.select('//body//span[@id="actualPriceValue"]//text()').re(price_re) or \
                hxs.select('//body//div[@id="price"]//span[contains(@class, "a-color-price")]/text()').re(price_re) or \
                hxs.select('//body//div[@id="priceBlock"]//span[@class="priceLarge"]/text()').re(price_re)
        manufact_node = hxs.select('(//body//div[@class="buying" and h1[contains(@class, "parseasinTitle")]]//a)[1]') or \
                        hxs.select('(//body//div[@id="brandByline_feature_div"]//a[@id="brand"])[1]') or \
                        hxs.select('(//body//span[@class="contributorNameTrigger"]//a)[1]')
        if manufact_node:
            manufact = manufact_node.select('./text()').extract()
            manufact_href = manufact_node.select('./@href').extract()
        else:
            manufact, manufact_href = [], []
        avg_stars, n_reviews = None, None
        reviews_t = hxs.select('//body//div[@id="centerCol"]//div[@id="averageCustomerReviews"]')
        if reviews_t:
            avg_stars = reviews_t.select('./span[contains(@title, "star")]/@title').re(star_rating_re)
            n_reviews = reviews_t.select('./a[contains(@href, "product-reviews")]/text()').re(number_digit_grpd)
        else:
            reviews_t = hxs.select('(//body//*[self::div[@class="buying"] or self::form[@id="handleBuy"]]//span[@class="crAvgStars"])[1]')
            if reviews_t:
                avg_stars = reviews_t.select('.//span[contains(@title, "star")]/@title').re(star_rating_re)
                n_reviews = reviews_t.select('./a[contains(@href, "product-reviews")]/text()').re(number_digit_grpd)

        sales_rank, cat, sub_cat_rank, sub_cat = [None]*4
        best_sellers_href, sub_cat_href = [], []
        parent_node = hxs.select('//body//li[@id="SalesRank"]')
        if parent_node:
            sales_rank, cat = parent_node.select('.//text()').re(sales_rank_re) or [None]*2
            best_sellers_href = parent_node.select('a[contains(lower-case(text()), "see top") and (contains(@href, "/best-sellers") or contains(@href, "/bestsellers"))]/@href').extract()
            sub_cat_node = parent_node.select('.//li[@class="zg_hrsr_item"][1]')
            if sub_cat_node:
                sub_cat_rank = sub_cat_node.select('./span[@class="zg_hrsr_rank"]/text()').re(number_digit_grpd)
                sub_cat = sub_cat_node.select('(./span[@class="zg_hrsr_ladder"]//a)[position()=last()]/text()').extract()
                sub_cat_href = sub_cat_node.select('(./span[@class="zg_hrsr_ladder"]//a)[position()=last()]/@href').extract()
        if not parent_node:
            parent_node = hxs.select('//body//div[@id="detailBullets"]//span[contains(b/text(), "Amazon Best Sellers Rank")]')
            if parent_node:
                sales_rank, cat = parent_node.select('.//text()').re(sales_rank_re) or [None]*2
                best_sellers_href = parent_node.select('a[contains(lower-case(text()), "see top") and (contains(@href, "/best-sellers") or contains(@href, "/bestsellers"))]/@href').extract()
                sub_cat_node = parent_node.select('.//li[@class="zg_hrsr_item"][1]')
                if sub_cat_node:
                    sub_cat_rank = sub_cat_node.select('./span[@class="zg_hrsr_rank"]/text()').re(number_digit_grpd)
                    sub_cat = sub_cat_node.select('(./span[@class="zg_hrsr_ladder"]//a)[position()=last()]/text()').extract()
                    sub_cat_href = sub_cat_node.select('(./span[@class="zg_hrsr_ladder"]//a)[position()=last()]/@href').extract()
        if not parent_node:
            parent_node = hxs.select('//body//tr[@id="SalesRank"]')
            if parent_node:
                sales_rank, cat = parent_node.select('.//text()').re(sales_rank_re) or [None]*2
                best_sellers_href = parent_node.select('.//a[contains(lower-case(text()), "see top") and (contains(@href, "/best-sellers") or contains(@href, "/bestsellers"))]/@href').extract()
                sub_cat_node = parent_node.select('.//li[@class="zg_hrsr_item"][1]')
                if sub_cat_node:
                    sub_cat_rank = sub_cat_node.select('./span[@class="zg_hrsr_rank"]/text()').re(number_digit_grpd)
                    sub_cat = sub_cat_node.select('(./span[@class="zg_hrsr_ladder"]//a)[position()=last()]/text()').extract()
                    sub_cat_href = sub_cat_node.select('(./span[@class="zg_hrsr_ladder"]//a)[position()=last()]/@href').extract()

        product = SingleValItemLoader(item=Product(), response=response)
        product.add_value('id', product_id)
        product.add_value('name', name)
        product.add_value('price', price)
        product.add_value('avgStars', avg_stars)
        product.add_value('nReviews', n_reviews)
        product.add_value('salesRank', sales_rank)
        product.add_value('cat', cat)
        product.add_value('subCatRank', sub_cat_rank)
        product.add_value('subCat', sub_cat)
        product.add_value('manufact', manufact)
        yield product.load_item()

        # yield same category and same manufacturer products
        same_cat_href = only_elem_or_default(sub_cat_href or best_sellers_href)
        if same_cat_href:
            yield Request(urljoin(response.url, same_cat_href), callback=self.parse_product_category_page,
                          meta={'id': product_id, 'type': PROD_TYPE, 'referrer': product_id})
        manufact_href = only_elem_or_default(manufact_href)
        if manufact_href:
            yield Request(urljoin(response.url, manufact_href), callback=self.parse_product_manufact_page,
                          meta={'id': product_id, 'type': PROD_TYPE, 'referrer': product_id})

        #yield the product reviews page.
        yield self._rev_page_request(product_id, PROD_TYPE)
Esempio n. 4
0
    def parse_member_rev_page(self, response):
        """
        Parses a member reviews page and makes requests for subsequent pages
        """

        log.msg('Parsing member reviews: %s p%d' %
                (response.meta['id'], response.meta['page']),
                level=log.INFO,
                spider=self)

        hxs = HtmlXPathSelector(response)
        member_id = response.meta['id']

        # yield each review
        rev_body_elems = hxs.select(
            '//table//td[not(@width)]//table//tr[not(@valign)]/td[@class="small"]/div'
        )
        rev_header_elems = hxs.select(
            '//table//td[not(@width)]//table//tr[@valign]/td[@align][2]//table[@class="small"]'
        )
        for rev_header, rev_body in zip(rev_header_elems, rev_body_elems):
            # populating review data
            review = SingleValItemLoader(item=Review(), response=response)
            product_id = only_elem_or_default(
                rev_header.select('.//b/a/@href').re(product_url_id_re))
            if product_id:
                product_id = str(product_id)
            star_rating_tmp = rev_body.select(
                './/span/img[contains(@title, "stars")]/@title').re(
                    star_rating_re)
            if not star_rating_tmp:
                # The review is probably a manufacturer response and not an actual review
                continue
            review.add_value('starRating', star_rating_tmp)
            review.add_value('productId', product_id)
            review.add_value('memberId', member_id)
            review.add_value(
                'id',
                rev_body.select('div/a[contains(text(), "Permalink")]/@href').
                re(review_id_re))
            review.add_value(
                'helpful',
                rev_body.select('div[contains(text(), "helpful")]/text()').re(
                    r'\d+'))
            review.add_value(
                'title',
                rev_body.select(
                    'div/span[contains(img/@alt, "stars")]/following-sibling::b[1]/text()'
                ).extract())
            review.add_value('date',
                             rev_body.select('div/nobr/text()').extract())
            review.add_value(
                'verifiedPurchase',
                rev_body.select(
                    './/span[contains(@class, "crVerifiedStripe")]'))
            review.add_value('reviewTxt', rev_body.select('text()').extract())
            nComments_tmp = only_elem_or_default(
                rev_body.select('.//div/a/text()').re(r'Comments\s+\((\d+)\)'),
                '0')
            review.add_value('nComments', nComments_tmp)
            review.add_value(
                'vine',
                rev_body.select(
                    './/span/b[contains(text(), "Customer review from the Amazon Vine Program")]'
                ))

            yield review.load_item()

            # yield the product
            yield self._item_page_request(product_id,
                                          PROD_TYPE,
                                          referrer=member_id)

        #make request for subsequent pages
        if hxs.select('//table//table//td[@class="small"]/b/text()').re(
                r'(\d+)\s+\|'):
            yield self._successor_page_request(response)
Esempio n. 5
0
    def parse_product_rev_page(self, response):
        """
        parses a single product page and makes requests for subsequent pages
        """

        # Start parsing this page
        log.msg('Parsing product reviews: %s p%d' %
                (response.meta['id'], response.meta['page']),
                level=log.INFO,
                spider=self)
        # from scrapy.shell import inspect_response
        # inspect_response(response)

        hxs = HtmlXPathSelector(response)

        # yield reviews and members posting them
        product_id = response.meta['id']
        revElems = hxs.select('//table[@id="productReviews"]//td/div')
        for rev in revElems:
            # yield review info
            review = SingleValItemLoader(item=Review(), response=response)
            member_id = only_elem_or_default(
                rev.select(
                    './/div[contains(text(), "By")]/following-sibling::div/a[contains(@href, "profile")]/@href'
                ).re(member_url_id_re))
            if member_id:
                member_id = str(member_id)
            star_rating_tmp = rev.select(
                './/span/span[contains(@class, "swSprite")]/@title').re(
                    star_rating_re)
            if not star_rating_tmp:
                # It is probably a manufacturer response, not a review
                continue
            review.add_value('starRating', star_rating_tmp)
            review.add_value(
                'id',
                rev.select(
                    './/span[@class="tiny"]/a[contains(text(), "Permalink")]/@href'
                ).re(review_id_re))
            review.add_value('productId', product_id)
            review.add_value('memberId', member_id)
            review.add_value(
                'helpful',
                rev.select('div[contains(text(), "helpful")]/text()').re(
                    r'\d+'))
            review.add_value(
                'title',
                rev.select(
                    './/span[contains(span/@class, "swSprite")]/following-sibling::span//b/text()'
                ).extract())
            review.add_value('date',
                             rev.select('div/span/nobr/text()').extract())
            review.add_value(
                'verifiedPurchase',
                rev.select('.//span[contains(@class, "crVerifiedStripe")]'))
            review.add_value('reviewTxt', rev.select('text()').extract())
            nComments_tmp = only_elem_or_default(
                rev.select('.//div//div/a/text()').re(r'Comments\s+\((\d+)\)'),
                '0')
            review.add_value('nComments', nComments_tmp)
            review.add_value(
                'vine',
                rev.select(
                    './/span/b[contains(text(), "Customer review from the Amazon Vine Program")]'
                ))
            yield review.load_item()

            # yield the reviewer
            yield self._item_page_request(member_id,
                                          MEMBER_TYPE,
                                          referrer=product_id)

        # request subsequent pages to be downloaded
        # Find out the number of review pages
        noPagesXPath = '(//table[@class="CMheadingBar"])[1]//span[@class="paging"]//a[following-sibling::a[1][starts-with(text(),"Next")]]/text()'
        noPages = int(
            only_elem_or_default(hxs.select(noPagesXPath).re(r'\d+'),
                                 default='1'))
        if response.meta['page'] < noPages:
            yield self._successor_page_request(response)
Esempio n. 6
0
    def parse_product_details_page(self, response):
        """
        Extracts information from a product page and yields its review page and pages of products in the same category
        """
        log.msg('Parsing product info: %s' % response.meta['id'],
                level=log.INFO,
                spider=self)
        # from scrapy.shell import inspect_response
        # inspect_response(response)
        hxs = HtmlXPathSelector(response)
        product_id = response.meta['id']

        # yield product details
        name = hxs.select('//body//span[@id="btAsinTitle"]/text()').extract() or \
               hxs.select('//body//div[@id="title_feature_div"]//h1/text()').extract()
        if not name:
            name = hxs.select('//head/title/text()').re(
                r'(?:Amazon:\s+)?([^:]+)')
        price = hxs.select('//body//span[@id="actualPriceValue"]//text()').re(price_re) or \
                hxs.select('//body//div[@id="price"]//span[contains(@class, "a-color-price")]/text()').re(price_re) or \
                hxs.select('//body//div[@id="priceBlock"]//span[@class="priceLarge"]/text()').re(price_re)
        manufact_node = hxs.select('(//body//div[@class="buying" and h1[contains(@class, "parseasinTitle")]]//a)[1]') or \
                        hxs.select('(//body//div[@id="brandByline_feature_div"]//a[@id="brand"])[1]') or \
                        hxs.select('(//body//span[@class="contributorNameTrigger"]//a)[1]')
        if manufact_node:
            manufact = manufact_node.select('./text()').extract()
            manufact_href = manufact_node.select('./@href').extract()
        else:
            manufact, manufact_href = [], []
        avg_stars, n_reviews = None, None
        reviews_t = hxs.select(
            '//body//div[@id="centerCol"]//div[@id="averageCustomerReviews"]')
        if reviews_t:
            avg_stars = reviews_t.select(
                './span[contains(@title, "star")]/@title').re(star_rating_re)
            n_reviews = reviews_t.select(
                './a[contains(@href, "product-reviews")]/text()').re(
                    number_digit_grpd)
        else:
            reviews_t = hxs.select(
                '(//body//*[self::div[@class="buying"] or self::form[@id="handleBuy"]]//span[@class="crAvgStars"])[1]'
            )
            if reviews_t:
                avg_stars = reviews_t.select(
                    './/span[contains(@title, "star")]/@title').re(
                        star_rating_re)
                n_reviews = reviews_t.select(
                    './a[contains(@href, "product-reviews")]/text()').re(
                        number_digit_grpd)

        sales_rank, cat, sub_cat_rank, sub_cat = [None] * 4
        best_sellers_href, sub_cat_href = [], []
        parent_node = hxs.select('//body//li[@id="SalesRank"]')
        if parent_node:
            sales_rank, cat = parent_node.select('.//text()').re(
                sales_rank_re) or [None] * 2
            best_sellers_href = parent_node.select(
                'a[contains(lower-case(text()), "see top") and (contains(@href, "/best-sellers") or contains(@href, "/bestsellers"))]/@href'
            ).extract()
            sub_cat_node = parent_node.select(
                './/li[@class="zg_hrsr_item"][1]')
            if sub_cat_node:
                sub_cat_rank = sub_cat_node.select(
                    './span[@class="zg_hrsr_rank"]/text()').re(
                        number_digit_grpd)
                sub_cat = sub_cat_node.select(
                    '(./span[@class="zg_hrsr_ladder"]//a)[position()=last()]/text()'
                ).extract()
                sub_cat_href = sub_cat_node.select(
                    '(./span[@class="zg_hrsr_ladder"]//a)[position()=last()]/@href'
                ).extract()
        if not parent_node:
            parent_node = hxs.select(
                '//body//div[@id="detailBullets"]//span[contains(b/text(), "Amazon Best Sellers Rank")]'
            )
            if parent_node:
                sales_rank, cat = parent_node.select('.//text()').re(
                    sales_rank_re) or [None] * 2
                best_sellers_href = parent_node.select(
                    'a[contains(lower-case(text()), "see top") and (contains(@href, "/best-sellers") or contains(@href, "/bestsellers"))]/@href'
                ).extract()
                sub_cat_node = parent_node.select(
                    './/li[@class="zg_hrsr_item"][1]')
                if sub_cat_node:
                    sub_cat_rank = sub_cat_node.select(
                        './span[@class="zg_hrsr_rank"]/text()').re(
                            number_digit_grpd)
                    sub_cat = sub_cat_node.select(
                        '(./span[@class="zg_hrsr_ladder"]//a)[position()=last()]/text()'
                    ).extract()
                    sub_cat_href = sub_cat_node.select(
                        '(./span[@class="zg_hrsr_ladder"]//a)[position()=last()]/@href'
                    ).extract()
        if not parent_node:
            parent_node = hxs.select('//body//tr[@id="SalesRank"]')
            if parent_node:
                sales_rank, cat = parent_node.select('.//text()').re(
                    sales_rank_re) or [None] * 2
                best_sellers_href = parent_node.select(
                    './/a[contains(lower-case(text()), "see top") and (contains(@href, "/best-sellers") or contains(@href, "/bestsellers"))]/@href'
                ).extract()
                sub_cat_node = parent_node.select(
                    './/li[@class="zg_hrsr_item"][1]')
                if sub_cat_node:
                    sub_cat_rank = sub_cat_node.select(
                        './span[@class="zg_hrsr_rank"]/text()').re(
                            number_digit_grpd)
                    sub_cat = sub_cat_node.select(
                        '(./span[@class="zg_hrsr_ladder"]//a)[position()=last()]/text()'
                    ).extract()
                    sub_cat_href = sub_cat_node.select(
                        '(./span[@class="zg_hrsr_ladder"]//a)[position()=last()]/@href'
                    ).extract()

        product = SingleValItemLoader(item=Product(), response=response)
        product.add_value('id', product_id)
        product.add_value('name', name)
        product.add_value('price', price)
        product.add_value('avgStars', avg_stars)
        product.add_value('nReviews', n_reviews)
        product.add_value('salesRank', sales_rank)
        product.add_value('cat', cat)
        product.add_value('subCatRank', sub_cat_rank)
        product.add_value('subCat', sub_cat)
        product.add_value('manufact', manufact)
        yield product.load_item()

        # yield same category and same manufacturer products
        same_cat_href = only_elem_or_default(sub_cat_href or best_sellers_href)
        if same_cat_href:
            yield Request(urljoin(response.url, same_cat_href),
                          callback=self.parse_product_category_page,
                          meta={
                              'id': product_id,
                              'type': PROD_TYPE,
                              'referrer': product_id
                          })
        manufact_href = only_elem_or_default(manufact_href)
        if manufact_href:
            yield Request(urljoin(response.url, manufact_href),
                          callback=self.parse_product_manufact_page,
                          meta={
                              'id': product_id,
                              'type': PROD_TYPE,
                              'referrer': product_id
                          })

        #yield the product reviews page.
        yield self._rev_page_request(product_id, PROD_TYPE)