Exemple #1
0
 def __init__(self, *args, **kwargs):
     super(AmazonReviewsSpider, self).__init__(self, *args, **kwargs)
     self.asin = kwargs['asin']
     # if send_mq argument is presented and not evaluated to False, then a message will be send to
     # 'load' to fetch the scraped reviews
     self.send_mq = kwargs.get('send_mq', 0)
     self.last_review_in_db = get_latest_user_review_date(
         self.mysql_manager, self.spider_conf['source_id'],
         self.amazon_kind, self.asin)
     self.incremental = get_incremental(self.mysql_manager,
                                        self.spider_conf['source_id'],
                                        self.amazon_kind, self.asin)
     self.update_incremental_kind = self.project_conf.getboolean(
         "OUTPUT", "update_incremental_kind")
Exemple #2
0
    def parse_reviews(self, response):
        reviews_xpath = "//li[@itemprop='review']"
        pros_xpath = ".//li[contains(@class, 'review-pros-and-cons__attribute--pro')]//text()"
        cons_xpath = ".//li[contains(@class, 'review-pros-and-cons__attribute--con')]//text()"

        product = response.meta['product']

        if not 'last_date_db' in response.meta:
            bol_id = response.meta['bol_id']
            ean = response.meta.get('ean', None)
            yield product
            yield bol_id
            yield ean

            last_review_in_db = get_latest_user_review_date(
                self.mysql_manager, self.spider_conf['source_id'],
                bol_id["ID_kind"], bol_id["ID_value"])
        else:
            last_review_in_db = response.meta['last_date_db']

        review_items = get_review_items_from_microdata(self, 'USER', response,
                                                       product, reviews_xpath,
                                                       pros_xpath, cons_xpath)

        if not review_items:
            return

        for review in review_items:
            yield review

        #incremental scraping
        date = review['TestDateText']
        last_date_in_page = dateparser.parse(date, ["%Y-%m-%d"])

        if last_review_in_db > last_date_in_page:
            return

        offset = get_query_parameter(response.url, 'offset')
        if not offset:
            offset = self.default_offset
        offset = int(offset) + self.limit

        next_page_url = set_query_parameter(response.url, 'offset', offset)
        next_page_url = set_query_parameter(next_page_url, 'limit', self.limit)
        request = Request(next_page_url, callback=self.parse_reviews)
        request.meta['use_proxy'] = True
        request.meta['last_date_db'] = last_review_in_db
        request.meta['product'] = product
        yield request
Exemple #3
0
    def parse_product(self, response):
        category = response.meta['category']
        productid = None
        product = ProductItem()
        
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = category['category_path']
        name = self.extract(response.xpath('//h1[@class="product_title"]/text()'))
        product['PicURL'] = self.extract(response.xpath('//img[@id="mainImage"]/@src'))
        product['source_internal_id'] = self.extract(response.xpath('//span[@id="product_internet_number"]/text()'))
        manu = self.extract(response.xpath('//span[@itemprop="brand"]/text()'))
        mpn = self.extract(response.xpath('//span[@itemprop="model"]/text()'))
        if manu:
            product["ProductManufacturer"] = manu
            name = manu + ' ' + name
            if mpn:
                name = manu + ' ' + mpn
        product['ProductName'] = name
        yield product
        
        if mpn:
            productid = ProductIdItem()
            productid['ProductName'] = product["ProductName"]
            productid['source_internal_id'] = product['source_internal_id']
            productid['ID_kind'] = "MPN"
            productid['ID_value'] = mpn
            yield productid

        response.meta['product'] = product
        if productid:
            response.meta['product_id'] = productid

        last_user_review = incremental_utils.get_latest_user_review_date(
            self.mysql_manager, self.spider_conf['source_id'],
            productid["ID_kind"], productid['ID_value'])

        response.meta['last_user_review'] = last_user_review

        yield Request(
            'http://homedepot.ugc.bazaarvoice.com/1999aa/{0}/reviews.djs?format=embeddedhtml&page=1&sort=submissionTime'.format(
                product["source_internal_id"]
            ), callback=self.parse_reviews, meta=response.meta, errback=self.errback)
Exemple #4
0
    def start_requests(self):

        query = "select pi.id_value from review.product_id pi " \
                "join review.products p on pi.prdid = p.id " \
                "join mamboinput.alascore a on p.al_id = a.al_id " \
                "where pi.kind = 7 and p.source_id =  %s " \
                "and TIMESTAMPDIFF(MONTH, p.updatetime,now()) < 4 " \
                "order by a.alascore desc, a.rank"

        self.mysql_manager.execute_select(query, self.spider_conf['source_id'])

        for asin in self.asins:
            start_url = self.start_url_format % asin
            request = Request(url=start_url, callback=self.parse_reviews)
            request.meta['asin'] = asin
            request.meta['last_review_in_db'] = get_latest_user_review_date(
                self.mysql_manager, self.spider_conf['source_id'],
                self.amazon_kind, asin)

            yield request
    def parse_product(self, response):
        category = response.meta['category']
        product_xpaths = \
            { "ProductName": "//*[contains(@class,'pdp-prod-name')]//text()",
              "PicURL": "//img[@class='primary-image']/@src",
              "source_internal_id": "//*[@itemprop='productID']/text()"
            }
        picurl_alt_xpath = "//img[@class='primary-image']/@data-src"
        product = self.init_item_by_xpaths(response, "product", product_xpaths)

        product['OriginalCategoryName'] = category['category_path']
        product["ProductManufacturer"] = "Panasonic"
        product["TestUrl"] = response.url
        if not product["PicURL"]:
            product["PicURL"] = self.extract(response.xpath(picurl_alt_xpath))

        yield product

        product_id = ProductIdItem()
        product_id['source_internal_id'] = product["source_internal_id"]
        product_id['ProductName'] = product["ProductName"]
        product_id['ID_kind'] = "MPN"
        product_id['ID_value'] = product["source_internal_id"]
        yield product_id

        response.meta['product'] = product
        response.meta['product_id'] = product_id

        last_user_review = incremental_utils.get_latest_user_review_date(
            self.mysql_manager, self.spider_conf['source_id'],
            product_id["ID_kind"], product_id['ID_value'])

        response.meta['last_user_review'] = last_user_review
        response.meta['incremental'] = True

        yield Request(
            'http://panasonic.reviews.bazaarvoice.com/9203-en_us/{0}/reviews.djs?format=embeddedhtml&page=1&sort=submissionTime'
            .format(product["source_internal_id"]),
            callback=self.parse_reviews,
            meta=response.meta)
    def parse_product(self, response):
        product = ProductItem()
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = self.extract_all(
            response.xpath('//div[@class="product-breadcrumbs"]//li//text()'),
            ' > ')
        product['ProductName'] = self.extract(
            response.xpath('//h1[@id]/text()'))
        product['source_internal_id'] = self.extract(
            response.xpath(
                '//th[contains(text(),"SKU")]/parent::tr/td/text()'))
        product['PicURL'] = self.extract(
            response.xpath('//img[@id="productmain"]/@src'))
        product["ProductManufacturer"] = self.extract(
            response.xpath(
                '//th[contains(text(),"Manufacturer")]/parent::tr/td//text()'))
        yield product

        product_id = None
        mpn_id = self.extract(
            response.xpath(
                '//th[contains(text(),"Mfg")]/parent::tr/td/text()'))
        if mpn_id:
            mpn = ProductIdItem()
            mpn['source_internal_id'] = product["source_internal_id"]
            mpn['ProductName'] = product["ProductName"]
            mpn['ID_kind'] = "MPN"
            mpn['ID_value'] = mpn_id
            product_id = mpn
            yield mpn

        upc_id = self.extract(
            response.xpath('//th[text()="UPC"]/parent::tr/td/text()'))
        if upc_id:
            upc = ProductIdItem()
            upc['source_internal_id'] = product["source_internal_id"]
            upc['ProductName'] = product["ProductName"]
            upc['ID_kind'] = "UPC"
            upc['ID_value'] = upc_id
            product_id = upc
            yield upc

        last_user_review = incremental_utils.get_latest_user_review_date(
            self.mysql_manager, self.spider_conf['source_id'],
            product_id["ID_kind"], product_id['ID_value'])

        reviews = response.xpath('//article[@id]')
        for review in reviews:
            dateRaw = self.extract(
                review.xpath('.//@data-created')).split(' ')[0]
            dateFormatted = date_format(dateRaw, "%m/%d/%Y")

            if dateFormatted:
                dateParsed = dateparser.parse(dateFormatted,
                                              date_formats=['%Y-%m-%d'])

                if dateParsed and last_user_review < dateParsed:
                    user_review = ReviewItem()
                    user_review['DBaseCategoryName'] = "USER"
                    user_review['ProductName'] = product['ProductName']
                    user_review['TestUrl'] = product['TestUrl']
                    user_review['TestDateText'] = dateFormatted
                    user_review['SourceTestRating'] = self.extract(
                        review.xpath('./@class')).strip('s')
                    user_review['Author'] = self.extract(
                        review.xpath('.//div/span/em/text()'))
                    user_review['TestTitle'] = self.extract(
                        review.xpath('.//h4/text()'))
                    user_review['TestSummary'] = self.extract_all(
                        review.xpath('.//p[@id]/text()'))
                    user_review['source_internal_id'] = product[
                        'source_internal_id']
                    yield user_review
Exemple #7
0
    def parse_reviews(self, response):
        product = response.meta.get('product', None)
        product_id = response.meta.get('product_id', None)
        brand = response.meta.get('brand', None)
        request_parse_product = response.meta.get('parse_product', None)
        parse_product = self.parse_BV_product

        if request_parse_product is not None:
            parse_product = request_parse_product

        if parse_product and not self.default_kind:
            raise Exception(
                "Parsing product from template but kind not defined")

        if not product and parse_product:
            product = self._parse_product(response, brand=brand)
            product_id = self.product_id(product)
            product_id["ID_kind"] = self.default_kind
            product_id['ID_value'] = product['source_internal_id']
            response.meta['product'] = product
            yield product
            yield product_id

        next_page_xpath = '(//*[contains(@class,"BVRRNextPage")])[1]/a/@href'
        last_user_review = response.meta.get('last_user_review', None)
        incremental = response.meta.get('incremental', True)
        if not last_user_review:
            if product_id:
                last_user_review = incremental_utils.get_latest_user_review_date(
                    self.mysql_manager, self.spider_conf['source_id'],
                    product_id["ID_kind"], product_id['ID_value'])

        review_list_xpath = '//*[contains(@class,"BVRRContentReview")]'
        from_another_product_xpath = ".//*[contains(@class,'BVDI_SU BVDI_SUAttribution')]"
        from_another_source_xpath = ".//*[contains(@class, 'BVRRSyndicatedContentAttribution')]"

        review_list = response.xpath(review_list_xpath)
        for idx, review_selector in enumerate(review_list):
            from_another_source = review_selector.xpath(
                from_another_source_xpath)
            from_another_product = review_selector.xpath(
                from_another_product_xpath)
            review = self.parse_review(response, review_selector)
            if not from_another_product and not from_another_source:
                yield review

            if last_user_review and incremental:
                current_user_review = datetime.strptime(
                    review['TestDateText'], '%Y-%m-%d')
                if current_user_review < last_user_review:
                    return

            next_page_url = self.extract(response.xpath(next_page_xpath))
            next_page_url = get_full_url(response.url, next_page_url)
            if next_page_url:
                request = Request(url=next_page_url,
                                  callback=self.parse_reviews)
                request.meta['last_user_review'] = last_user_review
                request.meta['product'] = product
                request.meta['product_id'] = product_id
                yield request