def get_product_task( prdid ): cli = get_cli() d = cli.get( KEY_PRODUCT_TASK.format( p = prdid ) ) if d is None: return None try: return json.loads( d ) except Exception as e: logger.info( 'get task: {p} {d} {e}'.format( p = prdid, d = repr( d ), e = repr( e ) ) )
def get_product_task(prdid): cli = get_cli() d = cli.get(KEY_PRODUCT_TASK.format(p=prdid)) if d is None: return None try: return json.loads(d) except Exception as e: logger.info('get task: {p} {d} {e}'.format(p=prdid, d=repr(d), e=repr(e)))
def next_product_url(): cli = get_cli() while True: product = cli.lpop(KEY_PRODUCTS) if product is None: logger.info('no product in queue ...') time.sleep(5) continue try: product = json.loads(product)['asin'] if not product_is_timeout(product): logger.info( 'product : ' + product + \ ' is not timeout, {day} days'.format( day = CRAWL_PRODUCT_TIMEOUT ) ) continue set_product_task(product) del_product_reviews(product) logger.info('next product : ' + product) return 'http://www.amazon.com/ss/customer-reviews/' + product except Exception as e: logger.exception(repr(e))
def next_product_url(): cli = get_cli() while True: product = cli.lpop( KEY_PRODUCTS ) if product is None: logger.info( 'no product in queue ...' ) time.sleep( 5 ) continue try: product = json.loads( product )[ 'asin' ] if not product_is_timeout( product ): logger.info( 'product : ' + product + \ ' is not timeout, {day} days'.format( day = CRAWL_PRODUCT_TIMEOUT ) ) continue set_product_task( product ) del_product_reviews( product ) logger.info( 'next product : ' + product ) return 'http://www.amazon.com/ss/customer-reviews/' + product except Exception as e: logger.exception( repr( e ) )
def parse(self, response): #with open( '/tmp/amazon.html', 'w' ) as f: # f.write( response.body ) logger.info('fetch : ' + response.url) prdid = response.url.split('?')[0].split('/')[-1] review = f_xpath(response, '//table[@id="productReviews"]/tr/td') if review is None: yield scrapy.http.Request(url=next_product_url(), callback=self.parse) rids = xpath_extract(review, './a/@name') details = xpath(review, './div') lenth = min(len(rids), len(details)) for i in xrange(lenth): rdetail = details[i] divs = xpath(rdetail, './div') # max of len( divs ) is 7, ( 0 - 6 ) # 0 : number of helpful review # 1 : star, helpful text, date # 2 : reviewer, reviewer from # 3 : from # 4 : free product # 5 : reviewText # 6 : helpful? d = self.empty_item() d['prdid'] = prdid d['rid'] = rids[i] d['text'] = ' '.join( xpath_extract(rdetail, './div[@class="reviewText"]/text()')) while len(divs) > 0: div = divs[0] divs = divs[1:] text = div.extract() if 'people found the following review helpful' in text: d['num_help_review'] = self.parse_num_help_review(div) continue if 'out of' in text and 'stars' in text and '</nobr>' in text: d[ 'star' ], d[ 'help_text' ], d[ 'date' ] = \ self.parse_star_help_date( div ) continue if 'By' in text and 'See all my reviews' in text: d[ 'reviewer' ], d[ 'reviewer_from' ] = \ self.parse_reviewer_from( div ) continue if 'This review is from' in text: d['rfrom'] = self.parse_from(div) break yield d next_url = self.next_page( response ) or \ next_product_url() # see http://doc.scrapy.org/en/latest/topics/request-response.html yield scrapy.http.Request(url=next_url, callback=self.parse, dont_filter=True)
def parse(self, response): #with open( '/tmp/amazon.html', 'w' ) as f: # f.write( response.body ) logger.info( 'fetch : ' + response.url ) prdid = response.url.split( '?' )[ 0 ].split( '/' )[ -1 ] review = f_xpath( response, '//table[@id="productReviews"]/tr/td' ) if review is None: yield scrapy.http.Request( url = next_product_url(), callback = self.parse ) rids = xpath_extract( review, './a/@name' ) details = xpath( review, './div' ) lenth = min( len( rids ), len( details ) ) for i in xrange( lenth ): rdetail = details[ i ] divs = xpath( rdetail, './div' ) # max of len( divs ) is 7, ( 0 - 6 ) # 0 : number of helpful review # 1 : star, helpful text, date # 2 : reviewer, reviewer from # 3 : from # 4 : free product # 5 : reviewText # 6 : helpful? d = self.empty_item() d[ 'prdid' ] = prdid d[ 'rid' ] = rids[ i ] d[ 'text' ] = ' '.join( xpath_extract( rdetail, './div[@class="reviewText"]/text()' ) ) while len( divs ) > 0: div = divs[ 0 ] divs = divs[ 1: ] text = div.extract() if 'people found the following review helpful' in text: d[ 'num_help_review' ] = self.parse_num_help_review( div ) continue if 'out of' in text and 'stars' in text and '</nobr>' in text: d[ 'star' ], d[ 'help_text' ], d[ 'date' ] = \ self.parse_star_help_date( div ) continue if 'By' in text and 'See all my reviews' in text: d[ 'reviewer' ], d[ 'reviewer_from' ] = \ self.parse_reviewer_from( div ) continue if 'This review is from' in text: d[ 'rfrom' ] = self.parse_from( div ) break yield d next_url = self.next_page( response ) or \ next_product_url() # see http://doc.scrapy.org/en/latest/topics/request-response.html yield scrapy.http.Request( url = next_url, callback = self.parse, dont_filter = True )