def parse(self, response): #with open( '/tmp/amazon.html', 'w' ) as f: # f.write( response.body ) logger.info('fetch : ' + response.url) prdid = response.url.split('?')[0].split('/')[-1] review = f_xpath(response, '//table[@id="productReviews"]/tr/td') if review is None: yield scrapy.http.Request(url=next_product_url(), callback=self.parse) rids = xpath_extract(review, './a/@name') details = xpath(review, './div') lenth = min(len(rids), len(details)) for i in xrange(lenth): rdetail = details[i] divs = xpath(rdetail, './div') # max of len( divs ) is 7, ( 0 - 6 ) # 0 : number of helpful review # 1 : star, helpful text, date # 2 : reviewer, reviewer from # 3 : from # 4 : free product # 5 : reviewText # 6 : helpful? d = self.empty_item() d['prdid'] = prdid d['rid'] = rids[i] d['text'] = ' '.join( xpath_extract(rdetail, './div[@class="reviewText"]/text()')) while len(divs) > 0: div = divs[0] divs = divs[1:] text = div.extract() if 'people found the following review helpful' in text: d['num_help_review'] = self.parse_num_help_review(div) continue if 'out of' in text and 'stars' in text and '</nobr>' in text: d[ 'star' ], d[ 'help_text' ], d[ 'date' ] = \ self.parse_star_help_date( div ) continue if 'By' in text and 'See all my reviews' in text: d[ 'reviewer' ], d[ 'reviewer_from' ] = \ self.parse_reviewer_from( div ) continue if 'This review is from' in text: d['rfrom'] = self.parse_from(div) break yield d next_url = self.next_page( response ) or \ next_product_url() # see http://doc.scrapy.org/en/latest/topics/request-response.html yield scrapy.http.Request(url=next_url, callback=self.parse, dont_filter=True)
def _parse_desc(self, p): desc = f_xpath( p, './div[@class="offer-descriptions"]' ) desc = ' '.join( [ x.strip() for x in \ xpath_extract( desc, './/div[@class="offer-description"]/text()' ) ] ) return desc
def parse(self, response): #with open( '/tmp/amazon.html', 'w' ) as f: # f.write( response.body ) logger.info( 'fetch : ' + response.url ) prdid = response.url.split( '?' )[ 0 ].split( '/' )[ -1 ] review = f_xpath( response, '//table[@id="productReviews"]/tr/td' ) if review is None: yield scrapy.http.Request( url = next_product_url(), callback = self.parse ) rids = xpath_extract( review, './a/@name' ) details = xpath( review, './div' ) lenth = min( len( rids ), len( details ) ) for i in xrange( lenth ): rdetail = details[ i ] divs = xpath( rdetail, './div' ) # max of len( divs ) is 7, ( 0 - 6 ) # 0 : number of helpful review # 1 : star, helpful text, date # 2 : reviewer, reviewer from # 3 : from # 4 : free product # 5 : reviewText # 6 : helpful? d = self.empty_item() d[ 'prdid' ] = prdid d[ 'rid' ] = rids[ i ] d[ 'text' ] = ' '.join( xpath_extract( rdetail, './div[@class="reviewText"]/text()' ) ) while len( divs ) > 0: div = divs[ 0 ] divs = divs[ 1: ] text = div.extract() if 'people found the following review helpful' in text: d[ 'num_help_review' ] = self.parse_num_help_review( div ) continue if 'out of' in text and 'stars' in text and '</nobr>' in text: d[ 'star' ], d[ 'help_text' ], d[ 'date' ] = \ self.parse_star_help_date( div ) continue if 'By' in text and 'See all my reviews' in text: d[ 'reviewer' ], d[ 'reviewer_from' ] = \ self.parse_reviewer_from( div ) continue if 'This review is from' in text: d[ 'rfrom' ] = self.parse_from( div ) break yield d next_url = self.next_page( response ) or \ next_product_url() # see http://doc.scrapy.org/en/latest/topics/request-response.html yield scrapy.http.Request( url = next_url, callback = self.parse, dont_filter = True )