def parse(self, response): # print response.body if '页面不存在' in response.body: Cookie = { '_hc.v': '9c20f3b2-274d-8559-c306-1785c4b96ebc.%s;' % (int(time.time())), 'JSESSIONID': '%s' % self.__md5sum("%s" % time.time()), 's_ViewType': '10', 'PHOENIX_ID': '0a0102f1-15c114151d0-436b141' } header['User-Agent'] = random.choice(ua_list) yield Request(response.url, callback=self.parse, headers=header, cookies=Cookie, dont_filter=True) else: sel = Selector(response) detail_list = sel.xpath('//div[@class="reviews-items"]/ul/li') if detail_list: for detail in detail_list: item = DianpingcommentItem() comment_id = detail.xpath( './div//span[@class="actions"]/a/@data-id').extract() if comment_id: comment_id = comment_id[0] item['comment_id'] = comment_id shop_id = ''.join( re.findall('com/shop/(.*?)/review', response.url)) item['shop_id'] = shop_id href = ''.join( detail.xpath('./a/@href').extract()).strip().replace( '\n', '') name = ''.join( detail.xpath( './div[@class="main-review"]/div/a[@class="name"]/text()' ).extract()).strip().replace('\n', '') # print href item['user_name'] = name user_id = href.replace('/member/', '').strip().replace('\n', '') item['user_id'] = user_id total_score = ''.join( detail.xpath( './div[@class="main-review"]/div[@class="review-rank"]/span[1]/@class' ).extract()).strip().replace('\n', '') if not total_score: total_score = ''.join( detail.xpath( './div[@class="content"]/p[@class="shop-info"]/span[1]/@class' ).extract()).strip().replace('\n', '') total_score = total_score.replace('sml-rank-stars sml-str', '').replace(' star', '') if total_score: total_score = int(total_score) / 10 item['total_score'] = total_score scores = detail.xpath( './div[@class="main-review"]//span[@class="score"]/span/text()' ).extract() if scores: if len(scores) == 3: score1 = scores[0].replace('\n', '').replace(' ', '') score2 = scores[1].replace('\n', '').replace(' ', '') score3 = scores[2].replace('\n', '').replace(' ', '') score1_name = score1[:-1] score1 = score1[-1:] item['score1_name'] = score1_name item['score1'] = score1 score2_name = score2[:-1] score2 = score2[-1:] item['score2_name'] = score2_name item['score2'] = score2 score3_name = score3[:-1] score3 = score3[-1:] item['score3_name'] = score3_name item['score3'] = score3 else: item['score1_name'] = '' item['score2_name'] = '' item['score3_name'] = '' item['score1'] = 0 item['score2'] = 0 item['score3'] = 0 comment_txt = ''.join( detail.xpath( './div[@class="main-review"]/div[@class="review-words"]/text()|./div[@class="main-review"]/div[@class="review-words Hide"]/text()' ).extract()).strip().replace('\n', '') item['comment_text'] = comment_txt comment_dt = ''.join( detail.xpath( './div[@class="main-review"]//span[@class="time"]/text()' ).extract()).strip().replace('\n', '') if comment_dt: comment_dt = comment_dt.replace(u'更新于', '') comment_dt = comment_dt.replace('\n', '').replace( '\r', '').replace('\t', '').strip() comment_dt = comment_dt.split(u'\xa0') if comment_dt: comment_dt = comment_dt[0] if len(comment_dt) == 5: comment_dt = '2017-' + comment_dt elif len(comment_dt) == 8: comment_dt = '20' + comment_dt if ' ' in comment_dt: comment_dt = comment_dt.split(' ')[0] else: comment_dt = ''.join( detail.xpath( './div[@class="content"]/div[@class="misc-info"]/span/a[@class="time"]/text()' ).extract()).strip().replace('\n', '') item['comment_dt'] = comment_dt contribution = ''.join( detail.xpath( './div[@class="pic"]/p[@class="contribution"]/span/@title' ).extract()).strip().replace('\n', '') contribution = contribution.replace('贡献值', '').strip() item['user_contrib_val'] = contribution # try: # db_insert.insert('t_hh_dianping_shop_comments', **item) # except: # pass yield item # next_page = sel.xpath('//a[@class="NextPage"]/@href') # if next_page: # next_page = ''.join(next_page.extract()) # next_page = urljoin(response.url, next_page) # print next_page # Cookie = {'_hc.v': '9c20f3b2-274d-8559-c306-1785c4b96ebc.%s;' % (int(time.time())), # 'JSESSIONID': '%s' % self.__md5sum("%s" % time.time()), # 's_ViewType': '10', # 'PHOENIX_ID': '0a0102f1-15c114151d0-436b141' # } # header['User-Agent'] = random.choice(ua_list) # yield Request(next_page, errback=self.parse_failure, # callback=self.parse, headers=header, cookies=Cookie, dont_filter=True, ) page_now = response.url.split('review_all/p') if int(page_now[1]) == 1: next_page = sel.xpath( '//a[@class="PageLink"][last()]/@data-pg').extract() if next_page: print next_page next_page = ''.join(next_page) for i in xrange(2, int(next_page) + 1): next_page = '%sreview_all/p%s' % (page_now[0], i) # print next_page Cookie = { '_hc.v': '9c20f3b2-274d-8559-c306-1785c4b96ebc.%s;' % (int(time.time())), 'JSESSIONID': '%s' % self.__md5sum("%s" % time.time()), 's_ViewType': '10', 'PHOENIX_ID': '0a0102f1-15c114151d0-436b141' } header['User-Agent'] = random.choice(ua_list) print next_page yield Request( next_page, callback=self.parse, headers=header, cookies=Cookie, dont_filter=True, ) else: print response.body
def parse(self, response): # print response.body if '页面不存在' in response.body: Cookie = { '_hc.v': '9c20f3b2-274d-8559-c306-1785c4b96ebc.%s;' % (int(time.time())), 'JSESSIONID': '%s' % self.__md5sum("%s" % time.time()), 's_ViewType': '10', 'PHOENIX_ID': '0a0102f1-15c114151d0-436b141' } yield Request(response.url, errback=self.parse_failure, callback=self.parse, headers=header, cookies=Cookie, dont_filter=True) else: sel = Selector(response) detail_list = sel.xpath('//div[@class="comment-list"]/ul/li') if detail_list: for detail in detail_list: item = DianpingcommentItem() comment_id = ''.join(detail.xpath('@data-id').extract()) item['comment_id'] = comment_id shop_id = ''.join( re.findall('com/shop/(.*?)/review', response.url)) item['shop_id'] = shop_id href = ''.join( detail.xpath( './div[@class="pic"]/p[@class="name"]/a/@href'). extract()).strip().replace('\n', '') name = ''.join( detail.xpath( './div[@class="pic"]/p[@class="name"]/a/text()'). extract()).strip().replace('\n', '') # print href item['user_name'] = name user_id = href.replace('/member/', '').strip().replace('\n', '') item['user_id'] = user_id total_score = ''.join( detail.xpath( './div[@class="content"]/div[@class="user-info"]/span[1]/@class' ).extract()).strip().replace('\n', '') if not total_score: total_score = ''.join( detail.xpath( './div[@class="content"]/p[@class="shop-info"]/span[1]/@class' ).extract()).strip().replace('\n', '') total_score = total_score.replace('item-rank-rst irr-star', '') if total_score: total_score = int(total_score) / 10 item['total_score'] = total_score scores = detail.xpath( './div[@class="content"]/div[@class="user-info"]/div/span/text()' ).extract() if scores: if len(scores) == 3: score1 = scores[0] score2 = scores[1] score3 = scores[2] score1_name = score1[:-1] score1 = score1[-1:] item['score1_name'] = score1_name item['score1'] = score1 score2_name = score2[:-1] score2 = score2[-1:] item['score2_name'] = score2_name item['score2'] = score2 score3_name = score3[:-1] score3 = score3[-1:] item['score3_name'] = score3_name item['score3'] = score3 comment_txt = ''.join( detail.xpath( './div[@class="content"]/div[@class="comment-txt"]/div/text()' ).extract()).strip().replace('\n', '') item['comment_text'] = comment_txt comment_dt = ''.join( detail.xpath( './div[@class="content"]/div[@class="misc-info"]/span/text()' ).extract()).strip().replace('\n', '') if comment_dt: comment_dt = comment_dt.split(u'\xa0') if comment_dt: comment_dt = comment_dt[0] if len(comment_dt) == 5: comment_dt = '2017-' + comment_dt elif len(comment_dt) == 8: comment_dt = '20' + comment_dt item['comment_dt'] = comment_dt contribution = ''.join( detail.xpath( './div[@class="pic"]/p[@class="contribution"]/span/@title' ).extract()).strip().replace('\n', '') contribution = contribution.replace('贡献值', '').strip() item['user_contrib_val'] = contribution # try: # db_insert.insert('t_hh_dianping_shop_comments', **item) # except: # pass yield item next_page = sel.xpath('//a[@class="NextPage"]/@href') if next_page: next_page = ''.join(next_page.extract()) next_page = urljoin(response.url, next_page) print next_page Cookie = { '_hc.v': '9c20f3b2-274d-8559-c306-1785c4b96ebc.%s;' % (int(time.time())), 'JSESSIONID': '%s' % self.__md5sum("%s" % time.time()), 's_ViewType': '10', 'PHOENIX_ID': '0a0102f1-15c114151d0-436b141' } yield Request( next_page, errback=self.parse_failure, callback=self.parse, headers=header, cookies=Cookie, dont_filter=True, ) else: print response.body