def get_shop_comment(self, url, shopid, offset, page): pprint('get_shop_comment offset:{} shopid:{}'.format(offset, shopid)) self.logger.debug('get_shop_comment offset:{} shopid:{}'.format(offset, shopid)) sleep(random.randint(1, 5)) print('开始爬取shop_comment第{}页......,url为{}'.format(page, url)) self.logger.debug('开始爬取shop_comment第{}页......,url为{}'.format(page, url)) # html = requests.get(url, headers=headers, proxies=p) json_data = html_from_uri(url) print(json_data) self.logger.debug(json_data) # dict_data = re.findall('RawParsed(.*){"tags": ', html)[0] dict_data = json.loads(json_data) shop_tag = ShopTagItem() for tag in dict_data['tags']: shop_tag['id'] = calc_md5(tag) shop_tag['shopId'] = shopid shop_tag['tag'] = tag.get('tag') shop_tag['count'] = tag.get('count') shop_tag['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") yield shop_tag if 'comments' in dict_data and dict_data['comments']: comment_groups = dict_data['comments'] shop_comment = ShopCommentItem() for comment in comment_groups: shop_comment['id'] = calc_md5(comment) shop_comment['shopId'] = shopid shop_comment['userName'] = comment.get('userName') shop_comment['userUrl'] = comment.get('userUrl') shop_comment['avgPrice'] = comment.get('avgPrice') shop_comment['comment'] = comment.get('comment') shop_comment['merchantComment'] = comment.get('merchantComment') shop_comment['picUrls'] = comment.get('picUrls') shop_comment['commentTime'] = comment.get('commentTime') shop_comment['replyCnt'] = comment.get('replyCnt') shop_comment['zanCnt'] = comment.get('zanCnt') shop_comment['readCnt'] = comment.get('readCnt') shop_comment['userLevel'] = comment.get('userLevel') shop_comment['userId'] = comment.get('userId') shop_comment['uType'] = comment.get('uType') shop_comment['star'] = comment.get('star') shop_comment['quality'] = comment.get('quality') shop_comment['alreadyZzz'] = comment.get('alreadyZzz') shop_comment['reviewId'] = comment.get('reviewId') shop_comment['menu'] = comment.get('menu') shop_comment['did'] = comment.get('did') shop_comment['dealEndtime'] = comment.get('dealEndtime') shop_comment['anonymous'] = comment.get('anonymous') shop_comment['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") yield shop_comment offset += 10 next_url = self.goods_comment_baseurl.format(shopid=shopid, offset=offset) # next_url = 'https://www.meituan.com/ptapi/poi/getcomment?id=41039567&offset=' + str( # offset) + '&pageSize=10&sortType=0' page = page + 1 self.get_shop_comment(next_url, shopid, offset, page)
def get_goods_comment(self, url, dealid, offset, page): pprint('get_goods_comment offset:{} dealid:{}'.format(offset, dealid)) self.logger.debug('get_goods_comment offset:{} dealid:{}'.format(offset, dealid)) sleep(random.randint(1, 5)) print('开始爬取goods_comment第{}页......,url为{}'.format(page, url)) self.logger.debug('开始爬取goods_comment第{}页......,url为{}'.format(page, url)) # html = requests.get(url, headers=headers, proxies=p) json_data = html_from_uri(url) pprint(json_data) self.logger.debug(json_data) # #从直接浏览器得到的结果看需要正则提取,但用requests_html库返回的数据已经是json格式了,所以注释了 # dict_data = re.findall('RawParsed(.*){"tags": ', html)[0] dict_data = json.loads(json_data) goods_tag = GoodsTagItem() for tag in dict_data['tags']: goods_tag['id'] = calc_md5(tag) goods_tag['goodsId'] = dealid goods_tag['content'] = tag.get('content') goods_tag['count'] = tag.get('count') goods_tag['isPositive'] = tag.get('isPositive') goods_tag['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") yield goods_tag if 'list' in dict_data: comment_groups = dict_data['list'] goods_comment = GoodsCommentItem() for comment in comment_groups: goods_comment['id'] = calc_md5(comment) goods_comment['goodsId'] = dealid goods_comment['content'] = comment.get('content') picUrls = comment.get('picUrls') goods_comment['picUrls'] = ', '.join(picUrl for picUrl in picUrls) if picUrls else None goods_comment['modTime'] = comment.get('modTime') goods_comment['star'] = comment.get('star') / 10 user = comment.get('user') goods_comment['userName'] = user.get('userName') goods_comment['isAnonymous'] = user.get('isAnonymous') goods_comment['imgUrl'] = user.get('imgUrl') goods_comment['shopTitle'] = comment.get('poi').get('title') goods_comment['recordCount'] = dict_data.get('recordCount') goods_comment['startIndex'] = dict_data.get('startIndex') goods_comment['nextStartIndex'] = dict_data.get('nextStartIndex') goods_comment['isEnd'] = dict_data.get('isEnd') goods_comment['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") yield goods_comment offset += 10 next_url = self.goods_comment_baseurl.format(dealid=dealid, offset=offset) # next_url = 'https://i.meituan.com/general/platform/mttgdetail/mtdealcommentsgn.json?dealid=39366931&limit=10&offset=' + str( # offset) + '&sorttype=1&tag=' page = page + 1 self.get_goods_comment(next_url, dealid, offset, page)
def parse_shop_comment(self, response): pprint('parse_shop_comment response.text:{}'.format(response.text)) self.logger.debug('parse_shop_comment response.text:{}'.format(response.text)) shop_comment_url = response.url shopid = response.meta.get('shopid') offset = response.meta.get('offset') page = response.meta.get('page') pprint('parse_shop_comment goods_comment_url:{}'.format(shop_comment_url)) self.logger.debug('parse_shop_comment shop_comment_url:{}'.format(shop_comment_url)) pprint('get_shop_comment offset:{} shopid:{}'.format(offset, shopid)) self.logger.debug('get_shop_comment offset:{} shopid:{}'.format(offset, shopid)) sleep(random.randint(1, 5)) print('开始爬取shop_comment第{}页......,url为{}'.format(page, shop_comment_url)) self.logger.debug('开始爬取shop_comment第{}页......,url为{}'.format(page, shop_comment_url)) # json_data = html_from_uri(shop_comment_url) json_data = response.text print(json_data) self.logger.debug(json_data) # dict_data = re.findall('RawParsed(.*){"tags": ', html)[0] dict_data = json.loads(json_data) shop_tag = ShopTagItem() if 'tags' in dict_data and dict_data['tags']: for tag in dict_data['tags']: shop_tag['id'] = calc_md5(tag) shop_tag['shopId'] = shopid shop_tag['tag'] = tag.get('tag') shop_tag['count'] = tag.get('count') shop_tag['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") yield shop_tag if 'comments' in dict_data and dict_data['comments']: comment_groups = dict_data['comments'] shop_comment = ShopCommentItem() for comment in comment_groups: shop_comment['id'] = calc_md5(comment) shop_comment['shopId'] = shopid shop_comment['userName'] = comment.get('userName') shop_comment['userUrl'] = comment.get('userUrl') shop_comment['avgPrice'] = comment.get('avgPrice') shop_comment['comment'] = comment.get('comment') shop_comment['merchantComment'] = comment.get('merchantComment') shop_comment['picUrls'] = str(comment.get('picUrls')) shop_comment['commentTime'] = timestamp_to_mytime(comment.get('commentTime')) shop_comment['replyCnt'] = comment.get('replyCnt') shop_comment['zanCnt'] = comment.get('zanCnt') shop_comment['readCnt'] = comment.get('readCnt') shop_comment['userLevel'] = comment.get('userLevel') shop_comment['userId'] = comment.get('userId') shop_comment['uType'] = comment.get('uType') shop_comment['star'] = comment.get('star') shop_comment['quality'] = comment.get('quality') shop_comment['alreadyZzz'] = comment.get('alreadyZzz') shop_comment['reviewId'] = comment.get('reviewId') shop_comment['menu'] = comment.get('menu') shop_comment['did'] = comment.get('did') shop_comment['dealEndtime'] = timestamp_to_mytime(comment.get('dealEndtime')) shop_comment['anonymous'] = comment.get('anonymous') shop_comment['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") yield shop_comment offset += 10 next_url = self.shop_comment_baseurl.format(shopid=shopid, offset=offset) page = page + 1 yield scrapy.Request(next_url, callback=self.parse_shop_comment, meta={'shopid': shopid, 'offset': offset, 'page': page}) pprint('33333333333333333333333333333333333get_shop_comment offset:{} shopid:{}'.format(offset, shopid)) self.logger.debug('33333333333333333333333333333333333333get_shop_comment offset:{} shopid:{}'.format(offset, shopid))
def parse_goods_comment(self, response): pprint('parse_goods_comment response.text:{}'.format(response.text)) self.logger.debug('parse_goods_comment response.text:{}'.format(response.text)) goods_comment_url = response.url dealid = response.meta.get('dealid') offset = response.meta.get('offset') page = response.meta.get('page') pprint('parse_goods_comment goods_comment_url:{}'.format(goods_comment_url)) self.logger.debug('parse_goods_comment goods_comment_url:{}'.format(goods_comment_url)) pprint('get_goods_comment offset:{} dealid:{}'.format(offset, dealid)) self.logger.debug('get_goods_comment offset:{} dealid:{}'.format(offset, dealid)) sleep(random.randint(1, 5)) print('开始爬取goods_comment第{}页......,url为{}'.format(page, goods_comment_url)) self.logger.debug('开始爬取goods_comment第{}页......,url为{}'.format(page, goods_comment_url)) # json_data = html_from_uri(goods_comment_url) json_data = response.text pprint(json_data) self.logger.debug(json_data) # #从直接浏览器得到的结果看需要正则提取,但用requests_html库返回的数据已经是json格式了,所以注释了 # dict_data = re.findall('RawParsed(.*){"tags": ', html)[0] dict_data = json.loads(json_data) goods_tag = GoodsTagItem() if 'tags' in dict_data and dict_data['tags']: for tag in dict_data['tags']: goods_tag['id'] = calc_md5(tag) goods_tag['goodsId'] = dealid goods_tag['content'] = tag.get('content') goods_tag['count'] = tag.get('count') goods_tag['isPositive'] = tag.get('isPositive') goods_tag['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") yield goods_tag if 'list' in dict_data and dict_data['list']: comment_groups = dict_data['list'] goods_comment = GoodsCommentItem() for comment in comment_groups: goods_comment['id'] = calc_md5(comment) goods_comment['goodsId'] = dealid goods_comment['content'] = comment.get('content') picUrls = comment.get('picUrls') goods_comment['picUrls'] = ', '.join(picUrl for picUrl in picUrls) if picUrls else None goods_comment['modTime'] = re.sub(r'\D', "-", comment.get('modTime')[:-1]) goods_comment['star'] = comment.get('star') / 10 user = comment.get('user') goods_comment['userName'] = user.get('userName') goods_comment['isAnonymous'] = user.get('isAnonymous') goods_comment['imgUrl'] = user.get('imgUrl') goods_comment['shopTitle'] = comment.get('poi').get('title') goods_comment['recordCount'] = dict_data.get('recordCount') goods_comment['startIndex'] = dict_data.get('startIndex') goods_comment['nextStartIndex'] = dict_data.get('nextStartIndex') goods_comment['isEnd'] = dict_data.get('isEnd') goods_comment['crawl_time'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") yield goods_comment offset += 10 next_url = self.goods_comment_baseurl.format(dealid=dealid, offset=offset) page += 1 yield scrapy.Request(next_url, callback=self.parse_goods_comment, meta={'dealid': dealid, 'offset': offset, 'page': page}) pprint('4444444444444444444444444444444444444444444get_goods_comment offset:{} dealid:{}'.format(offset, dealid)) self.logger.debug('444444444444444444444444444444444444444get_goods_comment offset:{} dealid:{}'.format(offset, dealid))