def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} _tmp_comment_list = [] self.my_lg.info('------>>>| 待抓取的goods_id: %s' % goods_id) ''' 下面抓取的是pc端的数据地址 ''' # 获取评论数据 for current_page_num in range(1, 4): self.my_lg.info('------>>>| 正在抓取第%s页评论...' % str(current_page_num)) tmp_url = 'https://rate.taobao.com/feedRateList.htm' _params = self._set_params(current_page_num=current_page_num, goods_id=goods_id) self.headers.update({'referer': 'https://item.taobao.com/item.htm?id='+goods_id}) body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, params=_params, encoding='gbk') # self.my_lg.info(str(body)) try: body = re.compile('\((.*)\)').findall(body)[0] except IndexError: self.my_lg.error('re得到需求body时出错! 出错goods_id: ' + goods_id) sleep(.5) self.result_data = {} return {} data = json_2_dict(json_str=body, logger=self.my_lg).get('comments') # pprint(data) if data is None: self.my_lg.error('出错goods_id: ' + goods_id) self.result_data = {} return {} if data == []: # 该页的"comments"=[], 跳出本次循环 continue _tmp_comment_list += data sleep(self.comment_page_switch_sleep_time) # self.my_lg.info(str(len(_tmp_comment_list))) try: _comment_list = self._get_comment_list(_tmp_comment_list=_tmp_comment_list) except Exception as e: self.my_lg.error('出错goods_id: ' + goods_id) self.my_lg.exception(e) self.result_data = {} return {} _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data
def _get_comment_data(self, goods_id): """ 获取comment数据 :param goods_id: :return: """ if goods_id == '': return self._data_error_init() self.lg.info('------>>>| 待抓取的goods_id: {}'.format(goods_id)) try: # db中已有的buyer_name and comment_date_list db_top_n_buyer_name_and_comment_date_list = get_top_n_buyer_name_and_comment_date_by_goods_id( goods_id=goods_id, logger=self.lg, ) except SqlServerConnectionException: self.lg.error('db 连接异常! 此处抓取跳过!') return self._data_error_init() try: db_sku_info_list = _get_sku_info_from_db_by_goods_id( goods_id=goods_id, logger=self.lg, ) except DBGetGoodsSkuInfoErrorException: self.lg.error( '获取db goods_id: {} 的sku_info失败! 此处跳过!'.format(goods_id)) return self._data_error_init() # 同步 # all_comment_list = self._get_all_comment_info(goods_id=goods_id) # celery all_comment_list = self._get_all_comment_info_by_celery( goods_id=goods_id) # pprint(all_comment_list) try: _comment_list = self._get_comment_list( all_comment_list=all_comment_list, db_top_n_buyer_name_and_comment_date_list= db_top_n_buyer_name_and_comment_date_list, db_sku_info_list=db_sku_info_list) except Exception as e: self.lg.error('出错goods_id: ' + goods_id) self.lg.exception(e) return self._data_error_init() _t = get_shanghai_time() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data
def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) self.goods_id = goods_id self.headers.update({ 'referer': 'https://item.m.jd.com/ware/view.action?wareId=' + str(goods_id), }) # 根据京东手机版商品评价获取 _tmp_comment_list = [] for current_page in range(1, 3): _url = 'https://item.m.jd.com/newComments/newCommentsDetail.json' params = self._set_params(goods_id=goods_id, current_page=current_page) body = MyRequests.get_url_body(url=_url, headers=self.headers, params=params) # self.my_lg.info(str(body)) _data = json_2_dict(json_str=body, logger=self.my_lg).get( 'wareDetailComment', {}).get('commentInfoList', []) if _data == []: self.my_lg.error('出错goods_id:{0}'.format(self.goods_id)) _tmp_comment_list += _data sleep(self.comment_page_switch_sleep_time) # pprint(_tmp_comment_list) try: _comment_list = self._get_comment_list( _tmp_comment_list=_tmp_comment_list) except Exception as e: self.my_lg.error('出错goods_id:{0}'.format(goods_id)) self.my_lg.exception(e) self.result_data = {} return {} _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data
def _get_comment_data(self, goods_id): if goods_id == '': return self._data_error() self.lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) try: # db中已有的buyer_name and comment_date_list db_top_n_buyer_name_and_comment_date_list = get_top_n_buyer_name_and_comment_date_by_goods_id( goods_id=goods_id, logger=self.lg, ) except SqlServerConnectionException: self.lg.error('db 连接异常! 此处抓取跳过!') return self._data_error() # 根据京东手机版商品评价获取 _tmp_comment_list = [] for current_page in range(1, 4): try: _data = self._get_one_page_comment_info( goods_id=goods_id, page_num=current_page, ) except (AssertionError, Exception): self.lg.error('遇到错误:', exc_info=True) continue _tmp_comment_list += _data sleep(self.comment_page_switch_sleep_time) # pprint(_tmp_comment_list) try: _comment_list = self._get_comment_list( _tmp_comment_list=_tmp_comment_list, db_top_n_buyer_name_and_comment_date_list= db_top_n_buyer_name_and_comment_date_list, ) except Exception: self.lg.error('出错goods_id:{0}'.format(goods_id), exc_info=True) return self._data_error() _t = get_shanghai_time() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data
def _get_comment_data(self, goods_id): if goods_id == '': return self._data_error() self.lg.info('------>>>| 待抓取的goods_id: {}'.format(goods_id)) try: # db中已有的buyer_name and comment_date_list db_top_n_buyer_name_and_comment_date_list = get_top_n_buyer_name_and_comment_date_by_goods_id( goods_id=goods_id, logger=self.lg,) except SqlServerConnectionException: self.lg.error('db 连接异常! 此处抓取跳过!') return self._data_error() # 同步 # all_comment_list = self._get_all_comment_info(goods_id=goods_id) # celery all_comment_list = self._get_all_comment_info_by_celery(goods_id=goods_id) # self.lg.info(str(len(all_comment_list))) try: _comment_list = self._get_comment_list( all_comment_list=all_comment_list, db_top_n_buyer_name_and_comment_date_list=db_top_n_buyer_name_and_comment_date_list) except Exception as e: self.lg.error('出错goods_id: ' + goods_id) self.lg.exception(e) return self._data_error() _t = get_shanghai_time() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data
def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} _tmp_comment_list = [] self.lg.info('------>>>| 待抓取的goods_id: %s' % goods_id) ''' 下面是抓取m.zhe800.com的数据地址 ''' for current_page_num in range(1, 4): # 起始页为1 self.lg.info('------>>>| 正在抓取第%s页评论...' % str(current_page_num)) tmp_url = 'https://th5.m.zhe800.com/app/detail/comment/list' _params = self._set_params(current_page_num=current_page_num, goods_id=goods_id) self.headers.update({ 'referer': 'https://th5.m.zhe800.com/h5/comment/list?zid={0}&dealId=39890410&tagId=' .format(str(goods_id)) }) body = Requests.get_url_body(url=tmp_url, headers=self.headers, params=_params, encoding='utf-8', ip_pool_type=self.ip_pool_type) # self.lg.info(str(body)) data = json_2_dict(json_str=body, logger=self.lg) # pprint(data) if data.get('comments') is not None: _tmp_comment_list += data.get('comments') # print(type(data.get('hasNext'))) # <class 'bool'> if not data.get('hasNext', False): # 先判断是否下页还有评论信息 break if data.get('comments') is None and data.get( 'hasNext' ) is None: # 默认为空,如果下页没有的话,但是上面已经进行下页判断,此处加这个用于异常退出 self.lg.error('获取到的data为None, 出错goods_id: ' + goods_id) self.result_data = {} return {} sleep(self.comment_page_switch_sleep_time) # self.lg.info(str(len(_tmp_comment_list))) try: _comment_list = self._get_comment_list( _tmp_comment_list=_tmp_comment_list) except Exception as e: self.lg.error('出错goods_id: ' + goods_id) self.lg.exception(e) self.result_data = {} return {} _t = get_shanghai_time() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data
def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} self.lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) '''改版抓取m站接口, 分析js源码: 已破解1688 m站 get必须参数_csrf的加密方式''' # 即从https://m.1688.com/page/offerRemark.htm?offerId=xxxx 这个页面源码拿到csrf 即为: 下次请求四五星好评所需的_csrf # 时间原因先不进行修改! # 此外cookies也是必要的, 可用driver获取到再抽离出cookies # 研究发现: 其中ali-ss, ali-ss.sig为cookies必要字段 # 下面还有问题不管怎么请求只能获取到第一页的评论 # # tmp_url = 'https://m.1688.com/page/offerRemark.htm?offerId=' + str(goods_id) # body = self.driver.use_phantomjs_to_get_url_body(url=tmp_url) # # self.lg.info(str(body)) # # if body == '': # self.lg.error('该地址的body为空值, 出错地址: ' + tmp_url) # return self._error_init() # try: # csrf = re.compile('\"csrf\":\"(.*?)\",').findall(body)[0] # except IndexError: # self.lg.error('获取csrf失败!') # return self._error_init() # # self.lg.info('获取到的csrf值为: {}'.format(csrf)) # cookies = self.driver._get_cookies() # cookies = dict_cookies_2_str(cookies) # self.lg.info('获取到的cookies为: {}'.format(cookies)) # origin_comment_list = self._get_origin_comment_list( # csrf=csrf, # goods_id=goods_id, # cookies=cookies, ) # pprint(origin_comment_list) '''下面是模拟pc端好评接口''' try: # db中已有的buyer_name and comment_date_list db_top_n_buyer_name_and_comment_date_list = get_top_n_buyer_name_and_comment_date_by_goods_id( goods_id=goods_id, logger=self.lg,) except SqlServerConnectionException: self.lg.error('db 连接异常! 此处抓取跳过!') return self._error_init() member_id = self._get_this_goods_member_id(goods_id=goods_id) self.lg.info('------>>>| 获取到的member_id: {0}'.format(member_id)) if member_id == '': self.lg.error('获取到的member_id为空值!请检查!') return self._error_init() # 这里从db获取该商品原先的规格值 try: db_sku_info = _get_sku_info_from_db_by_goods_id( goods_id=goods_id, logger=self.lg,) assert db_sku_info != [], 'db_sku_info为空list!' except DBGetGoodsSkuInfoErrorException: self.lg.error('获取db goods_id: {} 的sku_info失败! 此处跳过!'.format(goods_id)) return self._error_init() # 同步 # all_comment_list = self._get_all_comment_info(goods_id=goods_id, member_id=member_id) # celery all_comment_list = self._get_all_comment_info_by_celery(goods_id=goods_id, member_id=member_id) try: _comment_list = self._get_comment_list( all_comment_list=all_comment_list, db_top_n_buyer_name_and_comment_date_list=db_top_n_buyer_name_and_comment_date_list, db_sku_info=db_sku_info, ) # pprint(_comment_list) except Exception: self.lg.error('遇到错误[goods_id:{}]:'.format(goods_id), exc_info=True) return self._error_init() _t = get_shanghai_time() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data
def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) # # 原先采用phantomjs, 改用手机端抓html(speed slow, give up) # tmp_url = 'https://m.1688.com/page/offerRemark.htm?offerId=' + str(goods_id) # body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url, exec_code=self._exec_code) # # self.my_lg.info(str(body)) # # if body == '': # self.result_data = {} # self.my_lg.error('该地址的body为空值, 出错地址: ' + tmp_url) # return {} # # _html_comment_list = list(Selector(text=body).css('div.remark-item').extract()) # if _html_comment_list != []: # _comment_list = [] # for index, item in enumerate(_html_comment_list): # if index > 25: # 就取前25条评论信息 # break # # buyer_name = str(Selector(text=item).css('span.member::text').extract_first()) # quantify = str(Selector(text=item).css('span.amount::text').extract_first()) # try: # quantify = int(re.compile(r'\d+').findall(quantify)[0]) # except IndexError: # self.my_lg.error('获取quantify时索引异常! 出错地址: ' + tmp_url) # self.result_data = {} # return {} # # comment_date = str(Selector(text=item).css('div.date span::text').extract_first()) # comment_date = self._get_comment_date(comment_date) # str '2017-01-25 17:06:00' # tmp_sku_info = str(Selector(text=item).css('div.date::text').extract_first()) # # _comment_content = self._wash_comment(str(Selector(text=item).css('div.bd::text').extract_first())) # if not filter_invalid_comment_content(_comment_content): # continue # # comment = [{ # 'comment': _comment_content, # 'comment_date': comment_date, # 评论创建日期 # 'sku_info': re.compile(r'<span.*?</span>').sub('', tmp_sku_info), # 购买的商品规格 # 'img_url_list': [], # 'star_level': randint(3, 5), # 几星好评 # 'video': '', # }] # # _ = { # 'buyer_name': buyer_name, # 买家昵称 # 'comment': comment, # 评论内容 # 'quantify': quantify, # 购买数量 # 'head_img': '', # 用户头像 # 'append_comment': {}, # 追评 # } # _comment_list.append(_) # # _t = datetime.datetime.now() # # _r = CommentItem() # _r['goods_id'] = str(goods_id) # _r['create_time'] = _t # _r['modify_time'] = _t # _r['_comment_list'] = _comment_list # self.result_data = _r # # pprint(self.result_data) # return self.result_data # else: # self.my_lg.error('该商品的comment为空list! 出错地址: ' + tmp_url) # self.result_data = {} # return {} '''下面是模拟pc端好评接口''' member_id = self._get_this_goods_member_id(goods_id=goods_id) self.my_lg.info('------>>>| 获取到的member_id: {0}'.format(member_id)) if member_id == '': self.my_lg.error('获取到的member_id为空值!请检查!') self.result_data = {} return {} # 这里从db获取该商品原先的规格值 sku_info = self._get_sku_info_from_db(goods_id) # self.my_lg.info('sku_info: {0}'.format(sku_info)) if sku_info == []: self.result_data = {} return {} _comment_list = [] for page_num in range(1, 4): self.my_lg.info('------>>>| 正在抓取第{0}页...'.format(page_num)) params = self._set_params(goods_id=goods_id, member_id=member_id, page_num=page_num) url = 'https://rate.1688.com/remark/offerDetail/rates.json' tmp_headers = self.headers tmp_headers.update({ 'referer': 'https://detail.1688.com/offer/{0}.html'.format(str(goods_id)) }) # 原先用MyRequests老是404,改用phantomjsy也还是老是404 body = MyRequests.get_url_body(url=url, headers=tmp_headers, params=params) # self.my_lg.info(str(body)) # 用phantomjs # url = self._set_url(url=url, params=params) # self.my_lg.info(url) # body = self.my_phantomjs.use_phantomjs_to_get_url_body(url) # try: # body = re.compile('<pre.*?>(.*)</pre>').findall(body)[0] # except IndexError: # self.my_lg.error('获取body时索引异常!') # self.result_data = {} # return {} if body == '': self.result_data = {} self.my_lg.error('该地址的body为空值, 出错goods_id: {0}'.format(goods_id)) return {} data = self.json_str_2_dict(json_str=body) if data.get('url') is not None: self.my_lg.info('------>>>| 被重定向到404页面, 休眠{0}s中...'.format(self._page_sleep_time)) sleep(self._page_sleep_time) break # self.my_lg.info(str(body)) data = data.get('data', {}).get('rates', []) # pprint(data) if data == []: # sleep(self._page_sleep_time) break try: for item in data: buyer_name = item.get('member', '') comment = [] for i in item.get('rateItem', []): _comment_content = self._wash_comment(i.get('remarkContent', '')) if not filter_invalid_comment_content(_comment_content): continue comment.append({ 'comment': _comment_content, 'comment_date': str(i.get('remarkTime', '')), # 评论日期 'sku_info': choice(sku_info), # 购买的商品规格(pc端1688商品没有规格) 'star_level': i.get('starLevel', 5), 'img_url_list': [], 'video': '', }) quantify = item.get('quantity', 1) # 购买数量 if comment == []: # 为空不录入 continue _ = { 'buyer_name': buyer_name, # 买家昵称 'comment': comment, # 评论内容 'quantify': quantify, # 购买数量 'head_img': '', # 用户头像 'append_comment': {}, # 追评 } _comment_list.append(_) except Exception: self.result_data = {} self.my_lg.error('出错商品goods_id: {0}'.format(goods_id), exc_info=True) return {} sleep(self._page_sleep_time) if _comment_list != []: # pprint(_comment_list) _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r return self.result_data else: self.my_lg.error('出错goods_id: {0}'.format(goods_id)) self.result_data = {} return {}
def _get_comment_data(self, goods_id): if goods_id == '': self.result_data = {} return {} self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) # 原先采用phantomjs, 改用pc端抓包到的接口(speed slow, give up) tmp_url = 'https://m.1688.com/page/offerRemark.htm?offerId=' + str( goods_id) body = self.my_phantomjs.use_phantomjs_to_get_url_body( url=tmp_url, exec_code=self._exec_code) # self.my_lg.info(str(body)) if body == '': self.result_data = {} self.my_lg.error('该地址的body为空值, 出错地址: ' + tmp_url) return {} _html_comment_list = list( Selector(text=body).css('div.remark-item').extract()) if _html_comment_list != []: _comment_list = [] for index, item in enumerate(_html_comment_list): if index > 25: # 就取前25条评论信息 break buyer_name = str( Selector( text=item).css('span.member::text').extract_first()) quantify = str( Selector( text=item).css('span.amount::text').extract_first()) try: quantify = int(re.compile(r'\d+').findall(quantify)[0]) except IndexError: self.my_lg.error('获取quantify时索引异常! 出错地址: ' + tmp_url) self.result_data = {} return {} comment_date = str( Selector( text=item).css('div.date span::text').extract_first()) comment_date = self._get_comment_date( comment_date) # str '2017-01-25 17:06:00' tmp_sku_info = str( Selector(text=item).css('div.date::text').extract_first()) comment = [{ 'comment': self._wash_comment( str( Selector(text=item).css( 'div.bd::text').extract_first())), 'comment_date': comment_date, # 评论创建日期 'sku_info': re.compile(r'<span.*?</span>').sub( '', tmp_sku_info), # 购买的商品规格 'img_url_list': [], 'star_level': randint(3, 5), # 几星好评 'video': '', }] _ = { 'buyer_name': buyer_name, # 买家昵称 'comment': comment, # 评论内容 'quantify': quantify, # 购买数量 'head_img': '', # 用户头像 'append_comment': {}, # 追评 } _comment_list.append(_) _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data else: self.my_lg.error('该商品的comment为空list! 出错地址: ' + tmp_url) self.result_data = {} return {}
def _get_comment_data(self, _type: int, goods_id): """ 获取对应goods_id的评论数据 :param type: :param goods_id: :return: """ if goods_id == '' or type == '': return self._data_error() self.lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) try: # db中已有的buyer_name and comment_date_list db_top_n_buyer_name_and_comment_date_list = get_top_n_buyer_name_and_comment_date_by_goods_id( goods_id=goods_id, logger=self.lg, ) # pprint(db_top_n_buyer_name_and_comment_date_list) except SqlServerConnectionException: self.lg.error('db 连接异常! 此处抓取跳过!') return self._data_error() try: # 先获取到sellerId seller_id = self._get_seller_id(_type=type, goods_id=goods_id) self.lg.info('------>>>| 获取到的seller_id: {}'.format(seller_id)) # 获取db sku_info list db_sku_info_list = _get_sku_info_from_db_by_goods_id( goods_id=goods_id, logger=self.lg, ) # pprint(db_sku_info_list) except ( AssertionError, IndexError, ): self.lg.error('遇到错误[goods_id:{}]:'.format(goods_id), exc_info=True) return self._data_error() except DBGetGoodsSkuInfoErrorException: self.lg.error( '获取db goods_id: {} 的sku_info失败! 此处跳过!'.format(goods_id)) return self._data_error() # 同步 # all_comment_list = self._get_all_comment_list( # goods_id=goods_id, # seller_id=seller_id, # _type=_type) # celery all_comment_list = self._get_all_comment_list_by_celery( goods_id=goods_id, seller_id=seller_id, _type=_type) # pprint(all_comment_list) try: _comment_list = self._get_comment_list( all_comment_list=all_comment_list, db_top_n_buyer_name_and_comment_date_list= db_top_n_buyer_name_and_comment_date_list, db_sku_info_list=db_sku_info_list) except Exception as e: self.lg.error('出错type:{0}, goods_id:{1}'.format( str(type), goods_id)) self.lg.exception(e) return self._data_error() _t = get_shanghai_time() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data
def _get_comment_data(self, type: int, goods_id): if goods_id == '' or type == '': self.result_data = {} return {} self.lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id)) '''先获取到sellerId''' try: seller_id = self._get_seller_id(type=type, goods_id=goods_id) except AssertionError or IndexError as e: self.lg.error('出错goods_id: %s' % goods_id) self.lg.error(e.args[0]) self.result_data = {} self.random_sku_info_list = [] return {} """再获取price_info_list""" try: self.random_sku_info_list = self._get_random_sku_info_list() # self.lg.info(self.random_sku_info_list) except Exception as e: self.lg.error('出错goods_id: %s' % str(goods_id)) self.lg.exception(e) self.result_data = {} self.random_sku_info_list = [] return {} _tmp_comment_list = [] for current_page in range(1, 4): self.lg.info('------>>>| 正在抓取第 {0} 页的评论...'.format( str(current_page))) _url = 'https://rate.tmall.com/list_detail_rate.htm' params = self._set_params(goods_id=goods_id, seller_id=seller_id, current_page=current_page) self.headers.update({ 'referer': 'https://detail.m.tmall.com/item.htm?id=' + goods_id }) # 原先用代理请求不到数据的原因是没有cookies # body = MyRequests.get_url_body(url=_url, headers=self.headers, params=params, encoding='gbk') # 所以直接用phantomjs来获取相关api数据 _url = _get_url_contain_params(url=_url, params=params) # 根据params组合得到url # self.lg.info(_url) body = self.driver.use_phantomjs_to_get_url_body(url=_url) # self.lg.info(str(body)) if body == '': self.lg.error('获取到的body为空str! 出错type:{0}, goods_id:{1}'.format( str(type), goods_id)) self.result_data = {} return {} try: _ = re.compile('\((.*)\)').findall(body)[0] except IndexError: _ = {} self.lg.error('索引异常! 出错type:{0}, goods_id:{1}'.format( str(type), goods_id)) try: data = json.loads(_).get('rateDetail', {}).get('rateList', []) # pprint(data) except: data = [] self.lg.error( 'json.loads转换_出错! 出错type:{0}, goods_id:{1}'.format( str(type), goods_id)) _tmp_comment_list += data sleep(self.comment_page_switch_sleep_time) try: _comment_list = self._get_comment_list( _tmp_comment_list=_tmp_comment_list) except Exception as e: self.lg.error('出错type:{0}, goods_id:{1}'.format( str(type), goods_id)) self.lg.exception(e) self.result_data = {} return {} _t = datetime.datetime.now() _r = CommentItem() _r['goods_id'] = str(goods_id) _r['create_time'] = _t _r['modify_time'] = _t _r['_comment_list'] = _comment_list self.result_data = _r # pprint(self.result_data) return self.result_data