コード例 #1
0
    def _get_comment_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}
        _tmp_comment_list = []
        self.my_lg.info('------>>>| 待抓取的goods_id: %s' % goods_id)

        '''
        下面抓取的是pc端的数据地址
        '''
        # 获取评论数据
        for current_page_num in range(1, 4):
            self.my_lg.info('------>>>| 正在抓取第%s页评论...' % str(current_page_num))
            tmp_url = 'https://rate.taobao.com/feedRateList.htm'
            _params = self._set_params(current_page_num=current_page_num, goods_id=goods_id)

            self.headers.update({'referer': 'https://item.taobao.com/item.htm?id='+goods_id})
            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers, params=_params, encoding='gbk')
            # self.my_lg.info(str(body))

            try:
                body = re.compile('\((.*)\)').findall(body)[0]
            except IndexError:
                self.my_lg.error('re得到需求body时出错! 出错goods_id: ' + goods_id)
                sleep(.5)
                self.result_data = {}
                return {}

            data = json_2_dict(json_str=body, logger=self.my_lg).get('comments')
            # pprint(data)
            if data is None:
                self.my_lg.error('出错goods_id: ' + goods_id)
                self.result_data = {}
                return {}
            if data == []:  # 该页的"comments"=[], 跳出本次循环
                continue

            _tmp_comment_list += data
            sleep(self.comment_page_switch_sleep_time)

        # self.my_lg.info(str(len(_tmp_comment_list)))
        try:
            _comment_list = self._get_comment_list(_tmp_comment_list=_tmp_comment_list)
        except Exception as e:
            self.my_lg.error('出错goods_id: ' + goods_id)
            self.my_lg.exception(e)
            self.result_data = {}
            return {}

        _t = datetime.datetime.now()

        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data
コード例 #2
0
    def _get_comment_data(self, goods_id):
        """
        获取comment数据
        :param goods_id:
        :return:
        """
        if goods_id == '':
            return self._data_error_init()

        self.lg.info('------>>>| 待抓取的goods_id: {}'.format(goods_id))
        try:
            # db中已有的buyer_name and comment_date_list
            db_top_n_buyer_name_and_comment_date_list = get_top_n_buyer_name_and_comment_date_by_goods_id(
                goods_id=goods_id,
                logger=self.lg,
            )
        except SqlServerConnectionException:
            self.lg.error('db 连接异常! 此处抓取跳过!')
            return self._data_error_init()

        try:
            db_sku_info_list = _get_sku_info_from_db_by_goods_id(
                goods_id=goods_id,
                logger=self.lg,
            )
        except DBGetGoodsSkuInfoErrorException:
            self.lg.error(
                '获取db goods_id: {} 的sku_info失败! 此处跳过!'.format(goods_id))
            return self._data_error_init()

        # 同步
        # all_comment_list = self._get_all_comment_info(goods_id=goods_id)
        # celery
        all_comment_list = self._get_all_comment_info_by_celery(
            goods_id=goods_id)
        # pprint(all_comment_list)

        try:
            _comment_list = self._get_comment_list(
                all_comment_list=all_comment_list,
                db_top_n_buyer_name_and_comment_date_list=
                db_top_n_buyer_name_and_comment_date_list,
                db_sku_info_list=db_sku_info_list)
        except Exception as e:
            self.lg.error('出错goods_id: ' + goods_id)
            self.lg.exception(e)
            return self._data_error_init()

        _t = get_shanghai_time()
        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data
コード例 #3
0
    def _get_comment_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}
        self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id))

        self.goods_id = goods_id
        self.headers.update({
            'referer':
            'https://item.m.jd.com/ware/view.action?wareId=' + str(goods_id),
        })

        # 根据京东手机版商品评价获取
        _tmp_comment_list = []
        for current_page in range(1, 3):
            _url = 'https://item.m.jd.com/newComments/newCommentsDetail.json'

            params = self._set_params(goods_id=goods_id,
                                      current_page=current_page)
            body = MyRequests.get_url_body(url=_url,
                                           headers=self.headers,
                                           params=params)
            # self.my_lg.info(str(body))

            _data = json_2_dict(json_str=body, logger=self.my_lg).get(
                'wareDetailComment', {}).get('commentInfoList', [])
            if _data == []:
                self.my_lg.error('出错goods_id:{0}'.format(self.goods_id))

            _tmp_comment_list += _data

            sleep(self.comment_page_switch_sleep_time)

        # pprint(_tmp_comment_list)
        try:
            _comment_list = self._get_comment_list(
                _tmp_comment_list=_tmp_comment_list)
        except Exception as e:
            self.my_lg.error('出错goods_id:{0}'.format(goods_id))
            self.my_lg.exception(e)
            self.result_data = {}
            return {}

        _t = datetime.datetime.now()
        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data
コード例 #4
0
    def _get_comment_data(self, goods_id):
        if goods_id == '':
            return self._data_error()

        self.lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id))
        try:
            # db中已有的buyer_name and comment_date_list
            db_top_n_buyer_name_and_comment_date_list = get_top_n_buyer_name_and_comment_date_by_goods_id(
                goods_id=goods_id,
                logger=self.lg,
            )
        except SqlServerConnectionException:
            self.lg.error('db 连接异常! 此处抓取跳过!')
            return self._data_error()

        # 根据京东手机版商品评价获取
        _tmp_comment_list = []
        for current_page in range(1, 4):
            try:
                _data = self._get_one_page_comment_info(
                    goods_id=goods_id,
                    page_num=current_page,
                )
            except (AssertionError, Exception):
                self.lg.error('遇到错误:', exc_info=True)
                continue

            _tmp_comment_list += _data
            sleep(self.comment_page_switch_sleep_time)

        # pprint(_tmp_comment_list)
        try:
            _comment_list = self._get_comment_list(
                _tmp_comment_list=_tmp_comment_list,
                db_top_n_buyer_name_and_comment_date_list=
                db_top_n_buyer_name_and_comment_date_list,
            )
        except Exception:
            self.lg.error('出错goods_id:{0}'.format(goods_id), exc_info=True)
            return self._data_error()

        _t = get_shanghai_time()
        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data
コード例 #5
0
    def _get_comment_data(self, goods_id):
        if goods_id == '':
            return self._data_error()

        self.lg.info('------>>>| 待抓取的goods_id: {}'.format(goods_id))
        try:
            # db中已有的buyer_name and comment_date_list
            db_top_n_buyer_name_and_comment_date_list = get_top_n_buyer_name_and_comment_date_by_goods_id(
                goods_id=goods_id,
                logger=self.lg,)
        except SqlServerConnectionException:
            self.lg.error('db 连接异常! 此处抓取跳过!')
            return self._data_error()

        # 同步
        # all_comment_list = self._get_all_comment_info(goods_id=goods_id)
        # celery
        all_comment_list = self._get_all_comment_info_by_celery(goods_id=goods_id)

        # self.lg.info(str(len(all_comment_list)))
        try:
            _comment_list = self._get_comment_list(
                all_comment_list=all_comment_list,
                db_top_n_buyer_name_and_comment_date_list=db_top_n_buyer_name_and_comment_date_list)
        except Exception as e:
            self.lg.error('出错goods_id: ' + goods_id)
            self.lg.exception(e)
            return self._data_error()

        _t = get_shanghai_time()
        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data
コード例 #6
0
    def _get_comment_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}
        _tmp_comment_list = []
        self.lg.info('------>>>| 待抓取的goods_id: %s' % goods_id)
        '''
        下面是抓取m.zhe800.com的数据地址
        '''
        for current_page_num in range(1, 4):  # 起始页为1
            self.lg.info('------>>>| 正在抓取第%s页评论...' % str(current_page_num))
            tmp_url = 'https://th5.m.zhe800.com/app/detail/comment/list'
            _params = self._set_params(current_page_num=current_page_num,
                                       goods_id=goods_id)

            self.headers.update({
                'referer':
                'https://th5.m.zhe800.com/h5/comment/list?zid={0}&dealId=39890410&tagId='
                .format(str(goods_id))
            })
            body = Requests.get_url_body(url=tmp_url,
                                         headers=self.headers,
                                         params=_params,
                                         encoding='utf-8',
                                         ip_pool_type=self.ip_pool_type)
            # self.lg.info(str(body))

            data = json_2_dict(json_str=body, logger=self.lg)
            # pprint(data)

            if data.get('comments') is not None:
                _tmp_comment_list += data.get('comments')

            # print(type(data.get('hasNext')))    # <class 'bool'>
            if not data.get('hasNext', False):  # 先判断是否下页还有评论信息
                break

            if data.get('comments') is None and data.get(
                    'hasNext'
            ) is None:  # 默认为空,如果下页没有的话,但是上面已经进行下页判断,此处加这个用于异常退出
                self.lg.error('获取到的data为None, 出错goods_id: ' + goods_id)
                self.result_data = {}
                return {}

            sleep(self.comment_page_switch_sleep_time)

        # self.lg.info(str(len(_tmp_comment_list)))
        try:
            _comment_list = self._get_comment_list(
                _tmp_comment_list=_tmp_comment_list)
        except Exception as e:
            self.lg.error('出错goods_id: ' + goods_id)
            self.lg.exception(e)
            self.result_data = {}
            return {}

        _t = get_shanghai_time()

        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data
コード例 #7
0
    def _get_comment_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}
        self.lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id))

        '''改版抓取m站接口, 分析js源码: 已破解1688 m站 get必须参数_csrf的加密方式'''
        # 即从https://m.1688.com/page/offerRemark.htm?offerId=xxxx 这个页面源码拿到csrf 即为: 下次请求四五星好评所需的_csrf
        # 时间原因先不进行修改!
        # 此外cookies也是必要的, 可用driver获取到再抽离出cookies
        # 研究发现: 其中ali-ss, ali-ss.sig为cookies必要字段
        # 下面还有问题不管怎么请求只能获取到第一页的评论
        #
        # tmp_url = 'https://m.1688.com/page/offerRemark.htm?offerId=' + str(goods_id)
        # body = self.driver.use_phantomjs_to_get_url_body(url=tmp_url)
        # # self.lg.info(str(body))
        #
        # if body == '':
        #     self.lg.error('该地址的body为空值, 出错地址: ' + tmp_url)
        #     return self._error_init()
        # try:
        #     csrf = re.compile('\"csrf\":\"(.*?)\",').findall(body)[0]
        # except IndexError:
        #     self.lg.error('获取csrf失败!')
        #     return self._error_init()
        #
        # self.lg.info('获取到的csrf值为: {}'.format(csrf))
        # cookies = self.driver._get_cookies()
        # cookies = dict_cookies_2_str(cookies)
        # self.lg.info('获取到的cookies为: {}'.format(cookies))
        # origin_comment_list = self._get_origin_comment_list(
        #     csrf=csrf,
        #     goods_id=goods_id,
        #     cookies=cookies, )
        # pprint(origin_comment_list)

        '''下面是模拟pc端好评接口'''
        try:
            # db中已有的buyer_name and comment_date_list
            db_top_n_buyer_name_and_comment_date_list = get_top_n_buyer_name_and_comment_date_by_goods_id(
                goods_id=goods_id,
                logger=self.lg,)
        except SqlServerConnectionException:
            self.lg.error('db 连接异常! 此处抓取跳过!')
            return self._error_init()

        member_id = self._get_this_goods_member_id(goods_id=goods_id)
        self.lg.info('------>>>| 获取到的member_id: {0}'.format(member_id))
        if member_id == '':
            self.lg.error('获取到的member_id为空值!请检查!')
            return self._error_init()

        # 这里从db获取该商品原先的规格值
        try:
            db_sku_info = _get_sku_info_from_db_by_goods_id(
                goods_id=goods_id,
                logger=self.lg,)
            assert db_sku_info != [], 'db_sku_info为空list!'
        except DBGetGoodsSkuInfoErrorException:
            self.lg.error('获取db goods_id: {} 的sku_info失败! 此处跳过!'.format(goods_id))
            return self._error_init()

        # 同步
        # all_comment_list = self._get_all_comment_info(goods_id=goods_id, member_id=member_id)
        # celery
        all_comment_list = self._get_all_comment_info_by_celery(goods_id=goods_id, member_id=member_id)

        try:
            _comment_list = self._get_comment_list(
                all_comment_list=all_comment_list,
                db_top_n_buyer_name_and_comment_date_list=db_top_n_buyer_name_and_comment_date_list,
                db_sku_info=db_sku_info,
            )
            # pprint(_comment_list)
        except Exception:
            self.lg.error('遇到错误[goods_id:{}]:'.format(goods_id), exc_info=True)
            return self._error_init()

        _t = get_shanghai_time()
        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data
コード例 #8
0
ファイル: ali_1688_comment_parse.py プロジェクト: fzdr/python
    def _get_comment_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}
        self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id))

        # # 原先采用phantomjs, 改用手机端抓html(speed slow, give up)
        # tmp_url = 'https://m.1688.com/page/offerRemark.htm?offerId=' + str(goods_id)
        # body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url, exec_code=self._exec_code)
        # # self.my_lg.info(str(body))
        #
        # if body == '':
        #     self.result_data = {}
        #     self.my_lg.error('该地址的body为空值, 出错地址: ' + tmp_url)
        #     return {}
        #
        # _html_comment_list = list(Selector(text=body).css('div.remark-item').extract())
        # if _html_comment_list != []:
        #     _comment_list = []
        #     for index, item in enumerate(_html_comment_list):
        #         if index > 25:  # 就取前25条评论信息
        #             break
        #
        #         buyer_name = str(Selector(text=item).css('span.member::text').extract_first())
        #         quantify = str(Selector(text=item).css('span.amount::text').extract_first())
        #         try:
        #             quantify = int(re.compile(r'\d+').findall(quantify)[0])
        #         except IndexError:
        #             self.my_lg.error('获取quantify时索引异常! 出错地址: ' + tmp_url)
        #             self.result_data = {}
        #             return {}
        #
        #         comment_date = str(Selector(text=item).css('div.date span::text').extract_first())
        #         comment_date = self._get_comment_date(comment_date)     # str '2017-01-25 17:06:00'
        #         tmp_sku_info = str(Selector(text=item).css('div.date::text').extract_first())
        #
        #         _comment_content = self._wash_comment(str(Selector(text=item).css('div.bd::text').extract_first()))
        #         if not filter_invalid_comment_content(_comment_content):
        #             continue
        #
        #         comment = [{
        #             'comment': _comment_content,
        #             'comment_date': comment_date,                                               # 评论创建日期
        #             'sku_info': re.compile(r'<span.*?</span>').sub('', tmp_sku_info),           # 购买的商品规格
        #             'img_url_list': [],
        #             'star_level': randint(3, 5),                                                # 几星好评
        #             'video': '',
        #         }]
        #
        #         _ = {
        #             'buyer_name': buyer_name,           # 买家昵称
        #             'comment': comment,                 # 评论内容
        #             'quantify': quantify,               # 购买数量
        #             'head_img': '',                     # 用户头像
        #             'append_comment': {},               # 追评
        #         }
        #         _comment_list.append(_)
        #
        #     _t = datetime.datetime.now()
        #
        #     _r = CommentItem()
        #     _r['goods_id'] = str(goods_id)
        #     _r['create_time'] = _t
        #     _r['modify_time'] = _t
        #     _r['_comment_list'] = _comment_list
        #     self.result_data = _r
        #     # pprint(self.result_data)
        #     return self.result_data
        # else:
        #     self.my_lg.error('该商品的comment为空list! 出错地址: ' + tmp_url)
        #     self.result_data = {}
        #     return {}

        '''下面是模拟pc端好评接口'''
        member_id = self._get_this_goods_member_id(goods_id=goods_id)
        self.my_lg.info('------>>>| 获取到的member_id: {0}'.format(member_id))
        if member_id == '':
            self.my_lg.error('获取到的member_id为空值!请检查!')
            self.result_data = {}
            return {}

        # 这里从db获取该商品原先的规格值
        sku_info = self._get_sku_info_from_db(goods_id)
        # self.my_lg.info('sku_info: {0}'.format(sku_info))
        if sku_info == []:
            self.result_data = {}
            return {}

        _comment_list = []
        for page_num in range(1, 4):
            self.my_lg.info('------>>>| 正在抓取第{0}页...'.format(page_num))
            params = self._set_params(goods_id=goods_id, member_id=member_id, page_num=page_num)
            url = 'https://rate.1688.com/remark/offerDetail/rates.json'
            tmp_headers = self.headers
            tmp_headers.update({
                'referer': 'https://detail.1688.com/offer/{0}.html'.format(str(goods_id))
            })
            # 原先用MyRequests老是404,改用phantomjsy也还是老是404
            body = MyRequests.get_url_body(url=url, headers=tmp_headers, params=params)
            # self.my_lg.info(str(body))

            # 用phantomjs
            # url = self._set_url(url=url, params=params)
            # self.my_lg.info(url)
            # body = self.my_phantomjs.use_phantomjs_to_get_url_body(url)
            # try:
            #     body = re.compile('<pre.*?>(.*)</pre>').findall(body)[0]
            # except IndexError:
            #     self.my_lg.error('获取body时索引异常!')
            #     self.result_data = {}
            #     return {}

            if body == '':
                self.result_data = {}
                self.my_lg.error('该地址的body为空值, 出错goods_id: {0}'.format(goods_id))
                return {}

            data = self.json_str_2_dict(json_str=body)
            if data.get('url') is not None:
                self.my_lg.info('------>>>| 被重定向到404页面, 休眠{0}s中...'.format(self._page_sleep_time))
                sleep(self._page_sleep_time)
                break

            # self.my_lg.info(str(body))
            data = data.get('data', {}).get('rates', [])
            # pprint(data)
            if data == []:
                # sleep(self._page_sleep_time)
                break

            try:
                for item in data:
                    buyer_name = item.get('member', '')
                    comment = []
                    for i in item.get('rateItem', []):
                        _comment_content = self._wash_comment(i.get('remarkContent', ''))
                        if not filter_invalid_comment_content(_comment_content):
                            continue

                        comment.append({
                            'comment': _comment_content,
                            'comment_date': str(i.get('remarkTime', '')),    # 评论日期
                            'sku_info': choice(sku_info),  # 购买的商品规格(pc端1688商品没有规格)
                            'star_level': i.get('starLevel', 5),
                            'img_url_list': [],
                            'video': '',
                        })
                    quantify = item.get('quantity', 1)                                  # 购买数量
                    if comment == []:   # 为空不录入
                        continue

                    _ = {
                        'buyer_name': buyer_name,           # 买家昵称
                        'comment': comment,                 # 评论内容
                        'quantify': quantify,               # 购买数量
                        'head_img': '',                     # 用户头像
                        'append_comment': {},               # 追评
                    }
                    _comment_list.append(_)

            except Exception:
                self.result_data = {}
                self.my_lg.error('出错商品goods_id: {0}'.format(goods_id), exc_info=True)
                return {}

            sleep(self._page_sleep_time)

        if _comment_list != []:
            # pprint(_comment_list)
            _t = datetime.datetime.now()

            _r = CommentItem()
            _r['goods_id'] = str(goods_id)
            _r['create_time'] = _t
            _r['modify_time'] = _t
            _r['_comment_list'] = _comment_list
            self.result_data = _r

            return self.result_data
        else:
            self.my_lg.error('出错goods_id: {0}'.format(goods_id))
            self.result_data = {}
            return {}
コード例 #9
0
    def _get_comment_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}
        self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id))

        # 原先采用phantomjs, 改用pc端抓包到的接口(speed slow, give up)
        tmp_url = 'https://m.1688.com/page/offerRemark.htm?offerId=' + str(
            goods_id)
        body = self.my_phantomjs.use_phantomjs_to_get_url_body(
            url=tmp_url, exec_code=self._exec_code)
        # self.my_lg.info(str(body))

        if body == '':
            self.result_data = {}
            self.my_lg.error('该地址的body为空值, 出错地址: ' + tmp_url)
            return {}

        _html_comment_list = list(
            Selector(text=body).css('div.remark-item').extract())
        if _html_comment_list != []:
            _comment_list = []
            for index, item in enumerate(_html_comment_list):
                if index > 25:  # 就取前25条评论信息
                    break

                buyer_name = str(
                    Selector(
                        text=item).css('span.member::text').extract_first())
                quantify = str(
                    Selector(
                        text=item).css('span.amount::text').extract_first())
                try:
                    quantify = int(re.compile(r'\d+').findall(quantify)[0])
                except IndexError:
                    self.my_lg.error('获取quantify时索引异常! 出错地址: ' + tmp_url)
                    self.result_data = {}
                    return {}

                comment_date = str(
                    Selector(
                        text=item).css('div.date span::text').extract_first())
                comment_date = self._get_comment_date(
                    comment_date)  # str '2017-01-25 17:06:00'
                tmp_sku_info = str(
                    Selector(text=item).css('div.date::text').extract_first())
                comment = [{
                    'comment':
                    self._wash_comment(
                        str(
                            Selector(text=item).css(
                                'div.bd::text').extract_first())),
                    'comment_date':
                    comment_date,  # 评论创建日期
                    'sku_info':
                    re.compile(r'<span.*?</span>').sub(
                        '', tmp_sku_info),  # 购买的商品规格
                    'img_url_list': [],
                    'star_level':
                    randint(3, 5),  # 几星好评
                    'video':
                    '',
                }]

                _ = {
                    'buyer_name': buyer_name,  # 买家昵称
                    'comment': comment,  # 评论内容
                    'quantify': quantify,  # 购买数量
                    'head_img': '',  # 用户头像
                    'append_comment': {},  # 追评
                }
                _comment_list.append(_)

            _t = datetime.datetime.now()

            _r = CommentItem()
            _r['goods_id'] = str(goods_id)
            _r['create_time'] = _t
            _r['modify_time'] = _t
            _r['_comment_list'] = _comment_list
            self.result_data = _r
            # pprint(self.result_data)
            return self.result_data
        else:
            self.my_lg.error('该商品的comment为空list! 出错地址: ' + tmp_url)
            self.result_data = {}
            return {}
コード例 #10
0
ファイル: tmall_comment_parse.py プロジェクト: devyru/python
    def _get_comment_data(self, _type: int, goods_id):
        """
        获取对应goods_id的评论数据
        :param type:
        :param goods_id:
        :return:
        """
        if goods_id == '' or type == '':
            return self._data_error()

        self.lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id))
        try:
            # db中已有的buyer_name and comment_date_list
            db_top_n_buyer_name_and_comment_date_list = get_top_n_buyer_name_and_comment_date_by_goods_id(
                goods_id=goods_id,
                logger=self.lg,
            )
            # pprint(db_top_n_buyer_name_and_comment_date_list)
        except SqlServerConnectionException:
            self.lg.error('db 连接异常! 此处抓取跳过!')
            return self._data_error()

        try:
            # 先获取到sellerId
            seller_id = self._get_seller_id(_type=type, goods_id=goods_id)
            self.lg.info('------>>>| 获取到的seller_id: {}'.format(seller_id))

            # 获取db sku_info list
            db_sku_info_list = _get_sku_info_from_db_by_goods_id(
                goods_id=goods_id,
                logger=self.lg,
            )
            # pprint(db_sku_info_list)
        except (
                AssertionError,
                IndexError,
        ):
            self.lg.error('遇到错误[goods_id:{}]:'.format(goods_id), exc_info=True)
            return self._data_error()

        except DBGetGoodsSkuInfoErrorException:
            self.lg.error(
                '获取db goods_id: {} 的sku_info失败! 此处跳过!'.format(goods_id))
            return self._data_error()

        # 同步
        # all_comment_list = self._get_all_comment_list(
        #     goods_id=goods_id,
        #     seller_id=seller_id,
        #     _type=_type)
        # celery
        all_comment_list = self._get_all_comment_list_by_celery(
            goods_id=goods_id, seller_id=seller_id, _type=_type)
        # pprint(all_comment_list)

        try:
            _comment_list = self._get_comment_list(
                all_comment_list=all_comment_list,
                db_top_n_buyer_name_and_comment_date_list=
                db_top_n_buyer_name_and_comment_date_list,
                db_sku_info_list=db_sku_info_list)
        except Exception as e:
            self.lg.error('出错type:{0}, goods_id:{1}'.format(
                str(type), goods_id))
            self.lg.exception(e)
            return self._data_error()

        _t = get_shanghai_time()
        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data
コード例 #11
0
    def _get_comment_data(self, type: int, goods_id):
        if goods_id == '' or type == '':
            self.result_data = {}
            return {}
        self.lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id))
        '''先获取到sellerId'''
        try:
            seller_id = self._get_seller_id(type=type, goods_id=goods_id)
        except AssertionError or IndexError as e:
            self.lg.error('出错goods_id: %s' % goods_id)
            self.lg.error(e.args[0])
            self.result_data = {}
            self.random_sku_info_list = []
            return {}
        """再获取price_info_list"""
        try:
            self.random_sku_info_list = self._get_random_sku_info_list()
            # self.lg.info(self.random_sku_info_list)
        except Exception as e:
            self.lg.error('出错goods_id: %s' % str(goods_id))
            self.lg.exception(e)
            self.result_data = {}
            self.random_sku_info_list = []
            return {}

        _tmp_comment_list = []
        for current_page in range(1, 4):
            self.lg.info('------>>>| 正在抓取第 {0} 页的评论...'.format(
                str(current_page)))
            _url = 'https://rate.tmall.com/list_detail_rate.htm'

            params = self._set_params(goods_id=goods_id,
                                      seller_id=seller_id,
                                      current_page=current_page)
            self.headers.update({
                'referer':
                'https://detail.m.tmall.com/item.htm?id=' + goods_id
            })

            # 原先用代理请求不到数据的原因是没有cookies
            # body = MyRequests.get_url_body(url=_url, headers=self.headers, params=params, encoding='gbk')

            # 所以直接用phantomjs来获取相关api数据
            _url = _get_url_contain_params(url=_url,
                                           params=params)  # 根据params组合得到url
            # self.lg.info(_url)

            body = self.driver.use_phantomjs_to_get_url_body(url=_url)
            # self.lg.info(str(body))
            if body == '':
                self.lg.error('获取到的body为空str! 出错type:{0}, goods_id:{1}'.format(
                    str(type), goods_id))
                self.result_data = {}
                return {}

            try:
                _ = re.compile('\((.*)\)').findall(body)[0]
            except IndexError:
                _ = {}
                self.lg.error('索引异常! 出错type:{0}, goods_id:{1}'.format(
                    str(type), goods_id))

            try:
                data = json.loads(_).get('rateDetail', {}).get('rateList', [])
                # pprint(data)
            except:
                data = []
                self.lg.error(
                    'json.loads转换_出错! 出错type:{0}, goods_id:{1}'.format(
                        str(type), goods_id))
            _tmp_comment_list += data
            sleep(self.comment_page_switch_sleep_time)

        try:
            _comment_list = self._get_comment_list(
                _tmp_comment_list=_tmp_comment_list)
        except Exception as e:
            self.lg.error('出错type:{0}, goods_id:{1}'.format(
                str(type), goods_id))
            self.lg.exception(e)
            self.result_data = {}
            return {}

        _t = datetime.datetime.now()
        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data