Python filter_invalid_comment_content Examples, fzutils.cp_utils.filter_invalid_comment_content Python Examples

Example #1

0

Show file

    def _get_comment_list(self, all_comment_list, db_top_n_buyer_name_and_comment_date_list, db_sku_info):
        """
        转换成结果集
        :param all_comment_list:
        :return:
        """
        _comment_list = []
        for item in all_comment_list:
            buyer_name = item.get('member', '')
            comment = []
            # TODO item.get('rateItem', [])只取第一条comment
            try:
                first_rate_item = item.get('rateItem', [])[0]
            except IndexError:
                continue

            _comment_content = wash_goods_comment(comment_content=first_rate_item.get('remarkContent', ''))
            if not filter_invalid_comment_content(_comment_content):
                continue

            comment_date = self._get_comment_date2(item=first_rate_item)
            comment.append({
                'comment': _comment_content,
                'comment_date': comment_date,
                'sku_info': choice(db_sku_info),  # 购买的商品规格(pc端1688商品没有规格)
                'star_level': first_rate_item.get('starLevel', 5),
                'img_url_list': [],
                'video': '',
            })
            quantify = item.get('quantity', 1)  # 购买数量
            if comment == []:  # 为空不录入
                continue

            if not filter_crawled_comment_content(
                    new_buyer_name=buyer_name,
                    new_comment_date=comment_date,
                    db_buyer_name_and_comment_date_info=db_top_n_buyer_name_and_comment_date_list,
                    logger=self.lg):
                # 过滤已采集的comment
                continue

            _ = {
                'buyer_name': buyer_name,  # 买家昵称
                'comment': comment,  # 评论内容
                'quantify': quantify,  # 购买数量
                'head_img': '',  # 用户头像
                'append_comment': {},  # 追评
            }
            _comment_list.append(_)

        return _comment_list

Example #2

0

Show file

    def _get_comment_list(self, _tmp_comment_list):
        '''
        转换成需求的结果集
        :param _tmp_comment_list:
        :return:
        '''
        _comment_list = []
        for item in _tmp_comment_list:
            _comment_date = item.get('commentDate', '')
            assert _comment_date != '', '得到的_comment_date为空str!请检查!'

            # sku_info(有些商品评论是没有规格的所以默认为空即可，不加assert检查!)
            ware_attributes = item.get('wareAttributes', [])
            # self.lg.info(str(ware_attributes))
            sku_info = ' '.join([i.get('key', '')+':'+i.get('value', '') for i in ware_attributes])
            # assert sku_info != '', '得到的sku_info为空str!请检查!'

            _comment_content = item.get('commentData', '')
            assert _comment_content != '', '得到的评论内容为空str!请检查!'
            _comment_content = self._wash_comment(comment=_comment_content)

            buyer_name = item.get('userNickName', '')
            assert buyer_name != '', '得到的用户昵称为空值!请检查!'

            # jd设置默认 购买量为1
            quantify = 1

            head_img = item.get('userImgURL', '')
            assert head_img != '', '得到的用户头像为空值!请检查!'
            head_img = 'https://' + head_img

            # 第一次评论图片
            _comment_img_list = item.get('pictureInfoList', [])
            if _comment_img_list != []:
                _comment_img_list = [{'img_url': img.get('largePicURL', '')} for img in _comment_img_list]

            '''追评'''
            append_comment = {}

            # star_level
            star_level = int(item.get('commentScore', '5'))

            if not filter_invalid_comment_content(_comment_content):
                continue

            comment = [{
                'comment': _comment_content,
                'comment_date': _comment_date,
                'sku_info': sku_info,
                'img_url_list': _comment_img_list,
                'star_level': star_level,
                'video': '',
            }]

            _comment_list.append({
                'buyer_name': buyer_name,  # 买家昵称
                'comment': comment,  # 评论内容
                'quantify': quantify,  # 评论数量
                'head_img': head_img,  # 头像
                'append_comment': append_comment,  # 追评
            })

        return _comment_list

Example #3

0

Show file

File: zhe_800_comment_parse.py Project: ttggaa/Python_example

    def _get_comment_list(self, _tmp_comment_list):
        '''
        获取规范化的comment结果集
        :param _tmp_comment_list:
        :return:
        '''
        _comment_list = []
        for item in _tmp_comment_list:
            comment_date = item.get('createTime', '')
            assert comment_date != '', '得到的comment_date为空str!请检查!'
            comment_date = self._get_comment_date(comment_date)

            buyer_name = item.get('nickname', '')
            assert buyer_name != '', '得到的用户昵称为空值!请检查!'

            _comment_content = item.get('content', '')
            assert _comment_content != '', '得到的评论内容为空str!请检查!'
            _comment_content = self._wash_comment(comment=_comment_content)

            sku_info = item.get('skuDesc', '')
            # self.lg.info(sku_info)
            # 存在规格为空的
            # assert sku_info != '', '得到的sku_info为空str!请检查!'
            sku_info = self._wash_sku_info(sku_info)

            # 第一次评论照片
            img_url_list = item.get('firstEvidences', [])
            # self.lg.info(img_url_list)
            if img_url_list is None:
                img_url_list = []
            else:
                img_url_list = [{
                    'img_url': _i.get('big', '')
                } for _i in img_url_list]
            '''追评'''
            append_comment = {}
            if item.get('appendTime', '') == '':  # 追评时间为空即表示无追评
                pass
            else:
                _tmp_append_comment_content = item.get('append', '')
                # 追评的图片
                _append_comment_img_list = [
                    {
                        'img_url': img.get('big', '')
                    } for img in item.get('appendEvidences')
                ] if item.get('appendEvidences') is not None else []
                # self.lg.info(_append_comment_img_list)

                append_comment = {
                    'comment_date': item.get('appendTime', ''),
                    'comment': self._wash_comment(_tmp_append_comment_content),
                    'img_url_list': _append_comment_img_list,
                }

            # 购买数量, 随机
            quantify = randint(1, 2)

            # 用户头像, 默认留空
            head_img = ''

            # 评论星级
            star_level = int(item.get('levelStar', 5))

            if not filter_invalid_comment_content(_comment_content):
                continue

            comment = [{
                'comment': _comment_content,
                'comment_date': comment_date,
                'sku_info': sku_info,
                'img_url_list': img_url_list,
                'star_level': star_level,
                'video': '',
            }]

            _ = {
                'buyer_name': buyer_name,  # 买家昵称
                'comment': comment,  # 评论内容
                'quantify': quantify,  # 购买数量
                'head_img': head_img,  # 头像
                'append_comment': append_comment,  # 追评
            }

            _comment_list.append(_)

        return _comment_list

Example #4

0

Show file

File: ali_1688_comment_parse.py Project: fzdr/python

    def _get_comment_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}
        self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id))

        # # 原先采用phantomjs, 改用手机端抓html(speed slow, give up)
        # tmp_url = 'https://m.1688.com/page/offerRemark.htm?offerId=' + str(goods_id)
        # body = self.my_phantomjs.use_phantomjs_to_get_url_body(url=tmp_url, exec_code=self._exec_code)
        # # self.my_lg.info(str(body))
        #
        # if body == '':
        #     self.result_data = {}
        #     self.my_lg.error('该地址的body为空值, 出错地址: ' + tmp_url)
        #     return {}
        #
        # _html_comment_list = list(Selector(text=body).css('div.remark-item').extract())
        # if _html_comment_list != []:
        #     _comment_list = []
        #     for index, item in enumerate(_html_comment_list):
        #         if index > 25:  # 就取前25条评论信息
        #             break
        #
        #         buyer_name = str(Selector(text=item).css('span.member::text').extract_first())
        #         quantify = str(Selector(text=item).css('span.amount::text').extract_first())
        #         try:
        #             quantify = int(re.compile(r'\d+').findall(quantify)[0])
        #         except IndexError:
        #             self.my_lg.error('获取quantify时索引异常! 出错地址: ' + tmp_url)
        #             self.result_data = {}
        #             return {}
        #
        #         comment_date = str(Selector(text=item).css('div.date span::text').extract_first())
        #         comment_date = self._get_comment_date(comment_date)     # str '2017-01-25 17:06:00'
        #         tmp_sku_info = str(Selector(text=item).css('div.date::text').extract_first())
        #
        #         _comment_content = self._wash_comment(str(Selector(text=item).css('div.bd::text').extract_first()))
        #         if not filter_invalid_comment_content(_comment_content):
        #             continue
        #
        #         comment = [{
        #             'comment': _comment_content,
        #             'comment_date': comment_date,                                               # 评论创建日期
        #             'sku_info': re.compile(r'<span.*?</span>').sub('', tmp_sku_info),           # 购买的商品规格
        #             'img_url_list': [],
        #             'star_level': randint(3, 5),                                                # 几星好评
        #             'video': '',
        #         }]
        #
        #         _ = {
        #             'buyer_name': buyer_name,           # 买家昵称
        #             'comment': comment,                 # 评论内容
        #             'quantify': quantify,               # 购买数量
        #             'head_img': '',                     # 用户头像
        #             'append_comment': {},               # 追评
        #         }
        #         _comment_list.append(_)
        #
        #     _t = datetime.datetime.now()
        #
        #     _r = CommentItem()
        #     _r['goods_id'] = str(goods_id)
        #     _r['create_time'] = _t
        #     _r['modify_time'] = _t
        #     _r['_comment_list'] = _comment_list
        #     self.result_data = _r
        #     # pprint(self.result_data)
        #     return self.result_data
        # else:
        #     self.my_lg.error('该商品的comment为空list! 出错地址: ' + tmp_url)
        #     self.result_data = {}
        #     return {}

        '''下面是模拟pc端好评接口'''
        member_id = self._get_this_goods_member_id(goods_id=goods_id)
        self.my_lg.info('------>>>| 获取到的member_id: {0}'.format(member_id))
        if member_id == '':
            self.my_lg.error('获取到的member_id为空值!请检查!')
            self.result_data = {}
            return {}

        # 这里从db获取该商品原先的规格值
        sku_info = self._get_sku_info_from_db(goods_id)
        # self.my_lg.info('sku_info: {0}'.format(sku_info))
        if sku_info == []:
            self.result_data = {}
            return {}

        _comment_list = []
        for page_num in range(1, 4):
            self.my_lg.info('------>>>| 正在抓取第{0}页...'.format(page_num))
            params = self._set_params(goods_id=goods_id, member_id=member_id, page_num=page_num)
            url = 'https://rate.1688.com/remark/offerDetail/rates.json'
            tmp_headers = self.headers
            tmp_headers.update({
                'referer': 'https://detail.1688.com/offer/{0}.html'.format(str(goods_id))
            })
            # 原先用MyRequests老是404，改用phantomjsy也还是老是404
            body = MyRequests.get_url_body(url=url, headers=tmp_headers, params=params)
            # self.my_lg.info(str(body))

            # 用phantomjs
            # url = self._set_url(url=url, params=params)
            # self.my_lg.info(url)
            # body = self.my_phantomjs.use_phantomjs_to_get_url_body(url)
            # try:
            #     body = re.compile('<pre.*?>(.*)</pre>').findall(body)[0]
            # except IndexError:
            #     self.my_lg.error('获取body时索引异常!')
            #     self.result_data = {}
            #     return {}

            if body == '':
                self.result_data = {}
                self.my_lg.error('该地址的body为空值, 出错goods_id: {0}'.format(goods_id))
                return {}

            data = self.json_str_2_dict(json_str=body)
            if data.get('url') is not None:
                self.my_lg.info('------>>>| 被重定向到404页面, 休眠{0}s中...'.format(self._page_sleep_time))
                sleep(self._page_sleep_time)
                break

            # self.my_lg.info(str(body))
            data = data.get('data', {}).get('rates', [])
            # pprint(data)
            if data == []:
                # sleep(self._page_sleep_time)
                break

            try:
                for item in data:
                    buyer_name = item.get('member', '')
                    comment = []
                    for i in item.get('rateItem', []):
                        _comment_content = self._wash_comment(i.get('remarkContent', ''))
                        if not filter_invalid_comment_content(_comment_content):
                            continue

                        comment.append({
                            'comment': _comment_content,
                            'comment_date': str(i.get('remarkTime', '')),    # 评论日期
                            'sku_info': choice(sku_info),  # 购买的商品规格(pc端1688商品没有规格)
                            'star_level': i.get('starLevel', 5),
                            'img_url_list': [],
                            'video': '',
                        })
                    quantify = item.get('quantity', 1)                                  # 购买数量
                    if comment == []:   # 为空不录入
                        continue

                    _ = {
                        'buyer_name': buyer_name,           # 买家昵称
                        'comment': comment,                 # 评论内容
                        'quantify': quantify,               # 购买数量
                        'head_img': '',                     # 用户头像
                        'append_comment': {},               # 追评
                    }
                    _comment_list.append(_)

            except Exception:
                self.result_data = {}
                self.my_lg.error('出错商品goods_id: {0}'.format(goods_id), exc_info=True)
                return {}

            sleep(self._page_sleep_time)

        if _comment_list != []:
            # pprint(_comment_list)
            _t = datetime.datetime.now()

            _r = CommentItem()
            _r['goods_id'] = str(goods_id)
            _r['create_time'] = _t
            _r['modify_time'] = _t
            _r['_comment_list'] = _comment_list
            self.result_data = _r

            return self.result_data
        else:
            self.my_lg.error('出错goods_id: {0}'.format(goods_id))
            self.result_data = {}
            return {}

Example #5

0

Show file

File: taobao_comment_parse.py Project: mylove1/python

    def _get_comment_list(self, _tmp_comment_list):
        '''
        转化成需要的结果集
        :param _tmp_comment_list:
        :return:
        '''
        _comment_list = []
        _sku_info_list = []  # 用于存已有的规格
        for item in _tmp_comment_list:
            comment_date = item.get('date', '')
            assert comment_date != '', '得到的comment_date为空str!请检查!'
            comment_date = self._get_comment_date(comment_date)

            sku_info = item.get('auction', {}).get('sku', '')
            # self.my_lg.info(sku_info)
            if sku_info == '' and _sku_info_list == []:  # 规格为空就跳过, 即只抓取有效评论
                continue
            if sku_info != '':  # 不为空存入
                _sku_info_list.append(sku_info)
                _sku_info_list = list(set(_sku_info_list))
            if sku_info == '':  # 为空的，随机设置一个
                sku_info = _sku_info_list[randint(0, len(_sku_info_list) - 1)]
                # print(sku_info)
            sku_info = self._wash_sku_info(sku_info)

            # 评论照片
            img_url_list = item.get('photos', [])
            img_url_list = [{
                'img_url': 'https:' + _i.get('url', '')
            } for _i in img_url_list if _i.get('url', '') != '']

            _comment_content = item.get('content', '')
            assert _comment_content != '', '得到的评论内容为空str!请检查!'
            _comment_content = self._wash_comment(comment=_comment_content)

            buyer_name = item.get('user', {}).get('nick', '')
            assert buyer_name != '', '得到的用户昵称为空值!请检查!'

            quantify = int(item.get('buyAmount', 0)) if item.get('buyAmount', 0) != 0 else 1

            tmp_head_img = item.get('user', {}).get('avatar', '')
            if re.compile(r'/default/avatar-40.png').findall(tmp_head_img) != []:
                head_img = ''

            # 无法识别是否为同一张图 只能先拿到这种规律的然后请求图片看齐地址
            # elif re.compile(r'vGNuOHcWv88YXF').findall(tmp_head_img) != []:
            #     # self.my_lg.info('https:' + tmp_head_img)
            #     if self._judge_is_taobao_head_img(url='https:' + tmp_head_img):
            #         self.my_lg.info('https:' + tmp_head_img)
            #         head_img = ''
            #     else:
            #         head_img = 'https:' + tmp_head_img

            elif tmp_head_img != '//wwc.alicdn.com/avatar/getAvatar.do?userIdStr=vGNuOHcWv88YXF-HPmvbM07HvG8SvFI0Xm7Hvm80MkZhvkk0XmcSPFPhPHQWOmvG&width=40&height=40&type=sns' \
                    or tmp_head_img != '//gw.alicdn.com/tps/i3/TB1yeWeIFXXXXX5XFXXuAZJYXXX-210-210.png_40x40.jpg':
                head_img = 'https:' + tmp_head_img

            else:
                head_img = ''

            if not filter_invalid_comment_content(_comment_content):
                continue

            comment = [{
                'comment': _comment_content,
                'comment_date': comment_date,
                'sku_info': sku_info,
                'img_url_list': img_url_list,
                'star_level': randint(4, 5),
                'video': item.get('video', ''),
            }]

            _ = {
                'buyer_name': buyer_name,   # 买家昵称
                'comment': comment,         # 评论内容
                'quantify': quantify,       # 购买数量
                'head_img': head_img,       # 头像
                'append_comment': {},       # 追评
            }

            _comment_list.append(_)

        return _comment_list

Example #6

0

Show file

    def _get_comment_list(self, all_comment_list,
                          db_top_n_buyer_name_and_comment_date_list,
                          db_sku_info_list):
        '''
        转化成需要的结果集
        :param all_comment_list:
        :return:
        '''
        _comment_list = []
        _sku_info_list = []  # 用于存已有的规格
        for item in all_comment_list:
            comment_date = self._get_comment_date(item=item)
            sku_info = self._get_sku_info(item=item,
                                          db_sku_info_list=db_sku_info_list)

            # 评论照片
            img_url_list = item.get('photos', [])
            img_url_list = [{
                'img_url': 'https:' + _i.get('url', '')
            } for _i in img_url_list if _i.get('url', '') != '']

            _comment_content = item.get('content', '')
            assert _comment_content != '', '得到的评论内容为空str!请检查!'
            _comment_content = wash_goods_comment(
                comment_content=_comment_content)

            buyer_name = item.get('user', {}).get('nick', '')
            assert buyer_name != '', '得到的用户昵称为空值!请检查!'

            quantify = int(item.get('buyAmount',
                                    0)) if item.get('buyAmount', 0) != 0 else 1
            head_img = self._get_head_img_url(item=item)

            ori_video_info = item.get(
                'video', {}) if item.get('video') is not None else {}
            video_url = ori_video_info.get('cloudVideoUrl', '')
            video_url = 'https:' + video_url if video_url != '' else ''

            if not filter_invalid_comment_content(_comment_content):
                continue

            if not filter_crawled_comment_content(
                    new_buyer_name=buyer_name,
                    new_comment_date=comment_date,
                    db_buyer_name_and_comment_date_info=
                    db_top_n_buyer_name_and_comment_date_list,
                    logger=self.lg):
                # 过滤已采集的comment
                continue

            comment = [{
                'comment': _comment_content,
                'comment_date': comment_date,
                'sku_info': sku_info,
                'img_url_list': img_url_list,
                'star_level': randint(4, 5),
                'video': video_url,
            }]
            _ = {
                'buyer_name': buyer_name,  # 买家昵称
                'comment': comment,  # 评论内容
                'quantify': quantify,  # 购买数量
                'head_img': head_img,  # 头像
                'append_comment': {},  # 追评
            }

            _comment_list.append(_)

        return _comment_list

Example #7

0

Show file

File: tmall_comment_parse.py Project: devyru/python

    def _get_comment_list(self, all_comment_list,
                          db_top_n_buyer_name_and_comment_date_list,
                          db_sku_info_list):
        '''
        转换成需求的结果集
        :param all_comment_list:
        :return:
        '''
        _comment_list = []
        for item in all_comment_list:
            # pprint(item)
            _comment_date = self._get_comment_date(item=item)

            sku_info = item.get('auctionSku', '')
            # self.lg.info(sku_info)
            if sku_info == '':
                # 从所有规格里面随机一个
                sku_info = str(choice(db_sku_info_list))

            _comment_content = item.get('rateContent', '')
            assert _comment_content != '', '得到的评论内容为空str!请检查!'
            _comment_content = wash_goods_comment(
                comment_content=_comment_content)

            buyer_name = item.get('displayUserNick', '')
            assert buyer_name != '', '得到的用户昵称为空值!请检查!'

            # 天猫设置默认 购买量为1
            quantify = 1
            # 天猫没有head_img回传，就设置一个默认地址
            head_img = ''

            # 第一次评论图片
            _comment_img_list = item.get(
                'pics', []) if item.get('pics', '') != '' else []
            if _comment_img_list != []:
                _comment_img_list = [{
                    'img_url': 'https:' + img
                } for img in _comment_img_list]
            '''追评'''
            _tmp_append_comment = item.get(
                'appendComment',
                {}) if item.get('appendComment') is not None else {}
            # 追评的图片
            # pprint(_tmp_append_comment)
            _append_comment_img_list = _tmp_append_comment.get(
                'pics',
                []) if _tmp_append_comment.get('pics', '') != '' else []
            if _append_comment_img_list != []:
                _append_comment_img_list = [{
                    'img_url': 'https:' + img
                } for img in _append_comment_img_list]

            if _tmp_append_comment != {}:
                append_comment = {
                    'comment_date':
                    _tmp_append_comment.get('commentTime', ''),
                    'comment':
                    wash_goods_comment(comment_content=_tmp_append_comment.get(
                        'content', '')),
                    'img_url_list':
                    _append_comment_img_list,
                }
            else:
                append_comment = {}

            if not filter_invalid_comment_content(_comment_content):
                continue

            if not filter_crawled_comment_content(
                    new_buyer_name=buyer_name,
                    new_comment_date=_comment_date,
                    db_buyer_name_and_comment_date_info=
                    db_top_n_buyer_name_and_comment_date_list,
                    logger=self.lg):
                # 过滤已采集的comment
                continue

            comment = [{
                'comment': _comment_content,
                'comment_date': _comment_date,
                'sku_info': sku_info,
                'img_url_list': _comment_img_list,
                'star_level': randint(4, 5),
                'video': '',
            }]
            _ = {
                'buyer_name': buyer_name,  # 买家昵称
                'comment': comment,  # 评论内容
                'quantify': quantify,  # 评论数量
                'head_img': head_img,  # 头像
                'append_comment': append_comment,  # 追评
            }
            _comment_list.append(_)

        return _comment_list

Example #8

0

Show file

    def _get_comment_list(self, _tmp_comment_list):
        '''
        转换成需求的结果集
        :param _tmp_comment_list:
        :return:
        '''
        _comment_list = []
        for item in _tmp_comment_list:
            # pprint(item)
            _comment_date = item.get('rateDate', '')
            assert _comment_date != '', '得到的_comment_date为空str!请检查!'

            # 天猫接口拿到的sku_info默认为空
            # sku_info = ''
            # 从所有规格里面随机一个
            if self.random_sku_info_list == []:
                self.random_sku_info_list = ['']
            sku_info = str(choice(self.random_sku_info_list))

            _comment_content = item.get('rateContent', '')
            assert _comment_content != '', '得到的评论内容为空str!请检查!'
            _comment_content = self._wash_comment(comment=_comment_content)

            buyer_name = item.get('displayUserNick', '')
            assert buyer_name != '', '得到的用户昵称为空值!请检查!'

            # 天猫设置默认 购买量为1
            quantify = 1

            # 天猫没有head_img回传，就设置一个默认地址
            head_img = ''

            # 第一次评论图片
            _comment_img_list = item.get(
                'pics', []) if item.get('pics', '') != '' else []
            if _comment_img_list != []:
                _comment_img_list = [{
                    'img_url': 'https:' + img
                } for img in _comment_img_list]
            '''追评'''
            _tmp_append_comment = item.get(
                'appendComment',
                {}) if item.get('appendComment') is not None else {}
            # 追评的图片
            # pprint(_tmp_append_comment)
            _append_comment_img_list = _tmp_append_comment.get(
                'pics',
                []) if _tmp_append_comment.get('pics', '') != '' else []
            if _append_comment_img_list != []:
                _append_comment_img_list = [{
                    'img_url': 'https:' + img
                } for img in _append_comment_img_list]

            if _tmp_append_comment != {}:
                append_comment = {
                    'comment_date':
                    _tmp_append_comment.get('commentTime', ''),
                    'comment':
                    self._wash_comment(_tmp_append_comment.get('content', '')),
                    'img_url_list':
                    _append_comment_img_list,
                }
            else:
                append_comment = {}

            if not filter_invalid_comment_content(_comment_content):
                continue

            comment = [{
                'comment': _comment_content,
                'comment_date': _comment_date,
                'sku_info': sku_info,
                'img_url_list': _comment_img_list,
                'star_level': randint(4, 5),
                'video': '',
            }]

            _ = {
                'buyer_name': buyer_name,  # 买家昵称
                'comment': comment,  # 评论内容
                'quantify': quantify,  # 评论数量
                'head_img': head_img,  # 头像
                'append_comment': append_comment,  # 追评
            }

            _comment_list.append(_)

        return _comment_list

Example #9

0

Show file

File: jd_comment_parse.py Project: yfeng2018/python-1

    def _get_comment_list(self, _tmp_comment_list,
                          db_top_n_buyer_name_and_comment_date_list):
        '''
        转换成需求的结果集
        :param _tmp_comment_list:
        :return:
        '''
        _comment_list = []
        for item in _tmp_comment_list:
            _comment_date = self._get_comment_date(item=item)

            # sku_info(有些商品评论是没有规格的所以默认为空即可，不加assert检查!)
            # eg: '颜域品牌女装2017冬季新品娃娃领加厚格纹绣花毛呢外套中长款大衣04W7135 黑色 M/38'
            ware_attributes = item.get('referenceName', '')
            # self.lg.info(str(ware_attributes))
            sku_info = ' '.join(ware_attributes.split(' ')[1:])
            # assert sku_info != '', '得到的sku_info为空str!请检查!'

            _comment_content = item.get('content', '')
            assert _comment_content != '', '得到的评论内容为空str!请检查!'
            _comment_content = wash_goods_comment(
                comment_content=_comment_content)

            buyer_name = item.get('nickname', '')
            assert buyer_name != '', '得到的用户昵称为空值!请检查!'

            # jd设置默认 购买量为1
            quantify = 1
            head_img = self._get_head_img_url(item=item)

            # 第一次评论图片
            _comment_img_list = item.get('images', [])
            if _comment_img_list != []:
                _comment_img_list = [
                    {
                        'img_url':
                        img.get('imgUrl', '').replace('s128x96_jfs',
                                                      'jfs'),  # 小图换成大图!
                    } for img in _comment_img_list
                ]
            '''追评'''
            append_comment = {}
            # star_level
            star_level = int(item.get('score', '5'))

            if not filter_invalid_comment_content(_comment_content):
                continue

            comment = [{
                'comment': _comment_content,
                'comment_date': _comment_date,
                'sku_info': sku_info,
                'img_url_list': _comment_img_list,
                'star_level': star_level,
                'video': '',
            }]
            if not filter_crawled_comment_content(
                    new_buyer_name=buyer_name,
                    new_comment_date=_comment_date,
                    db_buyer_name_and_comment_date_info=
                    db_top_n_buyer_name_and_comment_date_list,
                    logger=self.lg):
                # 过滤已采集的comment
                continue

            _comment_list.append({
                'buyer_name': buyer_name,  # 买家昵称
                'comment': comment,  # 评论内容
                'quantify': quantify,  # 评论数量
                'head_img': head_img,  # 头像
                'append_comment': append_comment,  # 追评
            })

        return _comment_list