Exemple #1
0
class JdCommentParse(object):
    def __init__(self, logger=None):
        self.result_data = {}
        self.msg = ''
        self._set_logger(logger)
        self._set_headers()
        self.comment_page_switch_sleep_time = 1.2  # 评论下一页sleep time
        self.my_phantomjs = MyPhantomjs()
        self._add_headers_cookies()

    def _get_comment_data(self, goods_id):
        if goods_id == '':
            self.result_data = {}
            return {}
        self.my_lg.info('------>>>| 待处理的goods_id为: %s' % str(goods_id))

        self.goods_id = goods_id
        self.headers.update({
            'referer':
            'https://item.m.jd.com/ware/view.action?wareId=' + str(goods_id),
        })

        # 根据京东手机版商品评价获取
        _tmp_comment_list = []
        for current_page in range(1, 3):
            _url = 'https://item.m.jd.com/newComments/newCommentsDetail.json'

            params = self._set_params(goods_id=goods_id,
                                      current_page=current_page)
            body = MyRequests.get_url_body(url=_url,
                                           headers=self.headers,
                                           params=params)
            # self.my_lg.info(str(body))

            _data = self._json_2_dict(body).get('wareDetailComment',
                                                {}).get('commentInfoList', [])
            _tmp_comment_list += _data

            sleep(self.comment_page_switch_sleep_time)

        # pprint(_tmp_comment_list)
        try:
            _comment_list = self._get_comment_list(
                _tmp_comment_list=_tmp_comment_list)
        except Exception as e:
            self.my_lg.error('出错goods_id:{0}'.format(goods_id))
            self.my_lg.exception(e)
            self.result_data = {}
            return {}

        _t = datetime.datetime.now()
        _r = CommentItem()
        _r['goods_id'] = str(goods_id)
        _r['create_time'] = _t
        _r['modify_time'] = _t
        _r['_comment_list'] = _comment_list
        self.result_data = _r
        # pprint(self.result_data)

        return self.result_data

    def _get_comment_list(self, _tmp_comment_list):
        '''
        转换成需求的结果集
        :param _tmp_comment_list:
        :return:
        '''
        _comment_list = []
        for item in _tmp_comment_list:
            _comment_date = item.get('commentDate', '')
            assert _comment_date != '', '得到的_comment_date为空str!请检查!'

            # sku_info(有些商品评论是没有规格的所以默认为空即可,不加assert检查!)
            ware_attributes = item.get('wareAttributes', [])
            # self.my_lg.info(str(ware_attributes))
            sku_info = ' '.join([
                i.get('key', '') + ':' + i.get('value', '')
                for i in ware_attributes
            ])
            # assert sku_info != '', '得到的sku_info为空str!请检查!'

            _comment_content = item.get('commentData', '')
            assert _comment_content != '', '得到的评论内容为空str!请检查!'
            _comment_content = self._wash_comment(comment=_comment_content)

            buyer_name = item.get('userNickName', '')
            assert buyer_name != '', '得到的用户昵称为空值!请检查!'

            # jd设置默认 购买量为1
            quantify = 1

            head_img = item.get('userImgURL', '')
            assert head_img != '', '得到的用户头像为空值!请检查!'
            head_img = 'https://' + head_img

            # 第一次评论图片
            _comment_img_list = item.get('pictureInfoList', [])
            if _comment_img_list != []:
                _comment_img_list = [{
                    'img_url': img.get('largePicURL', '')
                } for img in _comment_img_list]
            '''追评'''
            append_comment = {}

            # star_level
            star_level = int(item.get('commentScore', '5'))

            comment = [{
                'comment': _comment_content,
                'comment_date': _comment_date,
                'sku_info': sku_info,
                'img_url_list': _comment_img_list,
                'star_level': star_level,
                'video': '',
            }]

            _comment_list.append({
                'buyer_name': buyer_name,  # 买家昵称
                'comment': comment,  # 评论内容
                'quantify': quantify,  # 评论数量
                'head_img': head_img,  # 头像
                'append_comment': append_comment,  # 追评
            })

        return _comment_list

    def _add_headers_cookies(self):
        # 测试发现得带cookies, 详细到cookies中的sid字符必须有
        # 先获取cookies
        _cookies = self.my_phantomjs.get_url_cookies_from_phantomjs_session(
            url='https://item.m.jd.com/')
        # self.my_lg.info(str(_cookies))
        self.headers.update({
            'cookie': _cookies,
        })

        return None

    def _set_logger(self, logger):
        if logger is None:
            self.my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH +
                                    '/京东/comment/' +
                                    str(get_shanghai_time())[0:10] + '.txt',
                                    console_log_level=INFO,
                                    file_log_level=ERROR)
        else:
            self.my_lg = logger

    def _set_headers(self):
        self.headers = {
            'origin': 'https://item.m.jd.com',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': HEADERS[randint(0,
                                          len(HEADERS) - 1)],
            'content-type': 'application/x-www-form-urlencoded',
            'accept': 'application/json',
            'referer': 'https://item.m.jd.com/ware/view.action?wareId=5025518',
            'x-requested-with': 'XMLHttpRequest',
        }

    def _wash_comment(self, comment):
        '''
        清洗评论
        :param comment:
        :return:
        '''
        comment = re.compile(r'jd|\n|Jd|JD').sub('', comment)
        comment = re.compile('京东').sub('优秀网', comment)

        return comment

    def _json_2_dict(self, json_str):
        '''
        json2dict
        :param json_str:
        :return:
        '''
        try:
            _ = json.loads(json_str)
        except:
            self.my_lg.error('json.loads转换json_str时出错! 出错goods_id: ' +
                             self.goods_id)
            return {}

        return _

    def _set_params(self, goods_id, current_page):
        '''
        设置params
        :param goods_id:
        :param current_page:
        :return:
        '''
        _params = [
            ('wareId', goods_id),
            ('offset', str(current_page)),
            ('num', '10'),
            ('checkParam', 'LUIPPTP'),
            ('category', '670_671_1105'),
            ('isUseMobile', 'true'),
            ('evokeType', ''),
            ('type', '3'),  # '0' 全部评论 | '3' 好评
            ('isCurrentSku', 'false'),
        ]

        return _params

    def __del__(self):
        try:
            del self.my_lg
            del self.my_phantomjs
            del self.headers
        except:
            pass
        gc.collect()
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server.
                          select_jumeiyoupin_xianshimiaosha_all_goods_id())
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            jumeiyoupin_spike = JuMeiYouPinSpike()
            # 获取cookies
            my_phantomjs = MyPhantomjs()
            cookies = my_phantomjs.get_url_cookies_from_phantomjs_session(
                url='https://h5.jumei.com/')
            try:
                del my_phantomjs
            except:
                pass
            if cookies == '':
                print('!!! 获取cookies失败 !!!')
                return False

            print('获取cookies成功!')
            self.headers.update(Cookie=cookies)
            for item in result:  # 实时更新数据
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                jumeiyoupin_miaosha = JuMeiYouPinParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id(
                            goods_id=item[0])
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀结束时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_end_time'))

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        this_page_all_goods_list = self.get_one_page_all_goods_list(
                            item[2])

                        if this_page_all_goods_list == '网络错误!':
                            print('网络错误!先跳过')
                            continue

                        elif this_page_all_goods_list == []:
                            print(
                                '#### 该page对应得到的this_page_all_goods_list为空[]!')
                            print('** 该商品已被下架限时秒杀活动, 此处将其删除')
                            tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id(
                                item[0])
                            print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                            pass

                        else:
                            """
                            由于不会内部提前下架,所以在售卖时间内的全部进行相关更新
                            """
                            # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in this_page_all_goods_list]
                            #
                            # if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                            #     print('该商品已被下架限时秒杀活动,此处将其删除')
                            #     tmp_sql_server.delete_jumeiyoupin_miaosha_expired_goods_id(goods_id=item[0])
                            #     print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                            #     pass
                            #
                            # else:  # 未下架的
                            tmp_r = jumeiyoupin_miaosha.get_goods_id_from_url(
                                item[3])
                            jumeiyoupin_miaosha.get_goods_data(goods_id=tmp_r)
                            goods_data = jumeiyoupin_miaosha.deal_with_data()

                            if goods_data == {}:  # 返回的data为空则跳过
                                pass
                            else:
                                goods_data['goods_id'] = str(item[0])
                                goods_data['miaosha_time'] = {
                                    'miaosha_begin_time':
                                    goods_data['schedule'].get(
                                        'begin_time', ''),
                                    'miaosha_end_time':
                                    goods_data['schedule'].get('end_time', ''),
                                }
                                goods_data['miaosha_begin_time'], goods_data[
                                    'miaosha_end_time'] = jumeiyoupin_spike.get_miaosha_begin_time_and_miaosha_end_time(
                                        miaosha_time=goods_data['miaosha_time']
                                    )

                                # print(goods_data)
                                jumeiyoupin_miaosha.update_jumeiyoupin_xianshimiaosha_table(
                                    data=goods_data, pipeline=tmp_sql_server)
                                sleep(JUMEIYOUPIN_SLEEP_TIME)

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Exemple #3
0
class JuMeiYouPinSpike(object):
    def __init__(self):
        self._set_headers()

    def _set_headers(self):
        self.headers = {
            'Accept': 'application/json,text/javascript,text/plain,*/*;q=0.01',
            # 'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            # 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Host': 'h5.jumei.com',
            'Referer': 'https://h5.jumei.com/',
            'Cache-Control': 'max-age=0',
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent': HEADERS[randint(0, 34)],  # 随机一个请求头
        }

    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        all_goods_list = []
        self.my_phantomjs = MyPhantomjs()
        cookies = self.my_phantomjs.get_url_cookies_from_phantomjs_session(url='https://h5.jumei.com/')
        try: del self.my_phantomjs
        except: pass
        if cookies == '':
            print('!!! 获取cookies失败 !!!')
            return False

        print('获取cookies成功!')
        self.headers.update(Cookie=cookies)

        print('开始抓取在售商品...')
        for page in range(1, 50):   # 1, 开始
            tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=formal&page_key=1521336720'.format(str(page))
            print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url)
            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
            # print(body)

            try:
                json_body = json.loads(body)
                # print(json_body)
            except:
                print('json.loads转换body时出错!请检查')
                json_body = {}
                pass

            this_page_item_list = json_body.get('item_list', [])
            if this_page_item_list == []:
                print('@@@@@@ 所有接口数据抓取完毕 !')
                break

            for item in this_page_item_list:
                if item.get('item_id', '') not in [item_1.get('item_id', '') for item_1 in all_goods_list]:
                    item['page'] = page
                    all_goods_list.append(item)

            sleep(.5)

        print('开始抓取预售商品...')
        for page in range(1, 50):   # 1, 开始
            tmp_url = 'https://h5.jumei.com/index/ajaxDealactList?card_id=4057&page={0}&platform=wap&type=pre&page_key=1521858480'.format(str(page))
            print('正在抓取的page为:', page, ', 接口地址为: ', tmp_url)
            body = MyRequests.get_url_body(url=tmp_url, headers=self.headers)
            # print(body)

            try:
                json_body = json.loads(body)
                # print(json_body)
            except:
                print('json.loads转换body时出错!请检查')
                json_body = {}
                pass

            this_page_item_list = json_body.get('item_list', [])
            if this_page_item_list == []:
                print('@@@@@@ 所有接口数据抓取完毕 !')
                break

            for item in this_page_item_list:
                if item.get('item_id', '') not in [item_1.get('item_id', '') for item_1 in all_goods_list]:
                    item['page'] = page
                    all_goods_list.append(item)

            sleep(.5)

        all_goods_list = [{
            'goods_id': str(item.get('item_id', '')),
            'type': item.get('type', ''),
            'page': item.get('page')
        } for item in all_goods_list if item.get('item_id') is not None]
        print(all_goods_list)
        print('本次抓取到共有限时商品个数为: ', all_goods_list.__len__())

        self.deal_with_data(all_goods_list)

        return True

    def deal_with_data(self, *params):
        '''
        处理并存储相关秒杀商品数据
        :param params: 相关参数
        :return:
        '''
        item_list = params[0]
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            sql_str = r'select goods_id, miaosha_time, page, goods_url from dbo.jumeiyoupin_xianshimiaosha where site_id=26'
            db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=sql_str))]
            # print(db_goods_id_list)

            for item in item_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass
                else:
                    jumei = JuMeiYouPinParse()
                    goods_id = item.get('goods_id', '')
                    type = item.get('type', '')
                    tmp_url = 'https://h5.jumei.com/product/detail?item_id={0}&type={1}'.format(goods_id, type)
                    jumei.get_goods_data(goods_id=[goods_id, type])
                    goods_data = jumei.deal_with_data()

                    if goods_data == {}:
                        pass

                    elif goods_data.get('is_delete', 0) == 1:
                        print('------>>>| 该商品库存为0,已被抢光!')
                        pass

                    else:   # 否则就解析并且插入
                        goods_data['goods_url'] = tmp_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['miaosha_time'] = {
                            'miaosha_begin_time': goods_data['schedule'].get('begin_time', ''),
                            'miaosha_end_time': goods_data['schedule'].get('end_time', ''),
                        }
                        goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time'])
                        goods_data['page'] = item.get('page')

                        # pprint(goods_data)
                        # print(goods_data)
                        jumei.insert_into_jumeiyoupin_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline)
                        sleep(JUMEIYOUPIN_SLEEP_TIME)  # 放慢速度   由于初始化用了phantomjs时间久,于是就不睡眠

                    try: del jumei
                    except: pass

        else:
            print('数据库连接失败,此处跳过!')
            pass

        gc.collect()

    def get_miaosha_begin_time_and_miaosha_end_time(self, miaosha_time):
        '''
        返回秒杀开始和结束时间
        :param miaosha_time:
        :return: tuple  miaosha_begin_time, miaosha_end_time
        '''
        miaosha_begin_time = miaosha_time.get('miaosha_begin_time')
        miaosha_end_time = miaosha_time.get('miaosha_end_time')
        # 将字符串转换为datetime类型
        miaosha_begin_time = datetime.datetime.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')
        miaosha_end_time = datetime.datetime.strptime(miaosha_end_time, '%Y-%m-%d %H:%M:%S')

        return miaosha_begin_time, miaosha_end_time

    def __del__(self):
        gc.collect()