Beispiel #1
0
    def deal_with_data(self, *params):
        '''
        处理并存储相关秒杀商品数据
        :param params: 相关参数
        :return:
        '''
        item_list = params[0]
        chuchujie = ChuChuJie_9_9_Parse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            db_goods_id_list = [
                item[0] for item in list(
                    my_pipeline.select_chuchujie_xianshimiaosha_all_goods_id())
            ]
            # print(db_goods_id_list)

            # my_phantomjs = MyPhantomjs()
            # my_phantomjs.init_phantomjs()
            # index = 1
            for item in item_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass

                else:
                    goods_id = item.get('goods_id', '')
                    tmp_url = 'https://m.chuchujie.com/details/detail.html?id=' + str(
                        goods_id)
                    chuchujie.get_goods_data(goods_id=goods_id)
                    goods_data = chuchujie.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass

                    elif goods_data.get('is_delete',
                                        0) == 1:  # is_delete=1(即库存为0)则跳过
                        print('------>>>| 该商品库存为0,已被抢光!')
                        pass

                    else:  # 否则就解析并且插入
                        my_phantomjs = MyPhantomjs()
                        my_phantomjs.init_phantomjs()

                        # 获取剩余时间
                        tmp_body = my_phantomjs.use_phantomjs_to_get_url_body(
                            url=tmp_url, css_selector='p#activityTime span')
                        # print(tmp_body)

                        try:
                            del my_phantomjs
                        except:
                            pass
                        gc.collect()

                        if tmp_body == '':  # 获取手机版的页面完整html失败
                            sleep(.4)
                            pass

                        else:
                            # p#activityTime span
                            _t = Selector(text=tmp_body).css(
                                'p#activityTime span::text').extract_first()
                            _t = re.compile(r'剩余').sub('', _t)
                            # print(_t)
                            if _t == '' or _t is None:
                                print('获取到的_t为空值, 严重错误! 请检查!')

                            miaosha_end_time = self.get_miaosha_end_time(_t)

                            goods_data['goods_url'] = tmp_url
                            goods_data['goods_id'] = str(goods_id)
                            goods_data['sub_title'] = item.get('sub_title', '')
                            goods_data['miaosha_time'] = {
                                'miaosha_begin_time':
                                self.timestamp_to_regulartime(int(
                                    time.time())),
                                'miaosha_end_time':
                                self.timestamp_to_regulartime(
                                    int(miaosha_end_time)),
                            }
                            goods_data['miaosha_begin_time'], goods_data[
                                'miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(
                                    miaosha_time=goods_data['miaosha_time'])
                            goods_data['gender'] = str(item.get('gender', '0'))
                            goods_data['page'] = item.get('page')

                            # pprint(goods_data)
                            # print(goods_data)
                            chuchujie.insert_into_chuchujie_xianshimiaosha_table(
                                data=goods_data, pipeline=my_pipeline)
                            # sleep(CHUCHUJIE_SLEEP_TIME)  # 放慢速度   由于初始化用了phantomjs时间久,于是就不睡眠

                        # index += 1

        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del chuchujie
        except:
            pass
        gc.collect()
    def run_forever(self):
        '''
        实时更新数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(
                tmp_sql_server.select_chuchujie_xianshimiaosha_all_goods_id())
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            for item in result:  # 实时更新数据
                miaosha_end_time = json.loads(item[1]).get('miaosha_end_time')
                miaosha_end_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_end_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_end_time)

                data = {}
                # 释放内存, 在外面声明就会占用很大的, 所以此处优化内存的方法是声明后再删除释放
                chuchujie_miaosha = ChuChuJie_9_9_Parse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_end_time) == 0:
                        tmp_sql_server.delete_chuchujie_miaosha_expired_goods_id(
                            goods_id=item[0])
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀结束时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_end_time'))

                    elif self.is_recent_time(miaosha_end_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))
                        data['goods_id'] = item[0]

                        body = self.get_one_page_goods_info(item[2], item[3])

                        if body == '{}':
                            # 可能是网络原因导致, 先跳过
                            pass

                        else:
                            try:
                                json_body = json.loads(body)
                                # print(json_body)
                            except:
                                print('json.loads转换body时出错!请检查')
                                json_body = {}
                                pass

                            try:
                                this_page_total_count = json_body.get(
                                    'data',
                                    {}).get('groupList',
                                            [])[0].get('totalCount', 0)
                            except IndexError:
                                print('获取this_page_total_count时出错, 请检查!')
                                this_page_total_count = 0

                            # 获取对应gender, page的商品list
                            if this_page_total_count == 0:
                                item_list = []

                            else:
                                tmp_goods_list = json_body.get('data', {}).get(
                                    'groupList', [])[0].get('dataList', [])

                                item_list = [{
                                    'goods_id':
                                    str(item_s.get('chuchuId', '')),
                                    'sub_title':
                                    item_s.get('description', ''),
                                } for item_s in tmp_goods_list]

                            if item_list == []:
                                print('#### 该gender, page对应得到的item_list为空[]!')
                                print('该商品已被下架限时秒杀活动,此处将其删除')
                                tmp_sql_server.delete_chuchujie_miaosha_expired_goods_id(
                                    goods_id=item[0])
                                print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                pass

                            else:
                                # miaosha_goods_all_goods_id = [item_1.get('goods_id', '') for item_1 in item_list]
                                """
                                由于不会内部提前下架,所以在售卖时间内的全部进行相关更新
                                """
                                # if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                                #     print('该商品已被下架限时秒杀活动,此处将其删除')
                                #     tmp_sql_server.delete_chuchujie_miaosha_expired_goods_id(goods_id=item[0])
                                #     print('下架的goods_id为(%s)' % item[0], ', 删除成功!')
                                #     pass
                                #
                                # else:  # 未下架的
                                '''
                                不更新秒杀时间和sub_title, 只更新其他相关数据
                                '''
                                # for item_2 in item_list:
                                #     if item_2.get('goods_id', '') == item[0]:
                                chuchujie_miaosha.get_goods_data(
                                    goods_id=item[0])
                                goods_data = chuchujie_miaosha.deal_with_data()

                                if goods_data == {}:  # 返回的data为空则跳过
                                    pass
                                else:
                                    goods_data['goods_id'] = str(item[0])

                                    # goods_data['sub_title'] = item_2.get('sub_title', '')

                                    # print(goods_data)
                                    chuchujie_miaosha.update_chuchujie_xianshimiaosha_table(
                                        data=goods_data,
                                        pipeline=tmp_sql_server)
                                    sleep(CHUCHUJIE_SLEEP_TIME)

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass

                index += 1
                gc.collect()
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()