Ejemplo n.º 1
0
    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        tab_id_list = [11, 12, 13, 21, 22, 23, 31, 32, 33]  # notice

        for tab_id in tab_id_list:
            for index in range(0, 50):
                tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format(
                    str(tab_id), str(index))
                print('待抓取的限时秒杀地址为: ', tmp_url)

                data = MyRequests.get_url_body(url=tmp_url,
                                               headers=self.headers)
                if data == '': break

                try:
                    data = json.loads(data)
                    data = data.get('data', {})
                    # print(data)
                except:
                    break

                if data.get('goodslist') == []:
                    print('tab_id={0}, page={1}的goodslist为[], 此处跳过'.format(
                        tab_id, index))
                    break
                else:
                    data = data.get('goodslist', [])
                    # print(data)
                    if data == []:
                        print('goodslist为[], 此处跳过')
                        pass
                    else:
                        miaosha_goods_list = self.get_miaoshao_goods_info_list(
                            data=data)
                        print(miaosha_goods_list)

                        juanpi = JuanPiParse()
                        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                        if my_pipeline.is_connect_success:
                            if my_pipeline._select_table(
                                    sql_str=jp_select_str_5) is None:
                                db_goods_id_list = []
                            else:
                                db_goods_id_list = [
                                    item[0] for item in list(
                                        my_pipeline._select_table(
                                            sql_str=jp_select_str_5))
                                ]

                            for item in miaosha_goods_list:
                                if item.get('goods_id',
                                            '') in db_goods_id_list:
                                    print('该goods_id已经存在于数据库中, 此处跳过')
                                    pass
                                else:
                                    tmp_url = 'http://shop.juanpi.com/deal/' + item.get(
                                        'goods_id')
                                    juanpi.get_goods_data(
                                        goods_id=item.get('goods_id'))
                                    goods_data = juanpi.deal_with_data()

                                    if goods_data == {}:  # 返回的data为空则跳过
                                        pass
                                    else:  # 否则就解析并插入
                                        goods_data['stock_info'] = item.get(
                                            'stock_info')
                                        goods_data['goods_id'] = item.get(
                                            'goods_id')
                                        goods_data['spider_url'] = tmp_url
                                        goods_data['username'] = '******'
                                        goods_data['price'] = item.get(
                                            'price')  # 秒杀前的原特价
                                        goods_data['taobao_price'] = item.get(
                                            'taobao_price')  # 秒杀价
                                        goods_data['sub_title'] = item.get(
                                            'sub_title', '')
                                        goods_data['miaosha_time'] = item.get(
                                            'miaosha_time')
                                        goods_data[
                                            'miaosha_begin_time'], goods_data[
                                                'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                    miaosha_time=item.get(
                                                        'miaosha_time'))
                                        goods_data['tab_id'] = tab_id
                                        goods_data['page'] = index

                                        # print(goods_data)
                                        juanpi.insert_into_juanpi_xianshimiaosha_table(
                                            data=goods_data,
                                            pipeline=my_pipeline)
                                        sleep(.4)  # 短暂sleep下避免出错跳出
                            sleep(.65)
                        else:
                            pass
                        try:
                            del juanpi
                        except:
                            pass
                        gc.collect()
Ejemplo n.º 2
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = r'select goods_id, schedule, is_delete from dbo.juanpi_pintuan where site_id=18'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            juanpi_pintuan = JuanPiParse()
            for item in result:  # 实时更新数据
                data = {}
                if index % 6 == 0:
                    try:
                        del juanpi_pintuan
                    except:
                        pass
                    gc.collect()
                    juanpi_pintuan = JuanPiParse()

                if index % 50 == 0:    # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    pintuan_end_time = json.loads(item[1])[0].get('end_time')
                    pintuan_end_time = int(str(time.mktime(time.strptime(pintuan_end_time, '%Y-%m-%d %H:%M:%S')))[0:10])
                    # print(pintuan_end_time)

                    if item[2] == 1 or pintuan_end_time < int(time.time()):
                        sql_str = 'delete from dbo.juanpi_pintuan where goods_id=%s'
                        tmp_sql_server._delete_table(sql_str=sql_str, params=(item[0],))
                        print('该goods_id[{0}]已过期或者售完,删除成功!'.format(item[0]))
                    else:
                        print('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)' % (item[0], index))
                        juanpi_pintuan.get_goods_data(goods_id=item[0])
                        data = juanpi_pintuan.deal_with_data()

                        if data != {}:
                            data['goods_id'] = item[0]
                            juanpi_pintuan.to_right_and_update_pintuan_data(data=data, pipeline=tmp_sql_server)
                        else:  # 表示返回的data值为空值
                                pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del juanpi_pintuan
                # except:
                #     pass
                gc.collect()
                sleep(1.2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Ejemplo n.º 3
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = r'select GoodsID, IsDelete, MyShelfAndDownTime, Price, TaoBaoPrice from dbo.GoodsInfoAutoGet where SiteID=12'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            juanpi = JuanPiParse()
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    juanpi = JuanPiParse()

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    print(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                        % (item[0], index))
                    juanpi.get_goods_data(goods_id=item[0])
                    data = juanpi.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[0]

                        data['my_shelf_and_down_time'], data[
                            'delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                MyShelfAndDownTime=item[2])
                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[3],
                                old_taobao_price=item[4],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])

                        # print('------>>>| 爬取到的数据为: ', data)
                        juanpi.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del tmall
                # except:
                #     pass
                gc.collect()
                sleep(1.2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        # del ali_1688
        gc.collect()
Ejemplo n.º 4
0
class JPUpdater(AsyncCrawler):
    """卷皮常规商品实时更新"""
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(self,
                              *params,
                              **kwargs,
                              log_print=True,
                              log_save_path=MY_SPIDER_LOGS_PATH + '/卷皮/实时更新/')
        self.sql_cli = None
        self.goods_index = 1
        # 并发量
        self.concurrency = 10

    async def _get_db_old_data(self):
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            result = list(self.sql_cli._select_table(sql_str=jp_select_str_3))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_new_jp_obj(self, index) -> None:
        if index % 10 == 0:
            try:
                del self.juanpi
            except:
                pass
            collect()
            self.juanpi = JuanPiParse(is_real_times_update_call=True)

    async def _update_one_goods_info(self, db_goods_info_obj, index):
        '''
        更新一个goods的信息
        :param db_goods_info_obj:
        :param index: 索引值
        :return: ['goods_id', bool:'成功与否']
        '''
        res = False
        await self._get_new_jp_obj(index=index)
        self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli,
                                              index=index,
                                              logger=self.lg)
        if self.sql_cli.is_connect_success:
            self.lg.info(
                '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                format(db_goods_info_obj.goods_id, index))
            self.juanpi.get_goods_data(goods_id=db_goods_info_obj.goods_id)
            data = self.juanpi.deal_with_data()
            if data != {}:
                data = get_goods_info_change_data(
                    target_short_name='jp',
                    logger=self.lg,
                    data=data,
                    db_goods_info_obj=db_goods_info_obj,
                )
                res = self.juanpi.to_right_and_update_data(
                    data, pipeline=self.sql_cli)

            else:  # 表示返回的data值为空值
                pass
        else:
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(1.2)

        return [db_goods_info_obj.goods_id, res]

    async def _update_db(self):
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                self.juanpi = JuanPiParse(is_real_times_update_call=True)
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        db_goods_info_obj = JPDbGoodsInfoObj(item=item,
                                                             logger=self.lg)
                        self.lg.info('创建 task goods_id: {}'.format(
                            db_goods_info_obj.goods_id))
                        tasks.append(
                            self.loop.create_task(
                                self._update_one_goods_info(
                                    db_goods_info_obj=db_goods_info_obj,
                                    index=index,
                                )))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)

                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(10.)
            try:
                del self.juanpi
            except:
                pass
            collect()

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.loop
        except:
            pass
        collect()
def run_forever():
    while True:
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            sql_cli._delete_table(sql_str=jp_delete_str_1)
            result = list(sql_cli._select_table(sql_str=jp_select_str_2))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            juanpi_pintuan = JuanPiParse()
            for item in result:  # 实时更新数据
                goods_id = item[0]
                if index % 6 == 0:
                    try:
                        del juanpi_pintuan
                    except:
                        pass
                    gc.collect()
                    juanpi_pintuan = JuanPiParse()

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 remainder=50)
                if sql_cli.is_connect_success:
                    try:
                        pintuan_end_time = json.loads(
                            item[1])[0].get('end_time')
                    except IndexError:
                        print('获取pintuan_end_time时索引异常!出错goods_id:{0}'.format(
                            goods_id))
                        _handle_goods_shelves_in_auto_goods_table(
                            goods_id=goods_id,
                            sql_cli=sql_cli,
                            update_sql_str=jp_update_str_7,
                        )
                        continue
                    pintuan_end_time = int(
                        str(
                            time.mktime(
                                time.strptime(pintuan_end_time,
                                              '%Y-%m-%d %H:%M:%S')))[0:10])
                    # print(pintuan_end_time)

                    if item[2] == 1 or pintuan_end_time < int(
                            datetime_to_timestamp(get_shanghai_time())):
                        _handle_goods_shelves_in_auto_goods_table(
                            goods_id=goods_id,
                            sql_cli=sql_cli,
                            update_sql_str=jp_update_str_7,
                        )
                        print('该goods_id[{0}]已过期或者售完,逻辑删除成功!'.format(goods_id))
                    else:
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (goods_id, index))
                        juanpi_pintuan.get_goods_data(goods_id=goods_id)
                        data = juanpi_pintuan.deal_with_data()
                        if data == {}:
                            continue

                        data['goods_id'] = goods_id
                        juanpi_pintuan.to_right_and_update_pintuan_data(
                            data=data, pipeline=sql_cli)

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                gc.collect()
                sleep(1.2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5 * 60)
        gc.collect()
    def run_forever(self):
        '''
        这个实时更新的想法是只更新当天前天未来14小时的上架商品的信息,再未来信息价格(全为原价)暂不更新
        :return:
        '''
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = r'select goods_id, miaosha_time, tab_id, page from dbo.juanpi_xianshimiaosha where site_id=15'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            juanpi_miaosha = JuanPiParse()

            for item in result:  # 实时更新数据
                miaosha_begin_time = json.loads(
                    item[1]).get('miaosha_begin_time')
                miaosha_begin_time = int(
                    str(
                        time.mktime(
                            time.strptime(miaosha_begin_time,
                                          '%Y-%m-%d %H:%M:%S')))[0:10])
                # print(miaosha_begin_time)

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    if self.is_recent_time(miaosha_begin_time) == 0:
                        tmp_sql_server._delete_table(
                            sql_str=self.delete_sql_str, params=(item[0]))
                        print(
                            '过期的goods_id为(%s)' % item[0],
                            ', 限时秒杀开始时间为(%s), 删除成功!' %
                            json.loads(item[1]).get('miaosha_begin_time'))

                    elif self.is_recent_time(miaosha_begin_time) == 2:
                        # break       # 跳出循环
                        pass  # 此处应该是pass,而不是break,因为数据库传回的goods_id不都是按照顺序的

                    else:  # 返回1,表示在待更新区间内
                        print(
                            '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                            % (item[0], index))

                        tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format(
                            str(item[2]),
                            str(item[3]),
                        )
                        # print('待爬取的tab_id, page地址为: ', tmp_url)

                        data = MyRequests.get_url_body(url=tmp_url,
                                                       headers=self.headers)
                        if data == '': break

                        try:
                            data = json.loads(data)
                            data = data.get('data', {})
                            # print(data)
                        except:
                            break

                        if data.get('goodslist') == []:
                            print('tab_id={0}, page={1}的goodslist为[], 此处跳过'.
                                  format(item[2], item[3]))
                            pass

                        else:
                            data = data.get('goodslist', [])
                            # print(data)
                            if data == []:
                                print('goodslist为[], 此处跳过')
                                pass
                            else:
                                miaosha_goods_list = self.get_miaoshao_goods_info_list(
                                    data=data)
                                # print(miaosha_goods_list)

                                # 该tab_id, page中现有的所有goods_id的list
                                miaosha_goods_all_goods_id = [
                                    i.get('goods_id')
                                    for i in miaosha_goods_list
                                ]
                                # print(miaosha_goods_all_goods_id)

                                if item[0] not in miaosha_goods_all_goods_id:  # 内部已经下架的
                                    '''
                                    表示该tab_id,page中没有了该goods_id
                                    '''
                                    tmp_sql_server._delete_table(
                                        sql_str=self.delete_sql_str,
                                        params=(item[0]))
                                    print(
                                        '该商品[goods_id为(%s)]已被下架限时秒杀活动,此处将其删除' %
                                        item[0])
                                    pass

                                else:  # 未下架的
                                    for item_1 in miaosha_goods_list:
                                        if item_1.get('goods_id',
                                                      '') == item[0]:
                                            # # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                                            # juanpi_miaosha = JuanPiParse()
                                            juanpi_miaosha.get_goods_data(
                                                goods_id=item[0])
                                            goods_data = juanpi_miaosha.deal_with_data(
                                            )

                                            if goods_data == {}:  # 返回的data为空则跳过
                                                pass
                                            else:  # 否则就解析并且插入
                                                goods_data[
                                                    'stock_info'] = item_1.get(
                                                        'stock_info')
                                                goods_data[
                                                    'goods_id'] = item_1.get(
                                                        'goods_id')
                                                # goods_data['username'] = '******'
                                                if item_1.get(
                                                        'stock_info'
                                                ).get('activity_stock') > 0:
                                                    goods_data[
                                                        'price'] = item_1.get(
                                                            'price')  # 秒杀前的原特价
                                                    goods_data[
                                                        'taobao_price'] = item_1.get(
                                                            'taobao_price'
                                                        )  # 秒杀价
                                                else:
                                                    pass
                                                goods_data[
                                                    'sub_title'] = item_1.get(
                                                        'sub_title', '')
                                                goods_data[
                                                    'miaosha_time'] = item_1.get(
                                                        'miaosha_time')
                                                goods_data[
                                                    'miaosha_begin_time'], goods_data[
                                                        'miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                                                            miaosha_time=item_1
                                                            .get('miaosha_time'
                                                                 ))

                                                # print(goods_data)
                                                juanpi_miaosha.to_update_juanpi_xianshimiaosha_table(
                                                    data=goods_data,
                                                    pipeline=tmp_sql_server)

                                                sleep(.2)  # 避免太快
                                        else:
                                            pass
                    if index % 10 == 0:  # 每过几个初始化一次,既能加快速度,又能优化内存
                        # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                        juanpi_miaosha = JuanPiParse()
                        gc.collect()

                    index += 1
                    gc.collect()

                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            # sleep(5)
            pass
        gc.collect()
Ejemplo n.º 7
0
    def get_spike_hour_goods_info(self):
        '''
        模拟构造得到data的url,得到近期所有的限时秒杀商品信息
        :return:
        '''
        tab_id_list = [11, 12, 13, 21, 22, 23, 31, 32, 33]  # notice

        for tab_id in tab_id_list:
            for index in range(0, 50):
                tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format(
                    str(tab_id), str(index))
                print('待抓取的限时秒杀地址为: ', tmp_url)

                # 设置代理ip
                self.proxies = self.get_proxy_ip_from_ip_pool(
                )  # {'http': ['xx', 'yy', ...]}
                self.proxy = self.proxies['http'][randint(
                    0,
                    len(self.proxies) - 1)]

                tmp_proxies = {
                    'http': self.proxy,
                }
                # print('------>>>| 正在使用代理ip: {} 进行爬取... |<<<------'.format(self.proxy))

                try:
                    response = requests.get(
                        tmp_url,
                        headers=self.headers,
                        proxies=tmp_proxies,
                        timeout=10)  # 在requests里面传数据,在构造头时,注意在url外头的&xxx=也得先构造
                    data = response.content.decode('utf-8')
                    # print(data)
                except Exception:
                    print('requests.get()请求超时....')
                    print('data为空!')
                    break

                try:
                    data = json.loads(data)
                    data = data.get('data', {})
                    # print(data)
                except:
                    break

                if data.get('goodslist') == []:
                    print('tab_id={0}, page={1}的goodslist为[], 此处跳过'.format(
                        tab_id, index))
                    break
                else:
                    data = data.get('goodslist', [])
                    # print(data)
                    if data == []:
                        print('goodslist为[], 此处跳过')
                        pass
                    else:
                        miaosha_goods_list = self.get_miaoshao_goods_info_list(
                            data=data)
                        print(miaosha_goods_list)

                        juanpi = JuanPiParse()
                        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                        if my_pipeline.is_connect_success:
                            if my_pipeline.select_juanpi_xianshimiaosha_all_goods_id(
                            ) is None:
                                db_goods_id_list = []
                            else:
                                db_goods_id_list = [
                                    item[0] for item in list(
                                        my_pipeline.
                                        select_juanpi_xianshimiaosha_all_goods_id(
                                        ))
                                ]

                            for item in miaosha_goods_list:
                                if item.get('goods_id',
                                            '') in db_goods_id_list:
                                    print('该goods_id已经存在于数据库中, 此处跳过')
                                    pass
                                else:
                                    tmp_url = 'http://shop.juanpi.com/deal/' + item.get(
                                        'goods_id')
                                    juanpi.get_goods_data(
                                        goods_id=item.get('goods_id'))
                                    goods_data = juanpi.deal_with_data()

                                    if goods_data == {}:  # 返回的data为空则跳过
                                        pass
                                    else:  # 否则就解析并插入
                                        goods_data['stock_info'] = item.get(
                                            'stock_info')
                                        goods_data['goods_id'] = item.get(
                                            'goods_id')
                                        goods_data['spider_url'] = tmp_url
                                        goods_data['username'] = '******'
                                        goods_data['price'] = item.get(
                                            'price')  # 秒杀前的原特价
                                        goods_data['taobao_price'] = item.get(
                                            'taobao_price')  # 秒杀价
                                        goods_data['sub_title'] = item.get(
                                            'sub_title', '')
                                        goods_data['miaosha_time'] = item.get(
                                            'miaosha_time')
                                        goods_data['tab_id'] = tab_id
                                        goods_data['page'] = index

                                        # print(goods_data)
                                        juanpi.insert_into_juanpi_xianshimiaosha_table(
                                            data=goods_data,
                                            pipeline=my_pipeline)
                                        sleep(.3)  # 短暂sleep下避免出错跳出
                            sleep(.65)
                        else:
                            pass
                        try:
                            del juanpi
                        except:
                            pass
                        gc.collect()
Ejemplo n.º 8
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server.select_juanpi_all_goods_id())
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                data = {}
                # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
                juanpi = JuanPiParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    print(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                        % (item[0], index))
                    juanpi.get_goods_data(goods_id=item[0])
                    data = juanpi.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[0]
                        '''
                        设置最后刷新的商品状态上下架时间
                        '''
                        # 1.is_delete由0->1 为下架时间down_time  2. is_delete由1->0 为上架时间shelf_time
                        my_shelf_and_down_time = {
                            'shelf_time': '',
                            'down_time': '',
                        }
                        if data['is_delete'] != item[1]:
                            if data['is_delete'] == 0 and item[1] == 1:
                                # is_delete由0->1 表示商品状态上架变为下架
                                my_shelf_and_down_time['down_time'] = str(
                                    get_shanghai_time())
                            else:
                                # is_delete由1->0 表示商品状态下架变为上架
                                my_shelf_and_down_time['shelf_time'] = str(
                                    get_shanghai_time())
                        else:
                            if item[2] is None or item[
                                    2] == '{"shelf_time": "", "down_time": ""}' or len(
                                        item[2]) == 35:  # 35就是那串初始str
                                if data['is_delete'] == 0:  # 上架的状态
                                    my_shelf_and_down_time['shelf_time'] = str(
                                        get_shanghai_time())
                                else:  # 下架的状态
                                    my_shelf_and_down_time['down_time'] = str(
                                        get_shanghai_time())
                            else:
                                # 否则保存原始值不变
                                tmp_shelf_and_down_time = item[2]
                                my_shelf_and_down_time = json.loads(
                                    tmp_shelf_and_down_time)  # 先转换为dict
                        data['my_shelf_and_down_time'] = my_shelf_and_down_time
                        # print(my_shlef_and_down_time)

                        # print('------>>>| 爬取到的数据为: ', data)
                        juanpi.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del tmall
                # except:
                #     pass
                gc.collect()
                # sleep(1)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        # del ali_1688
        gc.collect()
Ejemplo n.º 9
0
class JPUpdater(AsyncCrawler):
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/卷皮/秒杀实时更新/',
            ip_pool_type=IP_POOL_TYPE,
        )
        self.tmp_sql_server = None
        self.concurrency = 8
        self.goods_index = 1
        self.delete_sql_str = jp_delete_str_3

    async def _get_pc_headers(self) -> dict:
        return {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            # 'Accept-Encoding:': 'gzip',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'm.juanpi.com',
            'User-Agent': get_random_pc_ua(),  # 随机一个请求头
        }

    async def _get_db_old_data(self) -> (None, list):
        self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            self.tmp_sql_server._delete_table(sql_str=jp_delete_str_4, params=None)
            await async_sleep(5)
            result = list(self.tmp_sql_server._select_table(sql_str=jp_select_str_4))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_miaosha_begin_time(self, miaosha_time) -> int:
        miaosha_begin_time = json_2_dict(miaosha_time).get('miaosha_begin_time')
        miaosha_begin_time = int(str(time.mktime(time.strptime(miaosha_begin_time, '%Y-%m-%d %H:%M:%S')))[0:10])

        return miaosha_begin_time

    async def _get_new_jp_obj(self, index):
        if index % 10 == 0:         # 不能共享一个对象了, 否则驱动访问会异常!
            try:
                del self.juanpi_miaosha
            except:
                pass
            collect()
            self.juanpi_miaosha = JuanPiParse()

    async def _update_one_goods_info(self, item, index) -> tuple:
        '''
        更新单个
        :param item:
        :param index:
        :return:
        '''
        res = False
        goods_id = item[0]
        miaosha_time = item[1]
        tab_id = item[2]
        page = item[3]
        miaosha_begin_time = await self._get_miaosha_begin_time(miaosha_time)
        # self.lg.info(str(miaosha_begin_time))
        await self._get_new_jp_obj(index=index)
        self.tmp_sql_server = await _get_new_db_conn(db_obj=self.tmp_sql_server, index=index, logger=self.lg, remainder=30)

        if self.tmp_sql_server.is_connect_success:
            is_recent_time = await self._is_recent_time(miaosha_begin_time)
            if is_recent_time == 0:
                res = self.tmp_sql_server._update_table(sql_str=jp_update_str_6, params=(goods_id,))
                self.lg.info('过期的goods_id为({}), 限时秒杀开始时间为({}), 逻辑删除成功!'.format(goods_id, miaosha_begin_time))
                await async_sleep(.3)
                index += 1
                self.goods_index = index

                return goods_id, res

            elif is_recent_time == 2:
                self.lg.info('goods_id: {}, 未来时间跳过更新...'.format(goods_id))
                index += 1
                self.goods_index = index

                return goods_id, res

            else:  # 返回1,表示在待更新区间内
                self.lg.info('------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.format(goods_id, index))
                tmp_url = 'https://m.juanpi.com/act/timebuy-xrgoodslist?tab_id={0}&page={1}'.format(
                    str(tab_id), str(page),
                )
                # self.lg.info('待爬取的tab_id, page地址为: {}'.format(tmp_url))
                body = Requests.get_url_body(url=tmp_url, headers=await self._get_pc_headers(), ip_pool_type=self.ip_pool_type)
                try:
                    data = json_2_dict(body, default_res={}).get('data', {})
                    assert data != {}, 'data为空dict!'
                    data = data.get('goodslist', [])
                    assert data != [], 'tab_id={0}, page={1}的goodslist为[], 此处跳过'.format(tab_id, page)
                except AssertionError:
                    self.lg.error(msg='遇到错误:', exc_info=True)
                    index += 1
                    self.goods_index = index
                    await async_sleep(.3)

                    return goods_id, res

                miaosha_goods_list = await self._get_miaoshao_goods_info_list(data=data)
                # self.lg.info(str(miaosha_goods_list))
                # 该tab_id, page中现有的所有goods_id的list
                miaosha_goods_all_goods_id = [i.get('goods_id') for i in miaosha_goods_list]
                self.lg.info(str(miaosha_goods_all_goods_id))
                if goods_id not in miaosha_goods_all_goods_id:  # 内部已经下架的
                    if miaosha_goods_all_goods_id != []:        # 测试发现miaosha_goods_all_goods_id不为空,则未下架, 跳过!
                        self.lg.info('该商品[{}]未下架, 此处不进行更新跳过!!'.format(goods_id))
                    else:
                        # 表示该tab_id,page中没有了该goods_id
                        res = self.tmp_sql_server._update_table(sql_str=jp_update_str_6, params=(goods_id,))
                        self.lg.info('该商品[goods_id为({})]已被下架限时秒杀活动,此处将其逻辑删除'.format(goods_id))

                    index += 1
                    self.goods_index = index
                    await async_sleep(.3)

                    return goods_id, res

                else:  # 未下架的
                    res = await self._one_update(miaosha_goods_list=miaosha_goods_list, goods_id=goods_id)

        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')

        index += 1
        self.goods_index = index
        await async_sleep(1.2)

        return goods_id, res

    async def _update_db(self) -> None:
        '''
        秒杀数据实时更新
        :return:
        '''
        while True:
            self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(tasks_params_list=result, step=self.concurrency)
                self.juanpi_miaosha = JuanPiParse()
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                        # self.lg.info(str(slice_params_list))
                    except AssertionError:  # 全部提取完毕, 正常退出
                        break

                    tasks = []
                    for item in slice_params_list:
                        self.lg.info('创建 task goods_id: {}'.format(item[0]))
                        tasks.append(self.loop.create_task(self._update_one_goods_info(item=item, index=index)))
                        index += 1

                    await _get_async_task_result(tasks=tasks, logger=self.lg)

                self.lg.info('全部数据更新完毕'.center(100, '#'))
            if get_shanghai_time().hour == 0:  # 0点以后不更新
                await async_sleep(60 * 60 * 5.5)
            else:
                await async_sleep(2.5 * 60)
            try:
                del self.juanpi_miaosha
            except:
                pass
            collect()

    async def _one_update(self, **kwargs) -> bool:
        '''
        未下架的更新
        :param kwargs:
        :return:
        '''
        res = False
        miaosha_goods_list = kwargs.get('miaosha_goods_list')
        goods_id = kwargs.get('goods_id')

        for item_1 in miaosha_goods_list:
            if item_1.get('goods_id', '') == goods_id:
                self.juanpi_miaosha.get_goods_data(goods_id=goods_id)
                goods_data = self.juanpi_miaosha.deal_with_data()
                if goods_data == {}:  # 返回的data为空则跳过
                    break
                else:  # 否则就解析并且插入
                    goods_data['stock_info'] = item_1.get('stock_info')
                    goods_data['goods_id'] = item_1.get('goods_id')
                    # goods_data['username'] = '******'
                    if item_1.get('stock_info').get('activity_stock') > 0:
                        goods_data['price'] = item_1.get('price')  # 秒杀前的原特价
                        goods_data['taobao_price'] = item_1.get('taobao_price')  # 秒杀价
                    else:
                        pass
                    goods_data['sub_title'] = item_1.get('sub_title', '')
                    goods_data['miaosha_time'] = item_1.get('miaosha_time')
                    goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(
                        miaosha_time=item_1.get('miaosha_time'))

                    res = self.juanpi_miaosha.to_update_juanpi_xianshimiaosha_table(
                        data=goods_data,
                        pipeline=self.tmp_sql_server)
                    await async_sleep(.3)  # 避免太快
                    break
            else:
                pass

        return res

    async def _get_miaoshao_goods_info_list(self, data) -> list:
        '''
        得到秒杀商品有用信息
        :param data: 待解析的data
        :return: 有用信息list
        '''
        miaosha_goods_list = []
        for item in data:
            tmp = {}
            tmp['miaosha_time'] = {
                'miaosha_begin_time': timestamp_to_regulartime(int(item.get('start_time'))),
                'miaosha_end_time': timestamp_to_regulartime(int(item.get('end_time'))),
            }
            stock = item.get('stock', 0)
            tmp['goods_id'] = item.get('goods_id')
            # 限时秒杀库存信息
            tmp['stock_info'] = {
                'activity_stock': int(item.get('stock', 0)*(item.get('rate', 0)/100)),
                'stock': item.get('stock', 0),
            }
            # 原始价格
            tmp['price'] = round(float(item.get('oprice', '0')), 2)
            tmp['taobao_price'] = round(float(item.get('cprice', '0')), 2)
            miaosha_goods_list.append(tmp)

        return miaosha_goods_list

    async def _is_recent_time(self, timestamp) -> int:
        '''
        判断是否在指定的日期差内
        :param timestamp: 时间戳
        :return: 0: 已过期恢复原价的 1: 待更新区间内的 2: 未来时间的
        '''
        time_1 = int(timestamp)
        time_2 = int(time.time())  # 当前的时间戳

        diff_time = time_1 - time_2
        if diff_time < -259200:     # (为了后台能同步下架)所以设置为 72个小时, 只需要更新过去48小时和对与当前时间的未来2小时的商品信息
        # if diff_time < -172800:     # (原先的时间)48个小时, 只需要跟新过去48小时和对与当前时间的未来14小时的商品信息(20点到第二天10点时间间隔为14小时)
            return 0    # 已过期恢复原价的
        elif diff_time > -172800 and diff_time < 50400:
            return 1    # 表示是昨天跟今天的也就是待更新的
        else:
            return 2    # 未来时间的暂时不用更新

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.loop
        except:
            pass
        collect()
Ejemplo n.º 10
0
def run_forever():
    while True:
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(
                tmp_sql_server._select_table(sql_str=jp_select_str_3))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            juanpi = JuanPiParse()
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    juanpi = JuanPiParse()

                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    print(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                        % (item[0], index))
                    juanpi.get_goods_data(goods_id=item[0])
                    data = juanpi.deal_with_data()
                    if data != {}:
                        data['goods_id'] = item[0]

                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                shelf_time=item[4],
                                delete_time=item[5])
                        print('上架时间:', data['shelf_time'], '下架时间:',
                              data['delete_time'])

                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[2],
                                old_taobao_price=item[3],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])

                        try:
                            old_sku_info = format_price_info_list(
                                price_info_list=json_2_dict(item[6]),
                                site_id=12)
                        except AttributeError:  # 处理已被格式化过的
                            old_sku_info = item[6]
                        data['_is_price_change'], data[
                            'sku_info_trans_time'] = get_sku_info_trans_record(
                                old_sku_info=old_sku_info,
                                new_sku_info=format_price_info_list(
                                    data['price_info_list'], site_id=12),
                                is_price_change=item[7]
                                if item[7] is not None else 0)

                        juanpi.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)
                    else:  # 表示返回的data值为空值
                        pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del tmall
                # except:
                #     pass
                gc.collect()
                sleep(1.2)
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        # del ali_1688
        gc.collect()