async def _deal_with_all_goods_id(self):
        '''
        获取每个详细分类的商品信息
        :return: None
        '''
        _data = await self._get_all_goods_list()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        index = 1
        if my_pipeline.is_connect_success:
            self.my_lg.info('正在获取淘抢购db原有goods_id, 请耐心等待...')
            sql_str = r'select goods_id from dbo.tao_qianggou_xianshimiaosha where site_id=28'
            db_ = list(my_pipeline._select_table(sql_str=sql_str))
            db_all_goods_id = [item[0] for item in db_]
            self.my_lg.info('获取完毕!!!')
            # self.my_lg.info(str(db_all_goods_id))

            for item in _data:
                miaosha_goods_list = await self._get_taoqianggou_goods_list(data=item.get('data', []))
                # self.my_lg.info(str(miaosha_goods_list))
                # pprint(miaosha_goods_list)

                for tmp_item in miaosha_goods_list:
                    if tmp_item.get('goods_id', '') in db_all_goods_id:    # 处理如果该goods_id已经存在于数据库中的情况
                        self.my_lg.info('该goods_id[%s]已存在db中' % tmp_item.get('goods_id', ''))
                        continue

                    if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                        self.my_lg.info('正在重置,并与数据库建立新连接中...')
                        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                        # my_pipeline = SqlPools()
                        self.my_lg.info('与数据库的新连接成功建立...')

                    if my_pipeline.is_connect_success:
                        tmall = TmallParse(logger=self.my_lg)
                        tmp_url = 'https://detail.tmall.com/item.htm?id={0}'.format(tmp_item.get('goods_id'))
                        goods_id = tmall.get_goods_id_from_url(tmp_url)

                        tmall.get_goods_data(goods_id=goods_id)
                        goods_data = tmall.deal_with_data()

                        if goods_data != {}:
                            # self.my_lg.info(str(tmp_item))
                            goods_data['goods_id'] = tmp_item.get('goods_id')
                            goods_data['spider_url'] = tmp_url
                            goods_data['miaosha_time'] = tmp_item.get('miaosha_time')
                            goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=tmp_item.get('miaosha_time'))
                            goods_data['page'] = tmp_item.get('page')
                            goods_data['spider_time'] = tmp_item.get('spider_time')

                            tmall.insert_into_taoqianggou_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline)
                            await asyncio.sleep(TMALL_REAL_TIMES_SLEEP_TIME)

                        else:
                            await asyncio.sleep(5)

                        try: del tmall
                        except: pass
                        gc.collect()
Exemple #2
0
    async def deal_with_data(self):
        '''
        处理并存储相关拼团商品的数据
        :return:
        '''
        goods_list = await self.get_pintuan_goods_info()

        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            db_goods_id_list = [item[0] for item in list(await my_pipeline.select_jumeiyoupin_pintuan_all_goods_id(logger=self.my_lg))]
            # self.my_lg.info(str(db_goods_id_list))

            index = 1
            for item in goods_list:
                if index % 20 == 0:
                    my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

                if item.get('goods_id', '') in db_goods_id_list:
                    self.my_lg.info('该goods_id已经存在于数据库中, 此处跳过')
                    pass
                else:
                    goods_id = item.get('goods_id', '')
                    tmp_url = 'https://s.h5.jumei.com/yiqituan/detail?item_id={0}&type={1}'.format(goods_id, item.get('type', ''))

                    s_time = time.time()

                    jumeiyoupin = JuMeiYouPinPinTuanParse(logger=self.my_lg)
                    goods_data = await jumeiyoupin.deal_with_data(jumei_pintuan_url=tmp_url)

                    if goods_data == {} or goods_data.get('is_delete', 0) == 1:
                        pass
                    else:
                        # 规范化
                        goods_data['goods_id'] = goods_id
                        goods_data['pintuan_time'] = item.get('pintuan_time', {})
                        goods_data['pintuan_begin_time'], goods_data['pintuan_end_time'] = await self.get_pintuan_begin_time_and_pintuan_end_time(pintuan_time=item.get('pintuan_time', {}))
                        goods_data['sort'] = item.get('sort')
                        goods_data['page'] = item.get('page')
                        goods_data['tab'] = item.get('tab')

                        # pprint(goods_data)
                        # print(goods_data)
                        await jumeiyoupin.insert_into_jumeiyoupin_pintuan_table(data=goods_data, pipeline=my_pipeline, logger=self.my_lg)

                    e_time = time.time()
                    if e_time - s_time > JUMEIYOUPIN_SLEEP_TIME:    # 使其更智能点
                        pass
                    else:
                        await asyncio.sleep(JUMEIYOUPIN_SLEEP_TIME - (e_time-s_time))
                    index += 1

        else:
            self.my_lg.error('数据库连接失败,此处跳过!')
            pass

        gc.collect()
        return None
    def _just_run(self):
        while True:
            #### 更新数据
            self._comment_pipeline = SqlServerMyPageInfoSaveItemPipeline()
            #  and GETDATE()-a.modify_time>1
            try:
                result = list(
                    self._comment_pipeline._select_table(
                        sql_str=cm_select_str_1, logger=self.my_lg))
            except TypeError:
                self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
                continue

            self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            self.my_lg.info(str(result))
            self.my_lg.info(
                '--------------------------------------------------------')
            self.my_lg.info('待更新个数: {0}'.format(len(result)))

            self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))

            # 1.淘宝 2.阿里 3.天猫 4.天猫超市 5.聚划算 6.天猫国际 7.京东 8.京东超市 9.京东全球购 10.京东大药房  11.折800 12.卷皮 13.拼多多 14.折800秒杀 15.卷皮秒杀 16.拼多多秒杀 25.唯品会
            for index, item in enumerate(
                    result):  # item: ('xxxx':goods_id, 'y':site_id)
                if not self.debugging_api.get(item[1]):
                    self.my_lg.info('api为False, 跳过! 索引值[%s]' % str(index))
                    continue

                if index % 20 == 0:
                    try:
                        del self._comment_pipeline
                    except:
                        pass
                    self._comment_pipeline = SqlServerMyPageInfoSaveItemPipeline(
                    )

                switch = {
                    1: self.func_name_dict.get('taobao'),  # 淘宝
                    2: self.func_name_dict.get('ali'),  # 阿里1688
                    3: self.func_name_dict.get('tmall'),  # 天猫
                    4: self.func_name_dict.get('tmall'),  # 天猫超市
                    6: self.func_name_dict.get('tmall'),  # 天猫国际
                    7: self.func_name_dict.get('jd'),  # 京东
                    8: self.func_name_dict.get('jd'),  # 京东超市
                    9: self.func_name_dict.get('jd'),  # 京东全球购
                    10: self.func_name_dict.get('jd'),  # 京东大药房
                    11: self.func_name_dict.get('zhe_800'),  # 折800
                    12: self.func_name_dict.get('juanpi'),  # 卷皮
                    13: self.func_name_dict.get('pinduoduo'),  # 拼多多
                    25: self.func_name_dict.get('vip'),  # 唯品会
                }

                # 动态执行
                exec_code = compile(
                    switch[item[1]].format(index, item[0], item[1]), '',
                    'exec')
                exec(exec_code)
                sleep(1.1)
Exemple #4
0
    def _save_articles(self, data):
        '''
        存储数据
        :param data:
        :return:
        '''
        self.my_lg.info('即将开始存储该文章...')
        sql_str = 'insert into dbo.daren_recommend(share_id, nick_name, head_url, profile, gather_url, title, comment_content, share_img_url_list, div_body, create_time, site_id, tags, video_url, likes, collects) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
        for item in data:
            if self.index % 20 == 0:
                self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

            if self.my_pipeline.is_connect_success:
                share_id = item.get('share_id', '')
                if share_id == '':
                    continue

                self.my_lg.info('------>>>| 正在存储share_id: {0}...'.format(share_id))
                try:
                    params = self._get_db_insert_into_params(item=item)
                except Exception:
                    continue
                result = self.my_pipeline._insert_into_table_2(sql_str=sql_str, params=params, logger=self.my_lg)
                if result:
                    self.success_insert_db_num += 1

            else:
                self.my_lg.error('db连接失败!存储失败! 出错article地址:{0}'.format(item.get('gather_url', '')))

        self.my_lg.info('@' * 9 + ' 目前成功存储{0}个!'.format(self.success_insert_db_num))

        return True
Exemple #5
0
    def deal_with_data(self, *params):
        '''
        处理并存储相关拼团商品的数据
        :param params: 待传参数
        :return:
        '''
        goods_list = params[0]

        mogujie = MoGuJieParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=mg_select_str_1))]
            print(db_goods_id_list)

            for item in goods_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass

                else:
                    goods_id = str(item.get('goods_id', ''))
                    tmp_url = 'https://shop.mogujie.com/detail/' + str(goods_id)

                    mogujie.get_goods_data(goods_id=str(goods_id))
                    goods_data = mogujie.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass

                    else:  # 否则就解析并且插入
                        # 规范化
                        goods_data['price_info_list'] = _get_mogujie_pintuan_price_info_list(goods_data['price_info_list'])
                        goods_data['goods_url'] = tmp_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['pintuan_time'] = item.get('pintuan_time', {})
                        goods_data['pintuan_begin_time'], goods_data['pintuan_end_time'] = get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=item.get('pintuan_time', {}))
                        goods_data['all_sell_count'] = item.get('all_sell_count', '')
                        goods_data['fcid'] = str(item.get('fcid'))
                        goods_data['page'] = str(item.get('page'))
                        goods_data['sort'] = str(item.get('sort', ''))

                        # pprint(goods_data)
                        # print(goods_data)
                        _r = mogujie.insert_into_mogujie_pintuan_table(data=goods_data, pipeline=my_pipeline)
                        if _r:  # 更新
                            db_goods_id_list.append(goods_id)
                            db_goods_id_list = list(set(db_goods_id_list))

                        sleep(MOGUJIE_SLEEP_TIME)  # 放慢速度

        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del mogujie
        except:
            pass
        gc.collect()
    def run_forever(self):
        '''
        这个实时更新的想法是只更新当天前天未来两小时的上架商品的信息,再未来信息价格(全为原价)暂不更新
        :return:
        '''
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(tmp_sql_server._select_table(sql_str=z8_select_str_4))
            tmp_sql_server._delete_table(sql_str=z8_delete_str_4, params=None)
        except TypeError:
            self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(str(result))
            print('--------------------------------------------------------')

            self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            self._update_old_goods_info(tmp_sql_server=tmp_sql_server, result=result)

        if get_shanghai_time().hour == 0:   # 0点以后不更新
            sleep(60*60*5.5)
        else:
            sleep(10*60)

        return
Exemple #7
0
    async def _run_forever(self):
        '''
        实时更新所有数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(
                tmp_sql_server._select_table(sql_str=tb_select_str_4))
        except TypeError:
            self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is not None:
            self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            self.my_lg.info(str(result))
            self.my_lg.info(
                '--------------------------------------------------------')

            self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            await self._update_old_goods_info(tmp_sql_server=tmp_sql_server,
                                              result=result)

        else:
            pass

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            self.my_lg.info('休眠60s...')
            sleep(60)

        return
Exemple #8
0
    def run_forever(self):
        '''
        这个实时更新的想法是只更新当天前天未来两小时的上架商品的信息,再未来信息价格(全为原价)暂不更新
        :return:
        '''
        #### 实时更新数据
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = r'select goods_id, miaosha_time, session_id from dbo.zhe_800_xianshimiaosha where site_id=14'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            self._update_old_goods_info(tmp_sql_server=tmp_sql_server,
                                        result=result)

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        # del ali_1688

        return
    async def _run_forever(self):
        '''
        实时更新所有数据
        :return:
        '''
        tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        sql_str = 'select goods_id, miaosha_time, goods_url, page, spider_time from dbo.tao_qianggou_xianshimiaosha where site_id=28'
        try:
            result = list(tmp_sql_server._select_table(sql_str=sql_str))
        except TypeError as e:
            self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is not None:
            self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            self.my_lg.info(str(result))
            self.my_lg.info(
                '--------------------------------------------------------')

            self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            await self._update_old_goods_info(tmp_sql_server=tmp_sql_server,
                                              result=result)

        else:
            pass

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(60)

        return
    def _just_run(self):
        while True:
            #### 实时更新数据
            tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
            try:
                result = list(
                    tmp_sql_server.
                    select_all_goods_info_from_GoodsInfoAutoGet_table())
            except TypeError:
                self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
                result = None
            if result is None:
                pass
            else:
                self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
                self.my_lg.info(str(result))
                self.my_lg.info(
                    '--------------------------------------------------------')

                self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))

                for index, item in enumerate(
                        result):  # item: ('xxxx':goods_id, 'y':site_id)
                    switch = {
                        1: 'self.taobao_comment({0}, {1})',
                        2: 'self.ali_1688_comment({0}, {1})',
                    }

                    # 动态执行
                    exec_code = compile(switch[item[1]].format(index, item[0]),
                                        '', 'exec')
                    exec(exec_code)
Exemple #11
0
def test():
    """
    查看指定keys的更新时间点
    :return:
    """
    redis_cli = RedisCli()
    base_pattern = 'fzhook:tm0:'
    res = list(redis_cli.keys(pattern='fzhook:tm0:*'))
    sql_cli = SqlServerMyPageInfoSaveItemPipeline()
    sql_str = 'select GoodsID, ModfiyTime from dbo.GoodsInfoAutoGet where GoodsID=%s'

    tmp_list = []
    for item in res:
        goods_id = item.replace(base_pattern, '')
        print('look goods_id: {} ...'.format(goods_id))
        tmp = sql_cli._select_table(sql_str=sql_str, params=(goods_id, ))
        if tmp_list is not None:
            tmp_list.append({
                'goods_id': goods_id,
                'modify_time': tmp[0][1],
            })

    new_tmp_list = sorted(tmp_list,
                          key=lambda item: item.get('modify_time', ''))
    pprint(new_tmp_list[0:200])

    try:
        del sql_cli
        del redis_cli
    except:
        pass

    return
    async def get_db_res(self) -> list:
        """
        获取目标goods_id_list
        :return:
        """
        get_current_func_info_by_traceback(self=self, logger=self.lg)
        db_res = []
        try:
            self.lg.info('清除过期优惠券ing ...')
            # 清除过期优惠券
            self.sql_cli._delete_table(
                sql_str=
                'delete from dbo.coupon_info where GETDATE()-end_time >= 3',
                params=None,
            )
            self.lg.info('休眠15s ...')
            await async_sleep(15)
            self.lg.info('获取新待检测的goods数据ing...')
            db_res = list(self.sql_cli._select_table(sql_str=self.sql_tr0, ))
        except Exception:
            self.lg.error('遇到错误:', exc_info=True)
            self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()

        assert db_res != []
        self.lg.info('db_res_len: {}'.format(len(db_res)))

        return db_res
Exemple #13
0
    async def _save_all_business_settlement_records(self, all_res) -> None:
        """
        存储新增的商家提现记录
        :param all_res:
        :return:
        """
        new_add_count = 0
        for item in all_res:
            # 处理未存储的新数据
            unique_id = item['unique_id']
            self.lg.info('saving unique_id: {} ...'.format(unique_id))
            params = await self._get_insert_item_params(item=item)
            try:
                res = self.sql_cli._insert_into_table_2(
                    sql_str=zwm_insert_str_1,
                    params=params,
                    logger=self.lg)
                if res:
                    new_add_count += 1
            except Exception:
                self.lg.error('遇到错误:', exc_info=True)
                continue

        if not self.sql_cli.is_connect_success:
            self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        else:
            pass

        self.lg.info('新增个数: {}'.format(new_add_count))

        return None
    async def _get_db_old_data(self) -> (list, None):
        """
        获取db需求更新的数据
        :return:
        """
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            if self.db_res_from == 0:
                result = list(
                    self.sql_cli._select_table(sql_str=tm_select_str_3))

            elif self.db_res_from == 1:
                result = await get_waited_2_update_db_data_from_server(
                    server_ip=self.server_ip,
                    _type='tm',
                    child_type=0,
                )
            elif self.db_res_from == 2:
                # 默认拿300个, 避免前100个失败率较高的情况下, 后面能继续更新
                result = get_waited_2_update_db_data_from_redis_server(
                    spider_name='tm0',
                    logger=self.lg,
                    slice_num=800,
                )
            else:
                raise ValueError('self.db_res_from value异常!')

        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result
Exemple #15
0
    async def _fck_run(self):
        while True:
            try:
                print('now_time: {}'.format(get_shanghai_time()))
                self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
                if not self.sql_cli.is_connect_success:
                    raise SqlServerConnectionException
                else:
                    pass

                await self.db_script0(
                    select_sql_str=self.sql_str0,
                    update_sql_str=self.sql_str1,
                    func_get_params=self.get_params0,
                )
                await self.db_script0(
                    select_sql_str=self.sql_str2,
                    update_sql_str=self.sql_str3,
                    func_get_params=self.get_params1,
                )
                await self.db_script0(
                    select_sql_str=self.sql_str4,
                    update_sql_str=self.sql_str5,
                    func_get_params=self.get_params0,
                )
                # tb天天特价
                await self.db_script0(
                    select_sql_str=self.sql_str6,
                    update_sql_str=tb_update_str_5,
                    func_get_params=self.get_params2,
                )
                # zhe800秒杀
                await self.db_script0(
                    select_sql_str=self.sql_str7,
                    update_sql_str=z8_update_str_6,
                    func_get_params=self.get_params2,
                )
                # zhe800拼团
                await self.db_script0(
                    select_sql_str=self.sql_str8,
                    update_sql_str=z8_update_str_4,
                    func_get_params=self.get_params2,
                )
                # mia拼团
                await self.db_script0(
                    select_sql_str=self.sql_str9,
                    update_sql_str=mia_update_str_7,
                    func_get_params=self.get_params2,
                )
                # 聚美优品拼团
                await self.db_script0(
                    select_sql_str=self.sql_str10,
                    update_sql_str=jm_update_str_5,
                    func_get_params=self.get_params2,
                )
            except Exception as e:
                print(e)
            finally:
                print('休眠{}s ...'.format(self.sleep_time))
                await async_sleep(self.sleep_time)
Exemple #16
0
    def _handle_goods_is_delete(self, goods_id):
        '''
        处理商品无法查看或者下架的
        :return:
        '''
        try:
            sql_cli = SqlServerMyPageInfoSaveItemPipeline()
            is_in_db = sql_cli._select_table(sql_str=al_select_str_1,
                                             params=(str(goods_id), ))
            # self.lg.info(str(is_in_db))
        except Exception:
            self.lg.error('数据库连接失败!' + self.error_base_record, exc_info=True)
            return self._data_error_init()

        self.result_data = {}
        # 初始化下架商品的属性
        tmp_data_s = self.init_pull_off_shelves_goods()
        if is_in_db != []:
            # 表示该goods_id以前已被插入到db中, 于是只需要更改其is_delete的状态即可
            _handle_goods_shelves_in_auto_goods_table(goods_id=goods_id,
                                                      logger=self.lg)
            self.lg.info('@@@ 该商品goods_id原先存在于db中, 此处将其is_delete=1')
            # 用来判断原先该goods是否在db中
            tmp_data_s['before'] = True

        else:
            # 表示该goods_id没存在于db中
            self.lg.info('@@@ 该商品已下架[但未存在于db中], ** 此处将其插入到db中...')
            tmp_data_s['before'] = False

        return tmp_data_s
    def _get_sku_info_from_db(self, goods_id):
        '''
        从db中得到sku_info
        :param goods_id:
        :return:
        '''
        _ = SqlServerMyPageInfoSaveItemPipeline()
        # sku_info = self.json_2_dict(_._select_table(sql_str=sql_str, params=(str(goods_id),)))
        try:
            _r = _._select_table(sql_str=al_select_str_2,
                                 params=(str(goods_id), ))[0][0]
            # print(_r)
            sku_info = decode(_r)
        except Exception:
            self.my_lg.error('demjson.decode数据时遇到错误!', exc_info=True)
            return []

        if sku_info == []:
            return ['']
        else:
            return list(
                set([
                    item.get('spec_type', '').replace('|', ';')
                    for item in sku_info
                ]))
Exemple #18
0
    def deal_with_data(self, goods_list):
        '''
        处理并存储相关拼团商品的数据
        :param goods_list:
        :return:
        '''
        mia = MiaPintuanParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            sql_str = r'select goods_id, miaosha_time, pid from dbo.mia_pintuan where site_id=21'
            db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=sql_str))]
            # print(db_goods_id_list)

            for item in goods_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass

                else:
                    goods_id = str(item.get('goods_id', ''))
                    tmp_url = 'https://www.mia.com/item-' + str(goods_id) + '.html'

                    mia.get_goods_data(goods_id=str(goods_id))
                    goods_data = mia.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass

                    else:  # 否则就解析并且插入
                        goods_url = goods_data['goods_url']
                        if re.compile(r'://m.miyabaobei.hk/').findall(goods_url) != '':
                            goods_url = 'https://www.miyabaobei.hk/item-' + str(goods_id) + '.html'
                        else:
                            goods_url = 'https://www.mia.com/item-' + str(goods_id) + '.html'
                        goods_data['goods_url'] = goods_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['sub_title'] = item.get('sub_title', '')
                        goods_data['pintuan_begin_time'], goods_data['pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time(pintuan_time=goods_data['pintuan_time'])
                        goods_data['pid'] = item.get('pid')

                        # pprint(goods_data)
                        # print(goods_data)
                        _r = mia.insert_into_mia_pintuan_table(data=goods_data, pipeline=my_pipeline)
                        if _r:  # 更新
                            db_goods_id_list.append(goods_id)
                            db_goods_id_list = list(set(db_goods_id_list))

                    sleep(MIA_SPIKE_SLEEP_TIME)  # 放慢速度
        else:
            print('数据库连接失败,此处跳过!')
            pass

        try:
            del mia
        except:
            pass
        gc.collect()
Exemple #19
0
 def __init__(self):
     self._set_logger()
     self.msg = ''
     self._init_debugging_api()
     self.debugging_api = self._init_debugging_api()
     self._set_func_name_dict()
     self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
     # 插入数据到goods_id_and_keyword_middle_table表
     self.add_keyword_id_for_goods_id_sql_str = kw_insert_str_1
Exemple #20
0
 def __init__(self, logger=None):
     self._set_sort_type_name()
     self._set_logger(logger)
     self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
     self.update_sql = 'update dbo.sina_weibo set head_img_url=%s, modify_time=%s where id=%s'
     self.phantomjs = MyPhantomjs(executable_path=PHANTOMJS_DRIVER_PATH,
                                  logger=self.my_lg)
     self.id_list = []
     self.update_index = 0
Exemple #21
0
    def _get_db_goods_id_list(self) -> (list, None):
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        try:
            _ = list(my_pipeline._select_table(sql_str=mia_select_str_4))
            db_goods_id_list = [item[0] for item in _]
        except Exception:
            return None

        return db_goods_id_list
Exemple #22
0
    def _deal_with_data(self):
        '''
        处理并存储抓取到的拼团商品的数据
        :return:
        '''
        zid_list = self._get_pintuan_goods_info()

        zhe_800_pintuan = Zhe800PintuanParse()
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        if my_pipeline.is_connect_success:
            sql_str = r'select goods_id, is_delete from dbo.zhe_800_pintuan where site_id=17'
            db_goods_id_list = [
                item[0]
                for item in list(my_pipeline._select_table(sql_str=sql_str))
            ]
            for item in zid_list:
                if item[0] in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass
                else:
                    tmp_url = 'https://pina.m.zhe800.com/detail/detail.html?zid=' + str(
                        item[0])
                    goods_id = zhe_800_pintuan.get_goods_id_from_url(tmp_url)

                    zhe_800_pintuan.get_goods_data(goods_id=goods_id)
                    goods_data = zhe_800_pintuan.deal_with_data()

                    if goods_data == {}:  # 返回的data为空则跳过
                        pass
                    else:  # 否则就解析并且插入
                        goods_data['goods_id'] = str(item[0])
                        goods_data['spider_url'] = tmp_url
                        goods_data['username'] = '******'
                        goods_data['page'] = str(item[1])
                        goods_data['pintuan_begin_time'], goods_data[
                            'pintuan_end_time'] = self.get_pintuan_begin_time_and_pintuan_end_time(
                                schedule=goods_data.get('schedule', [])[0])

                        # print(goods_data)
                        _r = zhe_800_pintuan.insert_into_zhe_800_pintuan_table(
                            data=goods_data, pipeline=my_pipeline)
                        if _r:  # 插入就更新
                            db_goods_id_list.append(item[0])
                            db_goods_id_list = list(set(db_goods_id_list))

                    sleep(ZHE_800_PINTUAN_SLEEP_TIME)
                    gc.collect()

        else:
            pass
        try:
            del zhe_800_pintuan
        except:
            pass
        gc.collect()

        return None
Exemple #23
0
    def deal_with_data(self, *params):
        '''
        处理并存储相关秒杀商品数据
        :param params: 相关参数
        :return:
        '''
        item_list = params[0]
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        if my_pipeline.is_connect_success:
            sql_str = r'select goods_id, miaosha_time, page, goods_url from dbo.jumeiyoupin_xianshimiaosha where site_id=26'
            db_goods_id_list = [item[0] for item in list(my_pipeline._select_table(sql_str=sql_str))]
            # print(db_goods_id_list)

            for item in item_list:
                if item.get('goods_id', '') in db_goods_id_list:
                    print('该goods_id已经存在于数据库中, 此处跳过')
                    pass
                else:
                    jumei = JuMeiYouPinParse()
                    goods_id = item.get('goods_id', '')
                    type = item.get('type', '')
                    tmp_url = 'https://h5.jumei.com/product/detail?item_id={0}&type={1}'.format(goods_id, type)
                    jumei.get_goods_data(goods_id=[goods_id, type])
                    goods_data = jumei.deal_with_data()

                    if goods_data == {}:
                        pass

                    elif goods_data.get('is_delete', 0) == 1:
                        print('------>>>| 该商品库存为0,已被抢光!')
                        pass

                    else:   # 否则就解析并且插入
                        goods_data['goods_url'] = tmp_url
                        goods_data['goods_id'] = str(goods_id)
                        goods_data['miaosha_time'] = {
                            'miaosha_begin_time': goods_data['schedule'].get('begin_time', ''),
                            'miaosha_end_time': goods_data['schedule'].get('end_time', ''),
                        }
                        goods_data['miaosha_begin_time'], goods_data['miaosha_end_time'] = self.get_miaosha_begin_time_and_miaosha_end_time(miaosha_time=goods_data['miaosha_time'])
                        goods_data['page'] = item.get('page')

                        # pprint(goods_data)
                        # print(goods_data)
                        jumei.insert_into_jumeiyoupin_xianshimiaosha_table(data=goods_data, pipeline=my_pipeline)
                        sleep(JUMEIYOUPIN_SLEEP_TIME)  # 放慢速度   由于初始化用了phantomjs时间久,于是就不睡眠

                    try: del jumei
                    except: pass

        else:
            print('数据库连接失败,此处跳过!')
            pass

        gc.collect()
Exemple #24
0
    def _get_db_goods_id_list(self) -> list:
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()

        _ = my_pipeline._select_table(sql_str=jp_select_str_5)
        if _ is None:
            db_goods_id_list = []
        else:
            db_goods_id_list = [item[0] for item in list(_)]

        return db_goods_id_list
Exemple #25
0
    async def _get_db_old_data(self):
        self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            result = list(self.sql_cli._select_table(sql_str=jd_select_str_1))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result
Exemple #26
0
    def _just_run(self):
        while True:
            # 获取keywords
            sql_str = 'select id, keyword from dbo.goods_keywords where is_delete=0'
            # 获取原先goods_db的所有已存在的goods_id
            sql_str_2 = 'select GoodsID from dbo.GoodsInfoAutoGet'

            try:
                result = list(self.my_pipeline._select_table(sql_str=sql_str))
                self.my_lg.info('正在获取db中已存在的goods_id...')
                result_2 = list(self.my_pipeline._select_table(sql_str=sql_str_2))
                self.my_lg.info('db中已存在的goods_id获取成功!')

            except TypeError:
                self.my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
                result = None
                result_2 = None

            if result is not None and result_2 is not None:
                self.my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
                self.my_lg.info(str(result))
                self.my_lg.info('--------------------------------------------------------')

                self.my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
                self.add_goods_index = 0           # 用于定位增加商品的个数
                self.db_existed_goods_id_list = [item[0] for item in result_2]
                # 即时释放资源
                try: del result_2
                except: pass
                gc.collect()

                for type, type_value in self.debugging_api.items():  # 遍历待抓取的电商分类
                    if type_value is False:
                        self.my_lg.info('api为False, 跳过!')
                        continue

                    for item in result:     # 遍历每个关键字
                        self.my_lg.info('正在处理id为{0}, 关键字为 {1} ...'.format(item[0], item[1]))
                        if self.add_goods_index % 20 == 0:
                            self.my_lg.info('my_pipeline客户端重连中...')
                            try: del self.my_pipeline
                            except: pass
                            self.my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
                            self.my_lg.info('my_pipeline客户端重连完毕!')

                        goods_id_list = self._get_keywords_goods_id_list(type=type, keyword=item)
                        self.my_lg.info('关键字为{0}, 获取到的goods_id_list 如下: {1}'.format(item[1], str(goods_id_list)))
                        '''处理goods_id_list'''
                        self._deal_with_goods_id_list(
                            type=type,
                            goods_id_list=goods_id_list,
                            keyword_id=item[0]
                        )
                        sleep(5)
 def __init__(self):
     AsyncCrawler.__init__(
         self,
         ip_pool_type=IP_POOL_TYPE,
         log_print=True,
         log_save_path=MY_SPIDER_LOGS_PATH + '/cp/yx_goods_monitor/',
     )
     self.req_num_retries = 5
     self.concurrency = 100
     self.concurrent_type = 1
     self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
     self.init_sql_str()
Exemple #28
0
    def _get_db_goods_id_list(self) -> list:
        my_pipeline = SqlServerMyPageInfoSaveItemPipeline()
        _ = my_pipeline._select_table(sql_str=pd_select_str_3)
        assert _ is not None, 'db_goods_id_list为None!'
        db_goods_id_list = [item[0] for item in list(_)]

        try:
            del my_pipeline
        except:
            pass

        return db_goods_id_list
Exemple #29
0
 def set_sql_cli(self):
     """
     设置连接类型
     :return:
     """
     if self.db_conn_type == 1:
         # 推荐
         self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
     elif self.db_conn_type == 2:
         # 使用sqlalchemy管理数据库连接池
         self.sql_cli = SqlPools()
     else:
         raise ValueError('db_conn_type 值异常!')
Exemple #30
0
    async def _get_db_old_data(self) -> (None, list):
        self.tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        result = None
        try:
            self.tmp_sql_server._delete_table(sql_str=jp_delete_str_4, params=None)
            await async_sleep(5)
            result = list(self.tmp_sql_server._select_table(sql_str=jp_select_str_4))
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result