Example #1
0
    async def _update_one_goods_info_in_db(self, db_goods_info_obj, index,
                                           before_goods_data,
                                           end_goods_data) -> (list, tuple):
        """
        更新单个goods
        :param db_goods_info_obj:
        :param index:
        :param before_goods_data:
        :param end_goods_data:
        :return:
        """
        res = False

        self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli,
                                              index=index,
                                              logger=self.lg,
                                              db_conn_type=self.db_conn_type,
                                              remainder=25)
        if self.sql_cli.is_connect_success:
            self.lg.info('*' * 20 +
                         ' updating goods_id: {}, index: {} ...'.format(
                             db_goods_info_obj.goods_id,
                             index,
                         ))
            # 避免下面解析data错误休眠
            before_goods_data_is_delete = before_goods_data.get('is_delete', 0)
            if end_goods_data != {}:
                data = get_goods_info_change_data(
                    # eg: 'tm', 'tb'
                    target_short_name=self.goods_spider_type,
                    logger=self.lg,
                    data=end_goods_data,
                    db_goods_info_obj=db_goods_info_obj,
                    sql_cli=self.sql_cli,
                )
                res = to_right_and_update_data_by_goods_type(
                    goods_type=self.goods_spider_type,
                    data=data,
                    pipeline=self.sql_cli,
                    logger=self.lg,
                )

            else:  # 表示返回的data值为空值
                if before_goods_data_is_delete == 1:
                    # 检索后下架状态的, res也设置为True
                    res = True
                else:
                    self.lg.info('goods_id: {}, 阻塞休眠7s中...'.format(
                        db_goods_info_obj.goods_id, ))
                    await async_sleep(delay=7., loop=self.loop)
                    # 改为阻塞进程, 机器会挂
                    # sleep(7.)

        else:
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')
            await async_sleep(delay=8, loop=self.loop)

        collect()

        return [db_goods_info_obj.goods_id, res]
Example #2
0
    async def _update_one_goods_info(self, db_goods_info_obj, index):
        '''
        更新单个jd商品信息
        :param db_goods_info_obj:
        :param index:
        :return:
        '''
        res = False
        await self._get_new_jd_obj(index=index)
        self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli,
                                              index=index,
                                              logger=self.lg)
        if self.sql_cli.is_connect_success:
            self.lg.info(
                '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                format(db_goods_info_obj.goods_id, index))
            tmp_item = await self._get_tmp_item(
                site_id=db_goods_info_obj.site_id,
                goods_id=db_goods_info_obj.goods_id,
            )
            data = self.jd.get_goods_data(goods_id=tmp_item)
            if data.get('is_delete', 1) == 1:
                self.lg.info('该商品已下架...')
                self.sql_cli._update_table_2(sql_str=jd_update_str_2,
                                             params=(
                                                 str(get_shanghai_time()),
                                                 tmp_item[1],
                                             ),
                                             logger=self.lg)
                await async_sleep(1.2)
                index += 1
                self.goods_index = index

                return db_goods_info_obj.goods_id, index

            data = self.jd.deal_with_data(goods_id=tmp_item)
            if data != {}:
                data = get_goods_info_change_data(
                    target_short_name='jd',
                    logger=self.lg,
                    data=data,
                    db_goods_info_obj=db_goods_info_obj,
                )
                self.jd.to_right_and_update_data(data, pipeline=self.sql_cli)

            else:  # 表示返回的data值为空值
                pass
        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')
            pass

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(1.2)  # 避免被发现使用代理

        return db_goods_info_obj.goods_id, index
Example #3
0
    async def _update_one_goods_info_by_celery(self, db_goods_info_obj, index,
                                               before_goods_data,
                                               end_goods_data):
        """
        更新单个goods
        :param item:
        :param index:
        :param before_goods_data:
        :param end_goods_data:
        :return:
        """
        res = False

        self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli,
                                              index=index,
                                              logger=self.lg,
                                              remainder=50)
        if self.sql_cli.is_connect_success:
            self.lg.info('### updating goods_id: {}, index: {} ...'.format(
                db_goods_info_obj.goods_id,
                index,
            ))
            # 避免下面解析data错误休眠
            before_goods_data_is_delete = before_goods_data.get('is_delete', 0)
            if end_goods_data != {}:
                data = get_goods_info_change_data(
                    target_short_name='tm',
                    logger=self.lg,
                    data=end_goods_data,
                    db_goods_info_obj=db_goods_info_obj,
                )
                res = to_right_and_update_tm_data(data=data,
                                                  pipeline=self.sql_cli,
                                                  logger=self.lg)

            else:  # 表示返回的data值为空值
                if before_goods_data_is_delete == 1:
                    # 检索后下架状态的, res也设置为True
                    res = True
                else:
                    self.lg.info('goods_id: {}, 阻塞休眠7s中...'.format(
                        db_goods_info_obj.goods_id, ))
                    await async_sleep(delay=7., loop=self.loop)
                    # 改为阻塞进程, 机器会挂
                    # sleep(7.)

        else:
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')
            await async_sleep(delay=5, loop=self.loop)

        await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME)
        collect()

        return [db_goods_info_obj.goods_id, res]
Example #4
0
    async def _update_one_goods_info(self, db_goods_info_obj, index):
        '''
        更新单个goods
        :return:
        '''
        res = False
        await self._get_new_tb_obj(index=index)
        self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli,
                                              index=index,
                                              logger=self.lg,
                                              db_conn_type=2,
                                              remainder=50)
        if self.sql_cli.is_connect_success:
            self.lg.info(
                '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' %
                (db_goods_info_obj.goods_id, str(index)))
            oo = self.taobao.get_goods_data(
                goods_id=db_goods_info_obj.goods_id)
            oo_is_delete = oo.get('is_delete', 0)  # 避免下面解析data错误休眠
            data = self.taobao.deal_with_data(
                goods_id=db_goods_info_obj.goods_id)
            if data != {}:
                data = get_goods_info_change_data(
                    target_short_name='tb',
                    logger=self.lg,
                    data=data,
                    db_goods_info_obj=db_goods_info_obj,
                )
                res = to_right_and_update_tb_data(data=data,
                                                  pipeline=self.sql_cli,
                                                  logger=self.lg)

            else:
                if oo_is_delete == 1:
                    # 检索后下架状态的, res也设置为True
                    res = True
                else:
                    self.lg.info('------>>>| 休眠8s中...')
                    await async_sleep(delay=8, loop=self.loop)

        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')
            await async_sleep(delay=10, loop=self.loop)

        index += 1
        self.goods_index = index
        collect()
        # 国外服务器上可以缩短时间, 可以设置为0s
        await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量

        return [db_goods_info_obj.goods_id, res]
Example #5
0
    async def _update_one_goods_info(self, db_goods_info_obj, index):
        '''
        更新一个goods的信息
        :param db_goods_info_obj:
        :param index:
        :return: ['goods_id', bool:'成功与否']
        '''
        res = False
        await self._get_new_ali_obj(index=index)
        self.sql_cli = await _get_new_db_conn(
            db_obj=self.sql_cli,
            index=index,
            logger=self.lg)
        if self.sql_cli.is_connect_success:
            self.lg.info('------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.format(
                db_goods_info_obj.goods_id,
                index))
            self.zhe_800.get_goods_data(goods_id=db_goods_info_obj.goods_id)
            data = self.zhe_800.deal_with_data()
            if data != {}:
                data = get_goods_info_change_data(
                    target_short_name='z8',
                    logger=self.lg,
                    data=data,
                    db_goods_info_obj=db_goods_info_obj,)
                res = self.zhe_800.to_right_and_update_data(
                    data=data,
                    pipeline=self.sql_cli)

            else:  # 表示返回的data值为空值
                pass
        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(2.)

        return [db_goods_info_obj.goods_id, res]
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(
            logger_name=get_uuid1(),
            log_file_name=MY_SPIDER_LOGS_PATH + '/蜜芽/实时更新/' +
            str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR,
        )

        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=mia_select_str_5))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result, logger=my_lg)
            index = 1
            mia = MiaParse()
            for item in result:
                goods_id = item[1]
                if index % 5 == 0:
                    try:
                        del mia
                    except:
                        pass
                    mia = MiaParse()
                    collect()

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 logger=my_lg,
                                                 remainder=10)
                if sql_cli.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(goods_id), str(index)))
                    mia.get_goods_data(goods_id=goods_id)
                    data = mia.deal_with_data()
                    db_goods_info_obj = MIADbGoodsInfoObj(item=item,
                                                          logger=my_lg)
                    if data != {}:
                        if data.get('is_delete') == 1:  # 单独处理下架商品
                            my_lg.info('@@@ 该商品已下架...')
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                logger=my_lg,
                                sql_cli=sql_cli,
                            )
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data = get_goods_info_change_data(
                                target_short_name='mia',
                                logger=my_lg,
                                data=data,
                                db_goods_info_obj=db_goods_info_obj,
                            )

                        mia._to_right_and_update_data(data, pipeline=sql_cli)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5 * 60)
        try:
            del my_lg
        except:
            pass
        collect()
Example #7
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/网易考拉/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)
        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=kl_select_str_1))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result, logger=my_lg)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            kaola = KaoLaParse(logger=my_lg, is_real_times_update_call=True)
            for item in result:  # 实时更新数据
                goods_id = item[1]
                if index % 5 == 0:
                    try:
                        del kaola
                    except:
                        pass
                    kaola = KaoLaParse(logger=my_lg,
                                       is_real_times_update_call=True)
                    collect()

                sql_cli = _block_get_new_db_conn(
                    db_obj=sql_cli,
                    index=index,
                    logger=my_lg,
                    remainder=10,
                )
                if sql_cli.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(goods_id), str(index)))
                    db_goods_info_obj = KLDbGoodsInfoObj(item=item,
                                                         logger=my_lg)
                    data = kaola._get_goods_data(goods_id=goods_id)
                    if data.get('is_delete', 0) == 1:
                        # 单独处理下架商品
                        data['goods_id'] = goods_id
                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=db_goods_info_obj.is_delete,
                                shelf_time=db_goods_info_obj.shelf_time,
                                delete_time=db_goods_info_obj.delete_time,
                            )

                        try:
                            kaola.to_right_and_update_data(data,
                                                           pipeline=sql_cli)
                        except Exception:
                            my_lg.error(exc_info=True)

                        sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                        index += 1
                        collect()
                        continue

                    data = kaola._deal_with_data()
                    if data != {}:
                        if data.get('is_delete', 0) == 1:
                            _handle_goods_shelves_in_auto_goods_table(
                                goods_id=goods_id,
                                logger=my_lg,
                                sql_cli=sql_cli,
                            )
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data = get_goods_info_change_data(
                                target_short_name='kl',
                                logger=my_lg,
                                data=data,
                                db_goods_info_obj=db_goods_info_obj,
                            )
                        kaola.to_right_and_update_data(data, pipeline=sql_cli)

                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠3s中...')
                        sleep(3.)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:
            # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(60)
        collect()
Example #8
0
    async def _update_one_goods_info(self, db_goods_info_obj, index):
        """
        更新单个goods
        :param db_goods_info_obj:
        :param index: 
        :return: 
        """
        res = False

        tmall = TmallParse(logger=self.lg)
        self.sql_cli = await _get_new_db_conn(
            db_obj=self.sql_cli,
            index=index,
            logger=self.lg,
            remainder=50,
        )
        if self.sql_cli.is_connect_success:
            self.lg.info(
                '------>>>| 正在更新的goods_id为({}) | --------->>>@ 索引值为({})'.
                format(db_goods_info_obj.goods_id, index))
            tmp_item = self._get_tmp_item(site_id=db_goods_info_obj.site_id,
                                          goods_id=db_goods_info_obj.goods_id)
            # self.lg.info(str(tmp_item))

            # ** 阻塞方式运行
            oo = tmall.get_goods_data(goods_id=tmp_item)
            # ** 非阻塞方式运行
            # oo = await unblock_func(
            #     func_name=tmall.get_goods_data,
            #     func_args=[
            #         tmp_item,
            #     ],
            #     default_res={},
            #     logger=self.lg,)

            before_goods_data_is_delete = oo.get('is_delete',
                                                 0)  # 避免下面解析data错误休眠
            # 阻塞方式
            data = tmall.deal_with_data()
            if data != {}:
                data = get_goods_info_change_data(
                    target_short_name='tm',
                    logger=self.lg,
                    data=data,
                    db_goods_info_obj=db_goods_info_obj,
                )
                res = to_right_and_update_tm_data(data=data,
                                                  pipeline=self.sql_cli,
                                                  logger=self.lg)

            else:
                if before_goods_data_is_delete == 1:
                    # 检索后下架状态的, res也设置为True
                    res = True
                else:
                    self.lg.info('------>>>| 阻塞休眠7s中...')
                    await async_sleep(delay=7., loop=self.loop)
                    # 改为阻塞进程, 机器会挂
                    # sleep(7.)

        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')
            await async_sleep(delay=5, loop=self.loop)

        try:
            del tmall
        except:
            pass
        collect()
        await async_sleep(TMALL_REAL_TIMES_SLEEP_TIME)

        return [
            db_goods_info_obj.goods_id,
            res,
        ]
Example #9
0
def run_forever():
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(
            log_file_name=MY_SPIDER_LOGS_PATH + '/网易严选/实时更新/' +
            str(get_shanghai_time())[0:10] + '.txt',
            console_log_level=INFO,
            file_log_level=ERROR,
        )

        #### 实时更新数据
        sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        try:
            result = list(sql_cli._select_table(sql_str=yx_select_str_1))
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            _block_print_db_old_data(result=result, logger=my_lg)
            index = 1
            # 释放内存,在外面声明就会占用很大的,所以此处优化内存的方法是声明后再删除释放
            yanxuan = YanXuanParse(logger=my_lg)
            for item in result:  # 实时更新数据
                if index % 5 == 0:
                    try:
                        del yanxuan
                    except:
                        pass
                    yanxuan = YanXuanParse(logger=my_lg)
                    collect()

                sql_cli = _block_get_new_db_conn(db_obj=sql_cli,
                                                 index=index,
                                                 logger=my_lg,
                                                 remainder=10)
                if sql_cli.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (str(item[1]), str(index)))
                    yanxuan._get_goods_data(goods_id=item[1])

                    data = yanxuan._deal_with_data()
                    db_goods_info_obj = YXDbGoodsInfoObj(item=item,
                                                         logger=my_lg)
                    if data != {}:
                        if data.get('is_delete') == 1:
                            # 单独处理下架商品
                            my_lg.info('@@@ 该商品已下架...')
                            sql_cli._update_table_2(
                                sql_str=yx_update_str_2,
                                params=(db_goods_info_obj.goods_id, ),
                                logger=my_lg,
                            )
                            sleep(TMALL_REAL_TIMES_SLEEP_TIME)
                            continue

                        else:
                            data = get_goods_info_change_data(
                                target_short_name='yx',
                                logger=my_lg,
                                data=data,
                                db_goods_info_obj=db_goods_info_obj,
                            )

                        yanxuan.to_right_and_update_data(data,
                                                         pipeline=sql_cli)
                    else:  # 表示返回的data值为空值
                        my_lg.info('------>>>| 休眠8s中...')
                        sleep(8)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(5)
                    pass
                index += 1
                collect()
                sleep(TMALL_REAL_TIMES_SLEEP_TIME)

            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(60)
        collect()
    async def _update_one_goods_info(self, db_goods_info_obj, index) -> list:
        """
        更新一个goods的信息
        :param db_goods_info_obj:
        :param index: 索引值
        :return: ['goods_id', bool:'成功与否']
        """
        res = False
        await self._get_new_ali_obj(index=index)
        self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli,
                                              index=index,
                                              logger=self.lg)
        if self.sql_cli.is_connect_success:
            self.lg.info(
                '------>>>| 正在更新的goods_id为({0}) | --------->>>@ 索引值为({1})'.
                format(db_goods_info_obj.goods_id, index))
            data = self.ali_1688.get_ali_1688_data(
                goods_id=db_goods_info_obj.goods_id)
            if isinstance(data, int):  # 单独处理返回tt为4041
                self.goods_index += 1
                return [db_goods_info_obj.goods_id, res]

            if data.get('is_delete') == 1:
                # 单独处理【原先插入】就是 下架状态的商品
                data['goods_id'] = db_goods_info_obj.goods_id
                data['shelf_time'], data[
                    'delete_time'] = get_shelf_time_and_delete_time(
                        tmp_data=data,
                        is_delete=db_goods_info_obj.is_delete,
                        shelf_time=db_goods_info_obj.shelf_time,
                        delete_time=db_goods_info_obj.delete_time,
                    )
                try:
                    self.ali_1688.to_right_and_update_data(
                        data, pipeline=self.sql_cli)
                except Exception:
                    self.lg.error(exc_info=True)

                await async_sleep(1.5)
                self.goods_index += 1
                res = True

                return [db_goods_info_obj.goods_id, res]

            data = self.ali_1688.deal_with_data()
            if data != {}:
                data = get_goods_info_change_data(
                    target_short_name='al',
                    logger=self.lg,
                    data=data,
                    db_goods_info_obj=db_goods_info_obj,
                )

                res = self.ali_1688.to_right_and_update_data(
                    data, pipeline=self.sql_cli)
                await async_sleep(.3)

            else:  # 表示返回的data值为空值
                pass

        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')

        index += 1
        self.goods_index = index
        collect()
        await async_sleep(2.)  # 避免被发现使用代理

        return [db_goods_info_obj.goods_id, res]