Ejemplo n.º 1
0
 def set_sql_cli(self):
     """
     设置连接类型
     :return:
     """
     if self.db_conn_type == 1:
         # 推荐
         self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
     elif self.db_conn_type == 2:
         # 使用sqlalchemy管理数据库连接池
         self.sql_cli = SqlPools()
     else:
         raise ValueError('db_conn_type 值异常!')
Ejemplo n.º 2
0
    async def _get_db_old_data(self) -> (list, None):
        '''
        获取db需求更新的数据
        :return:
        '''
        if self.db_conn_type == 1:
            self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        elif self.db_conn_type == 2:
            # 使用sqlalchemy管理数据库连接池
            self.sql_cli = SqlPools()
        else:
            raise ValueError('db_conn_type 值异常!')

        result = None
        try:
            if self.db_res_from == 0:
                result = self.sql_cli._select_table(sql_str=tb_select_str_3,)

            elif self.db_res_from == 1:
                result = await get_waited_2_update_db_data_from_server(
                    server_ip=self.server_ip,
                    _type='tb',
                    child_type=0,)
            elif self.db_res_from == 2:
                # 默认拿300个, 避免前100个失败率较高的情况下, 后面能继续更新
                result = get_waited_2_update_db_data_from_redis_server(
                    spider_name='tb0',
                    logger=self.lg,
                    slice_num=800,)
            else:
                raise ValueError('self.db_res_from value异常!')

        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
        except Exception:
            self.lg.error('遇到错误:', exc_info=True)

        await _print_db_old_data(logger=self.lg, result=result)

        return result
Ejemplo n.º 3
0
    async def _get_db_old_data(self) -> (list, None):
        '''
        获取db需求更新的数据
        :return:
        '''
        # self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        # 使用sqlalchemy管理数据库连接池
        self.sql_cli = SqlPools()
        result = None
        try:
            # result = self.sql_cli._select_table(sql_str=tb_select_str_3,)
            result = await get_waited_2_update_db_data_from_server(
                server_ip=self.server_ip,
                _type='tb',
                child_type=0,
            )
        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
        except Exception:
            self.lg.error('遇到错误:', exc_info=True)

        await _print_db_old_data(logger=self.lg, result=result)

        return result
Ejemplo n.º 4
0
def run_forever():
    #### 实时更新数据
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)

        # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        tmp_sql_server = SqlPools()  # 使用sqlalchemy管理数据库连接池
        try:
            # result = list(tmp_sql_server.select_taobao_all_goods_id())
            result = tmp_sql_server.select_taobao_all_goods_id()

        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info(
                '--------------------------------------------------------')

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                taobao = TaoBaoLoginAndParse(logger=my_lg)
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    tmp_sql_server = SqlPools()

                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (item[0], str(index)))
                    data = taobao.get_goods_data(item[0])

                    if data.get('is_delete') == 1:  # 单独处理【原先插入】就是 下架状态的商品
                        data['goods_id'] = item[0]
                        data['my_shelf_and_down_time'], data[
                            'delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                MyShelfAndDownTime=item[2])

                        # my_lg.info('------>>>| 爬取到的数据为: ' + str(data))
                        taobao.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)

                        sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 避免服务器更新太频繁
                        index += 1
                        gc.collect()
                        continue

                    data = taobao.deal_with_data(goods_id=item[0])
                    if data != {}:
                        data['goods_id'] = item[0]
                        data['my_shelf_and_down_time'], data[
                            'delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                MyShelfAndDownTime=item[2])
                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[3],
                                old_taobao_price=item[4],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])

                        # my_lg.info('------>>>| 爬取到的数据为: ' + str(data))
                        taobao.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)
                    else:
                        my_lg.info('------>>>| 休眠5s中...')
                        sleep(5)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(10)
                    pass

                index += 1
                # try:
                #     del taobao
                # except:
                #     pass
                gc.collect()
                # 国外服务器上可以缩短时间, 可以设置为0s
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量
            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
        restart_program()
Ejemplo n.º 5
0
def run_forever():
    #### 实时更新数据
    while True:
        # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        tmp_sql_server = SqlPools()  # 使用sqlalchemy管理数据库连接池
        tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline()
        try:
            # result = list(tmp_sql_server.select_taobao_all_goods_id())
            result = tmp_sql_server.select_taobao_all_goods_id()
            result_2 = list(tmp_sql_server_2.select_old_table_all_goods_id())
            # print(result_2)
        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result_2)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1

            new_table_ali_1688_all_goods_id_list = [item[0] for item in result]
            for item in result_2:  # 实时更新数据
                data = {}
                taobao = TaoBaoLoginAndParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    tmp_sql_server_2 = SqlServerMyPageInfoSaveItemPipeline()
                    tmp_sql_server = SqlPools()

                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    goods_id = taobao.get_goods_id_from_url(item[0])
                    if goods_id == '':
                        print('@@@ 原商品的地址为: ', item[0])
                        continue
                    else:
                        if goods_id in new_table_ali_1688_all_goods_id_list:
                            print('该goods_id已经存在于数据库中, 此处跳过!')
                            continue

                        else:
                            print(
                                '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                                % (goods_id, index))
                            tt = taobao.get_goods_data(goods_id)
                            if tt.get('is_delete') == 1:  # 处理已下架的但是还是要插入的
                                tt['goods_id'] = goods_id
                                tt['goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(
                                    goods_id)
                                tt['username'] = '******'
                                tt['main_goods_id'] = item[1]

                                # print('------>>>| 爬取到的数据为: ', data)
                                taobao.old_taobao_goods_insert_into_new_table(
                                    data=tt, pipeline=tmp_sql_server_2)

                                index += 1
                                gc.collect()
                                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
                                continue
                            else:
                                pass

                            data = taobao.deal_with_data(goods_id=goods_id)
                            if data != {}:
                                data['goods_id'] = goods_id
                                data[
                                    'goods_url'] = 'https://item.taobao.com/item.htm?id=' + str(
                                        goods_id)
                                data['username'] = '******'
                                data['main_goods_id'] = item[1]

                                # print('------>>>| 爬取到的数据为: ', data)
                                taobao.old_taobao_goods_insert_into_new_table(
                                    data, pipeline=tmp_sql_server_2)
                            else:
                                pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del taobao
                # except:
                #     pass
                gc.collect()
                # 国外服务器上可以缩短时间, 可以设置为0s
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Ejemplo n.º 6
0
class TBUpdater(AsyncCrawler):
    def __init__(self, *params, **kwargs):
        AsyncCrawler.__init__(
            self,
            *params,
            **kwargs,
            log_print=True,
            log_save_path=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/')
        self.sql_cli = None
        # 1 SqlServerMyPageInfoSaveItemPipeline | 2 SqlPools
        self.db_conn_type = 1
        self.goods_index = 1
        # 并发量
        self.concurrency = 100
        self.concurrent_type = CONCURRENT_TYPE
        # 0 sqlserver | 1 new_my_server | 2 redis
        self.db_res_from = 2
        if 'armv7l-with-debian' in platform.platform():
            self.server_ip = 'http://0.0.0.0:80'
        else:
            self.server_ip = 'http://118.31.39.97'
            # self.server_ip = 'http://0.0.0.0:5000'

    async def _update_db(self):
        '''
        实时更新数据
        :return:
        '''
        while True:
            # 长期运行报: OSError: [Errno 24] Too many open files, 故不采用每日一志
            # self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                self.goods_index = 1
                tasks_params_list = TasksParamsListObj(tasks_params_list=result, step=self.concurrency)
                self.taobao = TaoBaoLoginAndParse(logger=self.lg, is_real_times_update_call=True)
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                    except AssertionError:
                        break

                    one_res, index = await self._get_one_res(
                        slice_params_list=slice_params_list,
                        index=index)
                    await self._except_sleep(res=one_res)

                self.lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)

            if get_shanghai_time().hour == 0:
                # 0点以后不更新
                await async_sleep(60 * 60 * .5)
            else:
                await async_sleep(5.)
            try:
                # del self.lg
                del result
            except:
                pass
            collect()

    async def _get_db_old_data(self) -> (list, None):
        '''
        获取db需求更新的数据
        :return:
        '''
        if self.db_conn_type == 1:
            self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        elif self.db_conn_type == 2:
            # 使用sqlalchemy管理数据库连接池
            self.sql_cli = SqlPools()
        else:
            raise ValueError('db_conn_type 值异常!')

        result = None
        try:
            if self.db_res_from == 0:
                result = self.sql_cli._select_table(sql_str=tb_select_str_3,)

            elif self.db_res_from == 1:
                result = await get_waited_2_update_db_data_from_server(
                    server_ip=self.server_ip,
                    _type='tb',
                    child_type=0,)
            elif self.db_res_from == 2:
                # 默认拿300个, 避免前100个失败率较高的情况下, 后面能继续更新
                result = get_waited_2_update_db_data_from_redis_server(
                    spider_name='tb0',
                    logger=self.lg,
                    slice_num=800,)
            else:
                raise ValueError('self.db_res_from value异常!')

        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
        except Exception:
            self.lg.error('遇到错误:', exc_info=True)

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_one_res(self, slice_params_list: list, index) -> tuple:
        """
        获取slice_params_list对应的one_res
        :param slice_params_list:
        :param index:
        :return: (list, int)
        """
        def get_tasks_params_list(slice_params_list: list, index: int) -> list:
            tasks_params_list = []
            for item in slice_params_list:
                db_goods_info_obj = TBDbGoodsInfoObj(item=item, logger=self.lg)
                tasks_params_list.append({
                    'db_goods_info_obj': db_goods_info_obj,
                    'index': index,
                })
                index += 1

            return tasks_params_list

        def get_create_task_msg(k) -> str:
            return 'create task[where is goods_id: {}, index: {}] ...'.format(
                k['db_goods_info_obj'].goods_id,
                k['index'],)

        def get_now_args(k) -> list:
            return [
                'tb',
                k['db_goods_info_obj'].goods_id,
                k['index'],
                self.lg,
            ]

        async def handle_one_res(one_res: list):
            """
            one_res后续处理
            :param one_res:
            :return:
            """
            nonlocal slice_params_list

            # 获取新new_slice_params_list
            new_slice_params_list = []
            for item in slice_params_list:
                goods_id = item[1]
                for i in one_res:
                    # self.lg.info(str(i))
                    try:
                        goods_id2 = i[1]
                        index = i[2]
                        if goods_id == goods_id2:
                            new_slice_params_list.append({
                                'index': index,
                                'before_goods_data': i[3],
                                'end_goods_data': i[4],
                                'item': item,
                            })
                            break
                        else:
                            continue
                    except IndexError:
                        continue

            # 阻塞方式进行存储, 避免db高并发导致大量死锁
            tasks = []
            for k in new_slice_params_list:
                item = k['item']
                index = k['index']
                db_goods_info_obj = TBDbGoodsInfoObj(item=item, logger=self.lg)
                self.lg.info('create task[where is goods_id: {}, index: {}]...'.format(
                    db_goods_info_obj.goods_id,
                    index))
                tasks.append(self.loop.create_task(self._update_one_goods_info_in_db(
                    db_goods_info_obj=db_goods_info_obj,
                    index=index,
                    before_goods_data=k['before_goods_data'],
                    end_goods_data=k['end_goods_data'],)))

            # self.lg.error(str(one_res))
            # self.lg.error(str(tasks))
            one_res = await _get_async_task_result(
                tasks=tasks,
                logger=self.lg)
            # pprint(one_res)
            try:
                del new_slice_params_list
            except:
                pass

            return one_res

        # tasks = []
        # # method 1
        # for item in slice_params_list:
        #     db_goods_info_obj = TBDbGoodsInfoObj(item=item, logger=self.lg)
        #     self.lg.info('创建 task goods_id: {}'.format(db_goods_info_obj.goods_id))
        #     tasks.append(self.loop.create_task(self._update_one_goods_info(
        #         db_goods_info_obj=db_goods_info_obj,
        #         index=index)))
        #     index += 1
        #
        # res = await _get_async_task_result(tasks=tasks, logger=self.lg)

        # method 2
        one_res = await get_or_handle_target_data_by_task_params_list(
            loop=self.loop,
            tasks_params_list=get_tasks_params_list(
                slice_params_list=slice_params_list,
                index=index,),
            func_name_where_get_create_task_msg=get_create_task_msg,
            func_name=block_get_one_goods_info_task_by_external_type,
            func_name_where_get_now_args=get_now_args,
            func_name_where_handle_one_res=None,
            func_name_where_add_one_res_2_all_res=default_add_one_res_2_all_res2,
            one_default_res=(),
            step=self.concurrency,
            logger=self.lg,
            get_all_res=True,
            concurrent_type=self.concurrent_type,
        )
        # pprint(one_res)
        res = await handle_one_res(one_res=one_res)

        return (res, index)

    async def _update_one_goods_info_in_db(self,
                                           db_goods_info_obj,
                                           index,
                                           before_goods_data,
                                           end_goods_data):
        """
        更新单个goods
        :param db_goods_info_obj:
        :param index:
        :param before_goods_data:
        :param end_goods_data:
        :return:
        """
        res = False

        self.sql_cli = await _get_new_db_conn(
            db_obj=self.sql_cli,
            index=index,
            logger=self.lg,
            db_conn_type=self.db_conn_type,
            remainder=25,)
        if self.sql_cli.is_connect_success:
            self.lg.info('*' * 20 + ' updating goods_id: {}, index: {} ...'.format(
                db_goods_info_obj.goods_id,
                index, ))
            # 避免下面解析data错误休眠
            before_goods_data_is_delete = before_goods_data.get('is_delete', 0)
            if end_goods_data != {}:
                data = get_goods_info_change_data(
                    target_short_name='tb',
                    logger=self.lg,
                    data=end_goods_data,
                    db_goods_info_obj=db_goods_info_obj, )
                res = to_right_and_update_tb_data(
                    data=data,
                    pipeline=self.sql_cli,
                    logger=self.lg,)

            else:  # 表示返回的data值为空值
                if before_goods_data_is_delete == 1:
                    # 检索后下架状态的, res也设置为True
                    res = True
                else:
                    self.lg.info('goods_id: {}, 阻塞休眠7s中...'.format(
                        db_goods_info_obj.goods_id,))
                    await async_sleep(delay=7., loop=self.loop)
                    # 改为阻塞进程, 机器会挂
                    # sleep(7.)

        else:
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')
            await async_sleep(delay=5, loop=self.loop)

        await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)
        collect()

        return [db_goods_info_obj.goods_id, res]

    async def _get_new_tb_obj(self, index) -> None:
        if index % 10 == 0:
            try:
                del self.taobao
            except:
                pass
            collect()
            self.taobao = TaoBaoLoginAndParse(logger=self.lg, is_real_times_update_call=True)

    async def _update_one_goods_info(self, db_goods_info_obj, index):
        '''
        更新单个goods
        :return:
        '''
        res = False
        await self._get_new_tb_obj(index=index)
        self.sql_cli = await _get_new_db_conn(
            db_obj=self.sql_cli,
            index=index,
            logger=self.lg,
            db_conn_type=self.db_conn_type,
            remainder=25,)
        if self.sql_cli.is_connect_success:
            self.lg.info('------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)' % (
                db_goods_info_obj.goods_id,
                str(index)))
            oo = self.taobao.get_goods_data(goods_id=db_goods_info_obj.goods_id)
            oo_is_delete = oo.get('is_delete', 0)  # 避免下面解析data错误休眠
            data = self.taobao.deal_with_data(goods_id=db_goods_info_obj.goods_id)
            if data != {}:
                data = get_goods_info_change_data(
                    target_short_name='tb',
                    logger=self.lg,
                    data = data,
                    db_goods_info_obj=db_goods_info_obj,)
                res = to_right_and_update_tb_data(
                    data=data,
                    pipeline=self.sql_cli,
                    logger=self.lg)

            else:
                if oo_is_delete == 1:
                    # 检索后下架状态的, res也设置为True
                    res = True
                else:
                    self.lg.info('------>>>| 休眠8s中...')
                    await async_sleep(delay=8, loop=self.loop)

        else:  # 表示返回的data值为空值
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')
            await async_sleep(delay=10, loop=self.loop)

        index += 1
        self.goods_index = index
        collect()
        # 国外服务器上可以缩短时间, 可以设置为0s
        await async_sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量

        return [db_goods_info_obj.goods_id, res]

    async def _except_sleep(self, res):
        '''
        异常休眠
        :param res:
        :return:
        '''
        count = 0
        all_count_fail_sleep_time = 100.
        # 本来休眠40., 现在不休眠
        sleep_time = 0.
        for item in res:
            try:
                if not item[1]:
                    count += 1
            except IndexError:
                pass
        self.lg.info('Fail count: {}个, 并发量: {}个'.format(count, self.concurrency))
        if count/self.concurrency >= .96:
            # 全失败的休眠方式
            self.lg.info('抓取异常!! 休眠{}s中...'.format(all_count_fail_sleep_time))
            await async_sleep(all_count_fail_sleep_time)

        else:
            if count >= int(self.concurrency/5):
                self.lg.info('抓取异常!! 休眠{}s中...'.format(sleep_time))
                await async_sleep(sleep_time)

        return None

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.sql_cli
        except:
            pass
        try:
            del self.loop
        except:
            pass
        collect()
Ejemplo n.º 7
0
class CommonGoodsRealTimeUpdater(AsyncCrawler):
    """常规商品实时更新"""
    def __init__(self):
        self.goods_spider_type = GOODS_SPIDER_NAME
        assert self.goods_spider_type is not None
        assert self.goods_spider_type in ('tb', 'tm'), \
            'self.goods_spider_type value异常!'
        AsyncCrawler.__init__(
            self,
            log_print=True,
            log_save_path=self.get_log_save_path(),
        )
        self.set_concurrency()
        self.crawl_type = CRAWL_TYPE_ASYNCIO
        self.concurrent_type = CONCURRENT_TYPE
        self.db_res_from = DB_RES_FROM
        self.db_conn_type = DB_CONN_TYPE
        self.sql_cli = None
        self.set_sql_cli()
        assert self.db_res_from in (0, 1, 2,), \
            'self.db_res_from value异常!'
        self.db_data_slice_num = 800
        self.is_real_times_update_call = True
        if 'armv7l-with-debian' in platform.platform():
            self.server_ip = 'http://0.0.0.0:80'
        else:
            self.server_ip = 'http://118.31.39.97'
            # self.server_ip = 'http://0.0.0.0:5000'

    def get_log_save_path(self) -> str:
        if self.goods_spider_type == 'tm':
            return MY_SPIDER_LOGS_PATH + '/天猫/实时更新/'

        elif self.goods_spider_type == 'tb':
            return MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/'

        else:
            raise NotImplemented

    def set_concurrency(self) -> None:
        """
        设置并发量, log_save_path
        :return:
        """
        if self.goods_spider_type == 'tm':
            self.concurrency = 100
        elif self.goods_spider_type == 'tb':
            self.concurrency = 100
        else:
            raise NotImplemented

    def set_sql_cli(self):
        """
        设置连接类型
        :return:
        """
        if self.db_conn_type == 1:
            # 推荐
            self.sql_cli = SqlServerMyPageInfoSaveItemPipeline()
        elif self.db_conn_type == 2:
            # 使用sqlalchemy管理数据库连接池
            self.sql_cli = SqlPools()
        else:
            raise ValueError('db_conn_type 值异常!')

    async def _update_db(self):
        while True:
            # 长期运行报: OSError: [Errno 24] Too many open files, 故不采用每日一志
            # self.lg = await self._get_new_logger(logger_name=get_uuid1())
            result = await self._get_db_old_data()
            if result is None:
                pass
            else:
                tasks_params_list = TasksParamsListObj(
                    tasks_params_list=result, step=self.concurrency)
                index = 1
                while True:
                    try:
                        slice_params_list = tasks_params_list.__next__()
                    except AssertionError:
                        break

                    one_res, index = await self._get_one_res(
                        slice_params_list=slice_params_list, index=index)
                    await self._except_sleep(res=one_res)

                self.lg.info('全部数据更新完毕'.center(100, '#'))

            if get_shanghai_time().hour == 0:
                # 0点以后不更新
                await async_sleep(60 * 60 * .5)
            else:
                await async_sleep(5.)

            try:
                # del self.lg
                del result
            except Exception:
                pass
            collect()

    async def _get_db_old_data(self) -> (list, None):
        """
        获取db需求更新的数据
        :return:
        """
        result = None
        try:
            if self.db_res_from == 0:
                if self.goods_spider_type == 'tm':
                    sql_str = tm_select_str_3
                elif self.goods_spider_type == 'tb':
                    sql_str = tb_select_str_3
                else:
                    raise NotImplemented

                result = list(
                    self.sql_cli._select_table(
                        sql_str=sql_str,
                        logger=self.lg,
                    ))

            elif self.db_res_from == 1:
                result = await get_waited_2_update_db_data_from_server(
                    server_ip=self.server_ip,
                    _type=self.goods_spider_type,
                    child_type=0,
                )
            else:
                # 默认拿300个, 避免前100个失败率较高的情况下, 后面能继续更新
                result = get_waited_2_update_db_data_from_redis_server(
                    # eg: 'tm0'
                    spider_name=self.goods_spider_type + '0',
                    logger=self.lg,
                    slice_num=self.db_data_slice_num,
                )

        except TypeError:
            self.lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')

        await _print_db_old_data(logger=self.lg, result=result)

        return result

    async def _get_one_res(self, slice_params_list, index) -> tuple:
        """
        获取slice_params_list对应的one_res
        :param slice_params_list:
        :param index:
        :return: (list, int)
        """
        if self.crawl_type == CRAWL_TYPE_ASYNCIO:
            """asyncio"""
            if self.goods_spider_type == 'tm':
                tasks_params_list = self.get_tm_tasks_params_list(
                    slice_params_list=slice_params_list,
                    index=index,
                )
                func_name_where_get_create_task_msg = self.get_tm_create_task_msg
                func_name_where_get_now_args = self.get_tm_now_args
            elif self.goods_spider_type == 'tb':
                tasks_params_list = self.get_tb_tasks_params_list(
                    slice_params_list=slice_params_list,
                    index=index,
                )
                func_name_where_get_create_task_msg = self.get_tb_create_task_msg
                func_name_where_get_now_args = self.get_tb_now_args
            else:
                raise NotImplemented

            # pprint(tasks_params_list)
            one_res = await get_or_handle_target_data_by_task_params_list(
                loop=self.loop,
                tasks_params_list=tasks_params_list,
                func_name_where_get_create_task_msg=
                func_name_where_get_create_task_msg,
                func_name=block_get_one_goods_info_task_by_external_type,
                func_name_where_get_now_args=func_name_where_get_now_args,
                func_name_where_handle_one_res=None,
                func_name_where_add_one_res_2_all_res=
                default_add_one_res_2_all_res2,
                one_default_res=(),
                step=self.concurrency,
                logger=self.lg,
                get_all_res=True,
                concurrent_type=self.concurrent_type,
            )
            # pprint(one_res)

        elif self.crawl_type == CRAWL_TYPE_CELERY:
            """celery"""
            tasks = []
            if self.goods_spider_type == 'tm':
                for item in slice_params_list:
                    index += 1
                    db_goods_info_obj = TMDbGoodsInfoObj(item=item,
                                                         logger=self.lg)
                    self.lg.info('创建 task goods_id: {}'.format(
                        db_goods_info_obj.goods_id))
                    tmp_item = self.get_tm_tmp_item(
                        site_id=db_goods_info_obj.site_id,
                        goods_id=db_goods_info_obj.goods_id,
                    )
                    try:
                        async_obj = await self.create_tm_celery_obj(
                            goods_id=tmp_item,
                            index=index,
                        )
                        tasks.append(async_obj)
                    except Exception:
                        continue
                one_res = await _get_celery_async_results(tasks=tasks)

            else:
                raise NotImplemented

        else:
            raise NotImplemented

        res = await handle_real_times_goods_one_res(
            # eg: 'tm', 'tb'
            goods_type=self.goods_spider_type,
            loop=self.loop,
            func_name_where_update_one_goods_info_in_db=self.
            _update_one_goods_info_in_db,
            slice_params_list=slice_params_list,
            one_res=one_res,
            logger=self.lg,
        )
        try:
            del slice_params_list
        except:
            pass

        return (res, index)

    def get_tm_tasks_params_list(self, slice_params_list: list,
                                 index: int) -> list:
        tasks_params_list = []
        for item in slice_params_list:
            try:
                db_goods_info_obj = TMDbGoodsInfoObj(item=item, logger=self.lg)
                tmp_item = self.get_tm_tmp_item(
                    site_id=db_goods_info_obj.site_id,
                    goods_id=db_goods_info_obj.goods_id,
                )
                tasks_params_list.append({
                    'db_goods_info_obj': db_goods_info_obj,
                    'index': index,
                    'tmp_item': tmp_item,
                })
                index += 1
            except Exception:
                self.lg.error('遇到错误[goods_id: {}]:', exc_info=True)
                continue

        return tasks_params_list

    def get_tb_tasks_params_list(self, slice_params_list: list,
                                 index: int) -> list:
        tasks_params_list = []
        for item in slice_params_list:
            try:
                db_goods_info_obj = TBDbGoodsInfoObj(item=item, logger=self.lg)
                tasks_params_list.append({
                    'db_goods_info_obj': db_goods_info_obj,
                    'index': index,
                })
                index += 1
            except Exception:
                self.lg.error('遇到错误[goods_id: {}]:', exc_info=True)
                continue

        return tasks_params_list

    @staticmethod
    def get_tm_create_task_msg(k) -> str:
        return 'create task[where is goods_id: {}, index: {}] ...'.format(
            k['db_goods_info_obj'].goods_id,
            k['index'],
        )

    @staticmethod
    def get_tb_create_task_msg(k) -> str:
        return 'create task[where is goods_id: {}, index: {}] ...'.format(
            k['db_goods_info_obj'].goods_id,
            k['index'],
        )

    def get_tm_now_args(self, k) -> list:
        return [
            'tm',
            k['tmp_item'],
            k['index'],
            self.lg,
        ]

    def get_tb_now_args(self, k) -> list:
        return [
            'tb',
            k['db_goods_info_obj'].goods_id,
            k['index'],
            self.lg,
        ]

    async def _update_one_goods_info_in_db(self, db_goods_info_obj, index,
                                           before_goods_data,
                                           end_goods_data) -> (list, tuple):
        """
        更新单个goods
        :param db_goods_info_obj:
        :param index:
        :param before_goods_data:
        :param end_goods_data:
        :return:
        """
        res = False

        self.sql_cli = await _get_new_db_conn(db_obj=self.sql_cli,
                                              index=index,
                                              logger=self.lg,
                                              db_conn_type=self.db_conn_type,
                                              remainder=25)
        if self.sql_cli.is_connect_success:
            self.lg.info('*' * 20 +
                         ' updating goods_id: {}, index: {} ...'.format(
                             db_goods_info_obj.goods_id,
                             index,
                         ))
            # 避免下面解析data错误休眠
            before_goods_data_is_delete = before_goods_data.get('is_delete', 0)
            if end_goods_data != {}:
                data = get_goods_info_change_data(
                    # eg: 'tm', 'tb'
                    target_short_name=self.goods_spider_type,
                    logger=self.lg,
                    data=end_goods_data,
                    db_goods_info_obj=db_goods_info_obj,
                    sql_cli=self.sql_cli,
                )
                res = to_right_and_update_data_by_goods_type(
                    goods_type=self.goods_spider_type,
                    data=data,
                    pipeline=self.sql_cli,
                    logger=self.lg,
                )

            else:  # 表示返回的data值为空值
                if before_goods_data_is_delete == 1:
                    # 检索后下架状态的, res也设置为True
                    res = True
                else:
                    self.lg.info('goods_id: {}, 阻塞休眠7s中...'.format(
                        db_goods_info_obj.goods_id, ))
                    await async_sleep(delay=7., loop=self.loop)
                    # 改为阻塞进程, 机器会挂
                    # sleep(7.)

        else:
            self.lg.error('数据库连接失败,数据库可能关闭或者维护中')
            await async_sleep(delay=8, loop=self.loop)

        collect()

        return [db_goods_info_obj.goods_id, res]

    async def create_tm_celery_obj(self, **kwargs):
        """
        创建celery obj
        :param kwargs:
        :return:
        """
        goods_id = kwargs.get('goods_id', [])
        index = kwargs['index']

        async_obj = _get_tm_one_goods_info_task.apply_async(
            args=[
                goods_id,
                index,
            ],
            expires=5 * 60,
            retry=False,
        )

        return async_obj

    @staticmethod
    def get_tm_tmp_item(site_id, goods_id):
        tmp_item = []
        # 从数据库中取出时,先转换为对应的类型
        if site_id == 3:
            tmp_item.append(0)
        elif site_id == 4:
            tmp_item.append(1)
        elif site_id == 6:
            tmp_item.append(2)

        tmp_item.append(goods_id)

        return tmp_item

    @staticmethod
    def get_jd_tmp_item(site_id, goods_id):
        tmp_item = []
        # 从数据库中取出时,先转换为对应的类型
        if site_id == 7 \
                or site_id == 8:
            tmp_item.append(0)
        elif site_id == 9:
            tmp_item.append(1)
        elif site_id == 10:
            tmp_item.append(2)

        tmp_item.append(goods_id)

        return tmp_item

    async def _except_sleep(self, res):
        """
        异常休眠
        :param res:
        :return:
        """
        count = 0
        all_count_fail_sleep_time = 100.

        # 本来是40., 此处不休眠
        sleep_time = 0.
        # pprint(res)
        for item in res:
            try:
                if not item[1]:
                    count += 1
            except IndexError:
                pass
        self.lg.info('Fail count: {}个, 并发量: {}个'.format(
            count, self.concurrency))
        if count / self.concurrency >= .96:
            # 全失败的休眠方式
            self.lg.info('抓取异常!! 休眠{}s中...'.format(all_count_fail_sleep_time))
            await async_sleep(all_count_fail_sleep_time)

        else:
            if count >= int(self.concurrency / 5):
                self.lg.info('抓取异常!! 休眠{}s中...'.format(sleep_time))
                await async_sleep(sleep_time)

        return None

    def __del__(self):
        try:
            del self.lg
        except:
            pass
        try:
            del self.sql_cli
        except:
            pass
        try:
            del self.loop
        except:
            pass
        collect()
Ejemplo n.º 8
0
def run_forever():
    #### 实时更新数据
    while True:
        # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        tmp_sql_server = SqlPools()  # 使用sqlalchemy管理数据库连接池
        try:
            # result = list(tmp_sql_server.select_taobao_all_goods_id())
            result = tmp_sql_server.select_taobao_all_goods_id()

        except TypeError as e:
            print('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            print('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            print(result)
            print('--------------------------------------------------------')

            print('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                data = {}
                taobao = TaoBaoLoginAndParse()
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    print('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    tmp_sql_server = SqlPools()

                    print('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    print(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%d)'
                        % (item[0], index))
                    taobao.get_goods_data(item[0])
                    data = taobao.deal_with_data(goods_id=item[0])
                    if data != {}:
                        data['goods_id'] = item[0]
                        # print('------>>>| 爬取到的数据为: ', data)
                        '''
                        设置最后刷新的商品状态上下架时间
                        '''
                        # 1.is_delete由0->1 为下架时间down_time  2. is_delete由1->0 为上架时间shelf_time
                        my_shelf_and_down_time = {
                            'shelf_time': '',
                            'down_time': '',
                        }
                        if data['is_delete'] != item[1]:
                            if data['is_delete'] == 0 and item[1] == 1:
                                # is_delete由0->1 表示商品状态上架变为下架
                                my_shelf_and_down_time['down_time'] = str(
                                    get_shanghai_time())
                            else:
                                # is_delete由1->0 表示商品状态下架变为上架
                                my_shelf_and_down_time['shelf_time'] = str(
                                    get_shanghai_time())
                        else:
                            if item[2] is None or item[
                                    2] == '{"shelf_time": "", "down_time": ""}' or len(
                                        item[2]) == 35:  # 35就是那串初始str
                                if data['is_delete'] == 0:  # 上架的状态
                                    my_shelf_and_down_time['shelf_time'] = str(
                                        get_shanghai_time())
                                else:  # 下架的状态
                                    my_shelf_and_down_time['down_time'] = str(
                                        get_shanghai_time())
                            else:
                                # 否则保存原始值不变
                                tmp_shelf_and_down_time = item[2]
                                my_shelf_and_down_time = json.loads(
                                    tmp_shelf_and_down_time)  # 先转换为dict
                        data['my_shelf_and_down_time'] = my_shelf_and_down_time
                        # print(my_shlef_and_down_time)

                        taobao.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)
                    else:
                        pass
                else:  # 表示返回的data值为空值
                    print('数据库连接失败,数据库可能关闭或者维护中')
                    pass
                index += 1
                # try:
                #     del taobao
                # except:
                #     pass
                gc.collect()
                # 国外服务器上可以缩短时间, 可以设置为0s
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量
            print('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
Ejemplo n.º 9
0
def run_forever():
    #### 实时更新数据
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)

        # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        tmp_sql_server = SqlPools()  # 使用sqlalchemy管理数据库连接池
        try:
            # result = list(tmp_sql_server.select_taobao_all_goods_id())
            result = tmp_sql_server._select_table(sql_str=tb_select_str_3, )
        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info(
                '--------------------------------------------------------')
            my_lg.info('总计待更新个数: {0}'.format(len(result)))

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                taobao = TaoBaoLoginAndParse(logger=my_lg)
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    tmp_sql_server = SqlPools()
                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (item[0], str(index)))
                    oo = taobao.get_goods_data(item[0])
                    oo_is_delete = oo.get('is_delete', 0)  # 避免下面解析data错误休眠
                    data = taobao.deal_with_data(goods_id=item[0])
                    if data != {}:
                        data['goods_id'] = item[0]
                        data['shelf_time'], data[
                            'delete_time'] = get_shelf_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                shelf_time=item[4],
                                delete_time=item[5])
                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[2],
                                old_taobao_price=item[3],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])

                        try:
                            old_sku_info = format_price_info_list(
                                price_info_list=json_2_dict(item[6]),
                                site_id=1)
                        except AttributeError:  # 处理已被格式化过的
                            old_sku_info = item[6]
                        data['_is_price_change'], data[
                            'sku_info_trans_time'] = get_sku_info_trans_record(
                                old_sku_info=old_sku_info,
                                new_sku_info=format_price_info_list(
                                    data['price_info_list'], site_id=1),
                                is_price_change=item[7]
                                if item[7] is not None else 0)

                        taobao.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)
                    else:
                        if oo_is_delete == 1:
                            pass
                        else:
                            my_lg.info('------>>>| 休眠5s中...')
                            sleep(4)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(10)
                    pass

                index += 1
                gc.collect()
                # 国外服务器上可以缩短时间, 可以设置为0s
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量
            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
        restart_program()