Beispiel #1
0
def get_el_tb_job_stat(batch_id, tb_nm, conn=None):
    """
    :param batch_id:
    :param tb_nm:
    :param conn:
    :return:
    """
    if conn is None:
        conn = Conn()
    sql = """select batch_id,tb_nm,batch_stat stat from {log_tb_nm} 
             where batch_id='{batch_id}' and tb_nm='{tb_nm}' order by start_dt desc limit 1"""
    sql = sql.format(batch_id=batch_id, log_tb_nm=tb_task_log, tb_nm=tb_nm)
    df = conn.select(sql)  # pd.read_sql(sql, conn)
    if df.shape[0] > 0:
        return df.loc[0, 'stat']
    else:
        return -1
Beispiel #2
0
def el_tbs(conn, tb_nm, batch_id=None, logs_print=False):
    """
    同步ods数据主程序入口,同步task_infos所有的未删除表
    :param tb_nm: 如果指定则只同步指定表 也可是多个表 例如ods.pf_cust_info,ods.portal_wechat_card
    :param batch_id:批次ID
    :param conn: 数据库连接
    :param logs_print: 是否打印同步的明细日志
    :return:
    """
    if ',' in tb_nm:
        tbs = tb_nm.replace(' ', '').split(",")
        tb_nm = str(tuple(tbs))
    else:
        tb_nm = "('" + tb_nm + "')"
    if conn is None:
        conn = Conn()
    task_sql = """
        select
            src_tb_nm read_tb,
            src_db_cd read_conn,
            tb_nm write_tb,
            trgt_db_cd write_conn,
            sql_where read_where,
            sql_pre_sql pre_sql,
            el_type,
            parallel_num::int parallel_num
         from {tb_task_info} where is_del='N' and tb_nm in {tb_nm}
        """
    task_sql = task_sql.format(tb_nm=tb_nm, tb_task_info=tb_task_info)
    crt_el_log_tb(conn)
    df = conn.select(task_sql)
    if df.shape[0] > 0:
        rses = df.to_dict(orient='record')
        if batch_id:
            for rs in rses:
                rs['batch_id'] = batch_id
                rs['logs_print'] = logs_print
                run_el_with_batch(**rs)
        else:
            for rs in rses:
                rs['logs_print'] = logs_print
                run_el(**rs)
    else:
        logger.info("没有表需要同步")
Beispiel #3
0
def get_ignore_cols(trgt_tb_nm, conn=None):
    """获取忽略导入字段"""
    if conn is None:
        conn = Conn()
    sql = """
            select 
            s.tb_nm,
            s.ignore_cols
            from {tb_task_info} s 
            where ignore_cols>'' and tb_nm='{trgt_tb_nm}'
        """
    sql = sql.format(trgt_tb_nm=trgt_tb_nm, tb_task_info=tb_task_info)
    if conn.is_exists(tb_task_info):
        df = conn.select(sql)  # pd.read_sql(sql, conn)
        if df.shape[0] > 0:
            return df.loc[0, 'ignore_cols'].split(',')  # list(df['ignore_cols'])
        else:
            return None
    else:
        return None
Beispiel #4
0
def multi_proc_el_all_task(batch_id,
                           conn=None,
                           batch_nm='T1',
                           processes=5,
                           logs_print=False,
                           only_tables=None):
    """
    同步ods数据主程序入口,同步task_infos所有的未删除表
    :param only_tables: 如果指定则只同步指定表
    :param batch_id:批次ID 字符串
    :param conn: 数据库连接
    :param batch_nm: 是否只处理增量表,用于准实时数据同步
    :param processes: 处理进程数个数,默认是5个,根据线上配置个数5-20个
    :param logs_print: 是否打印同步的明细日志
    :return:
    """
    batch_id = str(batch_id)
    if conn is None:
        conn = Conn()
    task_sql = """
        select
            src_tb_nm read_tb,
            src_db_cd read_conn,
            tb_nm write_tb,
            trgt_db_cd write_conn,
            sql_where read_where,
            sql_pre_sql pre_sql,
            el_type,
            parallel_num::int parallel_num
         from {tb_task_info} where is_del='N' and batch_nm='{batch_nm}'
        """
    # is_inc_where = " and batch_nm='{batch_nm}'".format(batch_nm=batch_nm, tb_task_info=tb_task_info)
    task_sql = task_sql.format(batch_nm=batch_nm,
                               tb_task_info=tb_task_info)  # + is_inc_where
    crt_el_log_tb(conn)
    df = conn.select(task_sql)  # pd.read_sql(task_sql, conn)
    if only_tables:  # 如果指定执行某个表
        only_tables = only_tables.replace(",", ",").replace(" ", "").split(",")
        df = df[df['write_tb'].isin(only_tables)]
    if df.shape[0] > 0:
        rs = df.to_dict(orient='record')
        logger.info("Batch_id:%s 启动进程数: %s,开始批处理run_el_with_batch" %
                    (batch_id, processes))
        pool = multiprocessing.Pool(processes=processes)
        for i in rs:
            i['batch_id'] = batch_id
            i['logs_print'] = logs_print
            pool.apply_async(run_el_with_batch, kwds=i)
        pool.close()
        pool.join()
        logger.info("batch_id:%s 完成数据导入" % batch_id)
        if only_tables:
            check_el_task(batch_id, conn, batch_nm, check_error_only=True)
            # 检查作业处理结果,有错误则抛出异常。可以加邮件通知和微信通知
        else:
            check_el_task(batch_id,
                          conn,
                          batch_nm,
                          check_error_only=False,
                          task_list=[i['write_tb'] for i in rs])
            # 检查作业处理结果,有错误则抛出异常。
    else:
        logger.info("没有表需要同步")
Beispiel #5
0
def crt_all_tb_from_src(src_db_cd,
                        src_tb_schm,
                        trgt_db_cd="DPS",
                        trgt_schm_nm='ods',
                        batch_id=None,
                        if_el_data=False):
    """
    一次性移动某个系统的表到数据仓库中,一般只有临时性导入某数据库才这样操作,不能用于生产调度
    :param if_el_data:  是导入源系统数据。默认只建表结构
    :param batch_id:   有batch_id的,run_el_with_batch 导入并记录到task_log里面,没有指定的直接导datax导数据
    :param src_db_cd: 源系统编号
    :param src_tb_schm: 源系统schema 只需要指定shcema 没有的用public
    :param trgt_db_cd: 目标系统编号 DPS HQ_HR
    :param trgt_schm_nm: 目标系统schema
    :return:
    """
    conn = Conn(src_db_cd)
    # conn, db_type, db_nm = get_db_conn(src_db_cd)
    if conn.db_type.upper() == 'ORACLE'.upper():
        sql = """select distinct table_name tb_nm from user_tables  order by  table_name"""
    elif conn.db_type.upper() == 'MYSQL'.upper():
        sql = """
           select distinct t2.table_name tb_nm
           from   information_schema.tables t2 
           where t2.table_schema='{db_nm}' and table_type='BASE TABLE';""".format(
            db_nm=conn.db_nm)
    elif conn.db_type.upper() == 'PostgreSQL'.upper():
        sql = """ 
           SELECT   distinct
                          '{tb_schm}.'||B.relname tb_nm  
                       FROM  pg_catalog.pg_class B
                       INNER JOIN pg_catalog.pg_namespace C
                           ON C.oid=B.relnamespace
                           AND C.nspname='{tb_schm}' where B.relam=0  and relkind='r' 
           """.format(tb_schm=src_tb_schm)
    elif conn.db_type.upper() == 'Greenplum'.upper():
        sql = """ 
           SELECT   distinct
                          '{tb_schm}.'||B.relname tb_nm  
                       FROM  pg_catalog.pg_class B
                       INNER JOIN pg_catalog.pg_namespace C
                           ON C.oid=B.relnamespace
                           AND C.nspname='{tb_schm}' where B.relam=0  and relkind='r' 
           """.format(tb_schm=src_tb_schm)
    elif conn.db_type.upper() == 'SQLServer'.upper():
        sql = """SELECT
                  distinct sysobjects.name AS tb_nm 
               FROM syscolumns                            -- 数据表字段
               INNER JOIN sysobjects                        -- 数据对象
                 ON sysobjects.id = syscolumns.id
               INNER JOIN systypes                         -- 数据类型
                 ON syscolumns.xtype = systypes.xtype
               LEFT OUTER JOIN sys.extended_properties properties       -- 字段属性信息
                 ON syscolumns.id = properties.major_id
                AND syscolumns.colid = properties.minor_id
               LEFT OUTER JOIN sys.extended_properties sysproperties                -- 表属性信息
                 ON sysobjects.id = sysproperties.major_id
                AND sysproperties.minor_id = 0
               LEFT OUTER JOIN syscomments                -- 注释信息
                 ON syscolumns.cdefault = syscomments.id
               LEFT OUTER JOIN sysindexkeys                -- 索引中的键或列的信息
                 ON sysindexkeys.id = syscolumns.id
                AND sysindexkeys.colid = syscolumns.colid
               LEFT OUTER JOIN sysindexes                  -- 数据库 索引表
                 ON sysindexes.id = sysindexkeys.id
                AND sysindexes.indid = sysindexkeys.indid
               LEFT OUTER JOIN sysforeignkeys
                 ON sysforeignkeys.fkeyid = syscolumns.id
                AND sysforeignkeys.fkey = syscolumns.colid
               left join INFORMATION_SCHEMA.tables ts on sysobjects.name =ts.table_name
               WHERE (sysobjects.xtype = 'U') and ts.Table_catalog='{db_nm}' 
               order by syscolumns.id""".format(db_nm=conn.db_nm, )
    else:
        # logger.error("不支持的数据库类型:" + db_type)
        raise Exception("不支持的数据库类型:" + conn.db_type)
    # logger.warning(sql)
    meta = conn.select(sql)  # pd.read_sql(sql, conn)
    if meta.shape[0] > 0:
        meta.columns = ['src_tb_nm'
                        ]  # (map(lambda x: x.lower(), list(meta.columns)))
        meta['el_type'] = 'datax'
        meta['src_db_cd'] = src_db_cd
        meta['trgt_tb_nm'] = None
        meta['trgt_db_cd'] = trgt_db_cd
        meta['read_where'] = ''
        meta['pre_sql'] = 'truncate'
        meta['parallel_num'] = 2
        meta['if_el_data'] = if_el_data
        multi_proc_el(meta, batch_id, processes=5, trgt_schm_nm=trgt_schm_nm)
    logger.info("数据处理完成")