Exemple #1
0
def is_runing(tb_nm, conn=None, batch_id=''):
    """
    :param batch_id:
    :param tb_nm:
    :param conn:
    :return:
    """
    if conn is None:
        conn = Conn()
    sql = """select batch_id from {log_tb_nm} where tb_nm='{tb_nm}' 
                and start_dt>= current_timestamp-interval '3 hour' and batch_stat=0"""
    sql = sql.format(log_tb_nm=tb_task_log, tb_nm=tb_nm)
    flag = conn.query_one(sql)  # pd.read_sql(sql, conn)
    times = 0
    while flag:
        logger.warning("%s在执行中本次执行开始等待,等待第%s次" % (tb_nm, times))
        send_error_msg("batch_id:%s的作业%s一直在执行中本批次%s等待,请确认是否有执行故障" %
                       (flag, batch_id, tb_nm),
                       tb_nm,
                       if_to_wx=False)
        time.sleep(300)  # 等待5分钟
        times += 1
        flag = conn.query_one(sql)
        if flag and times > 3:
            logger.error("%s一直在执行中,请确认是否产生故障。此次执行报错" % tb_nm)
            raise Exception("%s一直在执行中,请确认是否产生故障。此次执行报错" % tb_nm)
    return False
Exemple #2
0
 def crt_tb_from_src_sys(self, src_tb_nm, src_db_cd, trgt_schm_nm='ods'):
     """
     数据同步时,源系统和目标表结构创建或者校验
     :param src_tb_nm:
     :param src_db_cd:
     :param trgt_schm_nm:
     :return:
     """
     trgt_tb_nm = self.tb_nm
     trgt_db_cd = self.conn.db_cd
     src_conn = Conn(src_db_cd)
     src_meta = src_conn.get_tb_strct(src_tb_nm)  # 获取表结构
     src_conn.close()
     if src_meta:
         if trgt_tb_nm is None:  # 如果没有设定目标表名,需要自动生成目标表名
             trgt_tb_nm = self.get_auto_el_tb_nm(schema=trgt_schm_nm)
         crt_tb_sql = crt_trgt_db_sql(src_meta, trgt_tb_nm, trgt_db_cd)
         trgt_conn = Conn(trgt_db_cd)
         rs = trgt_conn.upd_tb_strct(crt_tb_sql,
                                     schm_tb_nm=trgt_tb_nm,
                                     drop_direct=False)
         trgt_conn.close()
         return rs
     else:
         raise Exception("源数据库目标表表%s不存在" % src_tb_nm)
Exemple #3
0
def multi_proc_el(df, batch_id, processes=5, trgt_schm_nm='ods'):
    """
    对df数据多进程处理,这里是导入数据或建表格
    :param batch_id:
    :param df: dataframe 需要字段batch_id, el_type, src_tb_nm, src_db_cd, trgt_tb_nm,
                         trgt_db_cd=default_db_cd, read_where='', pre_sql='truncate', parallel_num=2
    :param processes:  多进程数
    :param trgt_schm_nm:  目标schema 主要用于
    :return:
    """
    rs = df.to_dict(orient='record')
    logger.info("批量导入 启动进程数: %s,开始批处理" % (processes, ))
    pool = multiprocessing.Pool(processes=processes)
    # i 需要传递字段 batch_id, el_type, src_tb_nm, src_db_cd, trgt_tb_nm, trgt_db_cd,
    # read_where='', pre_sql='truncate', parallel_num
    for i in rs:
        i['batch_id'] = batch_id
        if i['trgt_tb_nm'] is None:
            i['trgt_tb_nm'] = get_targt_tb_nm(i['src_tb_nm'],
                                              i['src_db_cd'],
                                              schema=trgt_schm_nm)
        # logger.info(str(i))
        pool.apply_async(_el_run, kwds=i)
    pool.close()
    pool.join()
    logger.info("完成批处理")
    conn = Conn()
    check_el_task(batch_id, conn, batch_nm='T1', check_error_only=True)
    conn.close()
Exemple #4
0
def parse_sys_args(args):
    if isinstance(args, str):
        args = args.replace(",", ",").split()
    schm_tb_nm = args[1].replace(",", ",")
    job_type = args[2]  # job_type 必要参数 可以取ddl/dml/func/el/batch_el
    param_list = args[3:]
    param_list_len = len(param_list)
    if param_list_len > 0:  # 有代入参数
        if param_list_len % 2 == 0:
            keys = [
                param_list[i].replace("-", "") for i in range(param_list_len)
                if i % 2 == 0
            ]
            vals = [param_list[i] for i in range(param_list_len) if i % 2 == 1]
            params_dict = dict(zip(keys, vals))
        else:
            err_msgs = " ".join(param_list) + " 参数应该要kv一一对应"
            logger.error(err_msgs)
            raise Exception(err_msgs)
    else:
        params_dict = {}
    if 'conn' in params_dict:
        conn = Conn(params_dict['conn'])
    else:
        conn = Conn()
    if 'p' in params_dict:  # 进程池进程个数
        params_dict['processes'] = params_dict['p']
    if 'c' in params_dict:  # 是否检查表结构
        params_dict['check_tb_strct'] = params_dict['c']
    if 'nm' in params_dict:
        params_dict['batch_nm'] = params_dict['nm']
    return conn, schm_tb_nm, job_type, params_dict
Exemple #5
0
def deal():
    """
    处理入口
    """
    conn = Conn(default_db_cd)
    crt_el_log_tb(conn)
    try:
        # 校验表结构
        if conn.upd_tb_strct(crt_tb_sql=ddl_sql, schm_tb_nm=tb_nm, drop_direct=True):
            etl_meta_el(conn)
    finally:
        conn.close()
Exemple #6
0
def get_cols(db_cd, tb_nm):
    """
    获取字段名
    :param db_cd: 只是编号
    :param tb_nm:
    :return:
    """
    conn = Conn(db_cd)
    rs = conn.get_tb_strct(tb_nm)  # 获取表结构
    if rs:
        rs = rs['meta']['col_nm']
        cols = list(rs)
        return cols  # '"' + '","'.join(cols) + '"'
Exemple #7
0
def deal():
    """
    处理入口
    :return:
    """
    conn = Conn('ibm')
    try:
        with conn:
            # 表结构校验
            if conn.upd_tb_strct(crt_tb_sql=ddl_sql, schm_tb_nm=tb_nm, drop_direct=proc_all_flag):
                # 产品分类
                dim_prod_cat(conn)
    finally:
        conn.close()
Exemple #8
0
 def get_trgt_tb_crt_sql(self, trgt_tb_nm, trgt_db_cd="DPS", if_create_table=False):
     """
      根据源系统表结构生成目标系统的表结构语句,只适合简单的ods层同步表,不会获取索引、主键等信息
     :param if_create_table:  是否直接创建表
     :param trgt_tb_nm: 目标表表名 包含schema 例如dw.dim_date
     :param trgt_db_cd:
     :return:
     """
     tb_strct = self.get_tb_strct()
     # meta = tb_strct.copy()  # 源系统表结构 类型dataframe 'col_nm', 'col_type', 'col_len', 'col_prcn', 'col_cmnt'
     trgt_db_conn = Conn(trgt_db_cd)
     crt_tb_sql = crt_trgt_db_sql(tb_strct, trgt_tb_nm, trgt_db_cd=trgt_db_cd)
     if if_create_table:
         trgt_db_conn.upd_tb_strct(crt_tb_sql, schm_tb_nm=trgt_tb_nm, drop_direct=False)
     return crt_tb_sql  # {"tb_nm": trgt_tb_nm, 'crt_tb_sql': crt_tb_sql, 'meta': meta}
Exemple #9
0
def crt_trgt_db_sql(tb_strct, trgt_tb_nm, trgt_db_cd="DPS"):
    """
     根据源系统表结构生成目标系统的表结构语句
    :param tb_strct:
    :param trgt_tb_nm:
    :param trgt_db_cd:
    :return:
    """
    meta = tb_strct['meta'].copy()  # 源系统表结构 类型dataframe 'col_nm', 'col_type', 'col_len', 'col_prcn', 'col_cmnt'
    from_db_type = tb_strct['db_type']
    trgt_db_conn = Conn(trgt_db_cd)
    # trgt_db_conn, trgt_db_type, trgt_db_nm = get_db_conn(trgt_db_cd)
    ignore_cols_list = get_ignore_cols(trgt_tb_nm, trgt_db_conn)  # 获取剔除的字段
    if ignore_cols_list:
        meta = meta[~meta['col_nm'].isin(ignore_cols_list)]  # 删除不必要的字段
        logger.warning(trgt_tb_nm + " 剔除不同步字段:" + ','.join(ignore_cols_list))
    from etls.comm.dbmapping import get_db_mapping
    data_type_mapping = get_db_mapping(from_db_type, trgt_db_conn.db_type)
    meta['col_type'] = meta['col_type'].apply(lambda x: data_type_mapping.get(x, data_type_mapping.get('default', x)))
    if trgt_db_conn.db_type.lower() in ["postgresql", "pg"]:
        logger.debug("转化成postgresql库")
        from etls.comm.greenplum import generate_simple_ddl
        crt_tb_sql = generate_simple_ddl(trgt_tb_nm, tb_strct, meta)
        return crt_tb_sql  # {"tb_nm": trgt_tb_nm, 'crt_tb_sql': crt_tb_sql, 'meta': meta}
    elif trgt_db_conn.db_type.lower() in ["greenplum", "gp"]:
        logger.debug("转化成greenplum库")
        from etls.comm.greenplum import generate_simple_ddl
        crt_tb_sql = generate_simple_ddl(trgt_tb_nm, tb_strct, meta)
        return crt_tb_sql  # {"tb_nm": trgt_tb_nm, 'crt_tb_sql': crt_tb_sql, 'meta': meta}
    else:
        raise Exception("不支持的目标数据库类型:" + trgt_db_conn.db_type)
Exemple #10
0
def get_el_tb_job_stat(batch_id, tb_nm, conn=None):
    """
    :param batch_id:
    :param tb_nm:
    :param conn:
    :return:
    """
    if conn is None:
        conn = Conn()
    sql = """select batch_id,tb_nm,batch_stat stat from {log_tb_nm} 
             where batch_id='{batch_id}' and tb_nm='{tb_nm}' order by start_dt desc limit 1"""
    sql = sql.format(batch_id=batch_id, log_tb_nm=tb_task_log, tb_nm=tb_nm)
    df = conn.select(sql)  # pd.read_sql(sql, conn)
    if df.shape[0] > 0:
        return df.loc[0, 'stat']
    else:
        return -1
Exemple #11
0
def get_from_dim_date(batch_dt, col_nm):
    """
    返回清算的时间 仅对海银清算时间
    :param batch_dt:日期
    :param col_nm: 指定的字段
    :return:
    """
    sql = """
       SELECT {col_nm} from dw.dim_date where dt_id='{batch_dt}'
        """.format(batch_dt=batch_dt, col_nm=col_nm)
    dps = Conn("DPS")
    try:
        rs = dps.query_one(sql)
        return rs
    except Exception as e:
        str(e)
        return None
Exemple #12
0
def el_tbs(conn, tb_nm, batch_id=None, logs_print=False):
    """
    同步ods数据主程序入口,同步task_infos所有的未删除表
    :param tb_nm: 如果指定则只同步指定表 也可是多个表 例如ods.pf_cust_info,ods.portal_wechat_card
    :param batch_id:批次ID
    :param conn: 数据库连接
    :param logs_print: 是否打印同步的明细日志
    :return:
    """
    if ',' in tb_nm:
        tbs = tb_nm.replace(' ', '').split(",")
        tb_nm = str(tuple(tbs))
    else:
        tb_nm = "('" + tb_nm + "')"
    if conn is None:
        conn = Conn()
    task_sql = """
        select
            src_tb_nm read_tb,
            src_db_cd read_conn,
            tb_nm write_tb,
            trgt_db_cd write_conn,
            sql_where read_where,
            sql_pre_sql pre_sql,
            el_type,
            parallel_num::int parallel_num
         from {tb_task_info} where is_del='N' and tb_nm in {tb_nm}
        """
    task_sql = task_sql.format(tb_nm=tb_nm, tb_task_info=tb_task_info)
    crt_el_log_tb(conn)
    df = conn.select(task_sql)
    if df.shape[0] > 0:
        rses = df.to_dict(orient='record')
        if batch_id:
            for rs in rses:
                rs['batch_id'] = batch_id
                rs['logs_print'] = logs_print
                run_el_with_batch(**rs)
        else:
            for rs in rses:
                rs['logs_print'] = logs_print
                run_el(**rs)
    else:
        logger.info("没有表需要同步")
Exemple #13
0
def get_ignore_cols(trgt_tb_nm, conn=None):
    """获取忽略导入字段"""
    if conn is None:
        conn = Conn()
    sql = """
            select 
            s.tb_nm,
            s.ignore_cols
            from {tb_task_info} s 
            where ignore_cols>'' and tb_nm='{trgt_tb_nm}'
        """
    sql = sql.format(trgt_tb_nm=trgt_tb_nm, tb_task_info=tb_task_info)
    if conn.is_exists(tb_task_info):
        df = conn.select(sql)  # pd.read_sql(sql, conn)
        if df.shape[0] > 0:
            return df.loc[0, 'ignore_cols'].split(',')  # list(df['ignore_cols'])
        else:
            return None
    else:
        return None
Exemple #14
0
def src_tb_sync_ods(src_tb_nm,
                    src_db_cd,
                    trgt_tb_nm=None,
                    trgt_db_cd=default_db_cd,
                    trgt_schm_nm='ods',
                    if_el_data=True):
    """
    源系统和目标表结构创建或者校验
    :param if_el_data:  是否导入数据
    :param src_tb_nm:
    :param src_db_cd:
    :param trgt_tb_nm:
    :param trgt_db_cd:
    :param trgt_schm_nm:
    :return:
    """
    src_conn = Conn(src_db_cd)
    src_meta = src_conn.get_tb_strct(src_tb_nm)  # 获取表结构
    src_conn.close()
    if src_meta:
        if trgt_tb_nm is None:  # 如果没有设定目标表名,需要自动生成目标表名
            trgt_tb_nm = get_targt_tb_nm(src_meta['tb_nm'],
                                         src_meta['db_cd'],
                                         schema=trgt_schm_nm)
        crt_tb_sql = crt_trgt_db_sql(src_meta, trgt_tb_nm, trgt_db_cd)
        trgt_conn = Conn(trgt_db_cd)
        rs = trgt_conn.upd_tb_strct(crt_tb_sql,
                                    schm_tb_nm=trgt_tb_nm,
                                    drop_direct=False)
        trgt_conn.close()
        if if_el_data:
            datax(src_tb_nm,
                  src_db_cd,
                  trgt_tb_nm,
                  write_conn=trgt_db_cd,
                  check_tb_strct=False,
                  logs_print=False)
        return rs
    else:
        raise Exception("源数据库目标表表%s不存在" % src_tb_nm)
Exemple #15
0
def to_check_tb_strct(src_tb_nm=None,
                      src_db_cd=None,
                      trgt_tb_nm=None,
                      trgt_db_cd="DPS",
                      if_ddl_not_exists="raise"):
    """
    校验表结构
    :param if_ddl_not_exists:
    :param src_tb_nm:
    :param src_db_cd:
    :param trgt_tb_nm:
    :param trgt_db_cd:
    :return:
    """
    sql = get_sql(trgt_tb_nm, sql_type='ddl')  # 获取表结构的sql文件
    if sql:  # 存在sql文件时,以sql文件为主
        if isinstance(trgt_db_cd, str):
            conn = Conn(trgt_db_cd)  # get_conn(trgt_db_cd)
        else:
            conn = trgt_db_cd
        check_rs = conn.upd_tb_strct(sql, trgt_tb_nm)
        if check_rs is False:
            raise Exception("表结构校验不通过,请查看上面的错误明细")
    elif src_tb_nm and src_db_cd:
        err_msg = "根目录下的sql/ddl/" + trgt_tb_nm.replace(
            ".", "/") + ".sql文件不存在,从源系统拉取表结构校验"
        logger.warning(err_msg)
        src_tb_sync_ods(src_tb_nm,
                        src_db_cd,
                        trgt_tb_nm,
                        trgt_db_cd,
                        if_el_data=False)  # 校验表结构并修改
    else:
        err_msg = "根目录下的sql/ddl/" + trgt_tb_nm.replace(".",
                                                       "/") + ".sql 文件不存在,无法校验"
        logger.warning(err_msg)
        if if_ddl_not_exists == "raise":
            # send_error_msg(err_msg, trgt_tb_nm)
            raise Exception(err_msg)
Exemple #16
0
def get_rpt_dtm(batch_dt, dtm_type):
    """
    返回清算的时间 仅对海银清算时间
    :param batch_dt:日期
    :param dtm_type:
        -- 取上次报表结束时间  cvrt_typ: last_rpt_end_dtm
        -- 取今天报表结束时间  cvrt_typ: tdy_rpt_end_dtm
        -- 取下一个工作日     cvrt_typ:  nxt_workday
        -- 取季度的第一天     cvrt_typ: quar_start
        -- 取下个季度的第一天 cvrt_typ: quar_end
    :return:
    """
    if dtm_type in [
            'last_rpt_end_dtm', 'tdy_rpt_end_dtm', 'nxt_workday', 'quar_start',
            'quar_end'
    ]:
        sql = """
           SELECT
               dw.cvrt_dt('{batch_dt}', '{dtm_type}') AS dt  -- 上个工作日报表结束时间
           """.format(batch_dt=batch_dt, dtm_type=dtm_type)
        dps = Conn("DPS")
        return dps.query_one(sql)
    else:
        return None
Exemple #17
0
 def exec_flow(self):
     """执行flow"""
     # print(dsk['T1'])
     # keyorder = order(dsk)
     # # print(keyorder)
     # state = start_state_from_dask(dsk, cache=None, sortkey=keyorder.get)
     # print(state)
     try:
         crt_el_log_tb(Conn())  # 检查日志表结构变化
         dsk = self.get_flow()
         get(dsk, 'end', num_workers=8)  # 执行任务
     except Exception as e:
         from etls.comm.emailcomm import send_error_msg
         logger.error(str(e))
         # send_error_msg(str(e), "flow执行错误%s" % self.batch_nm, if_to_wx=False)
         raise e
Exemple #18
0
 def __init__(self, conn, tb_nm, cols=None, if_check_table_struct=False):
     assert isinstance(tb_nm, str)
     self.tb_nm = tb_nm
     if isinstance(conn, str):
         conn = Conn(conn)
     self.conn = conn
     if if_check_table_struct:
         check_table_struct(conn, tb_nm)
     if self.conn.is_exists(tb_nm) is False:
         rs = check_table_struct(conn, tb_nm)
         if rs is False:
             raise Exception(" %s数据库中不存在表%s" % (self.conn.db_cd, tb_nm))
     if cols:
         self.cols = cols
     else:
         self.cols = self.get_tb_cols()
Exemple #19
0
 def __init__(self,
              conn,
              tb_nm,
              job_type,
              job_params=None,
              batch_id=None,
              check_tb_strct=True,
              logs_print=True):
     # 表名称 包含schema str
     self.tb_nm = tb_nm.strip()
     self.conn = conn if isinstance(conn, Conn) else Conn(conn)
     self.job_params = job_params if job_params else {}
     self.batch_id = batch_id if batch_id else self.job_params.get(
         'batch_id', None)
     # 作业类型 ddl dml el py sql batch_el, 如果是python作业,则job_type是python文件中对应的函数名
     if job_type.lower() in list(jobtype.values()):
         self.job_type = job_type.lower()
         self.action = self.job_params.get('el_type', 'datax').lower()
         if job_type.lower() == jobtype['el'] and self.tb_nm.lower(
         ) == jobtype['batch_el']:
             self.job_type = 'batch_el'
             if 'batch_nm' in self.job_params:
                 self.tb_nm = self.job_params.get('batch_nm')
             else:
                 # 读取对应批次ID的参数文件
                 paramsfile = BatchParams(self.batch_id).read()
                 if 'batch_nm' in paramsfile:
                     self.tb_nm = paramsfile.get('batch_nm')
                 else:
                     raise Exception("batch_el 批量导入作业类型必须指定batch_nm")
     else:
         self.job_type = 'py'
         self.action = job_type
     self.check_tb_strct = get_bool_val(
         check_tb_strct if check_tb_strct is not None else self.job_params.
         get('check_tb_strct', True))
     self.logs_print = get_bool_val(
         logs_print if logs_print is not None else self.job_params.
         get('logs_print', True))
Exemple #20
0
def run_el_with_batch(batch_id,
                      el_type,
                      read_tb,
                      read_conn,
                      write_tb,
                      write_conn='DPS',
                      read_where='',
                      pre_sql='truncate',
                      parallel_num=2,
                      check_tb_strct=True,
                      logs_print=True):
    """
    同步表数据
    :param batch_id: 批次批次编号
    :param el_type:  datax或者pypd
    :param read_tb: 读取表名
    :param read_conn: 读取的数据库标识 例如DPS CRM PFUND
    :param write_tb: 写入的表名 例如 dw.dim_prod
    :param write_conn: 写入库的库名 例如DPS CRM PFUND
    :param read_where: sql where条件
    :param pre_sql:  导入前sql操作,truncate 表示清空表,可以有其他sql
    :param parallel_num: 并发的channel 数
    :param check_tb_strct: 是否检验表结构
    :param logs_print 是否打印日志到终端展示 不管是否设置 日志都会存储到datax/log路径下
    :return:
    """
    dw_conn = Conn(write_conn)
    if write_tb is None:
        write_tb = get_targt_tb_nm(read_tb, read_conn)
    stat = get_el_tb_job_stat(batch_id, write_tb, dw_conn)  # 获取作业状态
    if is_runing(write_tb, dw_conn):  # 如果作业在处理则跳过
        logger.info("el_type %s 处理表: %s 正在处理中不再处理" % (el_type, write_tb))
        el_upd_stat(dw_conn,
                    batch_id,
                    write_tb,
                    batch_stat=1,
                    error_msg="正在处理中不再处理")
    if stat != 1:
        # stat==1 表示执行成功了 不再执行
        el_start_stat(dw_conn, batch_id, write_tb)
        try:
            logger.info("开始同步 batch_id %s el_type %s 处理表: %s" %
                        (batch_id, el_type, write_tb))
            # logger.debug("%s 导入数据前执行:%s" % (write_tb, pre_sql))
            # logger.debug("%s 导入数据条件:%s" % (write_tb, read_where))
            rs = run_el(el_type,
                        read_tb,
                        read_conn,
                        write_tb,
                        write_conn,
                        read_where,
                        pre_sql,
                        parallel_num,
                        check_tb_strct,
                        logs_print,
                        batch_dt=batch_id)
            if rs:
                el_upd_stat(dw_conn, batch_id, write_tb, batch_stat=1)
                logger.info("同步成功 batch_id %s el_type %s 处理表: %s" %
                            (batch_id, el_type, write_tb))
            else:
                raise Exception("不知名错误")
        except Exception as e:
            err_msg = str(e)
            logger.error("同步错误 batch_id %s el_type %s 处理表: %s ERROR: %s" %
                         (batch_id, el_type, write_tb, err_msg))
            el_upd_stat(dw_conn,
                        batch_id,
                        write_tb,
                        batch_stat=2,
                        error_msg=err_msg)
            send_error_msg(err_msg, write_tb, if_to_wx=False)
            raise Exception(err_msg)
        finally:
            dw_conn.close()
    else:
        # 该批次下数据已经同步过,不再同步
        el_upd_stat(dw_conn,
                    batch_id,
                    write_tb,
                    batch_stat=1,
                    error_msg="多次执行,执行跳过")
        logger.warning("该批次下数据已经同步过,不再同步。 batch_id %s el_type %s 处理表: %s" %
                       (batch_id, el_type, write_tb))
        dw_conn.close()
Exemple #21
0
def crt_all_tb_from_src(src_db_cd,
                        src_tb_schm,
                        trgt_db_cd="DPS",
                        trgt_schm_nm='ods',
                        batch_id=None,
                        if_el_data=False):
    """
    一次性移动某个系统的表到数据仓库中,一般只有临时性导入某数据库才这样操作,不能用于生产调度
    :param if_el_data:  是导入源系统数据。默认只建表结构
    :param batch_id:   有batch_id的,run_el_with_batch 导入并记录到task_log里面,没有指定的直接导datax导数据
    :param src_db_cd: 源系统编号
    :param src_tb_schm: 源系统schema 只需要指定shcema 没有的用public
    :param trgt_db_cd: 目标系统编号 DPS HQ_HR
    :param trgt_schm_nm: 目标系统schema
    :return:
    """
    conn = Conn(src_db_cd)
    # conn, db_type, db_nm = get_db_conn(src_db_cd)
    if conn.db_type.upper() == 'ORACLE'.upper():
        sql = """select distinct table_name tb_nm from user_tables  order by  table_name"""
    elif conn.db_type.upper() == 'MYSQL'.upper():
        sql = """
           select distinct t2.table_name tb_nm
           from   information_schema.tables t2 
           where t2.table_schema='{db_nm}' and table_type='BASE TABLE';""".format(
            db_nm=conn.db_nm)
    elif conn.db_type.upper() == 'PostgreSQL'.upper():
        sql = """ 
           SELECT   distinct
                          '{tb_schm}.'||B.relname tb_nm  
                       FROM  pg_catalog.pg_class B
                       INNER JOIN pg_catalog.pg_namespace C
                           ON C.oid=B.relnamespace
                           AND C.nspname='{tb_schm}' where B.relam=0  and relkind='r' 
           """.format(tb_schm=src_tb_schm)
    elif conn.db_type.upper() == 'Greenplum'.upper():
        sql = """ 
           SELECT   distinct
                          '{tb_schm}.'||B.relname tb_nm  
                       FROM  pg_catalog.pg_class B
                       INNER JOIN pg_catalog.pg_namespace C
                           ON C.oid=B.relnamespace
                           AND C.nspname='{tb_schm}' where B.relam=0  and relkind='r' 
           """.format(tb_schm=src_tb_schm)
    elif conn.db_type.upper() == 'SQLServer'.upper():
        sql = """SELECT
                  distinct sysobjects.name AS tb_nm 
               FROM syscolumns                            -- 数据表字段
               INNER JOIN sysobjects                        -- 数据对象
                 ON sysobjects.id = syscolumns.id
               INNER JOIN systypes                         -- 数据类型
                 ON syscolumns.xtype = systypes.xtype
               LEFT OUTER JOIN sys.extended_properties properties       -- 字段属性信息
                 ON syscolumns.id = properties.major_id
                AND syscolumns.colid = properties.minor_id
               LEFT OUTER JOIN sys.extended_properties sysproperties                -- 表属性信息
                 ON sysobjects.id = sysproperties.major_id
                AND sysproperties.minor_id = 0
               LEFT OUTER JOIN syscomments                -- 注释信息
                 ON syscolumns.cdefault = syscomments.id
               LEFT OUTER JOIN sysindexkeys                -- 索引中的键或列的信息
                 ON sysindexkeys.id = syscolumns.id
                AND sysindexkeys.colid = syscolumns.colid
               LEFT OUTER JOIN sysindexes                  -- 数据库 索引表
                 ON sysindexes.id = sysindexkeys.id
                AND sysindexes.indid = sysindexkeys.indid
               LEFT OUTER JOIN sysforeignkeys
                 ON sysforeignkeys.fkeyid = syscolumns.id
                AND sysforeignkeys.fkey = syscolumns.colid
               left join INFORMATION_SCHEMA.tables ts on sysobjects.name =ts.table_name
               WHERE (sysobjects.xtype = 'U') and ts.Table_catalog='{db_nm}' 
               order by syscolumns.id""".format(db_nm=conn.db_nm, )
    else:
        # logger.error("不支持的数据库类型:" + db_type)
        raise Exception("不支持的数据库类型:" + conn.db_type)
    # logger.warning(sql)
    meta = conn.select(sql)  # pd.read_sql(sql, conn)
    if meta.shape[0] > 0:
        meta.columns = ['src_tb_nm'
                        ]  # (map(lambda x: x.lower(), list(meta.columns)))
        meta['el_type'] = 'datax'
        meta['src_db_cd'] = src_db_cd
        meta['trgt_tb_nm'] = None
        meta['trgt_db_cd'] = trgt_db_cd
        meta['read_where'] = ''
        meta['pre_sql'] = 'truncate'
        meta['parallel_num'] = 2
        meta['if_el_data'] = if_el_data
        multi_proc_el(meta, batch_id, processes=5, trgt_schm_nm=trgt_schm_nm)
    logger.info("数据处理完成")
Exemple #22
0
def json_file(read_tb,
              read_conn,
              write_tb,
              write_conn,
              read_where='',
              pre_sql='truncate',
              parallel_num=1):
    """
    创建datax执行的json文件
    :param read_tb:  源表
    :param read_conn: 源系统连接
    :param write_tb:   目标表
    :param write_conn:  目标系统连接
    :param read_where:  where过滤
    :param pre_sql:   执行前执行sql
    :param parallel_num: 进程数 channel
    :return:
    """
    if os.path.exists(
            os.path.join(datax_job_json_path, write_tb + ".json".lower())):
        # 如果已经存在json配置文件则直接返回
        return os.path.join(datax_job_json_path, write_tb + ".json".lower())
    if pre_sql == 'truncate':
        pre_sql = 'truncate table ' + write_tb
    src_conn = Conn(read_conn)
    read_db_type, read_user, read_pwd, read_jdbc = src_conn.get_jdbc()
    if read_db_type == "Greenplum".upper():
        # greeenplum 读 走postgresql线路
        read_db_type = "Postgresql"
    trgt_conn = Conn(write_conn)
    write_db_type, write_user, write_pwd, write_jdbc = trgt_conn.get_jdbc()
    if write_db_type == "Greenplum".upper():
        # greeenplum 写
        write_db_type = "gpdb"
    cols = get_cols(write_conn, write_tb)
    ignore_cols_list = get_ignore_cols(write_tb, trgt_conn)
    if ignore_cols_list:
        cols = list(set(cols) - set(ignore_cols_list))
    src_cols = get_cols(read_conn, read_tb)
    cols = list(filter(lambda x: x in src_cols, cols))  # 过滤不需要的字段
    jsons = {
        "job": {
            "content": [{
                "reader": {
                    "name": read_db_type.lower() + "reader",
                    "parameter": {
                        "username":
                        read_user,
                        "password":
                        read_pwd,
                        "column":
                        cols,
                        "where":
                        read_where,
                        "connection": [{
                            "table": [read_tb],
                            "jdbcUrl": [read_jdbc]
                        }]
                    }
                },
                "writer": {
                    "name": write_db_type.lower() + "writer",
                    "parameter": {
                        "username":
                        write_user,
                        "password":
                        write_pwd,
                        "column":
                        cols,
                        "preSql": [pre_sql],
                        "connection": [{
                            "jdbcUrl": write_jdbc,
                            "table": [write_tb]
                        }]
                    }
                }
            }],
            "setting": {
                "speed": {
                    "channel": parallel_num
                }
            }
        }
    }
    try:
        datax_json_path = os.path.join(TEMP_HOME, write_tb + ".json".lower())
        datax_json_file_handler = open(datax_json_path, "w")
        json_str = json.dumps(jsons,
                              indent=4,
                              sort_keys=False,
                              ensure_ascii=False)
        logger.debug(
            "\n" +
            json_str.replace(read_pwd, '*****').replace(write_pwd, '******'))
        datax_json_file_handler.write(json_str)
        logger.info('datax json 文件生成:' + datax_json_path)
        return datax_json_path
    except Exception as e:
        logger.error('datax json 文件生成错误:' + str(e))
Exemple #23
0
def multi_proc_el_all_task(batch_id,
                           conn=None,
                           batch_nm='T1',
                           processes=5,
                           logs_print=False,
                           only_tables=None):
    """
    同步ods数据主程序入口,同步task_infos所有的未删除表
    :param only_tables: 如果指定则只同步指定表
    :param batch_id:批次ID 字符串
    :param conn: 数据库连接
    :param batch_nm: 是否只处理增量表,用于准实时数据同步
    :param processes: 处理进程数个数,默认是5个,根据线上配置个数5-20个
    :param logs_print: 是否打印同步的明细日志
    :return:
    """
    batch_id = str(batch_id)
    if conn is None:
        conn = Conn()
    task_sql = """
        select
            src_tb_nm read_tb,
            src_db_cd read_conn,
            tb_nm write_tb,
            trgt_db_cd write_conn,
            sql_where read_where,
            sql_pre_sql pre_sql,
            el_type,
            parallel_num::int parallel_num
         from {tb_task_info} where is_del='N' and batch_nm='{batch_nm}'
        """
    # is_inc_where = " and batch_nm='{batch_nm}'".format(batch_nm=batch_nm, tb_task_info=tb_task_info)
    task_sql = task_sql.format(batch_nm=batch_nm,
                               tb_task_info=tb_task_info)  # + is_inc_where
    crt_el_log_tb(conn)
    df = conn.select(task_sql)  # pd.read_sql(task_sql, conn)
    if only_tables:  # 如果指定执行某个表
        only_tables = only_tables.replace(",", ",").replace(" ", "").split(",")
        df = df[df['write_tb'].isin(only_tables)]
    if df.shape[0] > 0:
        rs = df.to_dict(orient='record')
        logger.info("Batch_id:%s 启动进程数: %s,开始批处理run_el_with_batch" %
                    (batch_id, processes))
        pool = multiprocessing.Pool(processes=processes)
        for i in rs:
            i['batch_id'] = batch_id
            i['logs_print'] = logs_print
            pool.apply_async(run_el_with_batch, kwds=i)
        pool.close()
        pool.join()
        logger.info("batch_id:%s 完成数据导入" % batch_id)
        if only_tables:
            check_el_task(batch_id, conn, batch_nm, check_error_only=True)
            # 检查作业处理结果,有错误则抛出异常。可以加邮件通知和微信通知
        else:
            check_el_task(batch_id,
                          conn,
                          batch_nm,
                          check_error_only=False,
                          task_list=[i['write_tb'] for i in rs])
            # 检查作业处理结果,有错误则抛出异常。
    else:
        logger.info("没有表需要同步")