def is_runing(tb_nm, conn=None, batch_id=''): """ :param batch_id: :param tb_nm: :param conn: :return: """ if conn is None: conn = Conn() sql = """select batch_id from {log_tb_nm} where tb_nm='{tb_nm}' and start_dt>= current_timestamp-interval '3 hour' and batch_stat=0""" sql = sql.format(log_tb_nm=tb_task_log, tb_nm=tb_nm) flag = conn.query_one(sql) # pd.read_sql(sql, conn) times = 0 while flag: logger.warning("%s在执行中本次执行开始等待,等待第%s次" % (tb_nm, times)) send_error_msg("batch_id:%s的作业%s一直在执行中本批次%s等待,请确认是否有执行故障" % (flag, batch_id, tb_nm), tb_nm, if_to_wx=False) time.sleep(300) # 等待5分钟 times += 1 flag = conn.query_one(sql) if flag and times > 3: logger.error("%s一直在执行中,请确认是否产生故障。此次执行报错" % tb_nm) raise Exception("%s一直在执行中,请确认是否产生故障。此次执行报错" % tb_nm) return False
def crt_tb_from_src_sys(self, src_tb_nm, src_db_cd, trgt_schm_nm='ods'): """ 数据同步时,源系统和目标表结构创建或者校验 :param src_tb_nm: :param src_db_cd: :param trgt_schm_nm: :return: """ trgt_tb_nm = self.tb_nm trgt_db_cd = self.conn.db_cd src_conn = Conn(src_db_cd) src_meta = src_conn.get_tb_strct(src_tb_nm) # 获取表结构 src_conn.close() if src_meta: if trgt_tb_nm is None: # 如果没有设定目标表名,需要自动生成目标表名 trgt_tb_nm = self.get_auto_el_tb_nm(schema=trgt_schm_nm) crt_tb_sql = crt_trgt_db_sql(src_meta, trgt_tb_nm, trgt_db_cd) trgt_conn = Conn(trgt_db_cd) rs = trgt_conn.upd_tb_strct(crt_tb_sql, schm_tb_nm=trgt_tb_nm, drop_direct=False) trgt_conn.close() return rs else: raise Exception("源数据库目标表表%s不存在" % src_tb_nm)
def multi_proc_el(df, batch_id, processes=5, trgt_schm_nm='ods'): """ 对df数据多进程处理,这里是导入数据或建表格 :param batch_id: :param df: dataframe 需要字段batch_id, el_type, src_tb_nm, src_db_cd, trgt_tb_nm, trgt_db_cd=default_db_cd, read_where='', pre_sql='truncate', parallel_num=2 :param processes: 多进程数 :param trgt_schm_nm: 目标schema 主要用于 :return: """ rs = df.to_dict(orient='record') logger.info("批量导入 启动进程数: %s,开始批处理" % (processes, )) pool = multiprocessing.Pool(processes=processes) # i 需要传递字段 batch_id, el_type, src_tb_nm, src_db_cd, trgt_tb_nm, trgt_db_cd, # read_where='', pre_sql='truncate', parallel_num for i in rs: i['batch_id'] = batch_id if i['trgt_tb_nm'] is None: i['trgt_tb_nm'] = get_targt_tb_nm(i['src_tb_nm'], i['src_db_cd'], schema=trgt_schm_nm) # logger.info(str(i)) pool.apply_async(_el_run, kwds=i) pool.close() pool.join() logger.info("完成批处理") conn = Conn() check_el_task(batch_id, conn, batch_nm='T1', check_error_only=True) conn.close()
def parse_sys_args(args): if isinstance(args, str): args = args.replace(",", ",").split() schm_tb_nm = args[1].replace(",", ",") job_type = args[2] # job_type 必要参数 可以取ddl/dml/func/el/batch_el param_list = args[3:] param_list_len = len(param_list) if param_list_len > 0: # 有代入参数 if param_list_len % 2 == 0: keys = [ param_list[i].replace("-", "") for i in range(param_list_len) if i % 2 == 0 ] vals = [param_list[i] for i in range(param_list_len) if i % 2 == 1] params_dict = dict(zip(keys, vals)) else: err_msgs = " ".join(param_list) + " 参数应该要kv一一对应" logger.error(err_msgs) raise Exception(err_msgs) else: params_dict = {} if 'conn' in params_dict: conn = Conn(params_dict['conn']) else: conn = Conn() if 'p' in params_dict: # 进程池进程个数 params_dict['processes'] = params_dict['p'] if 'c' in params_dict: # 是否检查表结构 params_dict['check_tb_strct'] = params_dict['c'] if 'nm' in params_dict: params_dict['batch_nm'] = params_dict['nm'] return conn, schm_tb_nm, job_type, params_dict
def deal(): """ 处理入口 """ conn = Conn(default_db_cd) crt_el_log_tb(conn) try: # 校验表结构 if conn.upd_tb_strct(crt_tb_sql=ddl_sql, schm_tb_nm=tb_nm, drop_direct=True): etl_meta_el(conn) finally: conn.close()
def get_cols(db_cd, tb_nm): """ 获取字段名 :param db_cd: 只是编号 :param tb_nm: :return: """ conn = Conn(db_cd) rs = conn.get_tb_strct(tb_nm) # 获取表结构 if rs: rs = rs['meta']['col_nm'] cols = list(rs) return cols # '"' + '","'.join(cols) + '"'
def deal(): """ 处理入口 :return: """ conn = Conn('ibm') try: with conn: # 表结构校验 if conn.upd_tb_strct(crt_tb_sql=ddl_sql, schm_tb_nm=tb_nm, drop_direct=proc_all_flag): # 产品分类 dim_prod_cat(conn) finally: conn.close()
def get_trgt_tb_crt_sql(self, trgt_tb_nm, trgt_db_cd="DPS", if_create_table=False): """ 根据源系统表结构生成目标系统的表结构语句,只适合简单的ods层同步表,不会获取索引、主键等信息 :param if_create_table: 是否直接创建表 :param trgt_tb_nm: 目标表表名 包含schema 例如dw.dim_date :param trgt_db_cd: :return: """ tb_strct = self.get_tb_strct() # meta = tb_strct.copy() # 源系统表结构 类型dataframe 'col_nm', 'col_type', 'col_len', 'col_prcn', 'col_cmnt' trgt_db_conn = Conn(trgt_db_cd) crt_tb_sql = crt_trgt_db_sql(tb_strct, trgt_tb_nm, trgt_db_cd=trgt_db_cd) if if_create_table: trgt_db_conn.upd_tb_strct(crt_tb_sql, schm_tb_nm=trgt_tb_nm, drop_direct=False) return crt_tb_sql # {"tb_nm": trgt_tb_nm, 'crt_tb_sql': crt_tb_sql, 'meta': meta}
def crt_trgt_db_sql(tb_strct, trgt_tb_nm, trgt_db_cd="DPS"): """ 根据源系统表结构生成目标系统的表结构语句 :param tb_strct: :param trgt_tb_nm: :param trgt_db_cd: :return: """ meta = tb_strct['meta'].copy() # 源系统表结构 类型dataframe 'col_nm', 'col_type', 'col_len', 'col_prcn', 'col_cmnt' from_db_type = tb_strct['db_type'] trgt_db_conn = Conn(trgt_db_cd) # trgt_db_conn, trgt_db_type, trgt_db_nm = get_db_conn(trgt_db_cd) ignore_cols_list = get_ignore_cols(trgt_tb_nm, trgt_db_conn) # 获取剔除的字段 if ignore_cols_list: meta = meta[~meta['col_nm'].isin(ignore_cols_list)] # 删除不必要的字段 logger.warning(trgt_tb_nm + " 剔除不同步字段:" + ','.join(ignore_cols_list)) from etls.comm.dbmapping import get_db_mapping data_type_mapping = get_db_mapping(from_db_type, trgt_db_conn.db_type) meta['col_type'] = meta['col_type'].apply(lambda x: data_type_mapping.get(x, data_type_mapping.get('default', x))) if trgt_db_conn.db_type.lower() in ["postgresql", "pg"]: logger.debug("转化成postgresql库") from etls.comm.greenplum import generate_simple_ddl crt_tb_sql = generate_simple_ddl(trgt_tb_nm, tb_strct, meta) return crt_tb_sql # {"tb_nm": trgt_tb_nm, 'crt_tb_sql': crt_tb_sql, 'meta': meta} elif trgt_db_conn.db_type.lower() in ["greenplum", "gp"]: logger.debug("转化成greenplum库") from etls.comm.greenplum import generate_simple_ddl crt_tb_sql = generate_simple_ddl(trgt_tb_nm, tb_strct, meta) return crt_tb_sql # {"tb_nm": trgt_tb_nm, 'crt_tb_sql': crt_tb_sql, 'meta': meta} else: raise Exception("不支持的目标数据库类型:" + trgt_db_conn.db_type)
def get_el_tb_job_stat(batch_id, tb_nm, conn=None): """ :param batch_id: :param tb_nm: :param conn: :return: """ if conn is None: conn = Conn() sql = """select batch_id,tb_nm,batch_stat stat from {log_tb_nm} where batch_id='{batch_id}' and tb_nm='{tb_nm}' order by start_dt desc limit 1""" sql = sql.format(batch_id=batch_id, log_tb_nm=tb_task_log, tb_nm=tb_nm) df = conn.select(sql) # pd.read_sql(sql, conn) if df.shape[0] > 0: return df.loc[0, 'stat'] else: return -1
def get_from_dim_date(batch_dt, col_nm): """ 返回清算的时间 仅对海银清算时间 :param batch_dt:日期 :param col_nm: 指定的字段 :return: """ sql = """ SELECT {col_nm} from dw.dim_date where dt_id='{batch_dt}' """.format(batch_dt=batch_dt, col_nm=col_nm) dps = Conn("DPS") try: rs = dps.query_one(sql) return rs except Exception as e: str(e) return None
def el_tbs(conn, tb_nm, batch_id=None, logs_print=False): """ 同步ods数据主程序入口,同步task_infos所有的未删除表 :param tb_nm: 如果指定则只同步指定表 也可是多个表 例如ods.pf_cust_info,ods.portal_wechat_card :param batch_id:批次ID :param conn: 数据库连接 :param logs_print: 是否打印同步的明细日志 :return: """ if ',' in tb_nm: tbs = tb_nm.replace(' ', '').split(",") tb_nm = str(tuple(tbs)) else: tb_nm = "('" + tb_nm + "')" if conn is None: conn = Conn() task_sql = """ select src_tb_nm read_tb, src_db_cd read_conn, tb_nm write_tb, trgt_db_cd write_conn, sql_where read_where, sql_pre_sql pre_sql, el_type, parallel_num::int parallel_num from {tb_task_info} where is_del='N' and tb_nm in {tb_nm} """ task_sql = task_sql.format(tb_nm=tb_nm, tb_task_info=tb_task_info) crt_el_log_tb(conn) df = conn.select(task_sql) if df.shape[0] > 0: rses = df.to_dict(orient='record') if batch_id: for rs in rses: rs['batch_id'] = batch_id rs['logs_print'] = logs_print run_el_with_batch(**rs) else: for rs in rses: rs['logs_print'] = logs_print run_el(**rs) else: logger.info("没有表需要同步")
def get_ignore_cols(trgt_tb_nm, conn=None): """获取忽略导入字段""" if conn is None: conn = Conn() sql = """ select s.tb_nm, s.ignore_cols from {tb_task_info} s where ignore_cols>'' and tb_nm='{trgt_tb_nm}' """ sql = sql.format(trgt_tb_nm=trgt_tb_nm, tb_task_info=tb_task_info) if conn.is_exists(tb_task_info): df = conn.select(sql) # pd.read_sql(sql, conn) if df.shape[0] > 0: return df.loc[0, 'ignore_cols'].split(',') # list(df['ignore_cols']) else: return None else: return None
def src_tb_sync_ods(src_tb_nm, src_db_cd, trgt_tb_nm=None, trgt_db_cd=default_db_cd, trgt_schm_nm='ods', if_el_data=True): """ 源系统和目标表结构创建或者校验 :param if_el_data: 是否导入数据 :param src_tb_nm: :param src_db_cd: :param trgt_tb_nm: :param trgt_db_cd: :param trgt_schm_nm: :return: """ src_conn = Conn(src_db_cd) src_meta = src_conn.get_tb_strct(src_tb_nm) # 获取表结构 src_conn.close() if src_meta: if trgt_tb_nm is None: # 如果没有设定目标表名,需要自动生成目标表名 trgt_tb_nm = get_targt_tb_nm(src_meta['tb_nm'], src_meta['db_cd'], schema=trgt_schm_nm) crt_tb_sql = crt_trgt_db_sql(src_meta, trgt_tb_nm, trgt_db_cd) trgt_conn = Conn(trgt_db_cd) rs = trgt_conn.upd_tb_strct(crt_tb_sql, schm_tb_nm=trgt_tb_nm, drop_direct=False) trgt_conn.close() if if_el_data: datax(src_tb_nm, src_db_cd, trgt_tb_nm, write_conn=trgt_db_cd, check_tb_strct=False, logs_print=False) return rs else: raise Exception("源数据库目标表表%s不存在" % src_tb_nm)
def to_check_tb_strct(src_tb_nm=None, src_db_cd=None, trgt_tb_nm=None, trgt_db_cd="DPS", if_ddl_not_exists="raise"): """ 校验表结构 :param if_ddl_not_exists: :param src_tb_nm: :param src_db_cd: :param trgt_tb_nm: :param trgt_db_cd: :return: """ sql = get_sql(trgt_tb_nm, sql_type='ddl') # 获取表结构的sql文件 if sql: # 存在sql文件时,以sql文件为主 if isinstance(trgt_db_cd, str): conn = Conn(trgt_db_cd) # get_conn(trgt_db_cd) else: conn = trgt_db_cd check_rs = conn.upd_tb_strct(sql, trgt_tb_nm) if check_rs is False: raise Exception("表结构校验不通过,请查看上面的错误明细") elif src_tb_nm and src_db_cd: err_msg = "根目录下的sql/ddl/" + trgt_tb_nm.replace( ".", "/") + ".sql文件不存在,从源系统拉取表结构校验" logger.warning(err_msg) src_tb_sync_ods(src_tb_nm, src_db_cd, trgt_tb_nm, trgt_db_cd, if_el_data=False) # 校验表结构并修改 else: err_msg = "根目录下的sql/ddl/" + trgt_tb_nm.replace(".", "/") + ".sql 文件不存在,无法校验" logger.warning(err_msg) if if_ddl_not_exists == "raise": # send_error_msg(err_msg, trgt_tb_nm) raise Exception(err_msg)
def get_rpt_dtm(batch_dt, dtm_type): """ 返回清算的时间 仅对海银清算时间 :param batch_dt:日期 :param dtm_type: -- 取上次报表结束时间 cvrt_typ: last_rpt_end_dtm -- 取今天报表结束时间 cvrt_typ: tdy_rpt_end_dtm -- 取下一个工作日 cvrt_typ: nxt_workday -- 取季度的第一天 cvrt_typ: quar_start -- 取下个季度的第一天 cvrt_typ: quar_end :return: """ if dtm_type in [ 'last_rpt_end_dtm', 'tdy_rpt_end_dtm', 'nxt_workday', 'quar_start', 'quar_end' ]: sql = """ SELECT dw.cvrt_dt('{batch_dt}', '{dtm_type}') AS dt -- 上个工作日报表结束时间 """.format(batch_dt=batch_dt, dtm_type=dtm_type) dps = Conn("DPS") return dps.query_one(sql) else: return None
def exec_flow(self): """执行flow""" # print(dsk['T1']) # keyorder = order(dsk) # # print(keyorder) # state = start_state_from_dask(dsk, cache=None, sortkey=keyorder.get) # print(state) try: crt_el_log_tb(Conn()) # 检查日志表结构变化 dsk = self.get_flow() get(dsk, 'end', num_workers=8) # 执行任务 except Exception as e: from etls.comm.emailcomm import send_error_msg logger.error(str(e)) # send_error_msg(str(e), "flow执行错误%s" % self.batch_nm, if_to_wx=False) raise e
def __init__(self, conn, tb_nm, cols=None, if_check_table_struct=False): assert isinstance(tb_nm, str) self.tb_nm = tb_nm if isinstance(conn, str): conn = Conn(conn) self.conn = conn if if_check_table_struct: check_table_struct(conn, tb_nm) if self.conn.is_exists(tb_nm) is False: rs = check_table_struct(conn, tb_nm) if rs is False: raise Exception(" %s数据库中不存在表%s" % (self.conn.db_cd, tb_nm)) if cols: self.cols = cols else: self.cols = self.get_tb_cols()
def __init__(self, conn, tb_nm, job_type, job_params=None, batch_id=None, check_tb_strct=True, logs_print=True): # 表名称 包含schema str self.tb_nm = tb_nm.strip() self.conn = conn if isinstance(conn, Conn) else Conn(conn) self.job_params = job_params if job_params else {} self.batch_id = batch_id if batch_id else self.job_params.get( 'batch_id', None) # 作业类型 ddl dml el py sql batch_el, 如果是python作业,则job_type是python文件中对应的函数名 if job_type.lower() in list(jobtype.values()): self.job_type = job_type.lower() self.action = self.job_params.get('el_type', 'datax').lower() if job_type.lower() == jobtype['el'] and self.tb_nm.lower( ) == jobtype['batch_el']: self.job_type = 'batch_el' if 'batch_nm' in self.job_params: self.tb_nm = self.job_params.get('batch_nm') else: # 读取对应批次ID的参数文件 paramsfile = BatchParams(self.batch_id).read() if 'batch_nm' in paramsfile: self.tb_nm = paramsfile.get('batch_nm') else: raise Exception("batch_el 批量导入作业类型必须指定batch_nm") else: self.job_type = 'py' self.action = job_type self.check_tb_strct = get_bool_val( check_tb_strct if check_tb_strct is not None else self.job_params. get('check_tb_strct', True)) self.logs_print = get_bool_val( logs_print if logs_print is not None else self.job_params. get('logs_print', True))
def run_el_with_batch(batch_id, el_type, read_tb, read_conn, write_tb, write_conn='DPS', read_where='', pre_sql='truncate', parallel_num=2, check_tb_strct=True, logs_print=True): """ 同步表数据 :param batch_id: 批次批次编号 :param el_type: datax或者pypd :param read_tb: 读取表名 :param read_conn: 读取的数据库标识 例如DPS CRM PFUND :param write_tb: 写入的表名 例如 dw.dim_prod :param write_conn: 写入库的库名 例如DPS CRM PFUND :param read_where: sql where条件 :param pre_sql: 导入前sql操作,truncate 表示清空表,可以有其他sql :param parallel_num: 并发的channel 数 :param check_tb_strct: 是否检验表结构 :param logs_print 是否打印日志到终端展示 不管是否设置 日志都会存储到datax/log路径下 :return: """ dw_conn = Conn(write_conn) if write_tb is None: write_tb = get_targt_tb_nm(read_tb, read_conn) stat = get_el_tb_job_stat(batch_id, write_tb, dw_conn) # 获取作业状态 if is_runing(write_tb, dw_conn): # 如果作业在处理则跳过 logger.info("el_type %s 处理表: %s 正在处理中不再处理" % (el_type, write_tb)) el_upd_stat(dw_conn, batch_id, write_tb, batch_stat=1, error_msg="正在处理中不再处理") if stat != 1: # stat==1 表示执行成功了 不再执行 el_start_stat(dw_conn, batch_id, write_tb) try: logger.info("开始同步 batch_id %s el_type %s 处理表: %s" % (batch_id, el_type, write_tb)) # logger.debug("%s 导入数据前执行:%s" % (write_tb, pre_sql)) # logger.debug("%s 导入数据条件:%s" % (write_tb, read_where)) rs = run_el(el_type, read_tb, read_conn, write_tb, write_conn, read_where, pre_sql, parallel_num, check_tb_strct, logs_print, batch_dt=batch_id) if rs: el_upd_stat(dw_conn, batch_id, write_tb, batch_stat=1) logger.info("同步成功 batch_id %s el_type %s 处理表: %s" % (batch_id, el_type, write_tb)) else: raise Exception("不知名错误") except Exception as e: err_msg = str(e) logger.error("同步错误 batch_id %s el_type %s 处理表: %s ERROR: %s" % (batch_id, el_type, write_tb, err_msg)) el_upd_stat(dw_conn, batch_id, write_tb, batch_stat=2, error_msg=err_msg) send_error_msg(err_msg, write_tb, if_to_wx=False) raise Exception(err_msg) finally: dw_conn.close() else: # 该批次下数据已经同步过,不再同步 el_upd_stat(dw_conn, batch_id, write_tb, batch_stat=1, error_msg="多次执行,执行跳过") logger.warning("该批次下数据已经同步过,不再同步。 batch_id %s el_type %s 处理表: %s" % (batch_id, el_type, write_tb)) dw_conn.close()
def crt_all_tb_from_src(src_db_cd, src_tb_schm, trgt_db_cd="DPS", trgt_schm_nm='ods', batch_id=None, if_el_data=False): """ 一次性移动某个系统的表到数据仓库中,一般只有临时性导入某数据库才这样操作,不能用于生产调度 :param if_el_data: 是导入源系统数据。默认只建表结构 :param batch_id: 有batch_id的,run_el_with_batch 导入并记录到task_log里面,没有指定的直接导datax导数据 :param src_db_cd: 源系统编号 :param src_tb_schm: 源系统schema 只需要指定shcema 没有的用public :param trgt_db_cd: 目标系统编号 DPS HQ_HR :param trgt_schm_nm: 目标系统schema :return: """ conn = Conn(src_db_cd) # conn, db_type, db_nm = get_db_conn(src_db_cd) if conn.db_type.upper() == 'ORACLE'.upper(): sql = """select distinct table_name tb_nm from user_tables order by table_name""" elif conn.db_type.upper() == 'MYSQL'.upper(): sql = """ select distinct t2.table_name tb_nm from information_schema.tables t2 where t2.table_schema='{db_nm}' and table_type='BASE TABLE';""".format( db_nm=conn.db_nm) elif conn.db_type.upper() == 'PostgreSQL'.upper(): sql = """ SELECT distinct '{tb_schm}.'||B.relname tb_nm FROM pg_catalog.pg_class B INNER JOIN pg_catalog.pg_namespace C ON C.oid=B.relnamespace AND C.nspname='{tb_schm}' where B.relam=0 and relkind='r' """.format(tb_schm=src_tb_schm) elif conn.db_type.upper() == 'Greenplum'.upper(): sql = """ SELECT distinct '{tb_schm}.'||B.relname tb_nm FROM pg_catalog.pg_class B INNER JOIN pg_catalog.pg_namespace C ON C.oid=B.relnamespace AND C.nspname='{tb_schm}' where B.relam=0 and relkind='r' """.format(tb_schm=src_tb_schm) elif conn.db_type.upper() == 'SQLServer'.upper(): sql = """SELECT distinct sysobjects.name AS tb_nm FROM syscolumns -- 数据表字段 INNER JOIN sysobjects -- 数据对象 ON sysobjects.id = syscolumns.id INNER JOIN systypes -- 数据类型 ON syscolumns.xtype = systypes.xtype LEFT OUTER JOIN sys.extended_properties properties -- 字段属性信息 ON syscolumns.id = properties.major_id AND syscolumns.colid = properties.minor_id LEFT OUTER JOIN sys.extended_properties sysproperties -- 表属性信息 ON sysobjects.id = sysproperties.major_id AND sysproperties.minor_id = 0 LEFT OUTER JOIN syscomments -- 注释信息 ON syscolumns.cdefault = syscomments.id LEFT OUTER JOIN sysindexkeys -- 索引中的键或列的信息 ON sysindexkeys.id = syscolumns.id AND sysindexkeys.colid = syscolumns.colid LEFT OUTER JOIN sysindexes -- 数据库 索引表 ON sysindexes.id = sysindexkeys.id AND sysindexes.indid = sysindexkeys.indid LEFT OUTER JOIN sysforeignkeys ON sysforeignkeys.fkeyid = syscolumns.id AND sysforeignkeys.fkey = syscolumns.colid left join INFORMATION_SCHEMA.tables ts on sysobjects.name =ts.table_name WHERE (sysobjects.xtype = 'U') and ts.Table_catalog='{db_nm}' order by syscolumns.id""".format(db_nm=conn.db_nm, ) else: # logger.error("不支持的数据库类型:" + db_type) raise Exception("不支持的数据库类型:" + conn.db_type) # logger.warning(sql) meta = conn.select(sql) # pd.read_sql(sql, conn) if meta.shape[0] > 0: meta.columns = ['src_tb_nm' ] # (map(lambda x: x.lower(), list(meta.columns))) meta['el_type'] = 'datax' meta['src_db_cd'] = src_db_cd meta['trgt_tb_nm'] = None meta['trgt_db_cd'] = trgt_db_cd meta['read_where'] = '' meta['pre_sql'] = 'truncate' meta['parallel_num'] = 2 meta['if_el_data'] = if_el_data multi_proc_el(meta, batch_id, processes=5, trgt_schm_nm=trgt_schm_nm) logger.info("数据处理完成")
def json_file(read_tb, read_conn, write_tb, write_conn, read_where='', pre_sql='truncate', parallel_num=1): """ 创建datax执行的json文件 :param read_tb: 源表 :param read_conn: 源系统连接 :param write_tb: 目标表 :param write_conn: 目标系统连接 :param read_where: where过滤 :param pre_sql: 执行前执行sql :param parallel_num: 进程数 channel :return: """ if os.path.exists( os.path.join(datax_job_json_path, write_tb + ".json".lower())): # 如果已经存在json配置文件则直接返回 return os.path.join(datax_job_json_path, write_tb + ".json".lower()) if pre_sql == 'truncate': pre_sql = 'truncate table ' + write_tb src_conn = Conn(read_conn) read_db_type, read_user, read_pwd, read_jdbc = src_conn.get_jdbc() if read_db_type == "Greenplum".upper(): # greeenplum 读 走postgresql线路 read_db_type = "Postgresql" trgt_conn = Conn(write_conn) write_db_type, write_user, write_pwd, write_jdbc = trgt_conn.get_jdbc() if write_db_type == "Greenplum".upper(): # greeenplum 写 write_db_type = "gpdb" cols = get_cols(write_conn, write_tb) ignore_cols_list = get_ignore_cols(write_tb, trgt_conn) if ignore_cols_list: cols = list(set(cols) - set(ignore_cols_list)) src_cols = get_cols(read_conn, read_tb) cols = list(filter(lambda x: x in src_cols, cols)) # 过滤不需要的字段 jsons = { "job": { "content": [{ "reader": { "name": read_db_type.lower() + "reader", "parameter": { "username": read_user, "password": read_pwd, "column": cols, "where": read_where, "connection": [{ "table": [read_tb], "jdbcUrl": [read_jdbc] }] } }, "writer": { "name": write_db_type.lower() + "writer", "parameter": { "username": write_user, "password": write_pwd, "column": cols, "preSql": [pre_sql], "connection": [{ "jdbcUrl": write_jdbc, "table": [write_tb] }] } } }], "setting": { "speed": { "channel": parallel_num } } } } try: datax_json_path = os.path.join(TEMP_HOME, write_tb + ".json".lower()) datax_json_file_handler = open(datax_json_path, "w") json_str = json.dumps(jsons, indent=4, sort_keys=False, ensure_ascii=False) logger.debug( "\n" + json_str.replace(read_pwd, '*****').replace(write_pwd, '******')) datax_json_file_handler.write(json_str) logger.info('datax json 文件生成:' + datax_json_path) return datax_json_path except Exception as e: logger.error('datax json 文件生成错误:' + str(e))
def multi_proc_el_all_task(batch_id, conn=None, batch_nm='T1', processes=5, logs_print=False, only_tables=None): """ 同步ods数据主程序入口,同步task_infos所有的未删除表 :param only_tables: 如果指定则只同步指定表 :param batch_id:批次ID 字符串 :param conn: 数据库连接 :param batch_nm: 是否只处理增量表,用于准实时数据同步 :param processes: 处理进程数个数,默认是5个,根据线上配置个数5-20个 :param logs_print: 是否打印同步的明细日志 :return: """ batch_id = str(batch_id) if conn is None: conn = Conn() task_sql = """ select src_tb_nm read_tb, src_db_cd read_conn, tb_nm write_tb, trgt_db_cd write_conn, sql_where read_where, sql_pre_sql pre_sql, el_type, parallel_num::int parallel_num from {tb_task_info} where is_del='N' and batch_nm='{batch_nm}' """ # is_inc_where = " and batch_nm='{batch_nm}'".format(batch_nm=batch_nm, tb_task_info=tb_task_info) task_sql = task_sql.format(batch_nm=batch_nm, tb_task_info=tb_task_info) # + is_inc_where crt_el_log_tb(conn) df = conn.select(task_sql) # pd.read_sql(task_sql, conn) if only_tables: # 如果指定执行某个表 only_tables = only_tables.replace(",", ",").replace(" ", "").split(",") df = df[df['write_tb'].isin(only_tables)] if df.shape[0] > 0: rs = df.to_dict(orient='record') logger.info("Batch_id:%s 启动进程数: %s,开始批处理run_el_with_batch" % (batch_id, processes)) pool = multiprocessing.Pool(processes=processes) for i in rs: i['batch_id'] = batch_id i['logs_print'] = logs_print pool.apply_async(run_el_with_batch, kwds=i) pool.close() pool.join() logger.info("batch_id:%s 完成数据导入" % batch_id) if only_tables: check_el_task(batch_id, conn, batch_nm, check_error_only=True) # 检查作业处理结果,有错误则抛出异常。可以加邮件通知和微信通知 else: check_el_task(batch_id, conn, batch_nm, check_error_only=False, task_list=[i['write_tb'] for i in rs]) # 检查作业处理结果,有错误则抛出异常。 else: logger.info("没有表需要同步")