def getHiveTb(): engine = cons.meta('hive') sql_txt = """ select t.TBL_ID tb_id, d.name db, t.TBL_NAME tb, v.COLUMN_NAME col, v.TYPE_NAME ctype, v.`COMMENT` col_com from columns_v2 v inner join sds s on v.CD_ID=s.CD_ID inner join tbls t on s.sd_id=t.sd_id inner join dbs d on d.db_id=t.db_id where d.`NAME` in('cdi','app') order by t.TBL_ID,v.INTEGER_IDX; """ cols = pd.read_sql(sql_txt, engine) sql_txt = """ select s.tbl_id tb_id, max(if(PARAM_KEY='comment',PARAM_VALUE,null)) tb_com, max(if(PARAM_KEY='numRows',PARAM_VALUE,'')) row_num, max(if(PARAM_KEY='rawDataSize',PARAM_VALUE,'')) raw_data_size, max(if(PARAM_KEY='totalSize',PARAM_VALUE,'')) total_size, FROM_UNIXTIME(max(if(PARAM_KEY='transient_lastDdlTime',PARAM_VALUE,''))) last_ddl_time, FROM_UNIXTIME(max(if(PARAM_KEY='last_modified_time',PARAM_VALUE,''))) last_modified_time, max(if(PARAM_KEY='last_modified_by',PARAM_VALUE,null)) last_modified_by from TABLE_PARAMS s GROUP BY s.TBL_ID """ tbs = pd.read_sql(sql_txt, engine) tp = cols[['tb_id', 'tb', 'db']].drop_duplicates() tbs = tbs.merge(tp, how='inner', left_on='tb_id', right_on='tb_id') return cols, tbs
def re_run(): engine = cons.meta('hive') sql_txt = """ SELECT * FROM `etl错误任务视图` where start_time<CONCAT(CURRENT_DATE(),' 08:00:00'); """ err_df = pd.read_sql(sql_txt, engine) if err_df.shape[0] > 0: pass else: print('没有错误作业') return err_df
def getHive(): engine = cons.meta('hive') sql_txt = """ SELECT d.`NAME` db_name, TBL_NAME tb_name, FROM_UNIXTIME(CREATE_TIME) create_time, ifnull(p.row_num,0)+ifnull(pt.row_num,0) row_num, ifnull(p.total_size,0)+ifnull(pt.total_size,0) total_size, p.comments, case when pt.last_ddl_time>p.last_ddl_time then pt.last_ddl_time else p.last_ddl_time end last_ddl_time, -- case when pt.last_modified_time>p.last_modified_time then pt.last_modified_time else p.last_modified_time end last_modified_time, pt.part_name FROM tbls t INNER JOIN dbs d on t.DB_ID=d.DB_ID and d.`NAME` in('sdd','cdi','app') LEFT JOIN(select tbl_id, max(if(PARAM_KEY='comment',PARAM_VALUE,null)) comments, max(if(PARAM_KEY='numRows',PARAM_VALUE,'')) row_num, max(if(PARAM_KEY='rawDataSize',PARAM_VALUE,'')) raw_data_size, max(if(PARAM_KEY='totalSize',PARAM_VALUE,'')) total_size, FROM_UNIXTIME(max(if(PARAM_KEY='transient_lastDdlTime',PARAM_VALUE,''))) last_ddl_time, FROM_UNIXTIME(max(if(PARAM_KEY='last_modified_time',PARAM_VALUE,''))) last_modified_time, max(if(PARAM_KEY='last_modified_by',PARAM_VALUE,null)) last_modified_by from TABLE_PARAMS GROUP BY tbl_id) p on t.TBL_ID=p.tbl_id left JOIN( SELECT p.TBL_ID, sum(k.raw_data_size) raw_data_size, sum(k.row_num) row_num, sum(k.total_size) total_size, max(p.PART_NAME) part_name, max(k.last_ddl_time) last_ddl_time, max(k.last_modified_time) last_modified_time from partitions p LEFT JOIN( select PART_ID, max(if(PARAM_KEY='numRows',PARAM_VALUE,'')) row_num, max(if(PARAM_KEY='rawDataSize',PARAM_VALUE,'')) raw_data_size, max(if(PARAM_KEY='totalSize',PARAM_VALUE,'')) total_size, FROM_UNIXTIME(max(if(PARAM_KEY='transient_lastDdlTime',PARAM_VALUE,''))) last_ddl_time, FROM_UNIXTIME(max(if(PARAM_KEY='last_modified_time',PARAM_VALUE,''))) last_modified_time from partition_params GROUP BY PART_ID) k on p.PART_ID=k.PART_ID GROUP BY p.TBL_ID) pt on t.TBL_ID=pt.tbl_id """ oz_df = pd.read_sql(sql_txt, engine) return oz_df
def getOozie(): oozie = cons.meta('oozie') sql_txt = """ SELECT ifnull(w.app_name, c.app_name) job_name, c.last_modified_time job_last_time, c.next_matd_time job_next_time, w.end_time - w.start_time job_used_times,c.frequency FROM coord_jobs c LEFT JOIN coord_actions j ON c.id = j.job_id AND c.last_action_number = j.action_number LEFT JOIN wf_jobs w ON w.id = j.external_id WHERE c.user_name = 'hue' AND c.`status` = 'RUNNING' """ oz_df = pd.read_sql(sql_txt, oozie) return oz_df
if keys in oz_df.loc[j, 'job_name']: sh_oz_map.loc[i, 'job_name'] = oz_df.loc[j, 'job_name'] sh_oz_map = sh_oz_map.merge(oz_df, how='left', on='job_name') del sh_oz_map['sh_key'], i, j, keys, sh_list, sh_list_key #删除无效字段 rs = rs.merge(sh_oz_map, how='left', on='sh_files') hive_df = getHive() sch_rs = rs[pd.notnull(rs['job_name'])] #在执行计划中的 last_rs = hive_df.merge(sch_rs, how='left', left_on='tb_name', right_on='cfg_target_tb') last_rs['sh_files'] = last_rs['sh_files'].fillna('无配置shell') last_rs['job_name'] = last_rs['job_name'].fillna('暂无定时配置') last_rs['comments'] = last_rs['comments'].fillna(last_rs['sql_tb_cn']) etl_data = cons.meta('etl_data') table_to_jobs = last_rs.copy() last_rs['oper_date'] = today #result to mysql etl_data.execute( "delete from etl_job_set where oper_date='{0}'".format(today)) insert_rs = last_rs[insert_cols].copy() insert_rs = insert_rs.astype('str') insert_rs.to_sql(name='etl_job_set', con=etl_data, if_exists='append', index=False) etl_data.execute( "delete from etl_log_sum where oper_date='{0}'".format(today)) log_rs['oper_date'] = today log_rs[[