def update_rowcount(row): sql = "select count(1) as cnt from {table_name}".format( table_name=row["table_name"]) db_conf = { "host": row["host"], "port": row["port"], "user": row["user"], "password": row["password"], "database": row["db_name"], "charset": "utf8" } engine_str = get_engine_str(row["db_type"]).format(**db_conf) con = create_engine(engine_str, poolclass=pool.NullPool) # Dlogger.debug(sql) try: result = pd.read_sql(sql, con) table_rows = result.iat[0, 0] sql_update = "update meta_import set table_rows={table_rows},rows_updatetime='{now}' where db_name='{db_name}' and table_name='{table_name}'" \ .format(table_rows=table_rows, now=str(datetime.datetime.now()), db_name=row["db_name"], table_name=row["table_name"]) mysql_con.execute(sql_update) except Exception: table_rows = "Null" Dlogger.info("{:<40} : {}".format(row["db_name"] + "." + row["table_name"], table_rows))
def main(row): table_names = get_tabs(row["db_type"], row["db_conf"]) Dlogger.info("{}库共有{}个表".format(row["db_name"], len(table_names))) if len(table_names) == 0: sys.exit(1) insert_sql = """ INSERT INTO meta_import(connection_id, db_name, table_name, hive_database, hive_table, exec_engine, is_overwrite, status) VALUE""" for table_name in table_names: tmp_value = "('{connection_id}','{db_name}','{table_name}','{hive_database}','{hive_table}','{exec_engine}',{is_overwrite},{status})," \ .format(connection_id=row["connection_id"], db_name=row["db_name"], table_name=table_name, hive_database="pet_medical", hive_table="src_" + row["target_db_name"] + "_" + table_name, exec_engine="sqoop", is_overwrite=1, status=0) insert_sql = insert_sql + "\n " + tmp_value insert_sql = insert_sql[:-1] Dlogger.info(insert_sql) con = create_engine(get_engine_str("mysql").format(**config.DB_CONF)) con.execute(insert_sql) Dlogger.info("insert success") db_map_sql = "select distinct db_name,substr(t.hive_table,5,locate('_', t.hive_table,5)-1-4) as hive_db_name from meta_import t" df = pd.read_sql(sql=db_map_sql, con=con) print(df)
def get_cols(db_type, tb_name, db_conf): cols = [] engine_str = get_engine_str(db_type).format(**db_conf) if db_type == "sqlserver": sql = "sp_columns {tb_name}".format(tb_name=tb_name) Dlogger.info("MSSQL Command = " + sql) # engine_str = "mssql+pymssql://{username}:{password}@{host}:{port}/{database}?charset=utf8".format(**db_conf) con = create_engine(engine_str, poolclass=pool.NullPool) df = pd.read_sql(sql=sql, con=con) for index, row in df.iterrows(): key = str(row["COLUMN_NAME"]) value = str(row["TYPE_NAME"].replace(" identity", "")) cols.append({"name": key, "type": value}) elif db_type == "mysql": sql = "select t.column_name,data_type,column_type from information_schema.columns t where t.table_schema='{db_name}' and t.table_name='{tb_name}'".format( db_name=db_conf["database"], tb_name=tb_name) Dlogger.info("MySQL Command = " + sql) # engine_str = "mysql+pymysql://{username}:{password}@{host}:{port}/{database}?charset=utf8".format(**db_conf) con = create_engine(engine_str, poolclass=pool.NullPool) df = pd.read_sql(sql=sql, con=con) for index, row in df.iterrows(): key = str(row["column_name"]) value = str(row["column_type"].replace(" unsigned", "")) cols.append({"name": key, "type": value}) elif db_type == "oracle": sql = "select column_name,data_type from user_tab_columns t where lower(t.table_name)='{tb_name}'".format( tb_name=tb_name) Dlogger.info("Oracle command = " + sql) # engine_str = "oracle+cx_oracle://{username}:{password}@{host}:{port}/{database}".format(**db_conf) con = create_engine(engine_str, poolclass=pool.NullPool) df = pd.read_sql(sql=sql, con=con) for index, row in df.iterrows(): key = str(row["column_name"]) value = str(row["data_type"]) cols.append({"name": key, "type": value}) elif db_type == "hive": sql = "desc {tb_name}".format(tb_name=tb_name) Dlogger.info("Hive Command = " + sql) # engine_str = "hive://{username}@{host}:{port}/default".format(**db_conf) con = create_engine(engine_str, poolclass=pool.NullPool) try: df = pd.read_sql(sql=sql, con=con) # columns = df.columns.values.tolist() for index, row in df.iterrows(): if row["col_name"].rstrip().lstrip() == "": break key = row["col_name"] value = row["data_type"] cols.append({"name": key, "type": value}) if cols[-1]["name"] in ("date", "p_date"): cols.pop(-1) except Exception as e: traceback.format_exc() Dlogger.error("{} not exists. ".format(tb_name)) else: raise Exception("DATABASE TYPE ERROR !") Dlogger.info(str(cols)) return cols
def is_increase(hive_full_table): sql = """select filter,max_value from meta_import where lower(hive_database)=lower('{hive_database}') and lower(hive_table)=lower('{hive_table}') limit 1;""" \ .format(hive_database=hive_full_table.split(".")[0], hive_table=hive_full_table.split(".")[1]) engine_str = get_engine_str("mysql").format(**config.DB_CONF) con = create_engine(engine_str, poolclass=pool.NullPool) Dlogger.info(sql) # rows=con.execute(sql) df = pd.read_sql(sql=sql, con=con) if not df.empty: filter = df.iat[0, 0] max_value = df.iat[0, 1] if is_valid(filter) and is_valid(max_value): return True return False
def main(row): con_t = create_engine(get_engine_str("mysql").format(**row["db_conf"]), poolclass=pool.NullPool) dt = datetime.datetime.now().strftime("%Y%m%d") last_dt = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y%m%d") rename_sql, target_tb_name = pre_ddl(row, con_t, dt, last_dt) if row["exec_engine"] == "sqoop": sqoop_export(row["db_conf"], row["hive_database"], row["hive_table"], target_tb_name, row["m"]) else: datax_generate_json(row, target_tb_name) datax_export(row) post_ddl(row["mode"], con_t, rename_sql, row["table_name"], last_dt)
def prepare_increase(row): max_sql = "select {max_value} from {hive_full_name} limit 1".format( max_value=row["max_value"], hive_full_name=row["hive_full_name"]) engine_str = get_engine_str("hive").format(**config.HIVE_CONF) con = create_engine(engine_str, poolclass=pool.NullPool) Dlogger.info(max_sql) try: df = pd.read_sql(sql=max_sql, con=con) max_value = df.iat[0, 0] if not df.empty else 0 except Exception as e: Dlogger.warn("{} Not Exists Or Table Name Error".format( row["hive_full_name"])) max_value = 0 Dlogger.info("Max Value : {}".format(max_value)) if row["exec_engine"] == "sqoop": where_str = '--where "{}" \t'.format(row["filter"]) else: where_str = row["filter"] where_str = where_str.format(max_value=max_value) return where_str
def get_tabs(db_type, db_conf): tabs = [] if db_type == "sqlserver": sql = "select name as table_name from sys.tables;" Dlogger.info("MSSQL Command = " + sql) elif db_type == "mysql": sql = "select table_name from information_schema.tables t where t.table_schema='{db_name}'".format( db_name=db_conf["database"]) Dlogger.info("MySQL Command = " + sql) elif db_type == "oracle": sql = "select table_name from user_tables" Dlogger.info("Oracle command = " + sql) else: raise Exception("DATABASE TYPE ERROR !") engine_str = get_engine_str(db_type).format(**db_conf) con = create_engine(engine_str, poolclass=pool.NullPool) # df = pd.read_sql(sql=sql, con=con) for row in con.execute(sql): tabs.append(row["table_name"]) Dlogger.info(str(tabs)) return tabs
#!/usr/bin/env python # coding=utf-8 import datetime import pandas as pd from sqlalchemy import create_engine, pool from utility import get_engine_str from conf import config Dlogger = config.get_logger("DataRowCount") mysql_engine_str = get_engine_str("mysql").format(**config.DB_CONF) mysql_con = create_engine(mysql_engine_str, poolclass=pool.NullPool) def update_rowcount(row): sql = "select count(1) as cnt from {table_name}".format( table_name=row["table_name"]) db_conf = { "host": row["host"], "port": row["port"], "user": row["user"], "password": row["password"], "database": row["db_name"], "charset": "utf8" } engine_str = get_engine_str(row["db_type"]).format(**db_conf) con = create_engine(engine_str, poolclass=pool.NullPool) # Dlogger.debug(sql) try: result = pd.read_sql(sql, con) table_rows = result.iat[0, 0]
from conf import config # 并行数 _process_nums = 6 # 单任务的执行文件 _data_export_py = os.path.join(sys.path[0], "data_export.py") # 日志路径 _log_path = os.path.join(config.LOG_PATH, "data_export") if not os.path.exists(_log_path): os.makedirs(_log_path) # mysql数据库链接 con = create_engine(get_engine_str("mysql").format(**config.DB_CONF), poolclass=pool.NullPool) Dlogger = config.get_logger("DataExportAuto") def parallel_write_log(no, id, connection_name, db_name, table_name, last_exec_date, retry_count): try: current_date = str(datetime.date.today()) if last_exec_date == current_date: print("{no:<3} {table_name:<40} 今日已执行成功 skip.".format( no=no, table_name=db_name + "." + table_name)) return "" full_log_path = "{log_path}/sqoop_export_{db_name}_{table_name}.log".format(
def merge_data(pms_table): clinic_table = "pet_medical.mir_clinic" # HIS诊疗模块没有的表,老小暖的诊疗相关表数据 if re.search("CEMRecord|CPrescriptions|CPrescriptionDetails|CPhysical|CEMRecordPhysical|CLabReport|CLabVirusDetails|PBCheckList|PXRaysList|PPathologyList", pms_table, re.I): if "cemrecord" in pms_table.lower(): file_type = "rcfile" else: file_type = "orc" sql = """ drop table {data_xiaonuan_final_table}; create table {data_xiaonuan_final_table} stored as {file_type} as select * from {pms_table}; """ exec_sql = sql.format(pms_table=pms_table, file_type=file_type, data_xiaonuan_final_table=pms_table.replace("pet_medical.ods_pms_", "data_xiaonuan_final.")) else: pmsweb_table = pms_table.replace("ods_pms_", "ods_pmsweb_") mid_merge_table = pms_table.replace("ods_pms_", "mid_") sql1 = "desc {pms_table}".format(pms_table=pms_table) sql2 = "desc {pmsweb_table}".format(pmsweb_table=pmsweb_table) engine_str = get_engine_str("hive").format(**config.HIVE_CONF) con = create_engine(engine_str, poolclass=pool.NullPool) Dlogger.info(sql1) df1 = pd.read_sql(sql=sql1, con=con) Dlogger.info(sql2) df2 = pd.read_sql(sql=sql2, con=con) # print(df1) # print(df2) same_df = pd.merge(df1, df2, on="col_name", how="inner")["col_name"] same_columns = same_df.tolist() columns_str = ",".join(["t." + x for x in same_columns]) if re.search("SysCategory", pms_table, re.I): # SysCategory以pmsweb为主吧,其实都差不多 sql = """ drop table {data_xiaonuan_final_table}; create table {data_xiaonuan_final_table} stored as orc as select * from {pmsweb_table}; """ elif re.search("PClientManagement", pms_table, re.I): sql = """ drop table {mid_merge_table}; create table {mid_merge_table} stored as orc as select {columns_str} from {pms_table} t left join {clinic_table} s on t.id=s.clinic_id where s.pmsweb_id is null union all select {columns_str} from {pmsweb_table} t left join {clinic_table} s on t.id=s.clinic_id where s.pmsweb_id is not null; drop table {mir_merge_table}; create table {mir_merge_table} stored as orc as select s.brand_code,s.clinic_name,t.* from {mid_merge_table} t join (select brand_code,clinic_id,clinic_name from {clinic_table}) s on t.id=s.clinic_id; drop table {data_xiaonuan_final_table}; create table {data_xiaonuan_final_table} stored as orc as select * from {mir_merge_table}; """ else: sql = """ drop table {mid_merge_table}; create table {mid_merge_table} stored as orc as select {columns_str} from {pms_table} t left join {clinic_table} s on t.orgid=s.clinic_id where s.pmsweb_id is null union all select {columns_str} from {pmsweb_table} t left join {clinic_table} s on t.orgid=s.clinic_id where s.pmsweb_id is not null; drop table {mir_merge_table}; create table {mir_merge_table} stored as orc as select s.brand_code,s.clinic_name,t.* from {mid_merge_table} t join (select brand_code,clinic_id,clinic_name from {clinic_table}) s on t.orgid=s.clinic_id; drop table {data_xiaonuan_final_table}; create table {data_xiaonuan_final_table} stored as orc as select * from {mir_merge_table}; """ exec_sql = sql.format(mid_merge_table=mid_merge_table, columns_str=columns_str, pms_table=pms_table, pmsweb_table=pmsweb_table, mir_merge_table=mid_merge_table.replace("mid_", "mir_"), clinic_table=clinic_table, data_xiaonuan_final_table=pms_table.replace("pet_medical.ods_pms_", "data_xiaonuan_final.") ) # for sql in exec_sql.split(";"): # Dlogger.info(sql) # con.execute(sql) Dlogger.info(exec_sql) sh_cmd = 'hive -e "{exec_sql}"'.format(exec_sql=exec_sql) subprocess.check_output(sh_cmd, shell=True)
def pre_args(): parse = argparse.ArgumentParser(prog="InsertMetaData", description="I am help message...") parse.add_argument("-w", "--wizard", required=True, help="wizard,选择已经添加的数据库配置名称. example: -w warmsoft") parse.add_argument("--db", required=True, help="database,指定需要同步的数据库名称") parse.add_argument("--target_db", help="指定同步到hive中的库名,不填默认和db相同") args = parse.parse_args() print(args) args_dict = { "connection_id": None, "connection_name": args.wizard, "db_type": "", "host": "", "user": "", "password ": "", "port": 0, "jdbc_extend": "", "default_db": "", "db_name": args.db, "target_db_name": args.target_db if args.target_db else args.db } sql = """ SELECT t.connection_id, t.connection_name, t.db_type, t.host, t.user, t.password , t.port, t.jdbc_extend, t.default_db FROM meta_connections t WHERE t.connection_name = '{}'""".format(args.wizard) Dlogger.info(sql) con = create_engine(get_engine_str("mysql").format(**config.DB_CONF), poolclass=pool.NullPool) df = pd.read_sql(sql=sql, con=con) if len(df) > 0: tmp_row = df.to_dict("records")[0] else: print("Error Message: -w 数据库链接名称不存在") sys.exit(1) args_dict["connection_id"] = tmp_row["connection_id"] args_dict["connection_name"] = tmp_row["connection_name"] args_dict["db_type"] = tmp_row["db_type"] args_dict["host"] = tmp_row["host"] args_dict["user"] = tmp_row["user"] args_dict["password"] = tmp_row["password"] args_dict["port"] = tmp_row["port"] args_dict["jdbc_extend"] = tmp_row["jdbc_extend"] args_dict["default_db"] = tmp_row["default_db"] args_dict["db_conf"] = { "host": args_dict["host"], "port": args_dict["port"], "user": args_dict["user"], "password": args_dict["password"], "database": args_dict["db_name"], "charset": "utf8" } Dlogger.info(args_dict) return args_dict
def prepare_ddl(exec_engine, hive_full_name, cols_list_map, hive_cols, hive_partition_key, hive_partition_value, alter_sql, is_overwrite): # 表结构语句判断 schema_sql = "" if len(hive_cols) == 0: if exec_engine == "sqoop": # sqoop使用hive-import模式无法讲数据插入到orc格式中, 只能使用hcatalog模式, 但是这个模式的分区有权限问题 # 使用hive-import模式, 就不需要建表语句了 schema_sql = "" else: # exec_engine=="datax" if is_valid(hive_partition_key) and is_valid(hive_partition_value): part_sql = " partitioned by ({} string)".format( hive_partition_key) else: part_sql = "" # orc不能repalce字段,而且便于表结构一样就可以sqoop和datax任意切换,所以不用orc格式,使用sqoop默认的text格式 # schema_sql = """create table if not exists {table} (\n{cols}\n )\n{part_sql} stored as orc;""" \ schema_sql = """create table if not exists {table} (\n{cols}\n ){part_sql}""" \ .format(table=hive_full_name, cols=''.join([" {name} {type},\n".format(name=x["name"], type=x["type"]) for x in cols_list_map])[:-2], part_sql=part_sql) \ .lower() else: if len(alter_sql) > 0: schema_sql = "".join(alter_sql) # 删除数据判断 delete_data_sql = "" if exec_engine == "datax" and is_overwrite == "1": if hive_partition_key and hive_partition_key: delete_data_sql = """ alter table {hive_full_table} drop if exists partition ({hive_partition_key}='{hive_partition_value}')""". \ format(hive_full_table=hive_full_name, hive_partition_key=hive_partition_key, hive_partition_value=hive_partition_value) else: delete_data_sql = """ truncate table {hive_full_table}""". \ format(hive_full_table=hive_full_name, hive_partition_key=hive_partition_key, hive_partition_value=hive_partition_value) # 分区语句判断 add_partition_sql = "" if exec_engine == "datax" and is_valid(hive_partition_key) and is_valid( hive_partition_value): add_partition_sql = """ alter table {hive_full_table} add if not exists partition ({hive_partition_key}='{hive_partition_value}')""" \ .format(hive_full_table=hive_full_name, hive_partition_key=hive_partition_key, hive_partition_value=hive_partition_value) # 执行语句 if schema_sql + delete_data_sql + add_partition_sql != "": engine_str = get_engine_str("hive").format(**config.HIVE_CONF) con = create_engine(engine_str, poolclass=pool.NullPool) if schema_sql != "": for tmp_schema_sql in [x for x in schema_sql.split(";") if x]: Dlogger.info("\n " + tmp_schema_sql + ";") con.execute(tmp_schema_sql) if delete_data_sql != "": for tmp_drop_partition_sql in [ x for x in delete_data_sql.split(";") if x ]: Dlogger.info(tmp_drop_partition_sql + ";") con.execute(tmp_drop_partition_sql) if add_partition_sql != "": for tmp_add_partition_sql in [ x for x in add_partition_sql.split(";") if x ]: Dlogger.info(tmp_add_partition_sql + ";") con.execute(tmp_add_partition_sql)
def pre_args(): parse = argparse.ArgumentParser(prog="DataImport", description="I am help message...") parse.add_argument("-w", "--wizard", required=True, help="wizard,选择已经添加的数据库配置名称. example: -w warmsoft") parse.add_argument("--db", default="", help="<database> meta_import中的db_name库名,不区分大小写") parse.add_argument("--tb", default="", help="<table_name> meta_import中的table_name表名,不区分大小写") parse.add_argument("--exec_engine", choices=["sqoop", "datax"], default="sqoop", help="执行引擎, sqoop或者datax") parse.add_argument("-s", "--source_table", default="", help="source table. example: pms.ppets") parse.add_argument("-t", "--target_table", default="", help="target table. example: pet_medical.src_pms_ppets") parse.add_argument("-m", "--num_pappers", default="1", help="map并行数,默认1个") parse.add_argument("--hive_overwrite", action="store_true", help="sqoop覆盖") parse.add_argument("--hive_partition_key", help="分区键") parse.add_argument("--hive_partition_value", help="分区值") parse.add_argument( "--use_local_mode", action="store_true", help= "本地模式执行. 集群只有一台机器有外网, 如果分布的任务到没有外网的机器上就不能执行, 就需要指定本地模式。外网的任务建议使用datax引擎" ) args = parse.parse_args() # print parse print(args) args_dict = { "connection_id": "", "connection_name": "", "db_type": "", "host": "", "user": "", "password ": "", "port": 0, "jdbc_extend": "", "default_db": "", "db_name": "", "table_name": "", "hive_database": "", "hive_table": "", "is_overwrite": "", "query_sql": "", "columns": "", "filter": None, "max_value": None, "map_column_hive": "", "hive_partition_key": "", "hive_partition_value": "", "fields_terminated_by": None, "line_terminated_by": None, "use_raw_null": "1", "use_local_mode": "0", "warehouse_dir": "", "class_name": None, "outdir": None, "split_by": None, "m": None, "is_drop": "0", "exec_engine": "sqoop" } # table_names = [] # conn_names = [] wizard_name = args.wizard if args.source_table and len(args.source_table.split(".")) != 2: print("-s的参数必须是库名加表名,例如:pms.ppets") sys.exit(1) if args.target_table and len(args.target_table.split(".")) != 2: print("-t的参数必须是库名加表名,例如:pet_medical.ods_pmsweb_ppets") sys.exit(1) db = args.db.lower() tb = args.tb.lower() args_dict["exec_engine"] = args.exec_engine args_dict["db_name"] = args.source_table.split( ".")[0].lower() if args.source_table else "" args_dict["table_name"] = args.source_table.split( ".")[1].lower() if args.source_table else "" args_dict["hive_database"] = args.target_table.split( ".")[0].lower() if args.target_table else "" args_dict["hive_table"] = args.target_table.split( ".")[1].lower() if args.target_table else "" args_dict["m"] = args.num_pappers args_dict["is_overwrite"] = "1" if args.hive_overwrite else "0" args_dict["hive_partition_key"] = args.hive_partition_key args_dict["hive_partition_value"] = args.hive_partition_value args_dict["use_local_mode"] = "1" if args.use_local_mode else "0" sql = """ SELECT t.connection_id, t.connection_name, t.db_type, t.host, t.user, t.password , t.port, t.jdbc_extend, t.default_db, s.db_name, s.table_name, s.hive_database, s.hive_table, s.is_overwrite, s.query_sql, s.columns, s.filter, s.max_value, s.map_column_hive, s.hive_partition_key, s.hive_partition_value, s.fields_terminated_by, s.line_terminated_by, s.use_raw_null, s.use_local_mode, s.warehouse_dir, s.class_name, s.outdir, s.split_by, s.m, s.is_drop, s.exec_engine FROM meta_connections t LEFT JOIN meta_import s ON t.connection_id = s.connection_id ORDER BY t.connection_id """ con = create_engine(get_engine_str("mysql").format(**config.DB_CONF), poolclass=pool.NullPool) df = pd.read_sql(sql=sql, con=con) df["db_name"] = df["db_name"].map(lambda x: str(x).lower()) df["table_name"] = df["table_name"].map(lambda x: str(x).lower()) df["hive_database"] = df["hive_database"].map(lambda x: str(x).lower()) df["hive_table"] = df["hive_table"].map(lambda x: str(x).lower()) conn_names = df["connection_name"].drop_duplicates(keep="first").tolist() db_names = df[(df["db_name"].notna()) & (df["db_name"] != "")].drop_duplicates( keep="first")["db_name"].tolist() table_names = df[(df["table_name"].notna()) & (df["table_name"] != "")].drop_duplicates( keep="first")["table_name"].tolist() # print([x.lower() for x in table_names]) if wizard_name not in conn_names: print("Error Message: -w 数据库链接名称不存在") sys.exit(1) if db != "" and db.lower() not in [x.lower() for x in db_names]: print("Error Message: --db 库名不存在", db.lower(), db_names) sys.exit(1) if tb != "" and tb.lower() not in [x.lower() for x in table_names]: print("Error Message: --tb 表名不存在", tb.lower(), table_names) sys.exit(1) if tb != "" and args_dict["table_name"] != "": print("Error Message: --tb -s 不能同时指定") sys.exit(1) # print(conn_names, table_names) # tmp_row = [row for row in rows if wizard_name == row["connection_name"]][0] tmp_row = df[df["connection_name"] == wizard_name].to_dict("records")[0] args_dict["connection_id"] = tmp_row["connection_id"] args_dict["connection_name"] = tmp_row["connection_name"] args_dict["db_type"] = tmp_row["db_type"] args_dict["host"] = tmp_row["host"] args_dict["user"] = tmp_row["user"] args_dict["password"] = tmp_row["password"] args_dict["port"] = tmp_row["port"] args_dict["jdbc_extend"] = tmp_row["jdbc_extend"] args_dict["default_db"] = tmp_row["default_db"] if db != "" and tb != "": # args_dict = df[(df["connection_name"] == wizard_name) & (df["table_name"] == table_name_meta.lower())].head(1).to_dict("records")[0] args_dict = df[(df["connection_name"] == wizard_name) & (df["db_name"] == db) & (df["table_name"] == tb)] # 将pandas的特有类型nan处理成原生的None args_dict = args_dict.where(args_dict.notna(), None) # DataFrame转换成dict args_dict = args_dict.to_dict("records")[0] # print(df.dtypes) # print(args_dict) if (db != "" and tb != "") or (args_dict["db_name"] != "" and args_dict["table_name"] != "" and args_dict["hive_database"] != "" and args_dict["hive_table"] != ""): pass else: print("Error Message: 必须指定 -w --db --tb 或者 -w -d -s -t 的参数值") sys.exit(1) args_dict["hive_database"] = args_dict["hive_database"] args_dict["hive_table"] = args_dict["hive_table"] args_dict["hive_full_name"] = args_dict["hive_database"] + "." + args_dict[ "hive_table"] args_dict["db_conf"] = { "host": args_dict["host"], "port": args_dict["port"], "user": args_dict["user"], "password": args_dict["password"], "database": args_dict["db_name"], "charset": "utf8" } if not is_valid(args_dict["db_name"]): args_dict["db_name"] = args_dict["default_db"] if args_dict["hive_partition_value"] == "$yesterday": args_dict["hive_partition_value"] = get_yesterday() if not is_valid(args_dict["m"]): args_dict["m"] = 1 if args.num_pappers != '1': args_dict["m"] = args.num_pappers args_dict["m"] = int(args_dict["m"]) print(args_dict) return args_dict
def pre_args(): parse = argparse.ArgumentParser( prog="DataExport", description= "I am help message...默认模式是把数据导入到临时表,然后rename为正式表。Example1: python3 data_export.py -w xiaonuan_ddl --db xiaonuan --tb syscategory Example2: python3 data_export.py -w xiaonuan_ddl --s data_xiaonuan_final.syscategory --t syscategory --mode=overwrite" ) parse.add_argument("-w", "--wizard", required=True, help="wizard,选择已经添加的数据库配置名称. example: -w xiaonuan_ddl") parse.add_argument("--db", default="", help="<database> meta_export中的db_name库名,不区分大小写") parse.add_argument("--tb", default="", help="<table_name> meta_export中的table_name表名,不区分大小写") parse.add_argument("--mode", choices=["rename", "overwrite", "append"], default="rename", help="导入模式") parse.add_argument("--exec_engine", choices=["sqoop", "datax"], default="sqoop", help="执行引擎, sqoop或者datax") parse.add_argument( "-s", "--source_table", default="", help="source table. example: pet_medical.ods_pmsweb_ppets") parse.add_argument("-t", "--target_table", default="", help="target table. example: xiaonuan.ppets") parse.add_argument("-m", "--num_pappers", default="1", help="map并行数,默认1个") parse.add_argument( "--use_local_mode", action="store_true", help= "本地模式执行. 集群只有一台机器有外网, 如果分布的任务到没有外网的机器上就不能执行, 就需要指定本地模式。外网的任务建议使用datax引擎" ) args = parse.parse_args() print(args) args_dict = { "connection_id": "", "connection_name": "", "db_type": "", "host": "", "user": "", "password ": "", "port": 0, "jdbc_extend": "", "default_db": "", "hive_database": "", "hive_table": "", "db_name": "", "table_name": "", "m": 1, "is_overwrite": "", "is_drop": "0", "mode": "rename", "exec_engine": "sqoop" } wizard_name = args.wizard if args.source_table and len(args.source_table.split(".")) != 2: print("-s的参数必须是库名加表名,例如:pet_medical.ods_pmsweb_ppets") sys.exit(1) if args.target_table and len(args.target_table.split(".")) != 2: print("-t的参数必须是库名加表名,例如:xiaonuan.ppets") sys.exit(1) db = args.db.lower() tb = args.tb.lower() args_dict["exec_engine"] = args.exec_engine args_dict["hive_database"] = args.source_table.split( ".")[0].lower() if args.source_table else "" args_dict["hive_table"] = args.source_table.split( ".")[1].lower() if args.source_table else "" args_dict["db_name"] = args.target_table.split( ".")[0].lower() if args.target_table else "" args_dict["table_name"] = args.target_table.split( ".")[1].lower() if args.target_table else "" args_dict["m"] = args.num_pappers args_dict["mode"] = args.mode args_dict["use_local_mode"] = "1" if args.use_local_mode else "0" # args_dict["is_overwrite"] = "1" if args.hive_overwrite else "0" sql = """ SELECT t.connection_id, t.connection_name, t.db_type, t.host, t.user, t.password , t.port, t.jdbc_extend, t.default_db, s.hive_database, s.hive_table, s.db_name, s.table_name, s.exec_engine, s.m, s.is_overwrite, s.is_drop, s.mode FROM meta_connections t LEFT JOIN meta_export s ON t.connection_id = s.connection_id ORDER BY t.connection_id """ con = create_engine(get_engine_str("mysql").format(**config.DB_CONF), poolclass=pool.NullPool) df = pd.read_sql(sql=sql, con=con) df["db_name"] = df["db_name"].map(lambda x: str(x).lower()) df["table_name"] = df["table_name"].map(lambda x: str(x).lower()) df["hive_database"] = df["hive_database"].map(lambda x: str(x).lower()) df["hive_table"] = df["hive_table"].map(lambda x: str(x).lower()) conn_names = df["connection_name"].drop_duplicates(keep="first").tolist() db_names = df[(df["db_name"].notna()) & (df["db_name"] != "")].drop_duplicates( keep="first")["db_name"].tolist() table_names = df[(df["table_name"].notna()) & (df["table_name"] != "")].drop_duplicates( keep="first")["table_name"].tolist() # print([x.lower() for x in table_names]) if wizard_name not in conn_names: print("Error Message: -w 数据库链接名称不存在") sys.exit(1) if db != "" and db.lower() not in [x.lower() for x in db_names]: print("Error Message: --db 库名不存在") sys.exit(1) if tb != "" and tb.lower() not in [x.lower() for x in table_names]: print("Error Message: --tb 表名不存在") sys.exit(1) if tb != "" and args_dict["table_name"] != "": print("Error Message: --tb -s 不能同时指定") sys.exit(1) # print(conn_names, table_names) # tmp_row = [row for row in rows if wizard_name == row["connection_name"]][0] tmp_row = df[df["connection_name"] == wizard_name].to_dict("records")[0] args_dict["connection_id"] = tmp_row["connection_id"] args_dict["connection_name"] = tmp_row["connection_name"] args_dict["db_type"] = tmp_row["db_type"] args_dict["host"] = tmp_row["host"] args_dict["user"] = tmp_row["user"] args_dict["password"] = tmp_row["password"] args_dict["port"] = tmp_row["port"] args_dict["jdbc_extend"] = tmp_row["jdbc_extend"] args_dict["default_db"] = tmp_row["default_db"] if db != "" and tb != "": # args_dict = df[(df["connection_name"] == wizard_name) & (df["table_name"] == table_name_meta.lower())].head(1).to_dict("records")[0] args_dict = df[(df["connection_name"] == wizard_name) & (df["db_name"] == db) & (df["table_name"] == tb)] # 将pandas的特有类型nan处理成原生的None args_dict = args_dict.where(args_dict.notna(), None) # DataFrame转换成dict args_dict = args_dict.to_dict("records")[0] # print(df.dtypes) # print(args_dict) if (db != "" and tb != "") or (args_dict["db_name"] != "" and args_dict["table_name"] != "" and args_dict["hive_database"] != "" and args_dict["hive_table"] != ""): pass else: print("Error Message: 必须指定 -w --db --tb 或者 -w -d -s -t 的参数值") sys.exit(1) # args_dict["hive_database"] = args_dict["hive_database"] # args_dict["hive_table"] = args_dict["hive_table"] args_dict["hive_full_name"] = args_dict["hive_database"] + "." + args_dict[ "hive_table"] args_dict["db_conf"] = { "host": args_dict["host"], "port": args_dict["port"], "user": args_dict["user"], "password": args_dict["password"], "database": args_dict["db_name"], "charset": "utf8" } if not is_valid(args_dict["db_name"]): args_dict["db_name"] = args_dict["default_db"] if not is_valid(args_dict["mode"]): args_dict["mode"] = "rename" if not is_valid(args_dict["exec_engine"]): args_dict["exec_engine"] = "sqoop" if not is_valid(args_dict["m"]): args_dict["m"] = 1 if args.num_pappers != '1': args_dict["m"] = args.num_pappers args_dict["m"] = int(args_dict["m"]) print(args_dict) return args_dict