Ejemplo n.º 1
0
def update_rowcount(row):
    sql = "select count(1) as cnt from {table_name}".format(
        table_name=row["table_name"])
    db_conf = {
        "host": row["host"],
        "port": row["port"],
        "user": row["user"],
        "password": row["password"],
        "database": row["db_name"],
        "charset": "utf8"
    }
    engine_str = get_engine_str(row["db_type"]).format(**db_conf)
    con = create_engine(engine_str, poolclass=pool.NullPool)
    # Dlogger.debug(sql)
    try:
        result = pd.read_sql(sql, con)
        table_rows = result.iat[0, 0]
        sql_update = "update meta_import set table_rows={table_rows},rows_updatetime='{now}' where db_name='{db_name}' and table_name='{table_name}'" \
            .format(table_rows=table_rows,
                    now=str(datetime.datetime.now()),
                    db_name=row["db_name"],
                    table_name=row["table_name"])
        mysql_con.execute(sql_update)
    except Exception:
        table_rows = "Null"
    Dlogger.info("{:<40} : {}".format(row["db_name"] + "." + row["table_name"],
                                      table_rows))
Ejemplo n.º 2
0
def main(row):
    table_names = get_tabs(row["db_type"], row["db_conf"])
    Dlogger.info("{}库共有{}个表".format(row["db_name"], len(table_names)))
    if len(table_names) == 0:
        sys.exit(1)
    insert_sql = """
        INSERT INTO meta_import(connection_id, db_name, table_name, hive_database, hive_table, exec_engine, is_overwrite, status) VALUE"""
    for table_name in table_names:
        tmp_value = "('{connection_id}','{db_name}','{table_name}','{hive_database}','{hive_table}','{exec_engine}',{is_overwrite},{status})," \
            .format(connection_id=row["connection_id"],
                    db_name=row["db_name"],
                    table_name=table_name,
                    hive_database="pet_medical",
                    hive_table="src_" + row["target_db_name"] + "_" + table_name,
                    exec_engine="sqoop",
                    is_overwrite=1,
                    status=0)
        insert_sql = insert_sql + "\n        " + tmp_value
    insert_sql = insert_sql[:-1]
    Dlogger.info(insert_sql)
    con = create_engine(get_engine_str("mysql").format(**config.DB_CONF))
    con.execute(insert_sql)
    Dlogger.info("insert success")
    db_map_sql = "select distinct db_name,substr(t.hive_table,5,locate('_', t.hive_table,5)-1-4) as hive_db_name from meta_import t"
    df = pd.read_sql(sql=db_map_sql, con=con)
    print(df)
Ejemplo n.º 3
0
def get_cols(db_type, tb_name, db_conf):
    cols = []
    engine_str = get_engine_str(db_type).format(**db_conf)
    if db_type == "sqlserver":
        sql = "sp_columns {tb_name}".format(tb_name=tb_name)
        Dlogger.info("MSSQL Command = " + sql)
        # engine_str = "mssql+pymssql://{username}:{password}@{host}:{port}/{database}?charset=utf8".format(**db_conf)
        con = create_engine(engine_str, poolclass=pool.NullPool)
        df = pd.read_sql(sql=sql, con=con)
        for index, row in df.iterrows():
            key = str(row["COLUMN_NAME"])
            value = str(row["TYPE_NAME"].replace(" identity", ""))
            cols.append({"name": key, "type": value})
    elif db_type == "mysql":
        sql = "select t.column_name,data_type,column_type from information_schema.columns t where t.table_schema='{db_name}' and t.table_name='{tb_name}'".format(
            db_name=db_conf["database"], tb_name=tb_name)
        Dlogger.info("MySQL Command = " + sql)
        # engine_str = "mysql+pymysql://{username}:{password}@{host}:{port}/{database}?charset=utf8".format(**db_conf)
        con = create_engine(engine_str, poolclass=pool.NullPool)
        df = pd.read_sql(sql=sql, con=con)
        for index, row in df.iterrows():
            key = str(row["column_name"])
            value = str(row["column_type"].replace(" unsigned", ""))
            cols.append({"name": key, "type": value})
    elif db_type == "oracle":
        sql = "select column_name,data_type from user_tab_columns t where lower(t.table_name)='{tb_name}'".format(
            tb_name=tb_name)
        Dlogger.info("Oracle command = " + sql)
        # engine_str = "oracle+cx_oracle://{username}:{password}@{host}:{port}/{database}".format(**db_conf)
        con = create_engine(engine_str, poolclass=pool.NullPool)
        df = pd.read_sql(sql=sql, con=con)
        for index, row in df.iterrows():
            key = str(row["column_name"])
            value = str(row["data_type"])
            cols.append({"name": key, "type": value})
    elif db_type == "hive":
        sql = "desc {tb_name}".format(tb_name=tb_name)
        Dlogger.info("Hive Command = " + sql)
        # engine_str = "hive://{username}@{host}:{port}/default".format(**db_conf)
        con = create_engine(engine_str, poolclass=pool.NullPool)
        try:
            df = pd.read_sql(sql=sql, con=con)
            # columns = df.columns.values.tolist()
            for index, row in df.iterrows():
                if row["col_name"].rstrip().lstrip() == "":
                    break
                key = row["col_name"]
                value = row["data_type"]
                cols.append({"name": key, "type": value})
            if cols[-1]["name"] in ("date", "p_date"):
                cols.pop(-1)
        except Exception as e:
            traceback.format_exc()
            Dlogger.error("{} not exists. ".format(tb_name))
    else:
        raise Exception("DATABASE TYPE ERROR !")
    Dlogger.info(str(cols))
    return cols
Ejemplo n.º 4
0
def is_increase(hive_full_table):
    sql = """select filter,max_value from meta_import where lower(hive_database)=lower('{hive_database}') and lower(hive_table)=lower('{hive_table}') limit 1;""" \
        .format(hive_database=hive_full_table.split(".")[0],
                hive_table=hive_full_table.split(".")[1])
    engine_str = get_engine_str("mysql").format(**config.DB_CONF)
    con = create_engine(engine_str, poolclass=pool.NullPool)
    Dlogger.info(sql)
    # rows=con.execute(sql)
    df = pd.read_sql(sql=sql, con=con)
    if not df.empty:
        filter = df.iat[0, 0]
        max_value = df.iat[0, 1]
        if is_valid(filter) and is_valid(max_value):
            return True
    return False
Ejemplo n.º 5
0
def main(row):
    con_t = create_engine(get_engine_str("mysql").format(**row["db_conf"]),
                          poolclass=pool.NullPool)
    dt = datetime.datetime.now().strftime("%Y%m%d")
    last_dt = (datetime.datetime.now() -
               datetime.timedelta(days=1)).strftime("%Y%m%d")
    rename_sql, target_tb_name = pre_ddl(row, con_t, dt, last_dt)

    if row["exec_engine"] == "sqoop":
        sqoop_export(row["db_conf"], row["hive_database"], row["hive_table"],
                     target_tb_name, row["m"])
    else:
        datax_generate_json(row, target_tb_name)
        datax_export(row)

    post_ddl(row["mode"], con_t, rename_sql, row["table_name"], last_dt)
Ejemplo n.º 6
0
def prepare_increase(row):
    max_sql = "select {max_value} from {hive_full_name} limit 1".format(
        max_value=row["max_value"], hive_full_name=row["hive_full_name"])
    engine_str = get_engine_str("hive").format(**config.HIVE_CONF)
    con = create_engine(engine_str, poolclass=pool.NullPool)
    Dlogger.info(max_sql)
    try:
        df = pd.read_sql(sql=max_sql, con=con)
        max_value = df.iat[0, 0] if not df.empty else 0
    except Exception as e:
        Dlogger.warn("{} Not Exists Or Table Name Error".format(
            row["hive_full_name"]))
        max_value = 0
    Dlogger.info("Max Value : {}".format(max_value))
    if row["exec_engine"] == "sqoop":
        where_str = '--where "{}" \t'.format(row["filter"])
    else:
        where_str = row["filter"]
    where_str = where_str.format(max_value=max_value)
    return where_str
Ejemplo n.º 7
0
def get_tabs(db_type, db_conf):
    tabs = []
    if db_type == "sqlserver":
        sql = "select name as table_name from sys.tables;"
        Dlogger.info("MSSQL Command = " + sql)
    elif db_type == "mysql":
        sql = "select table_name from information_schema.tables t where t.table_schema='{db_name}'".format(
            db_name=db_conf["database"])
        Dlogger.info("MySQL Command = " + sql)
    elif db_type == "oracle":
        sql = "select table_name from user_tables"
        Dlogger.info("Oracle command = " + sql)
    else:
        raise Exception("DATABASE TYPE ERROR !")
    engine_str = get_engine_str(db_type).format(**db_conf)
    con = create_engine(engine_str, poolclass=pool.NullPool)
    # df = pd.read_sql(sql=sql, con=con)
    for row in con.execute(sql):
        tabs.append(row["table_name"])
    Dlogger.info(str(tabs))
    return tabs
Ejemplo n.º 8
0
#!/usr/bin/env python
# coding=utf-8

import datetime
import pandas as pd
from sqlalchemy import create_engine, pool
from utility import get_engine_str
from conf import config

Dlogger = config.get_logger("DataRowCount")
mysql_engine_str = get_engine_str("mysql").format(**config.DB_CONF)
mysql_con = create_engine(mysql_engine_str, poolclass=pool.NullPool)


def update_rowcount(row):
    sql = "select count(1) as cnt from {table_name}".format(
        table_name=row["table_name"])
    db_conf = {
        "host": row["host"],
        "port": row["port"],
        "user": row["user"],
        "password": row["password"],
        "database": row["db_name"],
        "charset": "utf8"
    }
    engine_str = get_engine_str(row["db_type"]).format(**db_conf)
    con = create_engine(engine_str, poolclass=pool.NullPool)
    # Dlogger.debug(sql)
    try:
        result = pd.read_sql(sql, con)
        table_rows = result.iat[0, 0]
Ejemplo n.º 9
0
from conf import config

# 并行数
_process_nums = 6

# 单任务的执行文件
_data_export_py = os.path.join(sys.path[0], "data_export.py")

# 日志路径
_log_path = os.path.join(config.LOG_PATH, "data_export")

if not os.path.exists(_log_path):
    os.makedirs(_log_path)

# mysql数据库链接
con = create_engine(get_engine_str("mysql").format(**config.DB_CONF),
                    poolclass=pool.NullPool)

Dlogger = config.get_logger("DataExportAuto")


def parallel_write_log(no, id, connection_name, db_name, table_name,
                       last_exec_date, retry_count):
    try:
        current_date = str(datetime.date.today())
        if last_exec_date == current_date:
            print("{no:<3} {table_name:<40}      今日已执行成功 skip.".format(
                no=no, table_name=db_name + "." + table_name))
            return ""

        full_log_path = "{log_path}/sqoop_export_{db_name}_{table_name}.log".format(
Ejemplo n.º 10
0
def merge_data(pms_table):
    clinic_table = "pet_medical.mir_clinic"

    # HIS诊疗模块没有的表,老小暖的诊疗相关表数据
    if re.search("CEMRecord|CPrescriptions|CPrescriptionDetails|CPhysical|CEMRecordPhysical|CLabReport|CLabVirusDetails|PBCheckList|PXRaysList|PPathologyList", pms_table, re.I):
        if "cemrecord" in pms_table.lower():
            file_type = "rcfile"
        else:
            file_type = "orc"
        sql = """
        drop table {data_xiaonuan_final_table};
        create table {data_xiaonuan_final_table} stored as {file_type} as
        select * from {pms_table};
        """
        exec_sql = sql.format(pms_table=pms_table, file_type=file_type, data_xiaonuan_final_table=pms_table.replace("pet_medical.ods_pms_", "data_xiaonuan_final."))
    else:
        pmsweb_table = pms_table.replace("ods_pms_", "ods_pmsweb_")
        mid_merge_table = pms_table.replace("ods_pms_", "mid_")
        sql1 = "desc {pms_table}".format(pms_table=pms_table)
        sql2 = "desc {pmsweb_table}".format(pmsweb_table=pmsweb_table)
        engine_str = get_engine_str("hive").format(**config.HIVE_CONF)
        con = create_engine(engine_str, poolclass=pool.NullPool)
        Dlogger.info(sql1)
        df1 = pd.read_sql(sql=sql1, con=con)
        Dlogger.info(sql2)
        df2 = pd.read_sql(sql=sql2, con=con)
        # print(df1)
        # print(df2)
        same_df = pd.merge(df1, df2, on="col_name", how="inner")["col_name"]
        same_columns = same_df.tolist()
        columns_str = ",".join(["t." + x for x in same_columns])
        if re.search("SysCategory", pms_table, re.I):  # SysCategory以pmsweb为主吧,其实都差不多
            sql = """
            drop table {data_xiaonuan_final_table};
            create table {data_xiaonuan_final_table} stored as orc as
            select * from {pmsweb_table};
            """
        elif re.search("PClientManagement", pms_table, re.I):
            sql = """
            drop table {mid_merge_table};
            create table {mid_merge_table} stored as orc as 
            select {columns_str} from {pms_table} t left join {clinic_table} s on t.id=s.clinic_id where s.pmsweb_id is null
            union all
            select {columns_str} from {pmsweb_table} t left join {clinic_table} s on t.id=s.clinic_id where s.pmsweb_id is not null;
            
            drop table {mir_merge_table};
            create table {mir_merge_table} stored as orc as
            select s.brand_code,s.clinic_name,t.* from {mid_merge_table} t join (select brand_code,clinic_id,clinic_name from {clinic_table}) s on t.id=s.clinic_id;
            
            drop table {data_xiaonuan_final_table};
            create table {data_xiaonuan_final_table} stored as orc as
            select * from {mir_merge_table};
            """
        else:
            sql = """
            drop table {mid_merge_table};
            create table {mid_merge_table} stored as orc as 
            select {columns_str} from {pms_table} t left join {clinic_table} s on t.orgid=s.clinic_id where s.pmsweb_id is null
            union all
            select {columns_str} from {pmsweb_table} t left join {clinic_table} s on t.orgid=s.clinic_id where s.pmsweb_id is not null;
            
            drop table {mir_merge_table};
            create table {mir_merge_table} stored as orc as
            select s.brand_code,s.clinic_name,t.* from {mid_merge_table} t join (select brand_code,clinic_id,clinic_name from {clinic_table}) s on t.orgid=s.clinic_id;
    
            drop table {data_xiaonuan_final_table};
            create table {data_xiaonuan_final_table} stored as orc as
            select * from {mir_merge_table};
            """
        exec_sql = sql.format(mid_merge_table=mid_merge_table,
                              columns_str=columns_str,
                              pms_table=pms_table,
                              pmsweb_table=pmsweb_table,
                              mir_merge_table=mid_merge_table.replace("mid_", "mir_"),
                              clinic_table=clinic_table,
                              data_xiaonuan_final_table=pms_table.replace("pet_medical.ods_pms_", "data_xiaonuan_final.")
                              )
    # for sql in exec_sql.split(";"):
    #     Dlogger.info(sql)
    #     con.execute(sql)
    Dlogger.info(exec_sql)
    sh_cmd = 'hive -e "{exec_sql}"'.format(exec_sql=exec_sql)
    subprocess.check_output(sh_cmd, shell=True)
Ejemplo n.º 11
0
def pre_args():
    parse = argparse.ArgumentParser(prog="InsertMetaData", description="I am help message...")
    parse.add_argument("-w", "--wizard", required=True, help="wizard,选择已经添加的数据库配置名称. example: -w warmsoft")
    parse.add_argument("--db", required=True, help="database,指定需要同步的数据库名称")
    parse.add_argument("--target_db", help="指定同步到hive中的库名,不填默认和db相同")
    args = parse.parse_args()
    print(args)

    args_dict = {
        "connection_id": None,
        "connection_name": args.wizard,
        "db_type": "",
        "host": "",
        "user": "",
        "password ": "",
        "port": 0,
        "jdbc_extend": "",
        "default_db": "",
        "db_name": args.db,
        "target_db_name": args.target_db if args.target_db else args.db
    }
    sql = """
    SELECT t.connection_id,
           t.connection_name,
           t.db_type,
           t.host,
           t.user,
           t.password ,
           t.port,
           t.jdbc_extend,
           t.default_db
      FROM meta_connections t
     WHERE t.connection_name = '{}'""".format(args.wizard)
    Dlogger.info(sql)
    con = create_engine(get_engine_str("mysql").format(**config.DB_CONF), poolclass=pool.NullPool)
    df = pd.read_sql(sql=sql, con=con)

    if len(df) > 0:
        tmp_row = df.to_dict("records")[0]
    else:
        print("Error Message: -w 数据库链接名称不存在")
        sys.exit(1)
    args_dict["connection_id"] = tmp_row["connection_id"]
    args_dict["connection_name"] = tmp_row["connection_name"]
    args_dict["db_type"] = tmp_row["db_type"]
    args_dict["host"] = tmp_row["host"]
    args_dict["user"] = tmp_row["user"]
    args_dict["password"] = tmp_row["password"]
    args_dict["port"] = tmp_row["port"]
    args_dict["jdbc_extend"] = tmp_row["jdbc_extend"]
    args_dict["default_db"] = tmp_row["default_db"]
    args_dict["db_conf"] = {
        "host": args_dict["host"],
        "port": args_dict["port"],
        "user": args_dict["user"],
        "password": args_dict["password"],
        "database": args_dict["db_name"],
        "charset": "utf8"
    }
    Dlogger.info(args_dict)
    return args_dict
Ejemplo n.º 12
0
def prepare_ddl(exec_engine, hive_full_name, cols_list_map, hive_cols,
                hive_partition_key, hive_partition_value, alter_sql,
                is_overwrite):
    # 表结构语句判断
    schema_sql = ""
    if len(hive_cols) == 0:
        if exec_engine == "sqoop":
            # sqoop使用hive-import模式无法讲数据插入到orc格式中, 只能使用hcatalog模式, 但是这个模式的分区有权限问题
            # 使用hive-import模式, 就不需要建表语句了
            schema_sql = ""
        else:  # exec_engine=="datax"
            if is_valid(hive_partition_key) and is_valid(hive_partition_value):
                part_sql = " partitioned by ({} string)".format(
                    hive_partition_key)
            else:
                part_sql = ""
            # orc不能repalce字段,而且便于表结构一样就可以sqoop和datax任意切换,所以不用orc格式,使用sqoop默认的text格式
            # schema_sql = """create table if not exists {table} (\n{cols}\n        )\n{part_sql}        stored as orc;""" \
            schema_sql = """create table if not exists {table} (\n{cols}\n        ){part_sql}""" \
                .format(table=hive_full_name,
                        cols=''.join(["            {name} {type},\n".format(name=x["name"], type=x["type"]) for x in cols_list_map])[:-2],
                        part_sql=part_sql) \
                .lower()

    else:
        if len(alter_sql) > 0:
            schema_sql = "".join(alter_sql)

    # 删除数据判断
    delete_data_sql = ""
    if exec_engine == "datax" and is_overwrite == "1":
        if hive_partition_key and hive_partition_key:
            delete_data_sql = """
        alter table {hive_full_table} drop if exists partition ({hive_partition_key}='{hive_partition_value}')""". \
                format(hive_full_table=hive_full_name,
                       hive_partition_key=hive_partition_key,
                       hive_partition_value=hive_partition_value)
        else:
            delete_data_sql = """
        truncate table {hive_full_table}""". \
                format(hive_full_table=hive_full_name,
                       hive_partition_key=hive_partition_key,
                       hive_partition_value=hive_partition_value)

    # 分区语句判断
    add_partition_sql = ""
    if exec_engine == "datax" and is_valid(hive_partition_key) and is_valid(
            hive_partition_value):
        add_partition_sql = """
        alter table {hive_full_table} add if not exists partition ({hive_partition_key}='{hive_partition_value}')""" \
            .format(hive_full_table=hive_full_name,
                    hive_partition_key=hive_partition_key,
                    hive_partition_value=hive_partition_value)

    # 执行语句
    if schema_sql + delete_data_sql + add_partition_sql != "":
        engine_str = get_engine_str("hive").format(**config.HIVE_CONF)
        con = create_engine(engine_str, poolclass=pool.NullPool)
        if schema_sql != "":
            for tmp_schema_sql in [x for x in schema_sql.split(";") if x]:
                Dlogger.info("\n        " + tmp_schema_sql + ";")
                con.execute(tmp_schema_sql)
        if delete_data_sql != "":
            for tmp_drop_partition_sql in [
                    x for x in delete_data_sql.split(";") if x
            ]:
                Dlogger.info(tmp_drop_partition_sql + ";")
                con.execute(tmp_drop_partition_sql)
        if add_partition_sql != "":
            for tmp_add_partition_sql in [
                    x for x in add_partition_sql.split(";") if x
            ]:
                Dlogger.info(tmp_add_partition_sql + ";")
                con.execute(tmp_add_partition_sql)
Ejemplo n.º 13
0
def pre_args():
    parse = argparse.ArgumentParser(prog="DataImport",
                                    description="I am help message...")
    parse.add_argument("-w",
                       "--wizard",
                       required=True,
                       help="wizard,选择已经添加的数据库配置名称. example: -w warmsoft")
    parse.add_argument("--db",
                       default="",
                       help="<database> meta_import中的db_name库名,不区分大小写")
    parse.add_argument("--tb",
                       default="",
                       help="<table_name> meta_import中的table_name表名,不区分大小写")
    parse.add_argument("--exec_engine",
                       choices=["sqoop", "datax"],
                       default="sqoop",
                       help="执行引擎, sqoop或者datax")
    parse.add_argument("-s",
                       "--source_table",
                       default="",
                       help="source table. example: pms.ppets")
    parse.add_argument("-t",
                       "--target_table",
                       default="",
                       help="target table. example: pet_medical.src_pms_ppets")
    parse.add_argument("-m", "--num_pappers", default="1", help="map并行数,默认1个")
    parse.add_argument("--hive_overwrite", action="store_true", help="sqoop覆盖")
    parse.add_argument("--hive_partition_key", help="分区键")
    parse.add_argument("--hive_partition_value", help="分区值")
    parse.add_argument(
        "--use_local_mode",
        action="store_true",
        help=
        "本地模式执行. 集群只有一台机器有外网, 如果分布的任务到没有外网的机器上就不能执行, 就需要指定本地模式。外网的任务建议使用datax引擎"
    )

    args = parse.parse_args()
    # print parse
    print(args)

    args_dict = {
        "connection_id": "",
        "connection_name": "",
        "db_type": "",
        "host": "",
        "user": "",
        "password ": "",
        "port": 0,
        "jdbc_extend": "",
        "default_db": "",
        "db_name": "",
        "table_name": "",
        "hive_database": "",
        "hive_table": "",
        "is_overwrite": "",
        "query_sql": "",
        "columns": "",
        "filter": None,
        "max_value": None,
        "map_column_hive": "",
        "hive_partition_key": "",
        "hive_partition_value": "",
        "fields_terminated_by": None,
        "line_terminated_by": None,
        "use_raw_null": "1",
        "use_local_mode": "0",
        "warehouse_dir": "",
        "class_name": None,
        "outdir": None,
        "split_by": None,
        "m": None,
        "is_drop": "0",
        "exec_engine": "sqoop"
    }
    # table_names = []
    # conn_names = []

    wizard_name = args.wizard
    if args.source_table and len(args.source_table.split(".")) != 2:
        print("-s的参数必须是库名加表名,例如:pms.ppets")
        sys.exit(1)
    if args.target_table and len(args.target_table.split(".")) != 2:
        print("-t的参数必须是库名加表名,例如:pet_medical.ods_pmsweb_ppets")
        sys.exit(1)

    db = args.db.lower()
    tb = args.tb.lower()
    args_dict["exec_engine"] = args.exec_engine
    args_dict["db_name"] = args.source_table.split(
        ".")[0].lower() if args.source_table else ""
    args_dict["table_name"] = args.source_table.split(
        ".")[1].lower() if args.source_table else ""
    args_dict["hive_database"] = args.target_table.split(
        ".")[0].lower() if args.target_table else ""
    args_dict["hive_table"] = args.target_table.split(
        ".")[1].lower() if args.target_table else ""
    args_dict["m"] = args.num_pappers
    args_dict["is_overwrite"] = "1" if args.hive_overwrite else "0"
    args_dict["hive_partition_key"] = args.hive_partition_key
    args_dict["hive_partition_value"] = args.hive_partition_value
    args_dict["use_local_mode"] = "1" if args.use_local_mode else "0"

    sql = """
    SELECT t.connection_id,
           t.connection_name,
           t.db_type,
           t.host,
           t.user,
           t.password ,
           t.port,
           t.jdbc_extend,
           t.default_db,
           s.db_name,
           s.table_name,
           s.hive_database,
           s.hive_table,
           s.is_overwrite,
           s.query_sql,
           s.columns,
           s.filter,
           s.max_value,
           s.map_column_hive,
           s.hive_partition_key,
           s.hive_partition_value,
           s.fields_terminated_by,
           s.line_terminated_by,
           s.use_raw_null,
           s.use_local_mode,
           s.warehouse_dir,
           s.class_name,
           s.outdir,
           s.split_by,
           s.m,
           s.is_drop,
           s.exec_engine
      FROM meta_connections t
      LEFT JOIN meta_import s ON t.connection_id = s.connection_id
     ORDER BY t.connection_id
    """
    con = create_engine(get_engine_str("mysql").format(**config.DB_CONF),
                        poolclass=pool.NullPool)
    df = pd.read_sql(sql=sql, con=con)
    df["db_name"] = df["db_name"].map(lambda x: str(x).lower())
    df["table_name"] = df["table_name"].map(lambda x: str(x).lower())
    df["hive_database"] = df["hive_database"].map(lambda x: str(x).lower())
    df["hive_table"] = df["hive_table"].map(lambda x: str(x).lower())

    conn_names = df["connection_name"].drop_duplicates(keep="first").tolist()
    db_names = df[(df["db_name"].notna())
                  & (df["db_name"] != "")].drop_duplicates(
                      keep="first")["db_name"].tolist()
    table_names = df[(df["table_name"].notna())
                     & (df["table_name"] != "")].drop_duplicates(
                         keep="first")["table_name"].tolist()
    # print([x.lower() for x in table_names])
    if wizard_name not in conn_names:
        print("Error Message: -w 数据库链接名称不存在")
        sys.exit(1)
    if db != "" and db.lower() not in [x.lower() for x in db_names]:
        print("Error Message: --db 库名不存在", db.lower(), db_names)
        sys.exit(1)
    if tb != "" and tb.lower() not in [x.lower() for x in table_names]:
        print("Error Message: --tb 表名不存在", tb.lower(), table_names)
        sys.exit(1)
    if tb != "" and args_dict["table_name"] != "":
        print("Error Message: --tb -s 不能同时指定")
        sys.exit(1)

    # print(conn_names, table_names)
    # tmp_row = [row for row in rows if wizard_name == row["connection_name"]][0]
    tmp_row = df[df["connection_name"] == wizard_name].to_dict("records")[0]
    args_dict["connection_id"] = tmp_row["connection_id"]
    args_dict["connection_name"] = tmp_row["connection_name"]
    args_dict["db_type"] = tmp_row["db_type"]
    args_dict["host"] = tmp_row["host"]
    args_dict["user"] = tmp_row["user"]
    args_dict["password"] = tmp_row["password"]
    args_dict["port"] = tmp_row["port"]
    args_dict["jdbc_extend"] = tmp_row["jdbc_extend"]
    args_dict["default_db"] = tmp_row["default_db"]
    if db != "" and tb != "":
        # args_dict = df[(df["connection_name"] == wizard_name) & (df["table_name"] == table_name_meta.lower())].head(1).to_dict("records")[0]
        args_dict = df[(df["connection_name"] == wizard_name)
                       & (df["db_name"] == db) & (df["table_name"] == tb)]
        # 将pandas的特有类型nan处理成原生的None
        args_dict = args_dict.where(args_dict.notna(), None)
        # DataFrame转换成dict
        args_dict = args_dict.to_dict("records")[0]
        # print(df.dtypes)
        # print(args_dict)

    if (db != "" and tb != "") or (args_dict["db_name"] != ""
                                   and args_dict["table_name"] != ""
                                   and args_dict["hive_database"] != ""
                                   and args_dict["hive_table"] != ""):
        pass
    else:
        print("Error Message: 必须指定 -w --db --tb 或者 -w -d -s -t 的参数值")
        sys.exit(1)
    args_dict["hive_database"] = args_dict["hive_database"]
    args_dict["hive_table"] = args_dict["hive_table"]
    args_dict["hive_full_name"] = args_dict["hive_database"] + "." + args_dict[
        "hive_table"]
    args_dict["db_conf"] = {
        "host": args_dict["host"],
        "port": args_dict["port"],
        "user": args_dict["user"],
        "password": args_dict["password"],
        "database": args_dict["db_name"],
        "charset": "utf8"
    }
    if not is_valid(args_dict["db_name"]):
        args_dict["db_name"] = args_dict["default_db"]
    if args_dict["hive_partition_value"] == "$yesterday":
        args_dict["hive_partition_value"] = get_yesterday()
    if not is_valid(args_dict["m"]):
        args_dict["m"] = 1
    if args.num_pappers != '1':
        args_dict["m"] = args.num_pappers
    args_dict["m"] = int(args_dict["m"])
    print(args_dict)
    return args_dict
Ejemplo n.º 14
0
def pre_args():
    parse = argparse.ArgumentParser(
        prog="DataExport",
        description=
        "I am help message...默认模式是把数据导入到临时表,然后rename为正式表。Example1: python3 data_export.py -w xiaonuan_ddl --db xiaonuan --tb syscategory   Example2: python3 data_export.py -w xiaonuan_ddl --s data_xiaonuan_final.syscategory --t syscategory --mode=overwrite"
    )
    parse.add_argument("-w",
                       "--wizard",
                       required=True,
                       help="wizard,选择已经添加的数据库配置名称. example: -w xiaonuan_ddl")
    parse.add_argument("--db",
                       default="",
                       help="<database> meta_export中的db_name库名,不区分大小写")
    parse.add_argument("--tb",
                       default="",
                       help="<table_name> meta_export中的table_name表名,不区分大小写")
    parse.add_argument("--mode",
                       choices=["rename", "overwrite", "append"],
                       default="rename",
                       help="导入模式")
    parse.add_argument("--exec_engine",
                       choices=["sqoop", "datax"],
                       default="sqoop",
                       help="执行引擎, sqoop或者datax")
    parse.add_argument(
        "-s",
        "--source_table",
        default="",
        help="source table. example: pet_medical.ods_pmsweb_ppets")
    parse.add_argument("-t",
                       "--target_table",
                       default="",
                       help="target table. example: xiaonuan.ppets")
    parse.add_argument("-m", "--num_pappers", default="1", help="map并行数,默认1个")
    parse.add_argument(
        "--use_local_mode",
        action="store_true",
        help=
        "本地模式执行. 集群只有一台机器有外网, 如果分布的任务到没有外网的机器上就不能执行, 就需要指定本地模式。外网的任务建议使用datax引擎"
    )

    args = parse.parse_args()
    print(args)

    args_dict = {
        "connection_id": "",
        "connection_name": "",
        "db_type": "",
        "host": "",
        "user": "",
        "password ": "",
        "port": 0,
        "jdbc_extend": "",
        "default_db": "",
        "hive_database": "",
        "hive_table": "",
        "db_name": "",
        "table_name": "",
        "m": 1,
        "is_overwrite": "",
        "is_drop": "0",
        "mode": "rename",
        "exec_engine": "sqoop"
    }

    wizard_name = args.wizard
    if args.source_table and len(args.source_table.split(".")) != 2:
        print("-s的参数必须是库名加表名,例如:pet_medical.ods_pmsweb_ppets")
        sys.exit(1)
    if args.target_table and len(args.target_table.split(".")) != 2:
        print("-t的参数必须是库名加表名,例如:xiaonuan.ppets")
        sys.exit(1)

    db = args.db.lower()
    tb = args.tb.lower()
    args_dict["exec_engine"] = args.exec_engine
    args_dict["hive_database"] = args.source_table.split(
        ".")[0].lower() if args.source_table else ""
    args_dict["hive_table"] = args.source_table.split(
        ".")[1].lower() if args.source_table else ""
    args_dict["db_name"] = args.target_table.split(
        ".")[0].lower() if args.target_table else ""
    args_dict["table_name"] = args.target_table.split(
        ".")[1].lower() if args.target_table else ""
    args_dict["m"] = args.num_pappers
    args_dict["mode"] = args.mode
    args_dict["use_local_mode"] = "1" if args.use_local_mode else "0"
    # args_dict["is_overwrite"] = "1" if args.hive_overwrite else "0"

    sql = """
    SELECT t.connection_id,
           t.connection_name,
           t.db_type,
           t.host,
           t.user,
           t.password ,
           t.port,
           t.jdbc_extend,
           t.default_db,
           s.hive_database,
           s.hive_table,
           s.db_name,
           s.table_name,
           s.exec_engine,
           s.m,
           s.is_overwrite,
           s.is_drop,
           s.mode
      FROM meta_connections t
      LEFT JOIN meta_export s ON t.connection_id = s.connection_id
     ORDER BY t.connection_id
    """
    con = create_engine(get_engine_str("mysql").format(**config.DB_CONF),
                        poolclass=pool.NullPool)
    df = pd.read_sql(sql=sql, con=con)
    df["db_name"] = df["db_name"].map(lambda x: str(x).lower())
    df["table_name"] = df["table_name"].map(lambda x: str(x).lower())
    df["hive_database"] = df["hive_database"].map(lambda x: str(x).lower())
    df["hive_table"] = df["hive_table"].map(lambda x: str(x).lower())

    conn_names = df["connection_name"].drop_duplicates(keep="first").tolist()
    db_names = df[(df["db_name"].notna())
                  & (df["db_name"] != "")].drop_duplicates(
                      keep="first")["db_name"].tolist()
    table_names = df[(df["table_name"].notna())
                     & (df["table_name"] != "")].drop_duplicates(
                         keep="first")["table_name"].tolist()
    # print([x.lower() for x in table_names])
    if wizard_name not in conn_names:
        print("Error Message: -w 数据库链接名称不存在")
        sys.exit(1)
    if db != "" and db.lower() not in [x.lower() for x in db_names]:
        print("Error Message: --db 库名不存在")
        sys.exit(1)
    if tb != "" and tb.lower() not in [x.lower() for x in table_names]:
        print("Error Message: --tb 表名不存在")
        sys.exit(1)
    if tb != "" and args_dict["table_name"] != "":
        print("Error Message: --tb -s 不能同时指定")
        sys.exit(1)

    # print(conn_names, table_names)
    # tmp_row = [row for row in rows if wizard_name == row["connection_name"]][0]
    tmp_row = df[df["connection_name"] == wizard_name].to_dict("records")[0]
    args_dict["connection_id"] = tmp_row["connection_id"]
    args_dict["connection_name"] = tmp_row["connection_name"]
    args_dict["db_type"] = tmp_row["db_type"]
    args_dict["host"] = tmp_row["host"]
    args_dict["user"] = tmp_row["user"]
    args_dict["password"] = tmp_row["password"]
    args_dict["port"] = tmp_row["port"]
    args_dict["jdbc_extend"] = tmp_row["jdbc_extend"]
    args_dict["default_db"] = tmp_row["default_db"]
    if db != "" and tb != "":
        # args_dict = df[(df["connection_name"] == wizard_name) & (df["table_name"] == table_name_meta.lower())].head(1).to_dict("records")[0]
        args_dict = df[(df["connection_name"] == wizard_name)
                       & (df["db_name"] == db) & (df["table_name"] == tb)]
        # 将pandas的特有类型nan处理成原生的None
        args_dict = args_dict.where(args_dict.notna(), None)
        # DataFrame转换成dict
        args_dict = args_dict.to_dict("records")[0]
        # print(df.dtypes)
        # print(args_dict)

    if (db != "" and tb != "") or (args_dict["db_name"] != ""
                                   and args_dict["table_name"] != ""
                                   and args_dict["hive_database"] != ""
                                   and args_dict["hive_table"] != ""):
        pass
    else:
        print("Error Message: 必须指定 -w --db --tb 或者 -w -d -s -t 的参数值")
        sys.exit(1)
    # args_dict["hive_database"] = args_dict["hive_database"]
    # args_dict["hive_table"] = args_dict["hive_table"]
    args_dict["hive_full_name"] = args_dict["hive_database"] + "." + args_dict[
        "hive_table"]
    args_dict["db_conf"] = {
        "host": args_dict["host"],
        "port": args_dict["port"],
        "user": args_dict["user"],
        "password": args_dict["password"],
        "database": args_dict["db_name"],
        "charset": "utf8"
    }
    if not is_valid(args_dict["db_name"]):
        args_dict["db_name"] = args_dict["default_db"]
    if not is_valid(args_dict["mode"]):
        args_dict["mode"] = "rename"
    if not is_valid(args_dict["exec_engine"]):
        args_dict["exec_engine"] = "sqoop"
    if not is_valid(args_dict["m"]):
        args_dict["m"] = 1
    if args.num_pappers != '1':
        args_dict["m"] = args.num_pappers
    args_dict["m"] = int(args_dict["m"])
    print(args_dict)
    return args_dict