def generate_sqoop_cmd(src_db, dst_db, dst_table, etl_mode, res_query, mapper_nums): connect, user_name, password = get_config("oyo_dw", etl_mode) mapreduce_job_name = ''.join( ["SQOOP_", etl_mode, "_", dst_db, ".", dst_table]) sqoop_cmd = """sqoop \ import \ --hive-import \ --hive-overwrite \ --null-string '\\\\N' \ --null-non-string '\\\\N' \ --connect %s \ --username %s \ --password "%s" """ % (connect, user_name, password) query = "--query '%s' " % res_query mappers = '--num-mappers %s' % mapper_nums ## 防止程序异常中断,重跑时报 Output directory already exists target_dir = '--target-dir /user/tmp/sqoop/%s/%s/%s ' % (dst_db, dst_table, version) hive_commands = """ --hive-database %s \ --hive-table %s \ --hive-delims-replacement " " \ --driver oracle.jdbc.OracleDriver \ --connection-manager org.apache.sqoop.manager.GenericJdbcManager \ --mapreduce-job-name %s """ % (dst_db, dst_table, mapreduce_job_name) sqoop_cmd = ' '.join( [sqoop_cmd, query, hive_commands, target_dir, mappers]) # logging.log(logging.INFO, 'Generated sqoop command: %s' % sqoop_cmd) return sqoop_cmd
def generate_sqoop_cmd(src_db, dst_db, dst_table, etl_mode, res_query, partition_key, partition_value, mapper_nums): connect, user_name, password = get_config(src_db, etl_mode) sqoop_cmd = """sqoop \ import \ --hive-import \ --hive-overwrite \ --null-string '\\\\N' \ --null-non-string '\\\\N' \ --connect %s \ --username %s \ --password '%s' """ % (connect, user_name, password) query = "--query '%s'" % res_query mappers = '--num-mappers %s' % mapper_nums target_dir = '--target-dir /user/tmp/sqoop/%s/%s/%s ' % ( dst_db, dst_table, version ) ## 防止程序异常中断,重跑时报 Output directory already exists hive_commands = """--hive-database %s \ --hive-table %s \ --hive-delims-replacement " " \ --hive-partition-key %s \ --hive-partition-value "%s" """ % (dst_db, dst_table, partition_key, partition_value) sqoop_cmd = ' '.join( [sqoop_cmd, hive_commands, target_dir, query, mappers]) # logging.log(logging.INFO, 'Generated sqoop command: %s' % sqoop_cmd) return sqoop_cmd
def hive2mysql(src_db, src_tb, dst_db, dst_table, dst_columns, etl_mode, res_query, mapper_nums, pre_sql): export_dir = "/user/tmp/sqoop/export/mysql/%s/%s/%s" % (src_db, src_tb, version) hive_sql = """ INSERT OVERWRITE DIRECTORY '%s' \ %s """ % (export_dir, res_query) hivecommand = hive_command(hive_sql) hive_exce_command(hivecommand) ## 将数据写到目标路径 connect, user_name, password = get_config(dst_db, etl_mode) db_mysql = get_db(connect, user_name, password) db_query_commit(db_mysql, pre_sql) ## 先清除目标表数据 sqoop_cmd = generate_sqoop_cmd(dst_db, dst_table, dst_columns, etl_mode, export_dir, mapper_nums) shell_exce_command(sqoop_cmd) shell_exce_command( "hdfs dfs -rm -r /user/tmp/sqoop/export/mysql/%s/%s/%s" % (src_db, src_tb, version)) ## 删除临时文件
def hive2mysqlupsert(src_db, src_tb, dst_db, dst_table, etl_mode, res_query, mapper_nums, update_key): export_dir = "/user/tmp/sqoop/export/mysql/%s/%s/%s" % (src_db, src_tb, version) hive_sql = """ INSERT OVERWRITE DIRECTORY '%s' \ row format delimited fields terminated by '\\001' %s """ % (export_dir, res_query) hivecommand = hive_command(hive_sql) hive_exce_command(hivecommand) ## 将数据写到目标路径 connect, user_name, password = get_config(dst_db, etl_mode) db_mysql = get_db(connect, user_name, password) sqoop_cmd = generate_sqoop_cmd(dst_db, dst_table, etl_mode, export_dir, mapper_nums, update_key) shell_exce_command(sqoop_cmd) shell_exce_command( "hdfs dfs -rm -r /user/tmp/sqoop/export/mysql/%s/%s/%s" % (src_db, src_tb, version)) ## 删除临时文件
def generate_sqoop_cmd(dst_db, dst_table, dst_columns, etl_mode, export_dir, mapper_nums): mapreduce_job_name = ''.join( ["SQOOP_", etl_mode, "_", dst_db, ".", dst_table]) connect, user_name, password = get_config(dst_db, etl_mode) sqoop_cmd = """sqoop \ export \ --table %s \ --connect %s \ --username %s \ --password "%s" \ --columns %s \ --input-fields-terminated-by '\\001' \ --input-lines-terminated-by '\\n' \ --input-null-string '\\\\N' \ --input-null-non-string '\\\\N' \ --num-mappers %s \ --export-dir %s \ --mapreduce-job-name %s """ % (dst_table, connect, user_name, password, dst_columns, mapper_nums, export_dir, mapreduce_job_name) # logging.log(logging.INFO, 'Generated sqoop command: %s' % sqoop_cmd) return sqoop_cmd
port = int(matchObj.group(2)) db_name = matchObj.group(3) db_mysql = pymysql.connect(host=host, user=user_name, passwd=password, port=port, db=db_name, charset='utf8', cursorclass=pymysql.cursors.DictCursor) return db_mysql def db_query_commit(db_mysql, pre_sql): cursor = db_mysql.cursor() try: # 执行sql语句 cursor.execute(pre_sql) # 提交到数据库执行 db_mysql.commit() except Exception as e: # 如果发生错误则回滚 db_mysql.rollback() raise MysqlDatabaseError("执行: mysql语句 %s 时出错:%s" % (pre_sql, e)) db_mysql.close() if __name__ == '__main__': connect, user_name, password = get_config('hera', 'mysql-full') db_mysql = get_db(connect, user_name, password)