def exec_compare(): # 获取部门和表类型参数 try: database = sys.argv[1] tabletype = sys.argv[2] except IndexError as e: print '没有参数:database,tabletype' else: # try: # 开始时间 start_time = time.time() hive_conn = HAConnection(hosts=dbconfig.huhive_hosts, port=dbconfig.huhive_port, authMechanism=dbconfig.huhive_authMechanism, configuration=dbconfig.huhive_conf, timeout=dbconfig.huhive_timeout) # 连接hive conn = hive_conn.getConnection() # 连接peta data conn_peta = connectPool.Connects( dbconfig.peta_database).conn_ali_petadata() cur_peta = conn_peta.cursor() # 连接pgp test conn_pgp_test = connectPool.Connects( dbconfig.pgptest_database).conn_mysql_test() cur_pgp_test = conn_pgp_test.cursor() # 获取日志对象 logger = LogUtil().count_logger() error_logger = LogUtil().counterror_logger() logger.info(logconfig.count_peta_hive_title) error_logger.error(logconfig.count_peta_hive_title) # 查询每个部门下所有表名 count = cur_pgp_test.execute(commonsql.need_clean_sql % database) log_head = logconfig.count_head % (database, count) logger.info(log_head) logger.info(logconfig.count_schema) error_logger.error(logconfig.counterror_schema) # 所有需要清洗的表名和库名数据 need_clean_tables = cur_pgp_test.fetchall() cur_pgp_test.close() conn_pgp_test.close() # 统计结果 success_count = 0 error_count = 0 for need_clean_table in need_clean_tables: db_tb_name = need_clean_table[0] table = db_tb_name.replace(database + '_', '') cur_hive = conn.cursor() # 设置hive计算引擎 cur_hive.execute("set hive.execution.engine = spark") # 查询语句准备 if tabletype == TableType.INVALID: count_hive_sql = countsql.hive_invalid_sql % (database, db_tb_name) count_peta_sql = countsql.peta_invalid_sql % db_tb_name elif tabletype == TableType.INVALID_DETAIL: count_hive_sql = countsql.hive_invalid_detail_sql % ( database, db_tb_name) count_peta_sql = countsql.peta_invalid_detail_sql % db_tb_name else: count_hive_sql = countsql.hive_valid_sql % (database, db_tb_name) count_peta_sql = countsql.peta_valid_sql % db_tb_name # 执行Hive查询 cur_hive.execute(count_hive_sql) count_hive = cur_hive.fetchall()[0][0] # 执行peta查询 cur_peta.execute(count_peta_sql) count_peta = cur_peta.fetchall()[0][0] # 对比逻辑 if count_hive == count_peta: success_count += 1 success_result = "%s_%s_%s\t%s\t%s" % ( database, table, tabletype, count_hive, count_peta) logger.info(success_result) else: error_count += 1 error_logger.error( "%s_%s_%s\t%s\t%s" % (database, table, tabletype, count_hive, count_peta)) cur_hive.close() # 结束时间 end_time = time.time() # 总耗时 total_time = end_time - start_time tail_log = logconfig.count_tail % (success_count, error_count, total_time) logger.info(tail_log) # 关闭连接 cur_peta.close()
def exec_compare(): # 获取参数 try: params = sys.argv if len(params) == 3: database = params[1] tabletype = params[2] sample = False elif len(params) == 5: database = params[1] tabletype = params[2] divisor = params[3] mod = params[4] sample = True except IndexError as index_error: print "传入参数列表:database,tabletype, divisor,mod; 错误信息:", index_error # hive连接 hive_conn_pool = HAConnection(hosts=dbconfig.huhive_hosts, port=dbconfig.huhive_port, authMechanism=dbconfig.huhive_authMechanism, configuration=dbconfig.huhive_conf, timeout=dbconfig.huhive_timeout) hive_conn = hive_conn_pool.getConnection() # petadata连接 petadata_conn = connectPool.Connects( dbconfig.peta_database).conn_ali_petadata() petadata_cur = petadata_conn.cursor() # pgptest连接 pgptest_conn = connectPool.Connects( dbconfig.pgptest_database).conn_mysql_test() pgptest_cur = pgptest_conn.cursor() # 所有表 count_table = pgptest_cur.execute(commonsql.need_clean_sql % database) table_names = pgptest_cur.fetchall() # 日志对象 logger = LogUtil().field_logger() error_logger = LogUtil().fielderror_logger() logger.info(logconfig.field_peta_hive_title) error_logger.error(logconfig.field_peta_hive_title) if tabletype == TableType.INVALID: _tabletype = 'invalid' logger.info(logconfig.tbtype_invalid) error_logger.error(logconfig.tbtype_invalid) elif tabletype == TableType.INVALID_DETAIL: _tabletype = 'invalid_detail' logger.info(logconfig.tbtype_invaliddetail) error_logger.error(logconfig.tbtype_invaliddetail) else: _tabletype = 'valid' logger.info(logconfig.tbtype_valid) error_logger.error(logconfig.tbtype_valid) log1 = logconfig.field_head % (database, count_table) logger.info(log1) # 遍历表名 for name_db_tb_row in table_names: name_db_tb = name_db_tb_row[0] table_name = name_db_tb.replace(database + '_', '') # 设置hive计算引擎 hive_cur = hive_conn.cursor() hive_cur.execute("set hive.execution.engine = spark") # 获取表在两边相同的字段 common_fields = get_common_field(hive_cur, petadata_cur, database, name_db_tb, _tabletype) log2 = '表名: %s' % table_name logger.info(log2) error_logger.error(log2) # 输出一下两边数据量 count_log = count_invalid(hive_conn, petadata_conn, database, name_db_tb, _tabletype) logger.info(count_log) # 日志结构头 logger.info(logconfig.field_schema) error_logger.error(logconfig.fielderror_schema) # 开始时间 time1 = time.time() # 查询所有tongid hive_cur = hive_conn.cursor() hive_cur.execute("set hive.execution.engine = spark") if tabletype == TableType.INVALID: hive_cur.execute(fieldsql.hive_select_tongid_invalid % (database, name_db_tb)) elif tabletype == TableType.INVALID_DETAIL: hive_cur.execute(fieldsql.hive_select_tongid_invaliddetail % (database, name_db_tb)) else: hive_cur.execute(fieldsql.hive_select_tongid_valid % (database, name_db_tb)) tongid_rows = hive_cur.fetchall() hive_cur.close() error_row = 0 # 遍历tongid for tongid_row in tongid_rows: tongid = tongid_row[0] if sample: # 过滤抽样抽取一些满足条件的tongid if not tongid_filter(tongid, int(divisor), int(mod)): continue log4 = 'tongid\t%s\t%s\t相同且唯一' % (tongid, tongid) logger.info("---------------- 1 行 ------------------\n") logger.info(log4) sql_field = ', '.join(common_fields) # 封装tongid对应的hive数据记录 hive_cur = hive_conn.cursor() petadata_cur = petadata_conn.cursor() # 查询数据 if tabletype == TableType.INVALID: hive_cur.execute(fieldsql.hive_invalid_sql % (sql_field, database, name_db_tb, tongid)) petadata_cur.execute(fieldsql.peta_invalid_sql % (sql_field, name_db_tb, tongid)) elif tabletype == TableType.INVALID_DETAIL: hive_cur.execute(fieldsql.hive_invalid_detail_sql % (sql_field, database, name_db_tb, tongid)) petadata_cur.execute(fieldsql.peta_invalid_detail_sql % (sql_field, name_db_tb, tongid)) else: hive_cur.execute(fieldsql.hive_valid_sql % (sql_field, database, name_db_tb, tongid)) petadata_cur.execute(fieldsql.peta_valid_sql % (sql_field, name_db_tb, tongid)) # 封装每个tongid对应的数据记录 hive_result_row = tuple(hive_cur.fetchall()[0]) hive_row_dict = dict() index = 0 for hive_data in hive_result_row: cur_field_key = common_fields[index] hive_row_dict[cur_field_key] = hive_data index += 1 peta_result_row = petadata_cur.fetchall()[0] peta_row_dict = dict() index = 0 for peta_data in peta_result_row: cur_field_key = common_fields[index] peta_row_dict[cur_field_key] = peta_data index += 1 del index # 开始比较字段 success_count = 0 error_count = 0 for key in common_fields: hive_data = hive_row_dict[key] peta_data = peta_row_dict[key] if cmp(hive_data, str(peta_data)) == 0: log5 = '%s\t%s\t%s\t%s' % (key, hive_data, peta_data, 'SUCCESS') logger.info(log5) success_count += 1 else: log6 = '%s\t%s\t%s\t%s' % (key, hive_data, peta_data, 'ERROR') error_logger.error(log6) error_count += 1 log7 = logconfig.field_result % (success_count, error_count) logger.info(log7) if error_count is not 0: error_row += 1 # 结束时间 time2 = time.time() total = time2 - time1 log8 = logconfig.field_tail % (total, error_row) print log8 logger.info(log8)
# -*- coding: utf-8 -*- """ Created on 2017/12/7. @author: kesong """ import json from db import connectPool from config import dbconfig conn_pgp_test = connectPool.Connects( dbconfig.pgptest_database).conn_mysql_test() cur_pgp_test = conn_pgp_test.cursor() query_distinct_depts = "select distinct dept_short_name from direct_info" cur_pgp_test.execute(query_distinct_depts) dept_rows = cur_pgp_test.fetchall() dept_list = [] for _ in dept_rows: dept_list.extend(_) jsonStr = json.dumps(dept_list) writer = open("files/depts.json", "w") writer.writelines(jsonStr) writer.close()
def exec_compare(database, ods_conn): # 开始时间 start_time = time.time() hive_conn = HAConnection(hosts=dbconfig.ali_hosts, port=dbconfig.ali_port, authMechanism=dbconfig.ali_authMechanism, configuration=dbconfig.ali_conf, timeout=dbconfig.ali_timeout) # 连接hive conn = hive_conn.getConnection() # 连接ods cur_ods = ods_conn.cursor() # 连接pgp test conn_pgp_test = connectPool.Connects( dbconfig.pgptest_database).conn_mysql_test() cur_pgp_test = conn_pgp_test.cursor() # 获取日志对象 logger = LogUtil().count_logger() error_logger = LogUtil().counterror_logger() logger.info(logconfig.count_ods_hive_title) error_logger.error(logconfig.count_ods_hive_title) # 查询每个部门下所有表名 count = cur_pgp_test.execute(commonsql.need_clean_sql % database) log_head = logconfig.count_head % (database, count) logger.info(log_head) logger.info(logconfig.count_odshive_schema) error_logger.error(logconfig.counterror_odshive_schema) # 所有需要清洗的表名和库名数据 need_clean_tables = cur_pgp_test.fetchall() cur_pgp_test.close() conn_pgp_test.close() # 统计结果 success_count = 0 error_count = 0 for need_clean_table in need_clean_tables: db_tb_name = need_clean_table[0] table = db_tb_name.replace(database + '_', '') cur_hive = conn.cursor() # 查询语句准备 count_hive_sql = countsql.hive_ods % (database, db_tb_name) count_ods_sql = countsql.ods % table # 执行Hive查询 cur_hive.execute(count_hive_sql) count_hive = cur_hive.fetchall()[0][0] # 执行ods查询 cur_ods.execute(count_ods_sql) count_ods = cur_ods.fetchall()[0][0] # 对比逻辑 if count_hive == count_ods: success_count += 1 success_result = "%s_%s_ods\t%s\t%s" % (database, table, count_hive, count_ods) logger.info(success_result) else: error_count += 1 error_logger.error("%s_%s_ods\t%s\t%s" % (database, table, count_hive, count_ods)) cur_hive.close() # 结束时间 end_time = time.time() # 总耗时 total_time = end_time - start_time tail_log = logconfig.count_tail % (success_count, error_count, total_time) logger.info(tail_log) # 关闭连接 cur_ods.close()
def exec_compare(): try: # 获取部门和表类型参数 database = sys.argv[1] tabletype = sys.argv[2] except IndexError as err: print '没有参数:database,tabletype;%s' % err else: # 开始时间 start_time = time.time() # huawei hive hive_pool = HAConnection(hosts=dbconfig.huhive_hosts, port=dbconfig.huhive_port, authMechanism=dbconfig.huhive_authMechanism, configuration=dbconfig.huhive_conf, timeout=dbconfig.huhive_timeout) hive_conn = hive_pool.getConnection() # emr hive emr_pool = HAConnection(hosts=dbconfig.ali_hosts, port=dbconfig.ali_port, authMechanism=dbconfig.ali_authMechanism, configuration=dbconfig.ali_conf, timeout=dbconfig.ali_timeout) emr_conn = emr_pool.getConnection() # 连接pgp test conn_pgp_test = connectPool.Connects( dbconfig.pgptest_database).conn_mysql_test() cur_pgp_test = conn_pgp_test.cursor() # 获取日志对象 logger = LogUtil().count_logger() error_logger = LogUtil().counterror_logger() logger.info(logconfig.count_emr_hive_title) error_logger.error(logconfig.count_emr_hive_title) # 查询每个部门下所有表名 count = cur_pgp_test.execute(commonsql.need_clean_sql % database) log_head = logconfig.count_head % (database, count) logger.info(log_head) logger.info(logconfig.count_emrhive_schema) error_logger.error(logconfig.counterror_emrhive_schema) # 所有需要清洗的表名和库名数据 need_clean_tables = cur_pgp_test.fetchall() cur_pgp_test.close() conn_pgp_test.close() # 统计结果 success_count = 0 error_count = 0 for need_clean_table in need_clean_tables: db_tb_name = need_clean_table[0] table = db_tb_name.replace(database + '_', '') cur_hive = hive_conn.cursor() cur_emr = emr_conn.cursor() # 使用spark 不用mapreduce cur_hive.execute("set hive.execution.engine = spark") cur_emr.execute("set hive.execution.engine = spark") # 查询语句准备 if tabletype == TableType.INVALID: count_hive_sql = countsql.hive_invalid_sql % (database, db_tb_name) elif tabletype == TableType.INVALID_DETAIL: count_hive_sql = countsql.hive_invalid_detail_sql % ( database, db_tb_name) else: count_hive_sql = countsql.hive_valid_sql % (database, db_tb_name) # 执行Hive查询 cur_hive.execute(count_hive_sql) count_hive = cur_hive.fetchall()[0][0] # 执行emr查询 cur_emr.execute(count_hive_sql) count_emr = cur_emr.fetchall()[0][0] # 结果对比 if count_hive == count_emr: success_count += 1 success_result = "%s_%s_%s\t%s\t%s" % ( database, table, tabletype, count_hive, count_emr) logger.info(success_result) else: error_count += 1 error_logger.error( "%s_%s_%s\t%s\t%s" % (database, table, tabletype, count_hive, count_emr)) cur_hive.close() cur_emr.close() # 结束时间 end_time = time.time() # 总耗时 total_time = end_time - start_time tail_log = logconfig.count_tail % (success_count, error_count, total_time) logger.info(tail_log)
def exec_compare(): # 获取部门和表类型参数 try: database = sys.argv[1] tablename = sys.argv[2] tabletype = sys.argv[3] except IndexError as e: print '没有参数:database,tablename' else: try: # 开始时间 start_time = time.time() hive_conn = HAConnection( hosts=dbconfig.huhive_hosts, port=dbconfig.huhive_port, authMechanism=dbconfig.huhive_authMechanism, configuration=dbconfig.huhive_conf, timeout=dbconfig.huhive_timeout) # 连接hive conn = hive_conn.getConnection() # 连接pgp test conn_pgp = connectPool.Connects( dbconfig.pgp_database).conn_mysql_pgp() cur_pgp = conn_pgp.cursor() # 获取日志对象 logger = LogUtil().count_logger() error_logger = LogUtil().counterror_logger() # 开始对比 cur_hive = conn.cursor() # 设置hive计算引擎 cur_hive.execute("set hive.execution.engine = spark") # 查询语句准备 if tabletype == TableType.INVALID: count_hive_sql = countsql.hive_invalid_sql % (database, tablename) count_pgp_sql = countsql.pgp_invalid_sql % tablename elif tabletype == TableType.INVALID_DETAIL: count_hive_sql = countsql.hive_invalid_detail_sql % (database, tablename) count_pgp_sql = countsql.pgp_invalid_detail_sql % tablename else: count_hive_sql = countsql.pgphive_valid_sql % (database, tablename) count_pgp_sql = countsql.pgp_valid_sql % tablename # 执行Hive查询 cur_hive.execute(count_hive_sql) count_hive = cur_hive.fetchall()[0][0] # 执行peta查询 cur_pgp.execute(count_pgp_sql) count_pgp = cur_pgp.fetchall()[0][0] # 对比逻辑 if count_hive == count_pgp: success_result = "%s\t%s\t%s" % (tablename, count_hive, count_pgp) logger.info(success_result) else: error_logger.error("%s\t%s\t%s" % (tablename, count_hive, count_pgp)) cur_hive.close() # 关闭连接 cur_pgp.close() except Pyhs2Exception as err: error_logger.error("表不存在: %s" % err)