def select_hive(queue_name, database, sql, logger): v_queue_name = "set mapred.job.queue.name=%s" % queue_name v_database = "use %s" % database sql = sql.encode('UTF-8') v_sql = re.sub(r';$', '', sql) timeout11 = 3 * 60 * 60 * 1000 conf = {"krb_host": "hadoop001", "krb_service": "hive"} print(v_queue_name) print(v_database) print(v_sql) try: with HAConnection(hosts=HIVE_HOSTS, port=HIVE_PORT, timeout=timeout11, authMechanism="KERBEROS", user='******', configuration=conf) as haConn: with haConn.getConnection() as conn: with conn.cursor() as cur: print(v_queue_name) logger.info(v_queue_name) cur.execute(v_queue_name) print(v_database) logger.info(v_database) cur.execute(v_database) print(v_sql) logger.info(v_sql) cur.execute(v_sql) tuple_dic = cur.fetchall() if len(tuple_dic) == 0: tuple_dic = None except Exception as e: logger.error(e) raise Exception(e) return tuple_dic
def HConn(): conn = HAConnection( hosts=hosts, port=10000, #authMechanism = "KERBEROS", authMechanism="KERBEROS", user='******', #database='default', configuration=conf, timeout=9999999999).getConnection() return conn
def get_emr_hive_conn(): try: hosts = ["172.31.72.52", "172.31.72.54"] port = int(10000) conf = {"krb_host": "emr-header-1.cluster-12", "krb_service": "hive"} haConn = HAConnection(hosts=hosts, port=port, authMechanism="KERBEROS", configuration=conf, timeout=60000000) conn = haConn.getConnection() print ("%s:成功连接hive!!!" % (datetime.datetime.now())) cur = conn.cursor() a = cur.execute('select * from hive_to.iris') b = cur.fetchall() print b print a conn.close() except Exception, e: print ("%s连接Hive数据库:%s" % (datetime.datetime.now(), e)) # conn.close() # 抛出异常 raise e
def exec_compare(): # 获取参数 try: params = sys.argv if len(params) == 3: database = params[1] tabletype = params[2] sample = False elif len(params) == 5: database = params[1] tabletype = params[2] divisor = params[3] mod = params[4] sample = True except IndexError as index_error: print "传入参数列表:database,tabletype, divisor,mod; 错误信息:", index_error # hive连接 hive_conn_pool = HAConnection(hosts=dbconfig.huhive_hosts, port=dbconfig.huhive_port, authMechanism=dbconfig.huhive_authMechanism, configuration=dbconfig.huhive_conf, timeout=dbconfig.huhive_timeout) hive_conn = hive_conn_pool.getConnection() # petadata连接 # petadata_conn = connectPool.Connects(dbconfig.peta_database).conn_ali_petadata_shamo() petadata_conn = connectPool.Connects( dbconfig.peta_database).conn_ali_petadata() petadata_cur = petadata_conn.cursor() # pgptest连接 pgptest_conn = connectPool.Connects( dbconfig.pgptest_database).conn_mysql_test() pgptest_cur = pgptest_conn.cursor() count_table = pgptest_cur.execute(commonsql.need_clean_sql % database) table_names = pgptest_cur.fetchall() logger = LogUtil().field_logger() error_logger = LogUtil().fielderror_logger() log1 = logconfig.field_head % (database, count_table) logger.info(log1) error_logger.error(log1) # 判断表类型 if tabletype == TableType.INVALID: _tabletype = 'invalid' logger.info(logconfig.tbtype_invalid) error_logger.error(logconfig.tbtype_invalid) elif tabletype == TableType.INVALID_DETAIL: _tabletype = 'invalid_detail' logger.info(logconfig.tbtype_invaliddetail) error_logger.error(logconfig.tbtype_invaliddetail) else: _tabletype = 'valid' logger.info(logconfig.tbtype_valid) error_logger.error(logconfig.tbtype_valid) # 遍历表名 for name_db_tb_row in table_names: name_db_tb = name_db_tb_row[0] table_name = name_db_tb.replace(database + '_', '') # 设置hive计算引擎 hive_cur = hive_conn.cursor() hive_cur.execute("set hive.execution.engine = spark") # 获取表在两边相同的字段 common_fields = get_common_field(hive_cur, petadata_cur, database, name_db_tb, _tabletype) common_fields.append("count") log2 = '-----------------表名: %s---------------' % table_name logger.info(log2) error_logger.error(log2) # 日志结构头 log3 = '字段\thive数据\tpetadata数据\t对比结果' logger.info(log3) error_logger.error(log3) # 开始时间 time1 = time.time() error_row = 0 # 拼接sql语句 where_sql = "" if sample: where_sql = "where tongid % %s = %s" % (divisor, mod) new_common_fields = [] for field in common_fields: if field == "count": continue new_common_fields.append( (fieldsql.hash_hive_template + " as " + field) % field) sql_field = ', '.join(new_common_fields) # 查询数据 hive_cur = hive_conn.cursor() petadata_cur = petadata_conn.cursor() if tabletype is TableType.INVALID: hive_cur.execute(fieldsql.hash_hive_invalid_sql % (sql_field, database, name_db_tb, where_sql)) petadata_cur.execute(fieldsql.hash_peta_invalid_sql % (sql_field, name_db_tb, where_sql)) elif tabletype is TableType.INVALID_DETAIL: hive_cur.execute(fieldsql.hash_hive_invaliddetail_sql % (sql_field, database, name_db_tb, where_sql)) petadata_cur.execute(fieldsql.hash_peta_invaliddetail_sql % (sql_field, name_db_tb, where_sql)) else: hive_cur.execute(fieldsql.hash_hive_valid_sql % (sql_field, database, name_db_tb, where_sql)) petadata_cur.execute(fieldsql.hash_peta_valid_sql % (sql_field, name_db_tb, where_sql)) # 封装数据记录 hive_result_row = tuple(hive_cur.fetchall()[0]) hive_row_dict = dict() index = 0 for hive_data in hive_result_row: cur_field_key = common_fields[index] hive_row_dict[cur_field_key] = hive_data index += 1 peta_result_row = petadata_cur.fetchall()[0] peta_row_dict = dict() index = 0 for peta_data in peta_result_row: cur_field_key = common_fields[index] peta_row_dict[cur_field_key] = peta_data index += 1 del index # 开始比较字段 success_count = 0 error_count = 0 for key in common_fields: hive_data = hive_row_dict[key] peta_data = peta_row_dict[key] if cmp(hive_data, str(peta_data)) == 0: log5 = '%s\t%s\t%s\t%s' % (key, hive_data, peta_data, 'SUCCESS') logger.info(log5) success_count += 1 else: log6 = '%s\t%s\t%s\t%s' % (key, hive_data, peta_data, 'ERROR') error_logger.error(log6) error_count += 1 log7 = '相同字段数:%s\t不同字段数:%s\t' % (success_count, error_count) logger.info(log7) if error_count is not 0: error_row += 1 # 结束时间 time2 = time.time() total = time2 - time1 log8 = '当前表对比完毕!\t耗时:%.2fs\t异常行数:%s\t' % (total, error_row) logger.info(log8)
from pyhs2.haconnection import HAConnection except ImportError as e: print 'error:无法导入pyhs2.haconnection模块(%s)' % e.message sys.exit(0) try: database = sys.argv[1] # 获取部门参数 log_title = '当前部门:%s' % database print log_title except IndexError as e: print 'error:没有传入部门参数(%s)' % e.message sys.exit(0) tongid = sys.argv[2] # hive连接 hive_conn_pool = HAConnection(hosts=config.hosts, port=config.port, authMechanism=config.authMechanism, configuration=config.conf, timeout=config.timeout) hive_cur = hive_conn_pool.getConnection().cursor() # petadata连接 petadata_conn = connectPool.Connects(config.peta_database).conn_ali_petadata_shamo() petadata_cur = petadata_conn.cursor() # pgptest连接 pgptest_conn = connectPool.Connects(config.pgptest_database).conn_mysql_test() pgptest_cur = pgptest_conn.cursor() # 查询表名 table_count = pgptest_cur.execute(fieldsql.need_clean_sql % database)
def exec_compare(): # 获取部门和表类型参数 try: database = sys.argv[1] tabletype = sys.argv[2] except IndexError as e: print '没有参数:database,tabletype' else: # try: # 开始时间 start_time = time.time() hive_conn = HAConnection(hosts=dbconfig.huhive_hosts, port=dbconfig.huhive_port, authMechanism=dbconfig.huhive_authMechanism, configuration=dbconfig.huhive_conf, timeout=dbconfig.huhive_timeout) # 连接hive conn = hive_conn.getConnection() # 连接peta data conn_peta = connectPool.Connects( dbconfig.peta_database).conn_ali_petadata() cur_peta = conn_peta.cursor() # 连接pgp test conn_pgp_test = connectPool.Connects( dbconfig.pgptest_database).conn_mysql_test() cur_pgp_test = conn_pgp_test.cursor() # 获取日志对象 logger = LogUtil().count_logger() error_logger = LogUtil().counterror_logger() logger.info(logconfig.count_peta_hive_title) error_logger.error(logconfig.count_peta_hive_title) # 查询每个部门下所有表名 count = cur_pgp_test.execute(commonsql.need_clean_sql % database) log_head = logconfig.count_head % (database, count) logger.info(log_head) logger.info(logconfig.count_schema) error_logger.error(logconfig.counterror_schema) # 所有需要清洗的表名和库名数据 need_clean_tables = cur_pgp_test.fetchall() cur_pgp_test.close() conn_pgp_test.close() # 统计结果 success_count = 0 error_count = 0 for need_clean_table in need_clean_tables: db_tb_name = need_clean_table[0] table = db_tb_name.replace(database + '_', '') cur_hive = conn.cursor() # 设置hive计算引擎 cur_hive.execute("set hive.execution.engine = spark") # 查询语句准备 if tabletype == TableType.INVALID: count_hive_sql = countsql.hive_invalid_sql % (database, db_tb_name) count_peta_sql = countsql.peta_invalid_sql % db_tb_name elif tabletype == TableType.INVALID_DETAIL: count_hive_sql = countsql.hive_invalid_detail_sql % ( database, db_tb_name) count_peta_sql = countsql.peta_invalid_detail_sql % db_tb_name else: count_hive_sql = countsql.hive_valid_sql % (database, db_tb_name) count_peta_sql = countsql.peta_valid_sql % db_tb_name # 执行Hive查询 cur_hive.execute(count_hive_sql) count_hive = cur_hive.fetchall()[0][0] # 执行peta查询 cur_peta.execute(count_peta_sql) count_peta = cur_peta.fetchall()[0][0] # 对比逻辑 if count_hive == count_peta: success_count += 1 success_result = "%s_%s_%s\t%s\t%s" % ( database, table, tabletype, count_hive, count_peta) logger.info(success_result) else: error_count += 1 error_logger.error( "%s_%s_%s\t%s\t%s" % (database, table, tabletype, count_hive, count_peta)) cur_hive.close() # 结束时间 end_time = time.time() # 总耗时 total_time = end_time - start_time tail_log = logconfig.count_tail % (success_count, error_count, total_time) logger.info(tail_log) # 关闭连接 cur_peta.close()
def exec_compare(): # 获取参数 try: params = sys.argv if len(params) == 3: database = params[1] tabletype = params[2] sample = False elif len(params) == 5: database = params[1] tabletype = params[2] divisor = params[3] mod = params[4] sample = True except IndexError as index_error: print "传入参数列表:database,tabletype, divisor,mod; 错误信息:", index_error # hive连接 hive_conn_pool = HAConnection(hosts=dbconfig.huhive_hosts, port=dbconfig.huhive_port, authMechanism=dbconfig.huhive_authMechanism, configuration=dbconfig.huhive_conf, timeout=dbconfig.huhive_timeout) hive_conn = hive_conn_pool.getConnection() # petadata连接 petadata_conn = connectPool.Connects( dbconfig.peta_database).conn_ali_petadata() petadata_cur = petadata_conn.cursor() # pgptest连接 pgptest_conn = connectPool.Connects( dbconfig.pgptest_database).conn_mysql_test() pgptest_cur = pgptest_conn.cursor() # 所有表 count_table = pgptest_cur.execute(commonsql.need_clean_sql % database) table_names = pgptest_cur.fetchall() # 日志对象 logger = LogUtil().field_logger() error_logger = LogUtil().fielderror_logger() logger.info(logconfig.field_peta_hive_title) error_logger.error(logconfig.field_peta_hive_title) if tabletype == TableType.INVALID: _tabletype = 'invalid' logger.info(logconfig.tbtype_invalid) error_logger.error(logconfig.tbtype_invalid) elif tabletype == TableType.INVALID_DETAIL: _tabletype = 'invalid_detail' logger.info(logconfig.tbtype_invaliddetail) error_logger.error(logconfig.tbtype_invaliddetail) else: _tabletype = 'valid' logger.info(logconfig.tbtype_valid) error_logger.error(logconfig.tbtype_valid) log1 = logconfig.field_head % (database, count_table) logger.info(log1) # 遍历表名 for name_db_tb_row in table_names: name_db_tb = name_db_tb_row[0] table_name = name_db_tb.replace(database + '_', '') # 设置hive计算引擎 hive_cur = hive_conn.cursor() hive_cur.execute("set hive.execution.engine = spark") # 获取表在两边相同的字段 common_fields = get_common_field(hive_cur, petadata_cur, database, name_db_tb, _tabletype) log2 = '表名: %s' % table_name logger.info(log2) error_logger.error(log2) # 输出一下两边数据量 count_log = count_invalid(hive_conn, petadata_conn, database, name_db_tb, _tabletype) logger.info(count_log) # 日志结构头 logger.info(logconfig.field_schema) error_logger.error(logconfig.fielderror_schema) # 开始时间 time1 = time.time() # 查询所有tongid hive_cur = hive_conn.cursor() hive_cur.execute("set hive.execution.engine = spark") if tabletype == TableType.INVALID: hive_cur.execute(fieldsql.hive_select_tongid_invalid % (database, name_db_tb)) elif tabletype == TableType.INVALID_DETAIL: hive_cur.execute(fieldsql.hive_select_tongid_invaliddetail % (database, name_db_tb)) else: hive_cur.execute(fieldsql.hive_select_tongid_valid % (database, name_db_tb)) tongid_rows = hive_cur.fetchall() hive_cur.close() error_row = 0 # 遍历tongid for tongid_row in tongid_rows: tongid = tongid_row[0] if sample: # 过滤抽样抽取一些满足条件的tongid if not tongid_filter(tongid, int(divisor), int(mod)): continue log4 = 'tongid\t%s\t%s\t相同且唯一' % (tongid, tongid) logger.info("---------------- 1 行 ------------------\n") logger.info(log4) sql_field = ', '.join(common_fields) # 封装tongid对应的hive数据记录 hive_cur = hive_conn.cursor() petadata_cur = petadata_conn.cursor() # 查询数据 if tabletype == TableType.INVALID: hive_cur.execute(fieldsql.hive_invalid_sql % (sql_field, database, name_db_tb, tongid)) petadata_cur.execute(fieldsql.peta_invalid_sql % (sql_field, name_db_tb, tongid)) elif tabletype == TableType.INVALID_DETAIL: hive_cur.execute(fieldsql.hive_invalid_detail_sql % (sql_field, database, name_db_tb, tongid)) petadata_cur.execute(fieldsql.peta_invalid_detail_sql % (sql_field, name_db_tb, tongid)) else: hive_cur.execute(fieldsql.hive_valid_sql % (sql_field, database, name_db_tb, tongid)) petadata_cur.execute(fieldsql.peta_valid_sql % (sql_field, name_db_tb, tongid)) # 封装每个tongid对应的数据记录 hive_result_row = tuple(hive_cur.fetchall()[0]) hive_row_dict = dict() index = 0 for hive_data in hive_result_row: cur_field_key = common_fields[index] hive_row_dict[cur_field_key] = hive_data index += 1 peta_result_row = petadata_cur.fetchall()[0] peta_row_dict = dict() index = 0 for peta_data in peta_result_row: cur_field_key = common_fields[index] peta_row_dict[cur_field_key] = peta_data index += 1 del index # 开始比较字段 success_count = 0 error_count = 0 for key in common_fields: hive_data = hive_row_dict[key] peta_data = peta_row_dict[key] if cmp(hive_data, str(peta_data)) == 0: log5 = '%s\t%s\t%s\t%s' % (key, hive_data, peta_data, 'SUCCESS') logger.info(log5) success_count += 1 else: log6 = '%s\t%s\t%s\t%s' % (key, hive_data, peta_data, 'ERROR') error_logger.error(log6) error_count += 1 log7 = logconfig.field_result % (success_count, error_count) logger.info(log7) if error_count is not 0: error_row += 1 # 结束时间 time2 = time.time() total = time2 - time1 log8 = logconfig.field_tail % (total, error_row) print log8 logger.info(log8)
def exec_compare(database, ods_conn): # 开始时间 start_time = time.time() hive_conn = HAConnection(hosts=dbconfig.ali_hosts, port=dbconfig.ali_port, authMechanism=dbconfig.ali_authMechanism, configuration=dbconfig.ali_conf, timeout=dbconfig.ali_timeout) # 连接hive conn = hive_conn.getConnection() # 连接ods cur_ods = ods_conn.cursor() # 连接pgp test conn_pgp_test = connectPool.Connects( dbconfig.pgptest_database).conn_mysql_test() cur_pgp_test = conn_pgp_test.cursor() # 获取日志对象 logger = LogUtil().count_logger() error_logger = LogUtil().counterror_logger() logger.info(logconfig.count_ods_hive_title) error_logger.error(logconfig.count_ods_hive_title) # 查询每个部门下所有表名 count = cur_pgp_test.execute(commonsql.need_clean_sql % database) log_head = logconfig.count_head % (database, count) logger.info(log_head) logger.info(logconfig.count_odshive_schema) error_logger.error(logconfig.counterror_odshive_schema) # 所有需要清洗的表名和库名数据 need_clean_tables = cur_pgp_test.fetchall() cur_pgp_test.close() conn_pgp_test.close() # 统计结果 success_count = 0 error_count = 0 for need_clean_table in need_clean_tables: db_tb_name = need_clean_table[0] table = db_tb_name.replace(database + '_', '') cur_hive = conn.cursor() # 查询语句准备 count_hive_sql = countsql.hive_ods % (database, db_tb_name) count_ods_sql = countsql.ods % table # 执行Hive查询 cur_hive.execute(count_hive_sql) count_hive = cur_hive.fetchall()[0][0] # 执行ods查询 cur_ods.execute(count_ods_sql) count_ods = cur_ods.fetchall()[0][0] # 对比逻辑 if count_hive == count_ods: success_count += 1 success_result = "%s_%s_ods\t%s\t%s" % (database, table, count_hive, count_ods) logger.info(success_result) else: error_count += 1 error_logger.error("%s_%s_ods\t%s\t%s" % (database, table, count_hive, count_ods)) cur_hive.close() # 结束时间 end_time = time.time() # 总耗时 total_time = end_time - start_time tail_log = logconfig.count_tail % (success_count, error_count, total_time) logger.info(tail_log) # 关闭连接 cur_ods.close()
def exec_compare(): try: # 获取部门和表类型参数 database = sys.argv[1] tabletype = sys.argv[2] except IndexError as err: print '没有参数:database,tabletype;%s' % err else: # 开始时间 start_time = time.time() # huawei hive hive_pool = HAConnection(hosts=dbconfig.huhive_hosts, port=dbconfig.huhive_port, authMechanism=dbconfig.huhive_authMechanism, configuration=dbconfig.huhive_conf, timeout=dbconfig.huhive_timeout) hive_conn = hive_pool.getConnection() # emr hive emr_pool = HAConnection(hosts=dbconfig.ali_hosts, port=dbconfig.ali_port, authMechanism=dbconfig.ali_authMechanism, configuration=dbconfig.ali_conf, timeout=dbconfig.ali_timeout) emr_conn = emr_pool.getConnection() # 连接pgp test conn_pgp_test = connectPool.Connects( dbconfig.pgptest_database).conn_mysql_test() cur_pgp_test = conn_pgp_test.cursor() # 获取日志对象 logger = LogUtil().count_logger() error_logger = LogUtil().counterror_logger() logger.info(logconfig.count_emr_hive_title) error_logger.error(logconfig.count_emr_hive_title) # 查询每个部门下所有表名 count = cur_pgp_test.execute(commonsql.need_clean_sql % database) log_head = logconfig.count_head % (database, count) logger.info(log_head) logger.info(logconfig.count_emrhive_schema) error_logger.error(logconfig.counterror_emrhive_schema) # 所有需要清洗的表名和库名数据 need_clean_tables = cur_pgp_test.fetchall() cur_pgp_test.close() conn_pgp_test.close() # 统计结果 success_count = 0 error_count = 0 for need_clean_table in need_clean_tables: db_tb_name = need_clean_table[0] table = db_tb_name.replace(database + '_', '') cur_hive = hive_conn.cursor() cur_emr = emr_conn.cursor() # 使用spark 不用mapreduce cur_hive.execute("set hive.execution.engine = spark") cur_emr.execute("set hive.execution.engine = spark") # 查询语句准备 if tabletype == TableType.INVALID: count_hive_sql = countsql.hive_invalid_sql % (database, db_tb_name) elif tabletype == TableType.INVALID_DETAIL: count_hive_sql = countsql.hive_invalid_detail_sql % ( database, db_tb_name) else: count_hive_sql = countsql.hive_valid_sql % (database, db_tb_name) # 执行Hive查询 cur_hive.execute(count_hive_sql) count_hive = cur_hive.fetchall()[0][0] # 执行emr查询 cur_emr.execute(count_hive_sql) count_emr = cur_emr.fetchall()[0][0] # 结果对比 if count_hive == count_emr: success_count += 1 success_result = "%s_%s_%s\t%s\t%s" % ( database, table, tabletype, count_hive, count_emr) logger.info(success_result) else: error_count += 1 error_logger.error( "%s_%s_%s\t%s\t%s" % (database, table, tabletype, count_hive, count_emr)) cur_hive.close() cur_emr.close() # 结束时间 end_time = time.time() # 总耗时 total_time = end_time - start_time tail_log = logconfig.count_tail % (success_count, error_count, total_time) logger.info(tail_log)
def exec_compare(): # 获取部门和表类型参数 try: database = sys.argv[1] tablename = sys.argv[2] tabletype = sys.argv[3] except IndexError as e: print '没有参数:database,tablename' else: try: # 开始时间 start_time = time.time() hive_conn = HAConnection( hosts=dbconfig.huhive_hosts, port=dbconfig.huhive_port, authMechanism=dbconfig.huhive_authMechanism, configuration=dbconfig.huhive_conf, timeout=dbconfig.huhive_timeout) # 连接hive conn = hive_conn.getConnection() # 连接pgp test conn_pgp = connectPool.Connects( dbconfig.pgp_database).conn_mysql_pgp() cur_pgp = conn_pgp.cursor() # 获取日志对象 logger = LogUtil().count_logger() error_logger = LogUtil().counterror_logger() # 开始对比 cur_hive = conn.cursor() # 设置hive计算引擎 cur_hive.execute("set hive.execution.engine = spark") # 查询语句准备 if tabletype == TableType.INVALID: count_hive_sql = countsql.hive_invalid_sql % (database, tablename) count_pgp_sql = countsql.pgp_invalid_sql % tablename elif tabletype == TableType.INVALID_DETAIL: count_hive_sql = countsql.hive_invalid_detail_sql % (database, tablename) count_pgp_sql = countsql.pgp_invalid_detail_sql % tablename else: count_hive_sql = countsql.pgphive_valid_sql % (database, tablename) count_pgp_sql = countsql.pgp_valid_sql % tablename # 执行Hive查询 cur_hive.execute(count_hive_sql) count_hive = cur_hive.fetchall()[0][0] # 执行peta查询 cur_pgp.execute(count_pgp_sql) count_pgp = cur_pgp.fetchall()[0][0] # 对比逻辑 if count_hive == count_pgp: success_result = "%s\t%s\t%s" % (tablename, count_hive, count_pgp) logger.info(success_result) else: error_logger.error("%s\t%s\t%s" % (tablename, count_hive, count_pgp)) cur_hive.close() # 关闭连接 cur_pgp.close() except Pyhs2Exception as err: error_logger.error("表不存在: %s" % err)