Exemple #1
0
def select_hive(queue_name, database, sql, logger):
    v_queue_name = "set mapred.job.queue.name=%s" % queue_name
    v_database = "use %s" % database
    sql = sql.encode('UTF-8')
    v_sql = re.sub(r';$', '', sql)
    timeout11 = 3 * 60 * 60 * 1000
    conf = {"krb_host": "hadoop001", "krb_service": "hive"}
    print(v_queue_name)
    print(v_database)
    print(v_sql)
    try:
        with HAConnection(hosts=HIVE_HOSTS,
                          port=HIVE_PORT,
                          timeout=timeout11,
                          authMechanism="KERBEROS",
                          user='******',
                          configuration=conf) as haConn:
            with haConn.getConnection() as conn:
                with conn.cursor() as cur:
                    print(v_queue_name)
                    logger.info(v_queue_name)
                    cur.execute(v_queue_name)
                    print(v_database)
                    logger.info(v_database)
                    cur.execute(v_database)
                    print(v_sql)
                    logger.info(v_sql)
                    cur.execute(v_sql)
                    tuple_dic = cur.fetchall()
                    if len(tuple_dic) == 0:
                        tuple_dic = None
    except Exception as e:
        logger.error(e)
        raise Exception(e)
    return tuple_dic
Exemple #2
0
def HConn():
    conn = HAConnection(
        hosts=hosts,
        port=10000,
        #authMechanism = "KERBEROS",
        authMechanism="KERBEROS",
        user='******',
        #database='default',
        configuration=conf,
        timeout=9999999999).getConnection()
    return conn
Exemple #3
0
def get_emr_hive_conn():
    try:

        hosts = ["172.31.72.52", "172.31.72.54"]
        port = int(10000)
        conf = {"krb_host": "emr-header-1.cluster-12", "krb_service": "hive"}

        haConn = HAConnection(hosts=hosts,
                              port=port,
                              authMechanism="KERBEROS",
                              configuration=conf, timeout=60000000)
        conn = haConn.getConnection()
        print ("%s:成功连接hive!!!" % (datetime.datetime.now()))
        cur = conn.cursor()
        a = cur.execute('select * from hive_to.iris')
        b = cur.fetchall()
        print b
        print a
        conn.close()
    except Exception, e:
        print ("%s连接Hive数据库:%s" % (datetime.datetime.now(), e))
        # conn.close()
        # 抛出异常
        raise e
Exemple #4
0
def exec_compare():
    # 获取参数
    try:
        params = sys.argv
        if len(params) == 3:
            database = params[1]
            tabletype = params[2]
            sample = False
        elif len(params) == 5:
            database = params[1]
            tabletype = params[2]
            divisor = params[3]
            mod = params[4]
            sample = True
    except IndexError as index_error:
        print "传入参数列表:database,tabletype, divisor,mod; 错误信息:", index_error

    # hive连接
    hive_conn_pool = HAConnection(hosts=dbconfig.huhive_hosts,
                                  port=dbconfig.huhive_port,
                                  authMechanism=dbconfig.huhive_authMechanism,
                                  configuration=dbconfig.huhive_conf,
                                  timeout=dbconfig.huhive_timeout)
    hive_conn = hive_conn_pool.getConnection()

    # petadata连接
    # petadata_conn = connectPool.Connects(dbconfig.peta_database).conn_ali_petadata_shamo()
    petadata_conn = connectPool.Connects(
        dbconfig.peta_database).conn_ali_petadata()
    petadata_cur = petadata_conn.cursor()

    # pgptest连接
    pgptest_conn = connectPool.Connects(
        dbconfig.pgptest_database).conn_mysql_test()
    pgptest_cur = pgptest_conn.cursor()

    count_table = pgptest_cur.execute(commonsql.need_clean_sql % database)
    table_names = pgptest_cur.fetchall()

    logger = LogUtil().field_logger()
    error_logger = LogUtil().fielderror_logger()
    log1 = logconfig.field_head % (database, count_table)
    logger.info(log1)
    error_logger.error(log1)

    # 判断表类型
    if tabletype == TableType.INVALID:
        _tabletype = 'invalid'
        logger.info(logconfig.tbtype_invalid)
        error_logger.error(logconfig.tbtype_invalid)
    elif tabletype == TableType.INVALID_DETAIL:
        _tabletype = 'invalid_detail'
        logger.info(logconfig.tbtype_invaliddetail)
        error_logger.error(logconfig.tbtype_invaliddetail)
    else:
        _tabletype = 'valid'
        logger.info(logconfig.tbtype_valid)
        error_logger.error(logconfig.tbtype_valid)

    # 遍历表名
    for name_db_tb_row in table_names:
        name_db_tb = name_db_tb_row[0]
        table_name = name_db_tb.replace(database + '_', '')

        # 设置hive计算引擎
        hive_cur = hive_conn.cursor()
        hive_cur.execute("set hive.execution.engine = spark")

        # 获取表在两边相同的字段
        common_fields = get_common_field(hive_cur, petadata_cur, database,
                                         name_db_tb, _tabletype)
        common_fields.append("count")
        log2 = '-----------------表名: %s---------------' % table_name
        logger.info(log2)
        error_logger.error(log2)

        # 日志结构头
        log3 = '字段\thive数据\tpetadata数据\t对比结果'
        logger.info(log3)
        error_logger.error(log3)

        # 开始时间
        time1 = time.time()
        error_row = 0

        # 拼接sql语句
        where_sql = ""
        if sample:
            where_sql = "where tongid % %s = %s" % (divisor, mod)
        new_common_fields = []
        for field in common_fields:
            if field == "count":
                continue
            new_common_fields.append(
                (fieldsql.hash_hive_template + " as " + field) % field)
        sql_field = ', '.join(new_common_fields)

        # 查询数据
        hive_cur = hive_conn.cursor()
        petadata_cur = petadata_conn.cursor()
        if tabletype is TableType.INVALID:
            hive_cur.execute(fieldsql.hash_hive_invalid_sql %
                             (sql_field, database, name_db_tb, where_sql))
            petadata_cur.execute(fieldsql.hash_peta_invalid_sql %
                                 (sql_field, name_db_tb, where_sql))
        elif tabletype is TableType.INVALID_DETAIL:
            hive_cur.execute(fieldsql.hash_hive_invaliddetail_sql %
                             (sql_field, database, name_db_tb, where_sql))
            petadata_cur.execute(fieldsql.hash_peta_invaliddetail_sql %
                                 (sql_field, name_db_tb, where_sql))
        else:
            hive_cur.execute(fieldsql.hash_hive_valid_sql %
                             (sql_field, database, name_db_tb, where_sql))
            petadata_cur.execute(fieldsql.hash_peta_valid_sql %
                                 (sql_field, name_db_tb, where_sql))

        # 封装数据记录
        hive_result_row = tuple(hive_cur.fetchall()[0])
        hive_row_dict = dict()
        index = 0
        for hive_data in hive_result_row:
            cur_field_key = common_fields[index]
            hive_row_dict[cur_field_key] = hive_data
            index += 1

        peta_result_row = petadata_cur.fetchall()[0]
        peta_row_dict = dict()
        index = 0
        for peta_data in peta_result_row:
            cur_field_key = common_fields[index]
            peta_row_dict[cur_field_key] = peta_data
            index += 1
        del index

        # 开始比较字段
        success_count = 0
        error_count = 0
        for key in common_fields:
            hive_data = hive_row_dict[key]
            peta_data = peta_row_dict[key]
            if cmp(hive_data, str(peta_data)) == 0:
                log5 = '%s\t%s\t%s\t%s' % (key, hive_data, peta_data,
                                           'SUCCESS')
                logger.info(log5)
                success_count += 1
            else:
                log6 = '%s\t%s\t%s\t%s' % (key, hive_data, peta_data, 'ERROR')
                error_logger.error(log6)
                error_count += 1

        log7 = '相同字段数:%s\t不同字段数:%s\t' % (success_count, error_count)
        logger.info(log7)
        if error_count is not 0:
            error_row += 1

        # 结束时间
        time2 = time.time()
        total = time2 - time1
        log8 = '当前表对比完毕!\t耗时:%.2fs\t异常行数:%s\t' % (total, error_row)
        logger.info(log8)
Exemple #5
0
    from pyhs2.haconnection import HAConnection
except ImportError as e:
    print 'error:无法导入pyhs2.haconnection模块(%s)' % e.message
    sys.exit(0)
try:
    database = sys.argv[1]  # 获取部门参数
    log_title = '当前部门:%s' % database
    print log_title
except IndexError as e:
    print 'error:没有传入部门参数(%s)' % e.message
    sys.exit(0)

tongid = sys.argv[2]
# hive连接
hive_conn_pool = HAConnection(hosts=config.hosts, port=config.port,
                         authMechanism=config.authMechanism, configuration=config.conf,
                         timeout=config.timeout)
hive_cur = hive_conn_pool.getConnection().cursor()

# petadata连接
petadata_conn = connectPool.Connects(config.peta_database).conn_ali_petadata_shamo()
petadata_cur = petadata_conn.cursor()

# pgptest连接
pgptest_conn = connectPool.Connects(config.pgptest_database).conn_mysql_test()
pgptest_cur = pgptest_conn.cursor()

# 查询表名
table_count = pgptest_cur.execute(fieldsql.need_clean_sql % database)

def exec_compare():
    # 获取部门和表类型参数
    try:
        database = sys.argv[1]
        tabletype = sys.argv[2]
    except IndexError as e:
        print '没有参数:database,tabletype'
    else:
        # try:
        # 开始时间
        start_time = time.time()
        hive_conn = HAConnection(hosts=dbconfig.huhive_hosts,
                                 port=dbconfig.huhive_port,
                                 authMechanism=dbconfig.huhive_authMechanism,
                                 configuration=dbconfig.huhive_conf,
                                 timeout=dbconfig.huhive_timeout)
        # 连接hive
        conn = hive_conn.getConnection()

        # 连接peta data
        conn_peta = connectPool.Connects(
            dbconfig.peta_database).conn_ali_petadata()
        cur_peta = conn_peta.cursor()

        # 连接pgp test
        conn_pgp_test = connectPool.Connects(
            dbconfig.pgptest_database).conn_mysql_test()
        cur_pgp_test = conn_pgp_test.cursor()

        # 获取日志对象
        logger = LogUtil().count_logger()
        error_logger = LogUtil().counterror_logger()
        logger.info(logconfig.count_peta_hive_title)
        error_logger.error(logconfig.count_peta_hive_title)

        # 查询每个部门下所有表名
        count = cur_pgp_test.execute(commonsql.need_clean_sql % database)
        log_head = logconfig.count_head % (database, count)
        logger.info(log_head)
        logger.info(logconfig.count_schema)
        error_logger.error(logconfig.counterror_schema)

        # 所有需要清洗的表名和库名数据
        need_clean_tables = cur_pgp_test.fetchall()
        cur_pgp_test.close()
        conn_pgp_test.close()

        # 统计结果
        success_count = 0
        error_count = 0
        for need_clean_table in need_clean_tables:
            db_tb_name = need_clean_table[0]
            table = db_tb_name.replace(database + '_', '')

            cur_hive = conn.cursor()
            # 设置hive计算引擎
            cur_hive.execute("set hive.execution.engine = spark")

            # 查询语句准备
            if tabletype == TableType.INVALID:
                count_hive_sql = countsql.hive_invalid_sql % (database,
                                                              db_tb_name)
                count_peta_sql = countsql.peta_invalid_sql % db_tb_name
            elif tabletype == TableType.INVALID_DETAIL:
                count_hive_sql = countsql.hive_invalid_detail_sql % (
                    database, db_tb_name)
                count_peta_sql = countsql.peta_invalid_detail_sql % db_tb_name
            else:
                count_hive_sql = countsql.hive_valid_sql % (database,
                                                            db_tb_name)
                count_peta_sql = countsql.peta_valid_sql % db_tb_name

            # 执行Hive查询
            cur_hive.execute(count_hive_sql)
            count_hive = cur_hive.fetchall()[0][0]

            # 执行peta查询
            cur_peta.execute(count_peta_sql)
            count_peta = cur_peta.fetchall()[0][0]

            # 对比逻辑
            if count_hive == count_peta:
                success_count += 1
                success_result = "%s_%s_%s\t%s\t%s" % (
                    database, table, tabletype, count_hive, count_peta)
                logger.info(success_result)
            else:
                error_count += 1
                error_logger.error(
                    "%s_%s_%s\t%s\t%s" %
                    (database, table, tabletype, count_hive, count_peta))
            cur_hive.close()
        # 结束时间
        end_time = time.time()
        # 总耗时
        total_time = end_time - start_time
        tail_log = logconfig.count_tail % (success_count, error_count,
                                           total_time)
        logger.info(tail_log)

        # 关闭连接
        cur_peta.close()
Exemple #7
0
def exec_compare():
    # 获取参数
    try:
        params = sys.argv
        if len(params) == 3:
            database = params[1]
            tabletype = params[2]
            sample = False
        elif len(params) == 5:
            database = params[1]
            tabletype = params[2]
            divisor = params[3]
            mod = params[4]
            sample = True
    except IndexError as index_error:
        print "传入参数列表:database,tabletype, divisor,mod; 错误信息:", index_error

    # hive连接
    hive_conn_pool = HAConnection(hosts=dbconfig.huhive_hosts,
                                  port=dbconfig.huhive_port,
                                  authMechanism=dbconfig.huhive_authMechanism,
                                  configuration=dbconfig.huhive_conf,
                                  timeout=dbconfig.huhive_timeout)
    hive_conn = hive_conn_pool.getConnection()

    # petadata连接
    petadata_conn = connectPool.Connects(
        dbconfig.peta_database).conn_ali_petadata()
    petadata_cur = petadata_conn.cursor()

    # pgptest连接
    pgptest_conn = connectPool.Connects(
        dbconfig.pgptest_database).conn_mysql_test()
    pgptest_cur = pgptest_conn.cursor()

    # 所有表
    count_table = pgptest_cur.execute(commonsql.need_clean_sql % database)
    table_names = pgptest_cur.fetchall()

    # 日志对象
    logger = LogUtil().field_logger()
    error_logger = LogUtil().fielderror_logger()
    logger.info(logconfig.field_peta_hive_title)
    error_logger.error(logconfig.field_peta_hive_title)

    if tabletype == TableType.INVALID:
        _tabletype = 'invalid'
        logger.info(logconfig.tbtype_invalid)
        error_logger.error(logconfig.tbtype_invalid)
    elif tabletype == TableType.INVALID_DETAIL:
        _tabletype = 'invalid_detail'
        logger.info(logconfig.tbtype_invaliddetail)
        error_logger.error(logconfig.tbtype_invaliddetail)
    else:
        _tabletype = 'valid'
        logger.info(logconfig.tbtype_valid)
        error_logger.error(logconfig.tbtype_valid)

    log1 = logconfig.field_head % (database, count_table)
    logger.info(log1)
    # 遍历表名
    for name_db_tb_row in table_names:
        name_db_tb = name_db_tb_row[0]
        table_name = name_db_tb.replace(database + '_', '')

        # 设置hive计算引擎
        hive_cur = hive_conn.cursor()
        hive_cur.execute("set hive.execution.engine = spark")

        # 获取表在两边相同的字段
        common_fields = get_common_field(hive_cur, petadata_cur, database,
                                         name_db_tb, _tabletype)
        log2 = '表名: %s' % table_name
        logger.info(log2)
        error_logger.error(log2)

        # 输出一下两边数据量
        count_log = count_invalid(hive_conn, petadata_conn, database,
                                  name_db_tb, _tabletype)
        logger.info(count_log)

        # 日志结构头
        logger.info(logconfig.field_schema)
        error_logger.error(logconfig.fielderror_schema)

        # 开始时间
        time1 = time.time()
        # 查询所有tongid
        hive_cur = hive_conn.cursor()
        hive_cur.execute("set hive.execution.engine = spark")

        if tabletype == TableType.INVALID:
            hive_cur.execute(fieldsql.hive_select_tongid_invalid %
                             (database, name_db_tb))
        elif tabletype == TableType.INVALID_DETAIL:
            hive_cur.execute(fieldsql.hive_select_tongid_invaliddetail %
                             (database, name_db_tb))
        else:
            hive_cur.execute(fieldsql.hive_select_tongid_valid %
                             (database, name_db_tb))

        tongid_rows = hive_cur.fetchall()
        hive_cur.close()

        error_row = 0
        # 遍历tongid
        for tongid_row in tongid_rows:
            tongid = tongid_row[0]
            if sample:
                # 过滤抽样抽取一些满足条件的tongid
                if not tongid_filter(tongid, int(divisor), int(mod)):
                    continue
            log4 = 'tongid\t%s\t%s\t相同且唯一' % (tongid, tongid)
            logger.info("---------------- 1 行 ------------------\n")
            logger.info(log4)

            sql_field = ', '.join(common_fields)
            # 封装tongid对应的hive数据记录
            hive_cur = hive_conn.cursor()
            petadata_cur = petadata_conn.cursor()

            # 查询数据
            if tabletype == TableType.INVALID:
                hive_cur.execute(fieldsql.hive_invalid_sql %
                                 (sql_field, database, name_db_tb, tongid))
                petadata_cur.execute(fieldsql.peta_invalid_sql %
                                     (sql_field, name_db_tb, tongid))
            elif tabletype == TableType.INVALID_DETAIL:
                hive_cur.execute(fieldsql.hive_invalid_detail_sql %
                                 (sql_field, database, name_db_tb, tongid))
                petadata_cur.execute(fieldsql.peta_invalid_detail_sql %
                                     (sql_field, name_db_tb, tongid))
            else:
                hive_cur.execute(fieldsql.hive_valid_sql %
                                 (sql_field, database, name_db_tb, tongid))
                petadata_cur.execute(fieldsql.peta_valid_sql %
                                     (sql_field, name_db_tb, tongid))

            # 封装每个tongid对应的数据记录
            hive_result_row = tuple(hive_cur.fetchall()[0])
            hive_row_dict = dict()
            index = 0
            for hive_data in hive_result_row:
                cur_field_key = common_fields[index]
                hive_row_dict[cur_field_key] = hive_data
                index += 1

            peta_result_row = petadata_cur.fetchall()[0]
            peta_row_dict = dict()
            index = 0
            for peta_data in peta_result_row:
                cur_field_key = common_fields[index]
                peta_row_dict[cur_field_key] = peta_data
                index += 1
            del index

            # 开始比较字段
            success_count = 0
            error_count = 0
            for key in common_fields:
                hive_data = hive_row_dict[key]
                peta_data = peta_row_dict[key]
                if cmp(hive_data, str(peta_data)) == 0:
                    log5 = '%s\t%s\t%s\t%s' % (key, hive_data, peta_data,
                                               'SUCCESS')
                    logger.info(log5)
                    success_count += 1
                else:
                    log6 = '%s\t%s\t%s\t%s' % (key, hive_data, peta_data,
                                               'ERROR')
                    error_logger.error(log6)
                    error_count += 1

            log7 = logconfig.field_result % (success_count, error_count)
            logger.info(log7)
            if error_count is not 0:
                error_row += 1

        # 结束时间
        time2 = time.time()
        total = time2 - time1
        log8 = logconfig.field_tail % (total, error_row)
        print log8
        logger.info(log8)
Exemple #8
0
    def exec_compare(database, ods_conn):
        # 开始时间
        start_time = time.time()
        hive_conn = HAConnection(hosts=dbconfig.ali_hosts,
                                 port=dbconfig.ali_port,
                                 authMechanism=dbconfig.ali_authMechanism,
                                 configuration=dbconfig.ali_conf,
                                 timeout=dbconfig.ali_timeout)
        # 连接hive
        conn = hive_conn.getConnection()

        # 连接ods
        cur_ods = ods_conn.cursor()

        # 连接pgp test
        conn_pgp_test = connectPool.Connects(
            dbconfig.pgptest_database).conn_mysql_test()
        cur_pgp_test = conn_pgp_test.cursor()

        # 获取日志对象
        logger = LogUtil().count_logger()
        error_logger = LogUtil().counterror_logger()
        logger.info(logconfig.count_ods_hive_title)
        error_logger.error(logconfig.count_ods_hive_title)

        # 查询每个部门下所有表名
        count = cur_pgp_test.execute(commonsql.need_clean_sql % database)
        log_head = logconfig.count_head % (database, count)
        logger.info(log_head)
        logger.info(logconfig.count_odshive_schema)
        error_logger.error(logconfig.counterror_odshive_schema)

        # 所有需要清洗的表名和库名数据
        need_clean_tables = cur_pgp_test.fetchall()
        cur_pgp_test.close()
        conn_pgp_test.close()

        # 统计结果
        success_count = 0
        error_count = 0
        for need_clean_table in need_clean_tables:
            db_tb_name = need_clean_table[0]
            table = db_tb_name.replace(database + '_', '')

            cur_hive = conn.cursor()

            # 查询语句准备
            count_hive_sql = countsql.hive_ods % (database, db_tb_name)
            count_ods_sql = countsql.ods % table

            # 执行Hive查询
            cur_hive.execute(count_hive_sql)
            count_hive = cur_hive.fetchall()[0][0]

            # 执行ods查询
            cur_ods.execute(count_ods_sql)
            count_ods = cur_ods.fetchall()[0][0]

            # 对比逻辑
            if count_hive == count_ods:
                success_count += 1
                success_result = "%s_%s_ods\t%s\t%s" % (database, table,
                                                        count_hive, count_ods)
                logger.info(success_result)
            else:
                error_count += 1
                error_logger.error("%s_%s_ods\t%s\t%s" %
                                   (database, table, count_hive, count_ods))
            cur_hive.close()
        # 结束时间
        end_time = time.time()
        # 总耗时
        total_time = end_time - start_time
        tail_log = logconfig.count_tail % (success_count, error_count,
                                           total_time)
        logger.info(tail_log)

        # 关闭连接
        cur_ods.close()
Exemple #9
0
def exec_compare():
    try:
        # 获取部门和表类型参数
        database = sys.argv[1]
        tabletype = sys.argv[2]
    except IndexError as err:
        print '没有参数:database,tabletype;%s' % err
    else:
        # 开始时间
        start_time = time.time()

        # huawei hive
        hive_pool = HAConnection(hosts=dbconfig.huhive_hosts,
                                 port=dbconfig.huhive_port,
                                 authMechanism=dbconfig.huhive_authMechanism,
                                 configuration=dbconfig.huhive_conf,
                                 timeout=dbconfig.huhive_timeout)
        hive_conn = hive_pool.getConnection()

        # emr hive
        emr_pool = HAConnection(hosts=dbconfig.ali_hosts,
                                port=dbconfig.ali_port,
                                authMechanism=dbconfig.ali_authMechanism,
                                configuration=dbconfig.ali_conf,
                                timeout=dbconfig.ali_timeout)
        emr_conn = emr_pool.getConnection()

        # 连接pgp test
        conn_pgp_test = connectPool.Connects(
            dbconfig.pgptest_database).conn_mysql_test()
        cur_pgp_test = conn_pgp_test.cursor()

        # 获取日志对象
        logger = LogUtil().count_logger()
        error_logger = LogUtil().counterror_logger()
        logger.info(logconfig.count_emr_hive_title)
        error_logger.error(logconfig.count_emr_hive_title)

        # 查询每个部门下所有表名
        count = cur_pgp_test.execute(commonsql.need_clean_sql % database)
        log_head = logconfig.count_head % (database, count)
        logger.info(log_head)
        logger.info(logconfig.count_emrhive_schema)
        error_logger.error(logconfig.counterror_emrhive_schema)

        # 所有需要清洗的表名和库名数据
        need_clean_tables = cur_pgp_test.fetchall()
        cur_pgp_test.close()
        conn_pgp_test.close()

        # 统计结果
        success_count = 0
        error_count = 0
        for need_clean_table in need_clean_tables:
            db_tb_name = need_clean_table[0]
            table = db_tb_name.replace(database + '_', '')

            cur_hive = hive_conn.cursor()
            cur_emr = emr_conn.cursor()
            # 使用spark 不用mapreduce
            cur_hive.execute("set hive.execution.engine = spark")
            cur_emr.execute("set hive.execution.engine = spark")

            # 查询语句准备
            if tabletype == TableType.INVALID:
                count_hive_sql = countsql.hive_invalid_sql % (database,
                                                              db_tb_name)
            elif tabletype == TableType.INVALID_DETAIL:
                count_hive_sql = countsql.hive_invalid_detail_sql % (
                    database, db_tb_name)
            else:
                count_hive_sql = countsql.hive_valid_sql % (database,
                                                            db_tb_name)

            # 执行Hive查询
            cur_hive.execute(count_hive_sql)
            count_hive = cur_hive.fetchall()[0][0]

            # 执行emr查询
            cur_emr.execute(count_hive_sql)
            count_emr = cur_emr.fetchall()[0][0]

            # 结果对比
            if count_hive == count_emr:
                success_count += 1
                success_result = "%s_%s_%s\t%s\t%s" % (
                    database, table, tabletype, count_hive, count_emr)
                logger.info(success_result)
            else:
                error_count += 1
                error_logger.error(
                    "%s_%s_%s\t%s\t%s" %
                    (database, table, tabletype, count_hive, count_emr))
            cur_hive.close()
            cur_emr.close()
        # 结束时间
        end_time = time.time()
        # 总耗时
        total_time = end_time - start_time
        tail_log = logconfig.count_tail % (success_count, error_count,
                                           total_time)
        logger.info(tail_log)
Exemple #10
0
def exec_compare():
    # 获取部门和表类型参数
    try:
        database = sys.argv[1]
        tablename = sys.argv[2]
        tabletype = sys.argv[3]
    except IndexError as e:
        print '没有参数:database,tablename'
    else:
        try:
            # 开始时间
            start_time = time.time()
            hive_conn = HAConnection(
                hosts=dbconfig.huhive_hosts,
                port=dbconfig.huhive_port,
                authMechanism=dbconfig.huhive_authMechanism,
                configuration=dbconfig.huhive_conf,
                timeout=dbconfig.huhive_timeout)
            # 连接hive
            conn = hive_conn.getConnection()

            # 连接pgp test
            conn_pgp = connectPool.Connects(
                dbconfig.pgp_database).conn_mysql_pgp()
            cur_pgp = conn_pgp.cursor()

            # 获取日志对象
            logger = LogUtil().count_logger()
            error_logger = LogUtil().counterror_logger()

            # 开始对比
            cur_hive = conn.cursor()
            # 设置hive计算引擎
            cur_hive.execute("set hive.execution.engine = spark")

            # 查询语句准备
            if tabletype == TableType.INVALID:
                count_hive_sql = countsql.hive_invalid_sql % (database,
                                                              tablename)
                count_pgp_sql = countsql.pgp_invalid_sql % tablename
            elif tabletype == TableType.INVALID_DETAIL:
                count_hive_sql = countsql.hive_invalid_detail_sql % (database,
                                                                     tablename)
                count_pgp_sql = countsql.pgp_invalid_detail_sql % tablename
            else:
                count_hive_sql = countsql.pgphive_valid_sql % (database,
                                                               tablename)
                count_pgp_sql = countsql.pgp_valid_sql % tablename

            # 执行Hive查询
            cur_hive.execute(count_hive_sql)
            count_hive = cur_hive.fetchall()[0][0]

            # 执行peta查询
            cur_pgp.execute(count_pgp_sql)
            count_pgp = cur_pgp.fetchall()[0][0]

            # 对比逻辑
            if count_hive == count_pgp:
                success_result = "%s\t%s\t%s" % (tablename, count_hive,
                                                 count_pgp)
                logger.info(success_result)
            else:
                error_logger.error("%s\t%s\t%s" %
                                   (tablename, count_hive, count_pgp))
            cur_hive.close()

            # 关闭连接
            cur_pgp.close()
        except Pyhs2Exception as err:
            error_logger.error("表不存在: %s" % err)