def exec_compare():
    # 获取部门和表类型参数
    try:
        database = sys.argv[1]
        tabletype = sys.argv[2]
    except IndexError as e:
        print '没有参数:database,tabletype'
    else:
        # try:
        # 开始时间
        start_time = time.time()
        hive_conn = HAConnection(hosts=dbconfig.huhive_hosts,
                                 port=dbconfig.huhive_port,
                                 authMechanism=dbconfig.huhive_authMechanism,
                                 configuration=dbconfig.huhive_conf,
                                 timeout=dbconfig.huhive_timeout)
        # 连接hive
        conn = hive_conn.getConnection()

        # 连接peta data
        conn_peta = connectPool.Connects(
            dbconfig.peta_database).conn_ali_petadata()
        cur_peta = conn_peta.cursor()

        # 连接pgp test
        conn_pgp_test = connectPool.Connects(
            dbconfig.pgptest_database).conn_mysql_test()
        cur_pgp_test = conn_pgp_test.cursor()

        # 获取日志对象
        logger = LogUtil().count_logger()
        error_logger = LogUtil().counterror_logger()
        logger.info(logconfig.count_peta_hive_title)
        error_logger.error(logconfig.count_peta_hive_title)

        # 查询每个部门下所有表名
        count = cur_pgp_test.execute(commonsql.need_clean_sql % database)
        log_head = logconfig.count_head % (database, count)
        logger.info(log_head)
        logger.info(logconfig.count_schema)
        error_logger.error(logconfig.counterror_schema)

        # 所有需要清洗的表名和库名数据
        need_clean_tables = cur_pgp_test.fetchall()
        cur_pgp_test.close()
        conn_pgp_test.close()

        # 统计结果
        success_count = 0
        error_count = 0
        for need_clean_table in need_clean_tables:
            db_tb_name = need_clean_table[0]
            table = db_tb_name.replace(database + '_', '')

            cur_hive = conn.cursor()
            # 设置hive计算引擎
            cur_hive.execute("set hive.execution.engine = spark")

            # 查询语句准备
            if tabletype == TableType.INVALID:
                count_hive_sql = countsql.hive_invalid_sql % (database,
                                                              db_tb_name)
                count_peta_sql = countsql.peta_invalid_sql % db_tb_name
            elif tabletype == TableType.INVALID_DETAIL:
                count_hive_sql = countsql.hive_invalid_detail_sql % (
                    database, db_tb_name)
                count_peta_sql = countsql.peta_invalid_detail_sql % db_tb_name
            else:
                count_hive_sql = countsql.hive_valid_sql % (database,
                                                            db_tb_name)
                count_peta_sql = countsql.peta_valid_sql % db_tb_name

            # 执行Hive查询
            cur_hive.execute(count_hive_sql)
            count_hive = cur_hive.fetchall()[0][0]

            # 执行peta查询
            cur_peta.execute(count_peta_sql)
            count_peta = cur_peta.fetchall()[0][0]

            # 对比逻辑
            if count_hive == count_peta:
                success_count += 1
                success_result = "%s_%s_%s\t%s\t%s" % (
                    database, table, tabletype, count_hive, count_peta)
                logger.info(success_result)
            else:
                error_count += 1
                error_logger.error(
                    "%s_%s_%s\t%s\t%s" %
                    (database, table, tabletype, count_hive, count_peta))
            cur_hive.close()
        # 结束时间
        end_time = time.time()
        # 总耗时
        total_time = end_time - start_time
        tail_log = logconfig.count_tail % (success_count, error_count,
                                           total_time)
        logger.info(tail_log)

        # 关闭连接
        cur_peta.close()
Exemple #2
0
def exec_compare():
    # 获取参数
    try:
        params = sys.argv
        if len(params) == 3:
            database = params[1]
            tabletype = params[2]
            sample = False
        elif len(params) == 5:
            database = params[1]
            tabletype = params[2]
            divisor = params[3]
            mod = params[4]
            sample = True
    except IndexError as index_error:
        print "传入参数列表:database,tabletype, divisor,mod; 错误信息:", index_error

    # hive连接
    hive_conn_pool = HAConnection(hosts=dbconfig.huhive_hosts,
                                  port=dbconfig.huhive_port,
                                  authMechanism=dbconfig.huhive_authMechanism,
                                  configuration=dbconfig.huhive_conf,
                                  timeout=dbconfig.huhive_timeout)
    hive_conn = hive_conn_pool.getConnection()

    # petadata连接
    petadata_conn = connectPool.Connects(
        dbconfig.peta_database).conn_ali_petadata()
    petadata_cur = petadata_conn.cursor()

    # pgptest连接
    pgptest_conn = connectPool.Connects(
        dbconfig.pgptest_database).conn_mysql_test()
    pgptest_cur = pgptest_conn.cursor()

    # 所有表
    count_table = pgptest_cur.execute(commonsql.need_clean_sql % database)
    table_names = pgptest_cur.fetchall()

    # 日志对象
    logger = LogUtil().field_logger()
    error_logger = LogUtil().fielderror_logger()
    logger.info(logconfig.field_peta_hive_title)
    error_logger.error(logconfig.field_peta_hive_title)

    if tabletype == TableType.INVALID:
        _tabletype = 'invalid'
        logger.info(logconfig.tbtype_invalid)
        error_logger.error(logconfig.tbtype_invalid)
    elif tabletype == TableType.INVALID_DETAIL:
        _tabletype = 'invalid_detail'
        logger.info(logconfig.tbtype_invaliddetail)
        error_logger.error(logconfig.tbtype_invaliddetail)
    else:
        _tabletype = 'valid'
        logger.info(logconfig.tbtype_valid)
        error_logger.error(logconfig.tbtype_valid)

    log1 = logconfig.field_head % (database, count_table)
    logger.info(log1)
    # 遍历表名
    for name_db_tb_row in table_names:
        name_db_tb = name_db_tb_row[0]
        table_name = name_db_tb.replace(database + '_', '')

        # 设置hive计算引擎
        hive_cur = hive_conn.cursor()
        hive_cur.execute("set hive.execution.engine = spark")

        # 获取表在两边相同的字段
        common_fields = get_common_field(hive_cur, petadata_cur, database,
                                         name_db_tb, _tabletype)
        log2 = '表名: %s' % table_name
        logger.info(log2)
        error_logger.error(log2)

        # 输出一下两边数据量
        count_log = count_invalid(hive_conn, petadata_conn, database,
                                  name_db_tb, _tabletype)
        logger.info(count_log)

        # 日志结构头
        logger.info(logconfig.field_schema)
        error_logger.error(logconfig.fielderror_schema)

        # 开始时间
        time1 = time.time()
        # 查询所有tongid
        hive_cur = hive_conn.cursor()
        hive_cur.execute("set hive.execution.engine = spark")

        if tabletype == TableType.INVALID:
            hive_cur.execute(fieldsql.hive_select_tongid_invalid %
                             (database, name_db_tb))
        elif tabletype == TableType.INVALID_DETAIL:
            hive_cur.execute(fieldsql.hive_select_tongid_invaliddetail %
                             (database, name_db_tb))
        else:
            hive_cur.execute(fieldsql.hive_select_tongid_valid %
                             (database, name_db_tb))

        tongid_rows = hive_cur.fetchall()
        hive_cur.close()

        error_row = 0
        # 遍历tongid
        for tongid_row in tongid_rows:
            tongid = tongid_row[0]
            if sample:
                # 过滤抽样抽取一些满足条件的tongid
                if not tongid_filter(tongid, int(divisor), int(mod)):
                    continue
            log4 = 'tongid\t%s\t%s\t相同且唯一' % (tongid, tongid)
            logger.info("---------------- 1 行 ------------------\n")
            logger.info(log4)

            sql_field = ', '.join(common_fields)
            # 封装tongid对应的hive数据记录
            hive_cur = hive_conn.cursor()
            petadata_cur = petadata_conn.cursor()

            # 查询数据
            if tabletype == TableType.INVALID:
                hive_cur.execute(fieldsql.hive_invalid_sql %
                                 (sql_field, database, name_db_tb, tongid))
                petadata_cur.execute(fieldsql.peta_invalid_sql %
                                     (sql_field, name_db_tb, tongid))
            elif tabletype == TableType.INVALID_DETAIL:
                hive_cur.execute(fieldsql.hive_invalid_detail_sql %
                                 (sql_field, database, name_db_tb, tongid))
                petadata_cur.execute(fieldsql.peta_invalid_detail_sql %
                                     (sql_field, name_db_tb, tongid))
            else:
                hive_cur.execute(fieldsql.hive_valid_sql %
                                 (sql_field, database, name_db_tb, tongid))
                petadata_cur.execute(fieldsql.peta_valid_sql %
                                     (sql_field, name_db_tb, tongid))

            # 封装每个tongid对应的数据记录
            hive_result_row = tuple(hive_cur.fetchall()[0])
            hive_row_dict = dict()
            index = 0
            for hive_data in hive_result_row:
                cur_field_key = common_fields[index]
                hive_row_dict[cur_field_key] = hive_data
                index += 1

            peta_result_row = petadata_cur.fetchall()[0]
            peta_row_dict = dict()
            index = 0
            for peta_data in peta_result_row:
                cur_field_key = common_fields[index]
                peta_row_dict[cur_field_key] = peta_data
                index += 1
            del index

            # 开始比较字段
            success_count = 0
            error_count = 0
            for key in common_fields:
                hive_data = hive_row_dict[key]
                peta_data = peta_row_dict[key]
                if cmp(hive_data, str(peta_data)) == 0:
                    log5 = '%s\t%s\t%s\t%s' % (key, hive_data, peta_data,
                                               'SUCCESS')
                    logger.info(log5)
                    success_count += 1
                else:
                    log6 = '%s\t%s\t%s\t%s' % (key, hive_data, peta_data,
                                               'ERROR')
                    error_logger.error(log6)
                    error_count += 1

            log7 = logconfig.field_result % (success_count, error_count)
            logger.info(log7)
            if error_count is not 0:
                error_row += 1

        # 结束时间
        time2 = time.time()
        total = time2 - time1
        log8 = logconfig.field_tail % (total, error_row)
        print log8
        logger.info(log8)
Exemple #3
0
# -*- coding: utf-8 -*-
"""
Created on 2017/12/7.

@author: kesong
"""
import json

from db import connectPool
from config import dbconfig

conn_pgp_test = connectPool.Connects(
    dbconfig.pgptest_database).conn_mysql_test()
cur_pgp_test = conn_pgp_test.cursor()
query_distinct_depts = "select distinct dept_short_name from direct_info"
cur_pgp_test.execute(query_distinct_depts)
dept_rows = cur_pgp_test.fetchall()
dept_list = []
for _ in dept_rows:
    dept_list.extend(_)
jsonStr = json.dumps(dept_list)
writer = open("files/depts.json", "w")
writer.writelines(jsonStr)
writer.close()
Exemple #4
0
    def exec_compare(database, ods_conn):
        # 开始时间
        start_time = time.time()
        hive_conn = HAConnection(hosts=dbconfig.ali_hosts,
                                 port=dbconfig.ali_port,
                                 authMechanism=dbconfig.ali_authMechanism,
                                 configuration=dbconfig.ali_conf,
                                 timeout=dbconfig.ali_timeout)
        # 连接hive
        conn = hive_conn.getConnection()

        # 连接ods
        cur_ods = ods_conn.cursor()

        # 连接pgp test
        conn_pgp_test = connectPool.Connects(
            dbconfig.pgptest_database).conn_mysql_test()
        cur_pgp_test = conn_pgp_test.cursor()

        # 获取日志对象
        logger = LogUtil().count_logger()
        error_logger = LogUtil().counterror_logger()
        logger.info(logconfig.count_ods_hive_title)
        error_logger.error(logconfig.count_ods_hive_title)

        # 查询每个部门下所有表名
        count = cur_pgp_test.execute(commonsql.need_clean_sql % database)
        log_head = logconfig.count_head % (database, count)
        logger.info(log_head)
        logger.info(logconfig.count_odshive_schema)
        error_logger.error(logconfig.counterror_odshive_schema)

        # 所有需要清洗的表名和库名数据
        need_clean_tables = cur_pgp_test.fetchall()
        cur_pgp_test.close()
        conn_pgp_test.close()

        # 统计结果
        success_count = 0
        error_count = 0
        for need_clean_table in need_clean_tables:
            db_tb_name = need_clean_table[0]
            table = db_tb_name.replace(database + '_', '')

            cur_hive = conn.cursor()

            # 查询语句准备
            count_hive_sql = countsql.hive_ods % (database, db_tb_name)
            count_ods_sql = countsql.ods % table

            # 执行Hive查询
            cur_hive.execute(count_hive_sql)
            count_hive = cur_hive.fetchall()[0][0]

            # 执行ods查询
            cur_ods.execute(count_ods_sql)
            count_ods = cur_ods.fetchall()[0][0]

            # 对比逻辑
            if count_hive == count_ods:
                success_count += 1
                success_result = "%s_%s_ods\t%s\t%s" % (database, table,
                                                        count_hive, count_ods)
                logger.info(success_result)
            else:
                error_count += 1
                error_logger.error("%s_%s_ods\t%s\t%s" %
                                   (database, table, count_hive, count_ods))
            cur_hive.close()
        # 结束时间
        end_time = time.time()
        # 总耗时
        total_time = end_time - start_time
        tail_log = logconfig.count_tail % (success_count, error_count,
                                           total_time)
        logger.info(tail_log)

        # 关闭连接
        cur_ods.close()
Exemple #5
0
def exec_compare():
    try:
        # 获取部门和表类型参数
        database = sys.argv[1]
        tabletype = sys.argv[2]
    except IndexError as err:
        print '没有参数:database,tabletype;%s' % err
    else:
        # 开始时间
        start_time = time.time()

        # huawei hive
        hive_pool = HAConnection(hosts=dbconfig.huhive_hosts,
                                 port=dbconfig.huhive_port,
                                 authMechanism=dbconfig.huhive_authMechanism,
                                 configuration=dbconfig.huhive_conf,
                                 timeout=dbconfig.huhive_timeout)
        hive_conn = hive_pool.getConnection()

        # emr hive
        emr_pool = HAConnection(hosts=dbconfig.ali_hosts,
                                port=dbconfig.ali_port,
                                authMechanism=dbconfig.ali_authMechanism,
                                configuration=dbconfig.ali_conf,
                                timeout=dbconfig.ali_timeout)
        emr_conn = emr_pool.getConnection()

        # 连接pgp test
        conn_pgp_test = connectPool.Connects(
            dbconfig.pgptest_database).conn_mysql_test()
        cur_pgp_test = conn_pgp_test.cursor()

        # 获取日志对象
        logger = LogUtil().count_logger()
        error_logger = LogUtil().counterror_logger()
        logger.info(logconfig.count_emr_hive_title)
        error_logger.error(logconfig.count_emr_hive_title)

        # 查询每个部门下所有表名
        count = cur_pgp_test.execute(commonsql.need_clean_sql % database)
        log_head = logconfig.count_head % (database, count)
        logger.info(log_head)
        logger.info(logconfig.count_emrhive_schema)
        error_logger.error(logconfig.counterror_emrhive_schema)

        # 所有需要清洗的表名和库名数据
        need_clean_tables = cur_pgp_test.fetchall()
        cur_pgp_test.close()
        conn_pgp_test.close()

        # 统计结果
        success_count = 0
        error_count = 0
        for need_clean_table in need_clean_tables:
            db_tb_name = need_clean_table[0]
            table = db_tb_name.replace(database + '_', '')

            cur_hive = hive_conn.cursor()
            cur_emr = emr_conn.cursor()
            # 使用spark 不用mapreduce
            cur_hive.execute("set hive.execution.engine = spark")
            cur_emr.execute("set hive.execution.engine = spark")

            # 查询语句准备
            if tabletype == TableType.INVALID:
                count_hive_sql = countsql.hive_invalid_sql % (database,
                                                              db_tb_name)
            elif tabletype == TableType.INVALID_DETAIL:
                count_hive_sql = countsql.hive_invalid_detail_sql % (
                    database, db_tb_name)
            else:
                count_hive_sql = countsql.hive_valid_sql % (database,
                                                            db_tb_name)

            # 执行Hive查询
            cur_hive.execute(count_hive_sql)
            count_hive = cur_hive.fetchall()[0][0]

            # 执行emr查询
            cur_emr.execute(count_hive_sql)
            count_emr = cur_emr.fetchall()[0][0]

            # 结果对比
            if count_hive == count_emr:
                success_count += 1
                success_result = "%s_%s_%s\t%s\t%s" % (
                    database, table, tabletype, count_hive, count_emr)
                logger.info(success_result)
            else:
                error_count += 1
                error_logger.error(
                    "%s_%s_%s\t%s\t%s" %
                    (database, table, tabletype, count_hive, count_emr))
            cur_hive.close()
            cur_emr.close()
        # 结束时间
        end_time = time.time()
        # 总耗时
        total_time = end_time - start_time
        tail_log = logconfig.count_tail % (success_count, error_count,
                                           total_time)
        logger.info(tail_log)
Exemple #6
0
def exec_compare():
    # 获取部门和表类型参数
    try:
        database = sys.argv[1]
        tablename = sys.argv[2]
        tabletype = sys.argv[3]
    except IndexError as e:
        print '没有参数:database,tablename'
    else:
        try:
            # 开始时间
            start_time = time.time()
            hive_conn = HAConnection(
                hosts=dbconfig.huhive_hosts,
                port=dbconfig.huhive_port,
                authMechanism=dbconfig.huhive_authMechanism,
                configuration=dbconfig.huhive_conf,
                timeout=dbconfig.huhive_timeout)
            # 连接hive
            conn = hive_conn.getConnection()

            # 连接pgp test
            conn_pgp = connectPool.Connects(
                dbconfig.pgp_database).conn_mysql_pgp()
            cur_pgp = conn_pgp.cursor()

            # 获取日志对象
            logger = LogUtil().count_logger()
            error_logger = LogUtil().counterror_logger()

            # 开始对比
            cur_hive = conn.cursor()
            # 设置hive计算引擎
            cur_hive.execute("set hive.execution.engine = spark")

            # 查询语句准备
            if tabletype == TableType.INVALID:
                count_hive_sql = countsql.hive_invalid_sql % (database,
                                                              tablename)
                count_pgp_sql = countsql.pgp_invalid_sql % tablename
            elif tabletype == TableType.INVALID_DETAIL:
                count_hive_sql = countsql.hive_invalid_detail_sql % (database,
                                                                     tablename)
                count_pgp_sql = countsql.pgp_invalid_detail_sql % tablename
            else:
                count_hive_sql = countsql.pgphive_valid_sql % (database,
                                                               tablename)
                count_pgp_sql = countsql.pgp_valid_sql % tablename

            # 执行Hive查询
            cur_hive.execute(count_hive_sql)
            count_hive = cur_hive.fetchall()[0][0]

            # 执行peta查询
            cur_pgp.execute(count_pgp_sql)
            count_pgp = cur_pgp.fetchall()[0][0]

            # 对比逻辑
            if count_hive == count_pgp:
                success_result = "%s\t%s\t%s" % (tablename, count_hive,
                                                 count_pgp)
                logger.info(success_result)
            else:
                error_logger.error("%s\t%s\t%s" %
                                   (tablename, count_hive, count_pgp))
            cur_hive.close()

            # 关闭连接
            cur_pgp.close()
        except Pyhs2Exception as err:
            error_logger.error("表不存在: %s" % err)