Exemple #1
0
def checkLength(db,tb,fd,inputLength,compType):
    """
    校验数据长度
    :param db: 校验数据库
    :param tb: 校验数据表
    :param fd: 校验列
    :param inputLength: 输入的期望长度
    :param compType: 比较类型(>,<,=,<=,>=)
    :return:
    """
    # 从连接池获取数据库连接
    db_pool = dbConnect.get_db_pool(False)
    conn = db_pool.connection()
    sql = "SELECT "+fd+" FROM "+db+"."+tb
    columnInfo = pd.read_sql(sql, conn)
    # 返回每一条记录的字符串的长度,效率比for循环更高,可以加条件过滤、统计长度不符合要求的字符串数据
    strInfo = columnInfo[fd].str
    lengthInfo = columnInfo[fd].str.len()
    if compType == "morethan":
        resRatio = lengthInfo[lengthInfo>int(inputLength)].shape[0]/lengthInfo.shape[0]
    elif compType == "lessthan":
        resRatio = lengthInfo[lengthInfo < int(inputLength)].shape[0] / lengthInfo.shape[0]
    elif compType == "equal":
        resRatio = lengthInfo[lengthInfo == int(inputLength)].shape[0] / lengthInfo.shape[0]

    return jsonify({"maxlengeth":max(lengthInfo),"minlengeth":min(lengthInfo),"accordRatio":resRatio})
Exemple #2
0
def checkType(database,table,field):
    """
    列类型校验
    :param database: 校验数据库
    :param table: 校验数据表
    :param field: 校验列
    :return: 校验列在mysql中的存储类型
    """
    # 从连接池获取数据库连接
    db_pool = dbConnect.get_db_pool(False)
    conn = db_pool.connection()

    db = database
    tb = table
    fd = field
    sql = "DESC "+db+"."+tb
    dfData = pd.read_sql(sql, conn)

    columnType = dfData[['Field','Type']]
    columnType.set_index(['Field'], inplace=True)
    # 获取单个或多个字段的数据类型
    res = columnType.loc[fd.split(',')]
    res = res.to_dict()
    # 返回数据类型,长度,精度
    return jsonify(res)
Exemple #3
0
def checkPrecision(database,table,field):
    """
    精度校验,校验小数点位数(计量单位校验待定)
    :param database: 校验数据库
    :param table: 校验数据表
    :param field: 校验列
    :return: 校验列的小数点位数
    """
    # 从连接池获取数据库连接
    db_pool = dbConnect.get_db_pool(False)
    conn = db_pool.connection()

    db = database
    tb = table
    fd = field
    sql = "SHOW FULL COLUMNS FROM "+db+"."+tb
    columnInfo = pd.read_sql(sql, conn)

    # 获取校验列的精度信息,计量单位,小数位数
    precisionInfo = columnInfo[['Field','Type','Comment']]
    # data为校验字段
    xsws = precisionInfo['Type'][precisionInfo['Field']== fd]
    typeSplit = re.split(r"[',',')']", xsws.values[0])
    if len(typeSplit) > 2:
        # decimalDigits为小数位数
        _, decimalDigits, _ = typeSplit

        end_exec = time.clock()
        print("精度校验总用时: %s Seconds" % (end_exec - start_read))
        return jsonify({fd:decimalDigits})
    else:
        return "校验列没有小数位"
Exemple #4
0
def checkDoubleCount(srcdb, srctb, srcfd, desdb, destb, desfd):
    """
    双表count校验
    :param srcdb: 源数据库
    :param srctb: 源数据表
    :param srcfd: 源检验列
    :param desdb: 目标数据库
    :param destb: 目标数据表
    :param desfd: 目标检验例
    :return:
    """
    # 从连接池获取数据库连接
    db_pool = dbConnect.get_db_pool(False)
    conn = db_pool.connection()
    sql_tb1 = "SELECT count(" + srcfd + ") as f1 FROM " + srcdb + "." + srctb
    dfDataTb1 = pd.read_sql(sql_tb1, conn)
    print(dfDataTb1["f1"][0])  # numpy.int64数据类型
    sql_tb2 = "SELECT count(" + desfd + ") as f2 FROM " + desdb + "." + destb
    dfDataTb2 = pd.read_sql(sql_tb2, conn)
    print(dfDataTb2["f2"][0])

    if dfDataTb1["f1"][0] == dfDataTb2["f2"][0]:
        res = 'double count check success'
    else:
        res = 'double count check faild'
    return res
Exemple #5
0
def checkDefaultValue(database, table, field):
    """
    评估列属性和数据在相同数据类型的字段默认值上的一致性
    :param database: 校验数据库
    :param table: 校验表
    :param field: 校验列
    :return: 返回数据库下所有表的相同数据类型的字段默认值是否一致
    """
    # 从连接池获取数据库连接
    db_pool = dbConnect.get_db_pool(False)
    conn = db_pool.connection()
    db = database
    tb = table
    fd = field
    sql = "select table_name from information_schema.tables WHERE table_schema = " + "'" + db + "'"
    # 返回指定数据库下所有表名
    dfData = pd.read_sql(sql, conn)
    allTableDefaultValue = {}
    for i in list(dfData['table_name']):
        sql = "DESC " + db + "." + i
        tbData = pd.read_sql(sql, conn)
        # 完全重复的类型与默认值组合会被合并
        print(dict(zip(list(tbData['Type']), tbData['Default'])))
        allTableDefaultValue.setdefault(i, []).append(
            dict(zip(list(tbData['Type']), tbData['Default'])))
    return jsonify(allTableDefaultValue)


# 访问示例:127.0.0.1:5000/conformity/checkDefaultValue/datagovernance/mytable/ff
Exemple #6
0
def checkSingleDefaultValue(database, table, field):
    """
    一个字段默认值使用的一致性
    :param database: 校验数据库
    :param table: 校验数据表
    :param field: 校验列
    :return:
    """
    # 从连接池获取数据库连接
    db_pool = dbConnect.get_db_pool(False)
    conn = db_pool.connection()
    # 读取数据库开始时间
    start_read = time.clock()

    db = database
    tb = table
    fd = field
    sql = "DESC " + db + "." + tb
    dfData = pd.read_sql(sql, conn)

    end_read = time.clock()
    print("一个字段默认值使用的一致性校验读取数据库时间: %s Seconds" % (end_read - start_read))

    defaultDataListDict = dict(
        zip(list(dfData['Field']), list(dfData['Default'])))
    defaultValue = defaultDataListDict[fd]

    end_exec = time.clock()
    print("默认值一致性校验总用时: %s Seconds" % (end_exec - start_read))
    return jsonify({fd: defaultValue})
Exemple #7
0
def checkFormat(typeFormat,database,table,field):
    """
    邮箱、手机号码、身份证格式校验
    :param typeFormat: 校验类型
    :param database: 校验数据库
    :param table: 校验数据表
    :param field: 校验列
    :return: 
    """
    # 从连接池获取数据库连接
    db_pool = dbConnect.get_db_pool(False)
    conn = db_pool.connection()
    # 读取数据库开始时间
    start_read = time.clock()


    db = database
    tb = table
    fd = field
    sql = "SELECT "+fd+" from "+db+"."+tb
    dfData = pd.read_sql(sql, conn, chunksize=20000)

    end_read = time.clock()
    print("邮箱校验读取数据库时间: %s Seconds" % (end_read - start_read))


    email_num = phone_num = card_num = 0
    # 检验该列所有数据的格式
    if typeFormat == 'email':
        for df in dfData:
            for data in df[fd]:
                if email(data):
                    email_num = email_num+1
        end_exec = time.clock()
        print("邮箱校验总用时: %s Seconds" % (end_exec - start_read))
        return jsonify({fd + " format correct num":int(email_num)})
    elif typeFormat == 'phone':
        for df in dfData:
            for data in df[fd]:
                if checkPhone(str(data)):
                    phone_num = phone_num + 1

        end_exec = time.clock()
        print("电话校验总用时: %s Seconds" % (end_exec - start_read))
        return jsonify({fd + " format correct num": int(phone_num)})
    elif typeFormat == 'idCard':
        for df in dfData:
            for data in df[fd]:
                # checkRes = checkIdcard(data)
                # if checkRes != '':
                #     print(checkRes)
                if data is not None:
                    if checkIdcard(str(data)) == True:
                        card_num = card_num + 1
        end_exec = time.clock()
        print("身份证校验总用时: %s Seconds" % (end_exec - start_read))
        return jsonify({fd + "format correct num": int(card_num)})
# 访问示例:127.0.0.1:5000/normalization/checkFormat/idCard/datagovernance/testdata/idcard
# 127.0.0.1:5000/normalization/checkFormat/email/datagovernance/fakedata/emailinfo
Exemple #8
0
def checkTimeliness(database, table, interval, inputvalue, comptype):
    """
    及时性
    :param database: 校验数据库
    :param table: 校验数据表
    :param interval: 时间差,间隔计量单位,
    SECOND 秒 SECONDS
    MINUTE 分钟 MINUTES
    HOUR 时间 HOURS
    DAY 天 DAYS
    MONTH 月 MONTHS
    YEAR 年 YEARS
    :param inputvalue: 输入参考值
    :param comptype: 比较方式,大于,大于等于,等于,小于,小于等于
    :return: 符合输入数据条件的记录数占比
    """
    # 从连接池获取数据库连接
    db_pool = dbConnect.get_db_pool(False)
    conn = db_pool.connection()
    # 读取数据库开始时间
    start_read = time.clock()

    db = database
    tb = table
    interval = interval
    inputvalue = int(inputvalue)
    comptype = comptype
    print(inputvalue, comptype)

    # 生产环境需修改sql的条件语句
    sql = "SELECT TIMESTAMPDIFF(" + interval + ",starttime,endtime) as difftime FROM " + db + "." + tb + " WHERE starttime IS NOT NULL"
    dfData = pd.read_sql(sql, conn)
    end_read = time.clock()
    print("及时性校验读取数据库时间: %s Seconds" % (end_read - start_read))

    if comptype == "morethan":
        res = dfData.loc[
            (dfData['difftime'] > inputvalue)].shape[0] / dfData.shape[0]
    elif comptype == "moreOrEqual":
        res = dfData.loc[
            (dfData['difftime'] >= inputvalue)].shape[0] / dfData.shape[0]
    elif comptype == "Equal":
        res = dfData.loc[(dfData['difftime']
                          == inputvalue)].shape[0] / dfData.shape[0]
    elif comptype == "lessthan":
        res = dfData.loc[
            (dfData['difftime'] < inputvalue)].shape[0] / dfData.shape[0]
    elif comptype == "lessOrEqual":
        res = dfData.loc[
            (dfData['difftime'] <= inputvalue)].shape[0] / dfData.shape[0]

    end_exec = time.clock()
    print("及时性校验总用时: %s Seconds" % (end_exec - start_read))
    return jsonify({"accordTimelinessRatio": res})


# 访问示例: 127.0.0.1:5000/timeliness/checkTimeliness/datagovernance/testdata/HOUR/3/morethan
Exemple #9
0
def checkUnique(database, table, field):
    """
    唯一性校验
    :param database: 校验数据库
    :param table: 校验数据表
    :param field: 校验列
    :return: 校验列在mysql中的存储类型
    """
    # 从连接池获取数据库连接
    db_pool = dbConnect.get_db_pool(False)
    conn = db_pool.connection()
    # 读取数据库开始时间
    start_read = time.clock()

    db = database
    tb = table
    fd = field
    sql = "SELECT " + fd + " FROM " + db + "." + tb
    dfData = pd.read_sql(sql, conn)  # 一次性读取,测试性能时可分批读取
    end_read = time.clock()
    print("唯一值校验读取数据库时间: %s Seconds" % (end_read - start_read))

    checkData = dfData.duplicated().value_counts()
    # 重复数据记录数
    if True in checkData.index.values:
        duplicateNum = checkData[True]
    else:
        duplicateNum = 0
    # 唯一数据记录数
    if False in checkData.index.values:
        uniqueNum = checkData[False]
    else:
        uniqueNum = 0
    # 重复数据占比
    duplicateNumRatio = uniqueNum / (int(duplicateNum) + int(uniqueNum))
    res = {
        "duplicateNum": int(duplicateNum),
        "uniqueNum": int(uniqueNum),
        "uniqueNumRatio": duplicateNumRatio
    }

    end_exec = time.clock()
    print("唯一值校验总用时: %s Seconds" % (end_exec - start_read))
    return jsonify(res)
Exemple #10
0
def checkMasterSlaveTable(masterDatabase, masterTable, masterField,
                          slaveDatabase, slaveTable, slaveField):
    """
    关联数据的一致性
    :param masterDatabase: 父表数据库
    :param masterTable: 父表数据表
    :param masterField: 父表校验列
    :param slaveDatabase: 子表数据库
    :param slaveTable: 子表数据表
    :param slaveField: 子表校验列
    :return:
    """
    # 从连接池获取数据库连接
    db_pool = dbConnect.get_db_pool(False)
    conn = db_pool.connection()

    mdb = masterDatabase
    mtb = masterTable
    mfd = masterField
    sdb = slaveDatabase
    stb = slaveTable
    sfd = slaveField

    sql_m = " SELECT " + mfd + " FROM " + mdb + "." + mtb
    mdfData = pd.read_sql(sql_m, conn)

    sql_s = " SELECT " + sfd + " FROM " + sdb + "." + stb
    sdfData = pd.read_sql(sql_s, conn)

    #此处采用循环实现,效率较低,待优化
    # 无父记录的子记录的值
    num = 0
    for i in list(sdfData[sfd]):
        if i not in list(mdfData[mfd]):
            num = num + 1

    # 无子记录的父记录的值
    num2 = 0
    for i in list(mdfData[mfd]):
        if i not in list(sdfData[sfd]):
            num2 = num2 + 1

    return jsonify({"无父记录的子记录的值": num, "无子记录的父记录的值": num2})
Exemple #11
0
def checkOutlier(funType,database,table,field):
    """
    离群值检查
    :param database: 校验数据库
    :param table: 校验数据表
    :param field: 校验列
    :return:
    """
    # 从连接池获取数据库连接
    db_pool = dbConnect.get_db_pool(False)
    conn = db_pool.connection()
    # 读取数据库开始时间
    start_read = time.clock()
    db = database
    tb = table
    fd = field
    sql = "SELECT "+fd+ " FROM "+db+"."+tb
    dfData = pd.read_sql(sql, conn)
    end_read = time.clock()
    print("离群值校验读取数据库时间: %s Seconds" % (end_read - start_read))

    dataMean = dfData.mean()[fd]
    dataStd = dfData.std()[fd]
    if funType == "3S":
        # 3S法则计算:当某一测量数据与其测量结果的算术平均值之差大于3倍标准偏差时,则该检查数据不符合规则。
        res = np.abs(np.array(dfData.get(fd))-dataMean)-3*dataStd
        # 尝试使用numpy优化
        outlierNum = len([d for d in res if d > 0])

        end_exec = time.clock()
        print("离群值校验总用时: %s Seconds" % (end_exec - start_read))
        return jsonify({"outlierNum": str(outlierNum)})
    # elif funType == "glbs":
    #     # 格鲁布斯法:计算公式为T = | X质疑—X平均 | / S,其中,S为这组数据的标准差(?需要对比临界值表中的T(待定))
    #     Tdata = np.abs(np.array(dfData.get(fd)) - dataMean)/dataStd
    #     # 表达式中的T为临界值表中的T
    #     outlierNum = len([d for d in Tdata if d-T > 0])
    #     pass
Exemple #12
0
def checkPri(database,table,field):
    """
    单字段主键校验
    :param database: 校验数据库名称
    :param table: 校验数据表
    :param field: 校验列
    :return: 返回主键校验结果
    """
    # 读取数据库开始时间
    start_read = time.clock()

    # 从连接池获取数据库连接
    print("获取数据库连接")
    db_pool = dbConnect.get_db_pool(False)
    conn = db_pool.connection()

    db = database
    tb = table
    fd = field
    sql = "DESC "+db+"."+tb
    # print(sql)
    columnInfo = pd.read_sql(sql, conn)

    end_read = time.clock()
    print("单字段主键校验读取数据库时间: %s Seconds" % (end_read - start_read))

    # 获取某个字段的数据主键信息
    isPri, = columnInfo['Key'][columnInfo['Field'] == fd].values
    # print(isPri)
    if isPri == 'PRI':

        end_exec = time.clock()
        print("单字段主键校验总用时: %s Seconds" % (end_exec - start_read))

        return jsonify({fd:isPri})
    else:
        return jsonify({fd:"NOT PRI"})
Exemple #13
0
def checkUnionPri(database,table,field):
    """
    联合主键校验
    :param database: 校验数据库
    :param table: 校验数据表
    :param field: 校验列
    :return: 返回联合主键校验结果
    """
    # 从连接池获取数据库连接
    db_pool = dbConnect.get_db_pool(False)
    conn = db_pool.connection()
    db = database
    tb = table
    fd = field
    sql = "DESC "+db+"."+tb
    columnInfo = pd.read_sql(sql, conn)
    # 获取检验列联合主键信息
    uPriInfo = columnInfo[['Field','Type']][columnInfo['Key']=='PRI']
    uPriList = list(uPriInfo['Field'])
    # 是否需要返回联合主键列表
    if fd in uPriList and len(uPriList)>1:
        return jsonify({fd:"unionPri"})
    else:
        return jsonify({fd:"NOT unionPri"})
Exemple #14
0
def checkValueRange(rangeType,minValue,maxValue,database,table,field):
    """
    值域校验
    :param rangeType: 校验的值域类型,数值型(取值范围)or字符型(枚举区间)or 时间范围
    :param minValue: 下限
    :param maxValue: 上限
    :param database: 校验数据库
    :param table: 校验数据表
    :param field: 校验列
    :return:
    """
    # 从连接池获取数据库连接
    db_pool = dbConnect.get_db_pool(False)
    conn = db_pool.connection()
    # 读取数据库开始时间
    start_read = time.clock()

    db = database
    tb = table
    fd = field

    sqlCount = "SELECT COUNT(*) as recordCount FROM " + db + "." + tb
    CountNum = pd.read_sql(sqlCount,conn)

    end_read = time.clock()
    print("列类型校验读取数据库时间: %s Seconds" % (end_read - start_read))


    recordCount = CountNum.get('recordCount').values[0]
    accordValueRangeNum = 0
    sql = "SELECT " + fd + " from " + db + "." + tb
    dfData = pd.read_sql(sql, conn, chunksize=20000)
    # print(minValue,maxValue)

    # 检验该列数据的值域
    if rangeType == 'valueRange':
        for df in dfData:
            for d in df[fd]:
                # if between(d, min=np.float(minValue), max=np.float(maxValue)):# 校验的效率比较低
                if np.float(minValue)<d<np.float(maxValue):# 校验的效率比较低
                    accordValueRangeNum = accordValueRangeNum + 1
        # return jsonify({"accordValueRangeRatio": accordValueRangeNum / recordCount})
    # 字符枚举
    elif rangeType == 'charRange':
        for df in dfData:
            for d in df[fd]:
                if d in [minValue, maxValue]:
                    accordValueRangeNum = accordValueRangeNum + 1
        # return jsonify({"accordValueRangeRatio": accordValueRangeNum/recordCount})
    # 时间区间
    elif rangeType == 'dateRange':
        mindate = datetime.strptime(minValue, '%Y-%m-%d %H:%M:%S')
        maxdate = datetime.strptime(maxValue, '%Y-%m-%d %H:%M:%S')
        for df in dfData:
            for d in df[fd]:
                if mindate < d < maxdate:
                    accordValueRangeNum = accordValueRangeNum + 1
        # return jsonify({"accordValueRangeRatio": accordValueRangeNum/recordCount})

    end_exec = time.clock()
    print("列类型校验总用时: %s Seconds" % (end_exec - start_read))
    return jsonify({"accordValueRangeRatio": accordValueRangeNum / recordCount})
Exemple #15
0
及时性:
1.基于时间点的及时性(finished)
    a)基于时间戳的记录数、频率分布或延迟时间符合业务需求的程度
    b)原始记录产生的时间(业务时间),处理完数据的时间(处理时间),做差和阈值比较,阈值需给定或者与历史值比价(环比,持续测量)
    c)计算出的时间差值与输入值进行比较,返回符合的记录占比
"""
import pymysql
import time
import numpy as np
import pandas as pd
from flask import Blueprint, jsonify
from DQApp.dbPool import dbConnect

timeliness = Blueprint('timeliness', __name__)
# 从连接池获取数据库连接
db_pool = dbConnect.get_db_pool(False)
conn = db_pool.connection()


@timeliness.route(
    '/checkTimeliness/<database>/<table>/<interval>/<inputvalue>/<comptype>',
    methods=['GET', 'POST'])
def checkTimeliness(database, table, interval, inputvalue, comptype):
    """
    及时性
    :param database: 校验数据库
    :param table: 校验数据表
    :param interval: 时间差,间隔计量单位,
    SECOND 秒 SECONDS
    MINUTE 分钟 MINUTES
    HOUR 时间 HOURS