def checkLength(db,tb,fd,inputLength,compType): """ 校验数据长度 :param db: 校验数据库 :param tb: 校验数据表 :param fd: 校验列 :param inputLength: 输入的期望长度 :param compType: 比较类型(>,<,=,<=,>=) :return: """ # 从连接池获取数据库连接 db_pool = dbConnect.get_db_pool(False) conn = db_pool.connection() sql = "SELECT "+fd+" FROM "+db+"."+tb columnInfo = pd.read_sql(sql, conn) # 返回每一条记录的字符串的长度,效率比for循环更高,可以加条件过滤、统计长度不符合要求的字符串数据 strInfo = columnInfo[fd].str lengthInfo = columnInfo[fd].str.len() if compType == "morethan": resRatio = lengthInfo[lengthInfo>int(inputLength)].shape[0]/lengthInfo.shape[0] elif compType == "lessthan": resRatio = lengthInfo[lengthInfo < int(inputLength)].shape[0] / lengthInfo.shape[0] elif compType == "equal": resRatio = lengthInfo[lengthInfo == int(inputLength)].shape[0] / lengthInfo.shape[0] return jsonify({"maxlengeth":max(lengthInfo),"minlengeth":min(lengthInfo),"accordRatio":resRatio})
def checkType(database,table,field): """ 列类型校验 :param database: 校验数据库 :param table: 校验数据表 :param field: 校验列 :return: 校验列在mysql中的存储类型 """ # 从连接池获取数据库连接 db_pool = dbConnect.get_db_pool(False) conn = db_pool.connection() db = database tb = table fd = field sql = "DESC "+db+"."+tb dfData = pd.read_sql(sql, conn) columnType = dfData[['Field','Type']] columnType.set_index(['Field'], inplace=True) # 获取单个或多个字段的数据类型 res = columnType.loc[fd.split(',')] res = res.to_dict() # 返回数据类型,长度,精度 return jsonify(res)
def checkPrecision(database,table,field): """ 精度校验,校验小数点位数(计量单位校验待定) :param database: 校验数据库 :param table: 校验数据表 :param field: 校验列 :return: 校验列的小数点位数 """ # 从连接池获取数据库连接 db_pool = dbConnect.get_db_pool(False) conn = db_pool.connection() db = database tb = table fd = field sql = "SHOW FULL COLUMNS FROM "+db+"."+tb columnInfo = pd.read_sql(sql, conn) # 获取校验列的精度信息,计量单位,小数位数 precisionInfo = columnInfo[['Field','Type','Comment']] # data为校验字段 xsws = precisionInfo['Type'][precisionInfo['Field']== fd] typeSplit = re.split(r"[',',')']", xsws.values[0]) if len(typeSplit) > 2: # decimalDigits为小数位数 _, decimalDigits, _ = typeSplit end_exec = time.clock() print("精度校验总用时: %s Seconds" % (end_exec - start_read)) return jsonify({fd:decimalDigits}) else: return "校验列没有小数位"
def checkDoubleCount(srcdb, srctb, srcfd, desdb, destb, desfd): """ 双表count校验 :param srcdb: 源数据库 :param srctb: 源数据表 :param srcfd: 源检验列 :param desdb: 目标数据库 :param destb: 目标数据表 :param desfd: 目标检验例 :return: """ # 从连接池获取数据库连接 db_pool = dbConnect.get_db_pool(False) conn = db_pool.connection() sql_tb1 = "SELECT count(" + srcfd + ") as f1 FROM " + srcdb + "." + srctb dfDataTb1 = pd.read_sql(sql_tb1, conn) print(dfDataTb1["f1"][0]) # numpy.int64数据类型 sql_tb2 = "SELECT count(" + desfd + ") as f2 FROM " + desdb + "." + destb dfDataTb2 = pd.read_sql(sql_tb2, conn) print(dfDataTb2["f2"][0]) if dfDataTb1["f1"][0] == dfDataTb2["f2"][0]: res = 'double count check success' else: res = 'double count check faild' return res
def checkDefaultValue(database, table, field): """ 评估列属性和数据在相同数据类型的字段默认值上的一致性 :param database: 校验数据库 :param table: 校验表 :param field: 校验列 :return: 返回数据库下所有表的相同数据类型的字段默认值是否一致 """ # 从连接池获取数据库连接 db_pool = dbConnect.get_db_pool(False) conn = db_pool.connection() db = database tb = table fd = field sql = "select table_name from information_schema.tables WHERE table_schema = " + "'" + db + "'" # 返回指定数据库下所有表名 dfData = pd.read_sql(sql, conn) allTableDefaultValue = {} for i in list(dfData['table_name']): sql = "DESC " + db + "." + i tbData = pd.read_sql(sql, conn) # 完全重复的类型与默认值组合会被合并 print(dict(zip(list(tbData['Type']), tbData['Default']))) allTableDefaultValue.setdefault(i, []).append( dict(zip(list(tbData['Type']), tbData['Default']))) return jsonify(allTableDefaultValue) # 访问示例:127.0.0.1:5000/conformity/checkDefaultValue/datagovernance/mytable/ff
def checkSingleDefaultValue(database, table, field): """ 一个字段默认值使用的一致性 :param database: 校验数据库 :param table: 校验数据表 :param field: 校验列 :return: """ # 从连接池获取数据库连接 db_pool = dbConnect.get_db_pool(False) conn = db_pool.connection() # 读取数据库开始时间 start_read = time.clock() db = database tb = table fd = field sql = "DESC " + db + "." + tb dfData = pd.read_sql(sql, conn) end_read = time.clock() print("一个字段默认值使用的一致性校验读取数据库时间: %s Seconds" % (end_read - start_read)) defaultDataListDict = dict( zip(list(dfData['Field']), list(dfData['Default']))) defaultValue = defaultDataListDict[fd] end_exec = time.clock() print("默认值一致性校验总用时: %s Seconds" % (end_exec - start_read)) return jsonify({fd: defaultValue})
def checkFormat(typeFormat,database,table,field): """ 邮箱、手机号码、身份证格式校验 :param typeFormat: 校验类型 :param database: 校验数据库 :param table: 校验数据表 :param field: 校验列 :return: """ # 从连接池获取数据库连接 db_pool = dbConnect.get_db_pool(False) conn = db_pool.connection() # 读取数据库开始时间 start_read = time.clock() db = database tb = table fd = field sql = "SELECT "+fd+" from "+db+"."+tb dfData = pd.read_sql(sql, conn, chunksize=20000) end_read = time.clock() print("邮箱校验读取数据库时间: %s Seconds" % (end_read - start_read)) email_num = phone_num = card_num = 0 # 检验该列所有数据的格式 if typeFormat == 'email': for df in dfData: for data in df[fd]: if email(data): email_num = email_num+1 end_exec = time.clock() print("邮箱校验总用时: %s Seconds" % (end_exec - start_read)) return jsonify({fd + " format correct num":int(email_num)}) elif typeFormat == 'phone': for df in dfData: for data in df[fd]: if checkPhone(str(data)): phone_num = phone_num + 1 end_exec = time.clock() print("电话校验总用时: %s Seconds" % (end_exec - start_read)) return jsonify({fd + " format correct num": int(phone_num)}) elif typeFormat == 'idCard': for df in dfData: for data in df[fd]: # checkRes = checkIdcard(data) # if checkRes != '': # print(checkRes) if data is not None: if checkIdcard(str(data)) == True: card_num = card_num + 1 end_exec = time.clock() print("身份证校验总用时: %s Seconds" % (end_exec - start_read)) return jsonify({fd + "format correct num": int(card_num)}) # 访问示例:127.0.0.1:5000/normalization/checkFormat/idCard/datagovernance/testdata/idcard # 127.0.0.1:5000/normalization/checkFormat/email/datagovernance/fakedata/emailinfo
def checkTimeliness(database, table, interval, inputvalue, comptype): """ 及时性 :param database: 校验数据库 :param table: 校验数据表 :param interval: 时间差,间隔计量单位, SECOND 秒 SECONDS MINUTE 分钟 MINUTES HOUR 时间 HOURS DAY 天 DAYS MONTH 月 MONTHS YEAR 年 YEARS :param inputvalue: 输入参考值 :param comptype: 比较方式,大于,大于等于,等于,小于,小于等于 :return: 符合输入数据条件的记录数占比 """ # 从连接池获取数据库连接 db_pool = dbConnect.get_db_pool(False) conn = db_pool.connection() # 读取数据库开始时间 start_read = time.clock() db = database tb = table interval = interval inputvalue = int(inputvalue) comptype = comptype print(inputvalue, comptype) # 生产环境需修改sql的条件语句 sql = "SELECT TIMESTAMPDIFF(" + interval + ",starttime,endtime) as difftime FROM " + db + "." + tb + " WHERE starttime IS NOT NULL" dfData = pd.read_sql(sql, conn) end_read = time.clock() print("及时性校验读取数据库时间: %s Seconds" % (end_read - start_read)) if comptype == "morethan": res = dfData.loc[ (dfData['difftime'] > inputvalue)].shape[0] / dfData.shape[0] elif comptype == "moreOrEqual": res = dfData.loc[ (dfData['difftime'] >= inputvalue)].shape[0] / dfData.shape[0] elif comptype == "Equal": res = dfData.loc[(dfData['difftime'] == inputvalue)].shape[0] / dfData.shape[0] elif comptype == "lessthan": res = dfData.loc[ (dfData['difftime'] < inputvalue)].shape[0] / dfData.shape[0] elif comptype == "lessOrEqual": res = dfData.loc[ (dfData['difftime'] <= inputvalue)].shape[0] / dfData.shape[0] end_exec = time.clock() print("及时性校验总用时: %s Seconds" % (end_exec - start_read)) return jsonify({"accordTimelinessRatio": res}) # 访问示例: 127.0.0.1:5000/timeliness/checkTimeliness/datagovernance/testdata/HOUR/3/morethan
def checkUnique(database, table, field): """ 唯一性校验 :param database: 校验数据库 :param table: 校验数据表 :param field: 校验列 :return: 校验列在mysql中的存储类型 """ # 从连接池获取数据库连接 db_pool = dbConnect.get_db_pool(False) conn = db_pool.connection() # 读取数据库开始时间 start_read = time.clock() db = database tb = table fd = field sql = "SELECT " + fd + " FROM " + db + "." + tb dfData = pd.read_sql(sql, conn) # 一次性读取,测试性能时可分批读取 end_read = time.clock() print("唯一值校验读取数据库时间: %s Seconds" % (end_read - start_read)) checkData = dfData.duplicated().value_counts() # 重复数据记录数 if True in checkData.index.values: duplicateNum = checkData[True] else: duplicateNum = 0 # 唯一数据记录数 if False in checkData.index.values: uniqueNum = checkData[False] else: uniqueNum = 0 # 重复数据占比 duplicateNumRatio = uniqueNum / (int(duplicateNum) + int(uniqueNum)) res = { "duplicateNum": int(duplicateNum), "uniqueNum": int(uniqueNum), "uniqueNumRatio": duplicateNumRatio } end_exec = time.clock() print("唯一值校验总用时: %s Seconds" % (end_exec - start_read)) return jsonify(res)
def checkMasterSlaveTable(masterDatabase, masterTable, masterField, slaveDatabase, slaveTable, slaveField): """ 关联数据的一致性 :param masterDatabase: 父表数据库 :param masterTable: 父表数据表 :param masterField: 父表校验列 :param slaveDatabase: 子表数据库 :param slaveTable: 子表数据表 :param slaveField: 子表校验列 :return: """ # 从连接池获取数据库连接 db_pool = dbConnect.get_db_pool(False) conn = db_pool.connection() mdb = masterDatabase mtb = masterTable mfd = masterField sdb = slaveDatabase stb = slaveTable sfd = slaveField sql_m = " SELECT " + mfd + " FROM " + mdb + "." + mtb mdfData = pd.read_sql(sql_m, conn) sql_s = " SELECT " + sfd + " FROM " + sdb + "." + stb sdfData = pd.read_sql(sql_s, conn) #此处采用循环实现,效率较低,待优化 # 无父记录的子记录的值 num = 0 for i in list(sdfData[sfd]): if i not in list(mdfData[mfd]): num = num + 1 # 无子记录的父记录的值 num2 = 0 for i in list(mdfData[mfd]): if i not in list(sdfData[sfd]): num2 = num2 + 1 return jsonify({"无父记录的子记录的值": num, "无子记录的父记录的值": num2})
def checkOutlier(funType,database,table,field): """ 离群值检查 :param database: 校验数据库 :param table: 校验数据表 :param field: 校验列 :return: """ # 从连接池获取数据库连接 db_pool = dbConnect.get_db_pool(False) conn = db_pool.connection() # 读取数据库开始时间 start_read = time.clock() db = database tb = table fd = field sql = "SELECT "+fd+ " FROM "+db+"."+tb dfData = pd.read_sql(sql, conn) end_read = time.clock() print("离群值校验读取数据库时间: %s Seconds" % (end_read - start_read)) dataMean = dfData.mean()[fd] dataStd = dfData.std()[fd] if funType == "3S": # 3S法则计算:当某一测量数据与其测量结果的算术平均值之差大于3倍标准偏差时,则该检查数据不符合规则。 res = np.abs(np.array(dfData.get(fd))-dataMean)-3*dataStd # 尝试使用numpy优化 outlierNum = len([d for d in res if d > 0]) end_exec = time.clock() print("离群值校验总用时: %s Seconds" % (end_exec - start_read)) return jsonify({"outlierNum": str(outlierNum)}) # elif funType == "glbs": # # 格鲁布斯法:计算公式为T = | X质疑—X平均 | / S,其中,S为这组数据的标准差(?需要对比临界值表中的T(待定)) # Tdata = np.abs(np.array(dfData.get(fd)) - dataMean)/dataStd # # 表达式中的T为临界值表中的T # outlierNum = len([d for d in Tdata if d-T > 0]) # pass
def checkPri(database,table,field): """ 单字段主键校验 :param database: 校验数据库名称 :param table: 校验数据表 :param field: 校验列 :return: 返回主键校验结果 """ # 读取数据库开始时间 start_read = time.clock() # 从连接池获取数据库连接 print("获取数据库连接") db_pool = dbConnect.get_db_pool(False) conn = db_pool.connection() db = database tb = table fd = field sql = "DESC "+db+"."+tb # print(sql) columnInfo = pd.read_sql(sql, conn) end_read = time.clock() print("单字段主键校验读取数据库时间: %s Seconds" % (end_read - start_read)) # 获取某个字段的数据主键信息 isPri, = columnInfo['Key'][columnInfo['Field'] == fd].values # print(isPri) if isPri == 'PRI': end_exec = time.clock() print("单字段主键校验总用时: %s Seconds" % (end_exec - start_read)) return jsonify({fd:isPri}) else: return jsonify({fd:"NOT PRI"})
def checkUnionPri(database,table,field): """ 联合主键校验 :param database: 校验数据库 :param table: 校验数据表 :param field: 校验列 :return: 返回联合主键校验结果 """ # 从连接池获取数据库连接 db_pool = dbConnect.get_db_pool(False) conn = db_pool.connection() db = database tb = table fd = field sql = "DESC "+db+"."+tb columnInfo = pd.read_sql(sql, conn) # 获取检验列联合主键信息 uPriInfo = columnInfo[['Field','Type']][columnInfo['Key']=='PRI'] uPriList = list(uPriInfo['Field']) # 是否需要返回联合主键列表 if fd in uPriList and len(uPriList)>1: return jsonify({fd:"unionPri"}) else: return jsonify({fd:"NOT unionPri"})
def checkValueRange(rangeType,minValue,maxValue,database,table,field): """ 值域校验 :param rangeType: 校验的值域类型,数值型(取值范围)or字符型(枚举区间)or 时间范围 :param minValue: 下限 :param maxValue: 上限 :param database: 校验数据库 :param table: 校验数据表 :param field: 校验列 :return: """ # 从连接池获取数据库连接 db_pool = dbConnect.get_db_pool(False) conn = db_pool.connection() # 读取数据库开始时间 start_read = time.clock() db = database tb = table fd = field sqlCount = "SELECT COUNT(*) as recordCount FROM " + db + "." + tb CountNum = pd.read_sql(sqlCount,conn) end_read = time.clock() print("列类型校验读取数据库时间: %s Seconds" % (end_read - start_read)) recordCount = CountNum.get('recordCount').values[0] accordValueRangeNum = 0 sql = "SELECT " + fd + " from " + db + "." + tb dfData = pd.read_sql(sql, conn, chunksize=20000) # print(minValue,maxValue) # 检验该列数据的值域 if rangeType == 'valueRange': for df in dfData: for d in df[fd]: # if between(d, min=np.float(minValue), max=np.float(maxValue)):# 校验的效率比较低 if np.float(minValue)<d<np.float(maxValue):# 校验的效率比较低 accordValueRangeNum = accordValueRangeNum + 1 # return jsonify({"accordValueRangeRatio": accordValueRangeNum / recordCount}) # 字符枚举 elif rangeType == 'charRange': for df in dfData: for d in df[fd]: if d in [minValue, maxValue]: accordValueRangeNum = accordValueRangeNum + 1 # return jsonify({"accordValueRangeRatio": accordValueRangeNum/recordCount}) # 时间区间 elif rangeType == 'dateRange': mindate = datetime.strptime(minValue, '%Y-%m-%d %H:%M:%S') maxdate = datetime.strptime(maxValue, '%Y-%m-%d %H:%M:%S') for df in dfData: for d in df[fd]: if mindate < d < maxdate: accordValueRangeNum = accordValueRangeNum + 1 # return jsonify({"accordValueRangeRatio": accordValueRangeNum/recordCount}) end_exec = time.clock() print("列类型校验总用时: %s Seconds" % (end_exec - start_read)) return jsonify({"accordValueRangeRatio": accordValueRangeNum / recordCount})
及时性: 1.基于时间点的及时性(finished) a)基于时间戳的记录数、频率分布或延迟时间符合业务需求的程度 b)原始记录产生的时间(业务时间),处理完数据的时间(处理时间),做差和阈值比较,阈值需给定或者与历史值比价(环比,持续测量) c)计算出的时间差值与输入值进行比较,返回符合的记录占比 """ import pymysql import time import numpy as np import pandas as pd from flask import Blueprint, jsonify from DQApp.dbPool import dbConnect timeliness = Blueprint('timeliness', __name__) # 从连接池获取数据库连接 db_pool = dbConnect.get_db_pool(False) conn = db_pool.connection() @timeliness.route( '/checkTimeliness/<database>/<table>/<interval>/<inputvalue>/<comptype>', methods=['GET', 'POST']) def checkTimeliness(database, table, interval, inputvalue, comptype): """ 及时性 :param database: 校验数据库 :param table: 校验数据表 :param interval: 时间差,间隔计量单位, SECOND 秒 SECONDS MINUTE 分钟 MINUTES HOUR 时间 HOURS