def do_validation(data): # define validation elements decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')] null_validation = [CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null')] # define validation schema schema = pandas_schema.Schema([ Column('ch1', decimal_validation + null_validation), Column('ch2', decimal_validation+ null_validation)]) # apply validation errors = schema.validate(data) errors_index_rows = [e.row for e in errors] if len(errors)==0: return True else: for e in range(0,len(errors_index_rows)): print("Error on line ", errors[e].row, " for ",errors[e].column, " : ",errors[e].value, " ", errors[e].message) return False
def __init__(self, dataframe): super(ValidateDataframe, self).__init__() self.dataframe = dataframe self.format_checker = { "int": int, "decimal": Decimal, "email": validate_email } self.decimal_validation = [ CustomElementValidation(lambda d: self.check_format("decimal", d), 'is not decimal')] self.null_validation = [ CustomElementValidation(lambda d: d not in [np.nan, "", pd.NaT, None], 'field cannot be null')] self.email_validation = [ CustomElementValidation(lambda e: self.check_format("email", e), 'email not valid')] self.int_validation = [CustomElementValidation(lambda i: check_int(i), 'is not integer')]
def do_validation(): # read the data data = pd.read_csv('data.csv') # define validation elements decimal_validation = [ CustomElementValidation(lambda d: check_decimal(d), 'is not decimal') ] int_validation = [ CustomElementValidation(lambda i: check_int(i), 'is not integer') ] null_validation = [ CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null') ] # define validation schema schema = pandas_schema.Schema([ Column('dec1', decimal_validation + null_validation), Column('dec2', decimal_validation), Column('dec3', decimal_validation), Column('dec4', decimal_validation), Column('dec5', decimal_validation), Column('dec6', decimal_validation), Column('dec7', decimal_validation), Column('company_id', int_validation + null_validation), Column('currency_id', int_validation + null_validation), Column('country_id', int_validation + null_validation) ]) # apply validation errors = schema.validate(data) errors_index_rows = [e.row for e in errors] data_clean = data.drop(index=errors_index_rows) # save data pd.DataFrame({'col': errors}).to_csv('errors.csv') data_clean.to_csv('clean_data.csv')
def validate(data): decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')] null_validation = [CustomElementValidation(lambda d: d , 'this field cannot be null')] test = [CustomElementValidation(lambda d: check_decimal(d), 'invalideted') | CustomElementValidation(lambda d: str(d).__contains__('nan'),"")] range_text = [CustomElementValidation(lambda d: (d>=0)&(d<100), 'not range')] range_number = [CustomElementValidation(lambda d: (d>=0)&(d<10000000), 'not range') | CustomElementValidation(lambda d: str(d).__contains__('nan'),"")] schema = pandas_schema.Schema([ Column('RevExp'), Column('budget', test), Column('budgetA', test), Column('total', test), Column('YTDA', test), Column('Q4F', test), Column('Q4FTB',test), Column('Q4FTBP', test), Column('comments') ]) # schema = pandas_schema.Schema([ # Column('LHIN Program: Revenue & Expenses'), # Column('Budget', test), # Column('Budget Adjustments', test), # Column('Total', test), # Column('YTD Actual', test), # Column('Q4 Forecast', test), # Column('Q4 $ Forecast Variance to Budget',test), # Column('Q4 % Forecast Variance to Budget', test), # Column('Comments\nExplanations are required where \nthe Q4 Forecasted % exceeds +/-10%') # ]) errors = schema.validate(data) # for e in errors: # print(e) # errors_index = {"row":[e.row for e in errors],"column":[e.column for e in errors]} list = [] # print(errors) # result = jsonify({"error": tuple(errors)}) for e in errors: list.append(str(e)) errors_index = {"row":[e.row for e in errors],"column":[e.column for e in errors]} print(errors_index) return errors_index
from pandas_schema import Column, Schema from pandas_schema.validation import ( LeadingWhitespaceValidation, TrailingWhitespaceValidation, CanConvertValidation, InListValidation, CustomElementValidation, ) EmptyStringValidation = CustomElementValidation(lambda d: d != "", "This field cannot be empty") nipt_results_schema = Schema([ Column("SampleID", [TrailingWhitespaceValidation(), EmptyStringValidation]), Column("SampleType", []), Column("Description", []), Column("SampleProject", [TrailingWhitespaceValidation(), EmptyStringValidation]), Column("Index1", []), Column("Index2", []), Column("Library_nM", []), Column("QCFlag", []), Column("Zscore_13", [CanConvertValidation(float)]), Column("Zscore_18", [CanConvertValidation(float)]), Column("Zscore_21", [CanConvertValidation(float)]), Column("Zscore_X", [CanConvertValidation(float)]), Column("Ratio_13", [CanConvertValidation(float)]), Column("Ratio_18", [CanConvertValidation(float)]), Column("Ratio_21", [CanConvertValidation(float)]), Column("Ratio_X", [CanConvertValidation(float)]),
return (len(pincode) == 6) and (pincode.isnumeric()) except: return False def check_dob(date_age): try: date_age = str(date_age) return re.match(r"[A-Za-z0-9!@#$%\\&\*\.\,\+-_\s]+", date_age).group() == date_age except: return False str_validation = [ CustomElementValidation(lambda d: check_str(d), 'invalid String') ] mob_validation = [ CustomElementValidation(lambda d: check_mob_number(d), 'invalid mobile number') ] pincode_validation = [ CustomElementValidation(lambda d: check_pincode(d), 'invalid pincode') ] null_validation = [ CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null') ] dob_validation = [ CustomElementValidation(lambda d: check_dob(d), 'either date or age is not valid')
if dec >10 or dec <-10: return False except : return False return True def check_int(num): try: int(num) except ValueError: return False return True decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')] int_validation = [CustomElementValidation(lambda i: check_int(i), 'is not integer')] null_validation = [CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null')] bool_validation = [CustomElementValidation(lambda d: d in ['False','True'], 'is not bool')] schema = pandas_schema.Schema([ Column('id', int_validation ), Column('y', ), Column('x1',decimal_validation ), Column('x2',decimal_validation ), Column('x3',decimal_validation ), Column('x4', decimal_validation), Column('x5', bool_validation ), Column('x6', ), Column('x7', decimal_validation ), Column('x8',decimal_validation ),
Decimal(dec) except InvalidOperation: return False return True def check_int(num): try: int(num) except ValueError: return False return True decimal_validation = [ CustomElementValidation(lambda d: check_decimal(d), 'is not decimal') ] int_validation = [ CustomElementValidation(lambda i: check_int(i), 'is not integer') ] null_validation = [ CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null') ] schema = pandas_schema.Schema([ Column('cantidaddepedido', int_validation + null_validation), Column('year', decimal_validation), Column('index', int_validation + null_validation), Column('pedido', int_validation + null_validation), Column('fechapedido', null_validation),
def is_string(value): return isinstance(value, str) """ @author @Amjad Alshihabi """ def is_float(value): return isinstance(value, float) int_validation = [ CustomElementValidation(lambda d: is_int(d), 'is not integer') ] string_validation = [ CustomElementValidation(lambda d: is_string(d), 'is not string') ] float_validation = [ CustomElementValidation(lambda d: is_float(d), 'is not float') ] null_validation = [ CustomElementValidation(lambda d: pd.notnull(d), 'cannot be null') ] hp_min_validation = [ CustomElementValidation(lambda d: d > HP_MIN_VALUE, 'cannot be less than 40') ] hp_max_validation = [
import json import pandas as pd import pytest from .test_helpers import CASENAMES, CASEDATA, RESULTS from pandas_schema import Column, Schema from pandas_schema.validation import (InListValidation, DateFormatValidation, IsDtypeValidation, CustomElementValidation) core_schema = Schema([ Column('id', [CustomElementValidation( lambda x: isinstance(x, str), "Not a string.")]), Column('authors', [CustomElementValidation( lambda x: isinstance(x, str), "Not a string.")]), Column('title', [CustomElementValidation( lambda x: isinstance(x, str), "Not a string.")]), Column('paper_abstract', [CustomElementValidation( lambda x: isinstance(x, str), "Not a string.")]), Column('year', [CustomElementValidation( lambda x: isinstance(x, str), "Not a string.")]), Column('subject_orig', [CustomElementValidation( lambda x: isinstance(x, str), "Not a string.")]), ]) base_schema = Schema([ Column('link', [CustomElementValidation(
import json import pandas as pd import pytest from .test_helpers import CASENAMES, CASEDATA, RESULTS, TRIPLE, retrieve_results from pandas_schema import Column, Schema from pandas_schema.validation import (InListValidation, DateFormatValidation, IsDtypeValidation, CustomElementValidation) core_schema = Schema([ Column('id', [ CustomElementValidation(lambda x: isinstance(x, str), "Not a string.") ]), Column('authors', [ CustomElementValidation(lambda x: isinstance(x, str), "Not a string.") ]), Column('title', [ CustomElementValidation(lambda x: isinstance(x, str), "Not a string.") ]), Column('paper_abstract', [ CustomElementValidation(lambda x: isinstance(x, str), "Not a string.") ]), Column('year', [ CustomElementValidation(lambda x: isinstance(x, str), "Not a string.") ]), Column('subject_orig', [ CustomElementValidation(lambda x: isinstance(x, str), "Not a string.") ]), ])
def validate(df): d_error = {} list_bu = [x[0] for x in BU_CHOICES] list_rd = [x[0] for x in RD_CHOICES] list_dept = [x[0] for x in DEPT_CHOICES] list_hplevel = [x[0] for x in HPLEVEL_CHOICES] list_province = [x[0] for x in PROVINCE_CHOICES] list_title = [x[0] for x in TITLE_CHOICES] NullValidation = CustomElementValidation(lambda d: d is not np.nan, "该字段不能为空") schema = Schema([ Column("南北中国", [InListValidation(list_bu)]), Column("区域", [InListValidation(list_rd)]), Column("大区", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column("地区经理", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column("负责代表", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column( "医院编码", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation, MatchesPatternValidation(r"^[H]{1}(\d){9}$"), ], ), Column("医院全称", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column("省/自治区/直辖市", [InListValidation(list_province)]), Column("是否双call", [InListValidation(["是", "否"])]), Column("医院级别", [InListValidation(list_hplevel)]), Column("开户进展", [InListValidation(["已开户", "未开户"])]), Column("客户姓名", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), IsDistinctValidation() ]), Column("所在科室", [InListValidation(list_dept)]), Column("职称", [InListValidation(list_title)]), Column("月出诊次数(半天计)", [CanConvertValidation(int), InRangeValidation(0, 63)]), Column("每半天\n门诊量", [CanConvertValidation(int), InRangeValidation(0, )]), Column("相关病人\n比例(%)\n建议比例:40%-80%", [CanConvertValidation(int), InRangeValidation(0, 101)]), Column("备注"), ]) errors = schema.validate(df.loc[:, COL]) for error in errors: str_warning = str(error) for term in D_TRANSLATE: str_warning = str_warning.replace(term, D_TRANSLATE[term]) findword = r": [0-9]\d*" str_warning = re.sub(findword, row_refined, str_warning) d_error[str_warning] = "<br>" d_error = {**d_error, **check_inconsist(df, "医院编码", "医院全称", "both")} d_error = {**d_error, **check_inconsist(df, "区域", "大区", "right")} d_error = {**d_error, **check_inconsist(df, "大区", "地区经理", "right")} d_error = {**d_error, **check_inconsist(df, "地区经理", "负责代表", "right")} d_error = {**d_error, **check_inconsist(df, "医院编码", "省/自治区/直辖市", "left")} d_error = {**d_error, **check_inconsist(df, "医院编码", "是否双call", "left")} d_error = {**d_error, **check_inconsist(df, "医院编码", "医院级别", "left")} d_error = {**d_error, **check_inconsist(df, "医院编码", "开户进展", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "省/自治区/直辖市", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "是否双call", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "医院级别", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "开户进展", "left")} d_error = {**d_error, **check_hplevel_with_dept(df)} # 检查医院级别和所在科室是否出现矛盾 return d_error
except: return False def valid_pattern(value, pattern): if isNaN(value): return True else: try: return bool(re.match(pattern, str(value))) except: False decimal_validation = [ CustomElementValidation(lambda d: check_decimal(d), 'no es decimal') ] int_validation = [ CustomElementValidation(lambda i: check_int(i), 'no es entero') ] null_validation = [ CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null') ] bool_validation = [ CustomElementValidation(lambda i: valid_bool(i), 'debe ser 0 o 1') ] date_validation = [ CustomElementValidation( lambda i: valid_date(date_format, i), 'no calza con el patrón de fecha "' + date_format + '"')