Beispiel #1
0
def do_validation(data):
    # define validation elements
    decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')]
    null_validation = [CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null')]

    # define validation schema
    schema = pandas_schema.Schema([
            Column('ch1', decimal_validation + null_validation),
            Column('ch2', decimal_validation+ null_validation)])
    
    # apply validation
    errors = schema.validate(data)
    errors_index_rows = [e.row for e in errors]
    if len(errors)==0:
        return True
    else:
        for e in range(0,len(errors_index_rows)):
            print("Error on line ", errors[e].row, " for ",errors[e].column, " : ",errors[e].value, " ", errors[e].message)
        return False
Beispiel #2
0
    def __init__(self, dataframe):
        super(ValidateDataframe, self).__init__()
        self.dataframe = dataframe
        self.format_checker = {
            "int": int,
            "decimal": Decimal,
            "email": validate_email
        }

        self.decimal_validation = [
            CustomElementValidation(lambda d: self.check_format("decimal", d),
                                    'is not decimal')]
        self.null_validation = [
            CustomElementValidation(lambda d: d not in [np.nan, "", pd.NaT, None],
                                    'field cannot be null')]

        self.email_validation = [
            CustomElementValidation(lambda e: self.check_format("email", e),
                                    'email not valid')]

        self.int_validation = [CustomElementValidation(lambda i: check_int(i),
                                                       'is not integer')]
Beispiel #3
0
def do_validation():
    # read the data
    data = pd.read_csv('data.csv')

    # define validation elements
    decimal_validation = [
        CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')
    ]
    int_validation = [
        CustomElementValidation(lambda i: check_int(i), 'is not integer')
    ]
    null_validation = [
        CustomElementValidation(lambda d: d is not np.nan,
                                'this field cannot be null')
    ]

    # define validation schema
    schema = pandas_schema.Schema([
        Column('dec1', decimal_validation + null_validation),
        Column('dec2', decimal_validation),
        Column('dec3', decimal_validation),
        Column('dec4', decimal_validation),
        Column('dec5', decimal_validation),
        Column('dec6', decimal_validation),
        Column('dec7', decimal_validation),
        Column('company_id', int_validation + null_validation),
        Column('currency_id', int_validation + null_validation),
        Column('country_id', int_validation + null_validation)
    ])

    # apply validation
    errors = schema.validate(data)
    errors_index_rows = [e.row for e in errors]
    data_clean = data.drop(index=errors_index_rows)

    # save data
    pd.DataFrame({'col': errors}).to_csv('errors.csv')
    data_clean.to_csv('clean_data.csv')
def validate(data):
    decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')]
    null_validation = [CustomElementValidation(lambda d: d , 'this field cannot be null')]
    test = [CustomElementValidation(lambda d: check_decimal(d), 'invalideted') | CustomElementValidation(lambda d: str(d).__contains__('nan'),"")]
    range_text = [CustomElementValidation(lambda d: (d>=0)&(d<100), 'not range')]
    range_number = [CustomElementValidation(lambda d: (d>=0)&(d<10000000), 'not range') | CustomElementValidation(lambda d: str(d).__contains__('nan'),"")]
   
    schema = pandas_schema.Schema([
                Column('RevExp'),
                Column('budget', test),
                Column('budgetA', test),
                Column('total', test),
                Column('YTDA', test),
                Column('Q4F', test),
                Column('Q4FTB',test),
                Column('Q4FTBP', test),
                Column('comments')
                ])

    # schema = pandas_schema.Schema([
    #             Column('LHIN Program:  Revenue & Expenses'),
    #             Column('Budget', test),
    #             Column('Budget Adjustments', test),
    #             Column('Total', test),
    #             Column('YTD Actual', test),
    #             Column('Q4 Forecast', test),
    #             Column('Q4 $ Forecast Variance to Budget',test),
    #             Column('Q4 % Forecast Variance to Budget', test),
    #             Column('Comments\nExplanations are required where \nthe Q4 Forecasted % exceeds +/-10%')
    #             ])
    errors = schema.validate(data)
    # for e in errors:
    #     print(e)
    # errors_index = {"row":[e.row for e in errors],"column":[e.column for e in errors]}
    list = []
    # print(errors)
    # result = jsonify({"error": tuple(errors)})
    for e in errors:
        list.append(str(e))
    errors_index = {"row":[e.row for e in errors],"column":[e.column for e in errors]}
    print(errors_index)
    return errors_index
Beispiel #5
0
from pandas_schema import Column, Schema
from pandas_schema.validation import (
    LeadingWhitespaceValidation,
    TrailingWhitespaceValidation,
    CanConvertValidation,
    InListValidation,
    CustomElementValidation,
)

EmptyStringValidation = CustomElementValidation(lambda d: d != "",
                                                "This field cannot be empty")

nipt_results_schema = Schema([
    Column("SampleID",
           [TrailingWhitespaceValidation(), EmptyStringValidation]),
    Column("SampleType", []),
    Column("Description", []),
    Column("SampleProject",
           [TrailingWhitespaceValidation(), EmptyStringValidation]),
    Column("Index1", []),
    Column("Index2", []),
    Column("Library_nM", []),
    Column("QCFlag", []),
    Column("Zscore_13", [CanConvertValidation(float)]),
    Column("Zscore_18", [CanConvertValidation(float)]),
    Column("Zscore_21", [CanConvertValidation(float)]),
    Column("Zscore_X", [CanConvertValidation(float)]),
    Column("Ratio_13", [CanConvertValidation(float)]),
    Column("Ratio_18", [CanConvertValidation(float)]),
    Column("Ratio_21", [CanConvertValidation(float)]),
    Column("Ratio_X", [CanConvertValidation(float)]),
Beispiel #6
0
        return (len(pincode) == 6) and (pincode.isnumeric())
    except:
        return False


def check_dob(date_age):
    try:
        date_age = str(date_age)
        return re.match(r"[A-Za-z0-9!@#$%\\&\*\.\,\+-_\s]+",
                        date_age).group() == date_age
    except:
        return False


str_validation = [
    CustomElementValidation(lambda d: check_str(d), 'invalid String')
]
mob_validation = [
    CustomElementValidation(lambda d: check_mob_number(d),
                            'invalid mobile number')
]
pincode_validation = [
    CustomElementValidation(lambda d: check_pincode(d), 'invalid pincode')
]
null_validation = [
    CustomElementValidation(lambda d: d is not np.nan,
                            'this field cannot be null')
]
dob_validation = [
    CustomElementValidation(lambda d: check_dob(d),
                            'either date or age is not valid')
Beispiel #7
0
        if dec >10 or dec <-10:
            return False
    except :
        return False
    return True


def check_int(num):
    try:
        int(num)
    except ValueError:
        return False
    return True


decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')]
int_validation = [CustomElementValidation(lambda i: check_int(i), 'is not integer')]
null_validation = [CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null')]
bool_validation = [CustomElementValidation(lambda d: d in ['False','True'], 'is not bool')]

schema = pandas_schema.Schema([
            Column('id', int_validation ),
            Column('y',  ),
            Column('x1',decimal_validation ),
            Column('x2',decimal_validation ),
            Column('x3',decimal_validation ),
            Column('x4', decimal_validation),
            Column('x5', bool_validation ),
            Column('x6', ),
            Column('x7', decimal_validation ),
            Column('x8',decimal_validation ),
        Decimal(dec)
    except InvalidOperation:
        return False
    return True


def check_int(num):
    try:
        int(num)
    except ValueError:
        return False
    return True


decimal_validation = [
    CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')
]
int_validation = [
    CustomElementValidation(lambda i: check_int(i), 'is not integer')
]
null_validation = [
    CustomElementValidation(lambda d: d is not np.nan,
                            'this field cannot be null')
]

schema = pandas_schema.Schema([
    Column('cantidaddepedido', int_validation + null_validation),
    Column('year', decimal_validation),
    Column('index', int_validation + null_validation),
    Column('pedido', int_validation + null_validation),
    Column('fechapedido', null_validation),
def is_string(value):
    return isinstance(value, str)


"""
    @author @Amjad Alshihabi
"""


def is_float(value):
    return isinstance(value, float)


int_validation = [
    CustomElementValidation(lambda d: is_int(d), 'is not integer')
]
string_validation = [
    CustomElementValidation(lambda d: is_string(d), 'is not string')
]
float_validation = [
    CustomElementValidation(lambda d: is_float(d), 'is not float')
]
null_validation = [
    CustomElementValidation(lambda d: pd.notnull(d), 'cannot be null')
]
hp_min_validation = [
    CustomElementValidation(lambda d: d > HP_MIN_VALUE,
                            'cannot be less than 40')
]
hp_max_validation = [
Beispiel #10
0
import json
import pandas as pd
import pytest

from .test_helpers import CASENAMES, CASEDATA, RESULTS

from pandas_schema import Column, Schema
from pandas_schema.validation import (InListValidation,
                                      DateFormatValidation,
                                      IsDtypeValidation,
                                      CustomElementValidation)


core_schema = Schema([
    Column('id', [CustomElementValidation(
                    lambda x: isinstance(x, str), "Not a string.")]),
    Column('authors', [CustomElementValidation(
                    lambda x: isinstance(x, str), "Not a string.")]),
    Column('title', [CustomElementValidation(
                    lambda x: isinstance(x, str), "Not a string.")]),
    Column('paper_abstract', [CustomElementValidation(
                    lambda x: isinstance(x, str), "Not a string.")]),
    Column('year', [CustomElementValidation(
                    lambda x: isinstance(x, str), "Not a string.")]),
    Column('subject_orig', [CustomElementValidation(
                    lambda x: isinstance(x, str), "Not a string.")]),
])


base_schema = Schema([
    Column('link', [CustomElementValidation(
Beispiel #11
0
import json
import pandas as pd
import pytest

from .test_helpers import CASENAMES, CASEDATA, RESULTS, TRIPLE, retrieve_results

from pandas_schema import Column, Schema
from pandas_schema.validation import (InListValidation, DateFormatValidation,
                                      IsDtypeValidation,
                                      CustomElementValidation)

core_schema = Schema([
    Column('id', [
        CustomElementValidation(lambda x: isinstance(x, str), "Not a string.")
    ]),
    Column('authors', [
        CustomElementValidation(lambda x: isinstance(x, str), "Not a string.")
    ]),
    Column('title', [
        CustomElementValidation(lambda x: isinstance(x, str), "Not a string.")
    ]),
    Column('paper_abstract', [
        CustomElementValidation(lambda x: isinstance(x, str), "Not a string.")
    ]),
    Column('year', [
        CustomElementValidation(lambda x: isinstance(x, str), "Not a string.")
    ]),
    Column('subject_orig', [
        CustomElementValidation(lambda x: isinstance(x, str), "Not a string.")
    ]),
])
Beispiel #12
0
def validate(df):
    d_error = {}
    list_bu = [x[0] for x in BU_CHOICES]
    list_rd = [x[0] for x in RD_CHOICES]
    list_dept = [x[0] for x in DEPT_CHOICES]
    list_hplevel = [x[0] for x in HPLEVEL_CHOICES]
    list_province = [x[0] for x in PROVINCE_CHOICES]
    list_title = [x[0] for x in TITLE_CHOICES]

    NullValidation = CustomElementValidation(lambda d: d is not np.nan,
                                             "该字段不能为空")
    schema = Schema([
        Column("南北中国", [InListValidation(list_bu)]),
        Column("区域", [InListValidation(list_rd)]),
        Column("大区", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("地区经理", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("负责代表", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column(
            "医院编码",
            [
                LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation(),
                NullValidation,
                MatchesPatternValidation(r"^[H]{1}(\d){9}$"),
            ],
        ),
        Column("医院全称", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("省/自治区/直辖市", [InListValidation(list_province)]),
        Column("是否双call", [InListValidation(["是", "否"])]),
        Column("医院级别", [InListValidation(list_hplevel)]),
        Column("开户进展", [InListValidation(["已开户", "未开户"])]),
        Column("客户姓名", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(),
            IsDistinctValidation()
        ]),
        Column("所在科室", [InListValidation(list_dept)]),
        Column("职称", [InListValidation(list_title)]),
        Column("月出诊次数(半天计)",
               [CanConvertValidation(int),
                InRangeValidation(0, 63)]),
        Column("每半天\n门诊量", [CanConvertValidation(int),
                            InRangeValidation(0, )]),
        Column("相关病人\n比例(%)\n建议比例:40%-80%",
               [CanConvertValidation(int),
                InRangeValidation(0, 101)]),
        Column("备注"),
    ])
    errors = schema.validate(df.loc[:, COL])
    for error in errors:
        str_warning = str(error)
        for term in D_TRANSLATE:
            str_warning = str_warning.replace(term, D_TRANSLATE[term])
            findword = r": [0-9]\d*"
            str_warning = re.sub(findword, row_refined, str_warning)
        d_error[str_warning] = "<br>"

    d_error = {**d_error, **check_inconsist(df, "医院编码", "医院全称", "both")}
    d_error = {**d_error, **check_inconsist(df, "区域", "大区", "right")}
    d_error = {**d_error, **check_inconsist(df, "大区", "地区经理", "right")}
    d_error = {**d_error, **check_inconsist(df, "地区经理", "负责代表", "right")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "省/自治区/直辖市", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "是否双call", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "医院级别", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "开户进展", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "省/自治区/直辖市", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "是否双call", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "医院级别", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "开户进展", "left")}

    d_error = {**d_error, **check_hplevel_with_dept(df)}  # 检查医院级别和所在科室是否出现矛盾
    return d_error
Beispiel #13
0
        except:
            return False


def valid_pattern(value, pattern):
    if isNaN(value):
        return True
    else:
        try:
            return bool(re.match(pattern, str(value)))
        except:
            False


decimal_validation = [
    CustomElementValidation(lambda d: check_decimal(d), 'no es decimal')
]
int_validation = [
    CustomElementValidation(lambda i: check_int(i), 'no es entero')
]
null_validation = [
    CustomElementValidation(lambda d: d is not np.nan,
                            'this field cannot be null')
]
bool_validation = [
    CustomElementValidation(lambda i: valid_bool(i), 'debe ser 0 o 1')
]
date_validation = [
    CustomElementValidation(
        lambda i: valid_date(date_format, i),
        'no calza con el patrón de fecha "' + date_format + '"')