Ejemplo n.º 1
0
def do_validation():
    # read the data
    data = pd.read_csv('noon.csv')
    data.dtypes
    # define validation elements
    int_validation = [
        CustomElementValidation(lambda i: check_int(i), 'is not integer')
    ]
    null_validation = [
        CustomElementValidation(lambda d: d is not np.nan,
                                'this field cannot be null')
    ]
    #d is not np.nan
    # define validation schema
    schema = pandas_schema.Schema([
        Column('Name', null_validation),
        Column('SKU', null_validation),
        Column('Price', int_validation + null_validation),
        Column('Special price', int_validation + null_validation),
        Column('Qty', int_validation + null_validation)
    ])

    # apply validation
    errors = schema.validate(data)

    for error in errors:
        print('"{}" failed!'.format(error.value))

    errors_index_rows = [e.row for e in errors]

    # save data
    pd.DataFrame({'col': errors}).to_csv('errors.csv')
Ejemplo n.º 2
0
def validate(data: pd.DataFrame):
    decimal_validation = [
        CustomElementValidation(lambda x: check_decimal(x), 'is not decimal')
    ]
    datetime_validation = [
        CustomElementValidation(lambda x: check_datetime(x), 'is not datetime')
    ]
    string_validation = [
        CustomElementValidation(lambda x: check_is_string_or_nan(x),
                                'is not string')
    ]
    nan_validation = [
        CustomElementValidation(lambda x: x is not np.nan,
                                'this field cannot be NaN')
    ]

    schema = pandas_schema.Schema([
        Column('value', decimal_validation + nan_validation),
        Column('time', datetime_validation + nan_validation),
        Column('target', string_validation),
        Column('message', string_validation),
        Column('event', string_validation),
        Column('account_number', string_validation),
    ])

    errors = schema.validate(data)
    if len(errors) > 0:
        for error in errors:
            print(error)
        raise InvalidDataFrame("Invalid dataframe!")
Ejemplo n.º 3
0
 def run(self):
     # define validation elements
     self.logger.info('1. Starting data Clean Action ..')
     system_packs_base_path = cfg.CONF.content.system_packs_base_path
     path_of_pack = system_packs_base_path + '/monitor_mqtt'
     success = False
     VALIDATORS = {
         'decimal':
         CustomElementValidation(lambda d: self.check_decimal(d),
                                 'is not decimal'),
         'int':
         CustomElementValidation(lambda i: self.check_int(i),
                                 'is not integer'),
         'null':
         CustomElementValidation(lambda d: d is not np.nan,
                                 'this field cannot be null'),
         'time_stamp':
         CustomElementValidation(lambda d: self.check_time_stamp(d),
                                 'time_stamp format is not valid')
     }
     self.logger.info('2. Loading Schema ..')
     with open(self._json_schema_path, 'r') as my_json:
         json_schema = json.load(my_json)
     column_list = [
         Column(k, [VALIDATORS[v] for v in vals])
         for k, vals in json_schema.items()
     ]
     schema = pandas_schema.Schema(column_list)
     self.logger.info('3. Loading CSV Data ..')
     data = pd.read_csv(self._data_file_path)
     self.logger.debug(data)
     try:
         self.logger.info('4. Validating input CSV data ..')
         errors = schema.validate(data)
         for e in errors:
             self.logger.debug(e)
         if errors:
             errors_index_rows = [e.row for e in errors]
             self.logger.info('5. Cleaning input CSV data ..')
             data_clean = data.drop(index=errors_index_rows)
             ct = datetime.datetime.now()
             filename = '{:%Y_%m_%d_%H_%M_%S_%f}.csv'.format(ct)
             pathoffile = path_of_pack + '/etc/clean_data_output/errors_' + filename
             message = 'Error Data file: ' + pathoffile
             self.logger.debug(message)
             pd.DataFrame({'col': errors}).to_csv(pathoffile)
         else:
             self.logger.info('5. Couldn`t find issue with input CSV ..')
             data_clean = data
             cleanpath = path_of_pack + '/etc/clean_data_output/clean_data.csv'
             cleanmessage = 'Clean Data path: ' + cleanpath
         self.logger.debug(cleanmessage)
         data_clean.to_csv(cleanpath)
         success = True
         self.logger.info('Action Completed Successfully')
     except Exception as msg:
         self.logger.info(f"FAILED STEP: {msg}\n FAILED: Clean Data Action")
     return success
Ejemplo n.º 4
0
def data_validation(filename):
    """
        :param filename: name of the csv file with data
        :return: dataframe if the data is correct, list of errors
        :does: validates the data in the csv file
    """
    # read the data
    try:
        data = pd.read_csv(filename)
    except Exception:
        return [False, ['Error reading a file.']]
    # check column names
    if (data.columns.to_list() == [
            'City', 'Cappuccino', 'Cinema', 'Wine', 'Gasoline', 'Avg Rent',
            'Avg Disposable Income'
    ]):

        # define validation elements
        decimal_validation = [
            CustomElementValidation(lambda d: check_float(d),
                                    'Must be decimal')
        ]
        null_validation = [
            CustomElementValidation(lambda d: d is not np.nan,
                                    'Must not be nan')
        ]

        # define validation schema
        schema = pandas_schema.Schema([
            Column('City', null_validation),
            Column('Cappuccino', decimal_validation + null_validation),
            Column('Cinema', decimal_validation + null_validation),
            Column('Wine', decimal_validation + null_validation),
            Column('Gasoline', decimal_validation + null_validation),
            Column('Avg Rent', decimal_validation + null_validation),
            Column('Avg Disposable Income',
                   decimal_validation + null_validation)
        ])

        # apply validation
        errors = schema.validate(data)
        errors_index_rows = [e.row for e in errors]
        data_clean = data.drop(index=errors_index_rows)

        if errors is not None and len(errors) == len(data['City']):
            return [False, errors]
        else:
            for e in errors:
                if e.column != 'City':
                    data_clean[e.column] = pd.to_numeric(data_clean[e.column])
            return [True, data_clean, errors]
    else:
        return [False, ['The criteria names are incorrect.']]
Ejemplo n.º 5
0
def data_validation(data, element):
    """return True if csv data to upload id valid"""

    environment_scopes = []
    for e_scope in element.environments.list():
        environment_scopes.append(e_scope.slug)

    # define validation elements
    key_validation = [
        CustomElementValidation(
            lambda k: check_key(k),
            'is not valid key - only letters, digits and _')
    ]
    bool_validation = [
        CustomElementValidation(lambda b: check_boolean(b), 'is not boolean')
    ]
    null_validation = [
        CustomElementValidation(lambda d: d is not np.nan,
                                'this field cannot be null')
    ]
    variable_type_validation = [
        CustomElementValidation(lambda t: check_variable_type(t),
                                'is not valid variable_type')
    ]
    environment_scope_validation = [
        CustomElementValidation(
            lambda e: check_environment_scope(e, environment_scopes),
            'is not valid environment_scope')
    ]

    # variable_type,key,value,protected,masked
    # define validation schema
    schema = pandas_schema.Schema([
        Column('variable_type', variable_type_validation + null_validation),
        Column('key', key_validation + null_validation),
        Column('value', null_validation),
        Column('protected', bool_validation + null_validation),
        Column('masked', bool_validation + null_validation),
        Column('environment_scope',
               null_validation + environment_scope_validation)
    ])

    # apply validation
    errors = schema.validate(data)
    errors_index_rows = [e.row for e in errors]

    if errors_index_rows:
        logging.error(pd.DataFrame({'errors': errors}))
        return False

    logging.info("CSV validation passed.")
    return True
Ejemplo n.º 6
0
 def return_validate(self):
     schema = pandas_schema.Schema(
         [
             Column('Sr. No', self.decimal_validation + self.null_validation),
             Column('Batch', self.null_validation),
             Column('First Name', self.null_validation),
             Column('Last Name', self.null_validation),
             Column('Email', self.email_validation + self.null_validation),
             Column('Contact', self.decimal_validation + self.null_validation)
         ]
     )
     errors = schema.validate(self.dataframe)
     print(errors, "/////////////////////")
     return errors
Ejemplo n.º 7
0
def do_validation(data):
    # define validation elements
    decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')]
    null_validation = [CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null')]

    # define validation schema
    schema = pandas_schema.Schema([
            Column('ch1', decimal_validation + null_validation),
            Column('ch2', decimal_validation+ null_validation)])
    
    # apply validation
    errors = schema.validate(data)
    errors_index_rows = [e.row for e in errors]
    if len(errors)==0:
        return True
    else:
        for e in range(0,len(errors_index_rows)):
            print("Error on line ", errors[e].row, " for ",errors[e].column, " : ",errors[e].value, " ", errors[e].message)
        return False
Ejemplo n.º 8
0
def validate(data):
    decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')]
    null_validation = [CustomElementValidation(lambda d: d , 'this field cannot be null')]
    test = [CustomElementValidation(lambda d: check_decimal(d), 'invalideted') | CustomElementValidation(lambda d: str(d).__contains__('nan'),"")]
    range_text = [CustomElementValidation(lambda d: (d>=0)&(d<100), 'not range')]
    range_number = [CustomElementValidation(lambda d: (d>=0)&(d<10000000), 'not range') | CustomElementValidation(lambda d: str(d).__contains__('nan'),"")]
   
    schema = pandas_schema.Schema([
                Column('RevExp'),
                Column('budget', test),
                Column('budgetA', test),
                Column('total', test),
                Column('YTDA', test),
                Column('Q4F', test),
                Column('Q4FTB',test),
                Column('Q4FTBP', test),
                Column('comments')
                ])

    # schema = pandas_schema.Schema([
    #             Column('LHIN Program:  Revenue & Expenses'),
    #             Column('Budget', test),
    #             Column('Budget Adjustments', test),
    #             Column('Total', test),
    #             Column('YTD Actual', test),
    #             Column('Q4 Forecast', test),
    #             Column('Q4 $ Forecast Variance to Budget',test),
    #             Column('Q4 % Forecast Variance to Budget', test),
    #             Column('Comments\nExplanations are required where \nthe Q4 Forecasted % exceeds +/-10%')
    #             ])
    errors = schema.validate(data)
    # for e in errors:
    #     print(e)
    # errors_index = {"row":[e.row for e in errors],"column":[e.column for e in errors]}
    list = []
    # print(errors)
    # result = jsonify({"error": tuple(errors)})
    for e in errors:
        list.append(str(e))
    errors_index = {"row":[e.row for e in errors],"column":[e.column for e in errors]}
    print(errors_index)
    return errors_index
Ejemplo n.º 9
0
def do_validation():
    # read the data
    data = pd.read_csv('data.csv')

    # define validation elements
    decimal_validation = [
        CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')
    ]
    int_validation = [
        CustomElementValidation(lambda i: check_int(i), 'is not integer')
    ]
    null_validation = [
        CustomElementValidation(lambda d: d is not np.nan,
                                'this field cannot be null')
    ]

    # define validation schema
    schema = pandas_schema.Schema([
        Column('dec1', decimal_validation + null_validation),
        Column('dec2', decimal_validation),
        Column('dec3', decimal_validation),
        Column('dec4', decimal_validation),
        Column('dec5', decimal_validation),
        Column('dec6', decimal_validation),
        Column('dec7', decimal_validation),
        Column('company_id', int_validation + null_validation),
        Column('currency_id', int_validation + null_validation),
        Column('country_id', int_validation + null_validation)
    ])

    # apply validation
    errors = schema.validate(data)
    errors_index_rows = [e.row for e in errors]
    data_clean = data.drop(index=errors_index_rows)

    # save data
    pd.DataFrame({'col': errors}).to_csv('errors.csv')
    data_clean.to_csv('clean_data.csv')
Ejemplo n.º 10
0
        return False
    return True


decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')]
int_validation = [CustomElementValidation(lambda i: check_int(i), 'is not integer')]
null_validation = [CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null')]
bool_validation = [CustomElementValidation(lambda d: d in ['False','True'], 'is not bool')]

schema = pandas_schema.Schema([
            Column('id', int_validation ),
            Column('y',  ),
            Column('x1',decimal_validation ),
            Column('x2',decimal_validation ),
            Column('x3',decimal_validation ),
            Column('x4', decimal_validation),
            Column('x5', bool_validation ),
            Column('x6', ),
            Column('x7', decimal_validation ),
            Column('x8',decimal_validation ),
            Column('x9',decimal_validation),
            Column('x10', decimal_validation)])

trainData = pd.read_csv('TrainOnMe.csv')
errors = schema.validate(trainData)
errors_index_rows = [e.row for e in errors]
data_clean = trainData.drop(index=errors_index_rows)
pd.DataFrame({'col':errors}).to_csv('errors.csv')
data_clean.to_csv('clean_data.csv')

def preProcess(data) :
schema = pandas_schema.Schema([
    Column('cantidaddepedido', int_validation + null_validation),
    Column('year', decimal_validation),
    Column('index', int_validation + null_validation),
    Column('pedido', int_validation + null_validation),
    Column('fechapedido', null_validation),
    Column('tienda', int_validation + null_validation),
    Column('nombretienda', null_validation),
    Column('direcciontienda', null_validation),
    Column('fabricante', int_validation + null_validation),
    Column('nombrefabricante', null_validation),
    Column('material', int_validation + null_validation),
    Column('nombrematerial', null_validation),
    Column('um', null_validation),
    Column('valorunitariopedido', int_validation + null_validation),
    Column('valortotalpedido', int_validation + null_validation),
    Column('entrega'),
    Column('factura'),
    Column('zona'),
    Column('nombrezonacomercial'),
    Column('ruta'),
    Column('nombreruta'),
    Column('comuna'),
    Column('barrio', null_validation),
    Column('poblacion', null_validation),
    Column('row_number', int_validation + null_validation),
    Column('month', int_validation + null_validation),
    Column('day', int_validation + null_validation)
])
Ejemplo n.º 12
0
def main():
    logging.basicConfig(level=logging.INFO,
                        format="[%(levelname)s] %(message)s")

    excel_library = 'openpyxl'  # Alternative: xlsxwriter
    write_excel = importlib.util.find_spec(excel_library) is not None
    if not write_excel:
        logging.info(
            "Couldn't find package {}, thus won't write Excel files".format(
                excel_library))

    parser = argparse.ArgumentParser()
    configure_cli_parser(parser)
    args = parser.parse_args()

    files: List[io.TextIOWrapper] = \
      [args.baseline] + args.variant

    input_data: List[pd.DataFrame] = \
      [pd.read_csv(file) for file in files]

    schema = pds.Schema([
        pds.Column(input_headers.file),
        pds.Column(input_headers.outputs,
                   [pds.validation.CanConvertValidation(int)]),
        pds.Column(input_headers.mean,
                   [pds.validation.CanConvertValidation(int)]),
        pds.Column(input_headers.stddev,
                   [pds.validation.CanConvertValidation(int)]),
        pds.Column(input_headers.relstddev,
                   [pds.validation.CanConvertValidation(int)]),
        pds.Column(input_headers.best,
                   [pds.validation.CanConvertValidation(int)]),
        pds.Column(input_headers.median,
                   [pds.validation.CanConvertValidation(int)]),
        pds.Column(input_headers.worst,
                   [pds.validation.CanConvertValidation(int)])
    ])

    for idx in range(len(files)):
        file = files[idx]
        data = input_data[idx]

        # Passing expected column headers to validate() has the effect that additional
        # columns in the CSV file are ignored, instead of being reported as an error.
        # See https://github.com/TMiguelT/PandasSchema/issues/12.
        errors = schema.validate(data, columns=input_headers.all)

        message = "Validating structure of {}: {} error(s)".format(
            file.name, len(errors))

        if errors:
            logging.error(message)
            for error in errors:
                logging.error("  {}".format(error))
        else:
            logging.info(message)

    baseline: pd.DataFrame = input_data[0]

    for idx in range(1, len(input_data)):
        variant: pd.DataFrame = input_data[idx]

        comparison: pd.DataFrame = \
          variant[[input_headers.file, input_headers.outputs, input_headers.mean]]

        comparison = pd.merge(comparison,
                              baseline[[
                                  input_headers.file, input_headers.outputs,
                                  input_headers.mean
                              ]],
                              how='outer',
                              on=input_headers.file,
                              suffixes=('_base', '_var'))

        comparison = comparison.set_index(input_headers.file)

        # print(comparison.head())

        comparison['Consistency'] = \
          comparison[input_headers.outputs + '_var'] == comparison[input_headers.outputs + '_base']

        comparison['ΔMean [ms]'] = \
          comparison[input_headers.mean + '_var'] - comparison[input_headers.mean + '_base']

        comparison['ΔMean [%]'] = \
          (comparison['ΔMean [ms]'] / comparison[input_headers.mean + '_base']) * 100

        comparison = comparison.drop(columns=[
            input_headers.outputs + '_var', input_headers.outputs + '_base'
        ])

        comparison = comparison.rename(
            columns={
                input_headers.mean + '_var': 'Mean (V) [ms]',
                input_headers.mean + '_base': 'Mean (B) [ms]'
            })

        comparison = \
          comparison[[
            'Consistency',
            'Mean (B) [ms]', 'Mean (V) [ms]',
            'ΔMean [ms]', 'ΔMean [%]'
          ]]

        # See https://stackoverflow.com/questions/25788037 for why encoding is UTF-16
        csv_filename = '{}__vs-base.csv'.format(files[idx].name)
        logging.info('Writing comparison to CSV file {}'.format(csv_filename))
        comparison.to_csv(csv_filename, encoding='utf-16')

        if write_excel:
            excel_filename = '{}__vs-base.xlsx'.format(files[idx].name)
            logging.info(
                'Writing comparison to Excel file {}'.format(excel_filename))

            with pd.ExcelWriter(excel_filename,
                                engine=excel_library) as writer:
                baseline.set_index(input_headers.file).to_excel(
                    writer, sheet_name='Baseline')
                variant.set_index(input_headers.file).to_excel(
                    writer, sheet_name='Variant')
                comparison.to_excel(writer, sheet_name='Comparison')
Ejemplo n.º 13
0
date_validation = [
    CustomElementValidation(
        lambda i: valid_date(date_format, i),
        'no calza con el patrón de fecha "' + date_format + '"')
]
dv_validation = [
    CustomElementValidation(
        lambda i: valid_pattern(i, dv_pattern),
        'no calza con el patrón de dv "' + dv_pattern + '"')
]

schema = pandas_schema.Schema([
    Column('RUT', null_validation),
    Column('FECHA', date_validation),
    Column('APORTES', int_validation),
    Column('RETIROS', int_validation),
    Column('PATRIMONIO', decimal_validation),
    Column('rent_diaria', decimal_validation),
    Column('rent_acum', decimal_validation)
])

############################### Ejecución de Funciones ###################################

df = extract_db_data('fuentedatos', 'rentaclientes')
errors = schema.validate(df)
errors_index_rows = [e.row for e in errors]
data_clean = df.drop(index=errors_index_rows)
#Todos los errores detectados
allErrors = pd.DataFrame()
for i in range(0, len(errors)):
    df3 = pd.DataFrame({