def do_validation(): # read the data data = pd.read_csv('noon.csv') data.dtypes # define validation elements int_validation = [ CustomElementValidation(lambda i: check_int(i), 'is not integer') ] null_validation = [ CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null') ] #d is not np.nan # define validation schema schema = pandas_schema.Schema([ Column('Name', null_validation), Column('SKU', null_validation), Column('Price', int_validation + null_validation), Column('Special price', int_validation + null_validation), Column('Qty', int_validation + null_validation) ]) # apply validation errors = schema.validate(data) for error in errors: print('"{}" failed!'.format(error.value)) errors_index_rows = [e.row for e in errors] # save data pd.DataFrame({'col': errors}).to_csv('errors.csv')
def validate(data: pd.DataFrame): decimal_validation = [ CustomElementValidation(lambda x: check_decimal(x), 'is not decimal') ] datetime_validation = [ CustomElementValidation(lambda x: check_datetime(x), 'is not datetime') ] string_validation = [ CustomElementValidation(lambda x: check_is_string_or_nan(x), 'is not string') ] nan_validation = [ CustomElementValidation(lambda x: x is not np.nan, 'this field cannot be NaN') ] schema = pandas_schema.Schema([ Column('value', decimal_validation + nan_validation), Column('time', datetime_validation + nan_validation), Column('target', string_validation), Column('message', string_validation), Column('event', string_validation), Column('account_number', string_validation), ]) errors = schema.validate(data) if len(errors) > 0: for error in errors: print(error) raise InvalidDataFrame("Invalid dataframe!")
def run(self): # define validation elements self.logger.info('1. Starting data Clean Action ..') system_packs_base_path = cfg.CONF.content.system_packs_base_path path_of_pack = system_packs_base_path + '/monitor_mqtt' success = False VALIDATORS = { 'decimal': CustomElementValidation(lambda d: self.check_decimal(d), 'is not decimal'), 'int': CustomElementValidation(lambda i: self.check_int(i), 'is not integer'), 'null': CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null'), 'time_stamp': CustomElementValidation(lambda d: self.check_time_stamp(d), 'time_stamp format is not valid') } self.logger.info('2. Loading Schema ..') with open(self._json_schema_path, 'r') as my_json: json_schema = json.load(my_json) column_list = [ Column(k, [VALIDATORS[v] for v in vals]) for k, vals in json_schema.items() ] schema = pandas_schema.Schema(column_list) self.logger.info('3. Loading CSV Data ..') data = pd.read_csv(self._data_file_path) self.logger.debug(data) try: self.logger.info('4. Validating input CSV data ..') errors = schema.validate(data) for e in errors: self.logger.debug(e) if errors: errors_index_rows = [e.row for e in errors] self.logger.info('5. Cleaning input CSV data ..') data_clean = data.drop(index=errors_index_rows) ct = datetime.datetime.now() filename = '{:%Y_%m_%d_%H_%M_%S_%f}.csv'.format(ct) pathoffile = path_of_pack + '/etc/clean_data_output/errors_' + filename message = 'Error Data file: ' + pathoffile self.logger.debug(message) pd.DataFrame({'col': errors}).to_csv(pathoffile) else: self.logger.info('5. Couldn`t find issue with input CSV ..') data_clean = data cleanpath = path_of_pack + '/etc/clean_data_output/clean_data.csv' cleanmessage = 'Clean Data path: ' + cleanpath self.logger.debug(cleanmessage) data_clean.to_csv(cleanpath) success = True self.logger.info('Action Completed Successfully') except Exception as msg: self.logger.info(f"FAILED STEP: {msg}\n FAILED: Clean Data Action") return success
def data_validation(filename): """ :param filename: name of the csv file with data :return: dataframe if the data is correct, list of errors :does: validates the data in the csv file """ # read the data try: data = pd.read_csv(filename) except Exception: return [False, ['Error reading a file.']] # check column names if (data.columns.to_list() == [ 'City', 'Cappuccino', 'Cinema', 'Wine', 'Gasoline', 'Avg Rent', 'Avg Disposable Income' ]): # define validation elements decimal_validation = [ CustomElementValidation(lambda d: check_float(d), 'Must be decimal') ] null_validation = [ CustomElementValidation(lambda d: d is not np.nan, 'Must not be nan') ] # define validation schema schema = pandas_schema.Schema([ Column('City', null_validation), Column('Cappuccino', decimal_validation + null_validation), Column('Cinema', decimal_validation + null_validation), Column('Wine', decimal_validation + null_validation), Column('Gasoline', decimal_validation + null_validation), Column('Avg Rent', decimal_validation + null_validation), Column('Avg Disposable Income', decimal_validation + null_validation) ]) # apply validation errors = schema.validate(data) errors_index_rows = [e.row for e in errors] data_clean = data.drop(index=errors_index_rows) if errors is not None and len(errors) == len(data['City']): return [False, errors] else: for e in errors: if e.column != 'City': data_clean[e.column] = pd.to_numeric(data_clean[e.column]) return [True, data_clean, errors] else: return [False, ['The criteria names are incorrect.']]
def data_validation(data, element): """return True if csv data to upload id valid""" environment_scopes = [] for e_scope in element.environments.list(): environment_scopes.append(e_scope.slug) # define validation elements key_validation = [ CustomElementValidation( lambda k: check_key(k), 'is not valid key - only letters, digits and _') ] bool_validation = [ CustomElementValidation(lambda b: check_boolean(b), 'is not boolean') ] null_validation = [ CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null') ] variable_type_validation = [ CustomElementValidation(lambda t: check_variable_type(t), 'is not valid variable_type') ] environment_scope_validation = [ CustomElementValidation( lambda e: check_environment_scope(e, environment_scopes), 'is not valid environment_scope') ] # variable_type,key,value,protected,masked # define validation schema schema = pandas_schema.Schema([ Column('variable_type', variable_type_validation + null_validation), Column('key', key_validation + null_validation), Column('value', null_validation), Column('protected', bool_validation + null_validation), Column('masked', bool_validation + null_validation), Column('environment_scope', null_validation + environment_scope_validation) ]) # apply validation errors = schema.validate(data) errors_index_rows = [e.row for e in errors] if errors_index_rows: logging.error(pd.DataFrame({'errors': errors})) return False logging.info("CSV validation passed.") return True
def return_validate(self): schema = pandas_schema.Schema( [ Column('Sr. No', self.decimal_validation + self.null_validation), Column('Batch', self.null_validation), Column('First Name', self.null_validation), Column('Last Name', self.null_validation), Column('Email', self.email_validation + self.null_validation), Column('Contact', self.decimal_validation + self.null_validation) ] ) errors = schema.validate(self.dataframe) print(errors, "/////////////////////") return errors
def do_validation(data): # define validation elements decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')] null_validation = [CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null')] # define validation schema schema = pandas_schema.Schema([ Column('ch1', decimal_validation + null_validation), Column('ch2', decimal_validation+ null_validation)]) # apply validation errors = schema.validate(data) errors_index_rows = [e.row for e in errors] if len(errors)==0: return True else: for e in range(0,len(errors_index_rows)): print("Error on line ", errors[e].row, " for ",errors[e].column, " : ",errors[e].value, " ", errors[e].message) return False
def validate(data): decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')] null_validation = [CustomElementValidation(lambda d: d , 'this field cannot be null')] test = [CustomElementValidation(lambda d: check_decimal(d), 'invalideted') | CustomElementValidation(lambda d: str(d).__contains__('nan'),"")] range_text = [CustomElementValidation(lambda d: (d>=0)&(d<100), 'not range')] range_number = [CustomElementValidation(lambda d: (d>=0)&(d<10000000), 'not range') | CustomElementValidation(lambda d: str(d).__contains__('nan'),"")] schema = pandas_schema.Schema([ Column('RevExp'), Column('budget', test), Column('budgetA', test), Column('total', test), Column('YTDA', test), Column('Q4F', test), Column('Q4FTB',test), Column('Q4FTBP', test), Column('comments') ]) # schema = pandas_schema.Schema([ # Column('LHIN Program: Revenue & Expenses'), # Column('Budget', test), # Column('Budget Adjustments', test), # Column('Total', test), # Column('YTD Actual', test), # Column('Q4 Forecast', test), # Column('Q4 $ Forecast Variance to Budget',test), # Column('Q4 % Forecast Variance to Budget', test), # Column('Comments\nExplanations are required where \nthe Q4 Forecasted % exceeds +/-10%') # ]) errors = schema.validate(data) # for e in errors: # print(e) # errors_index = {"row":[e.row for e in errors],"column":[e.column for e in errors]} list = [] # print(errors) # result = jsonify({"error": tuple(errors)}) for e in errors: list.append(str(e)) errors_index = {"row":[e.row for e in errors],"column":[e.column for e in errors]} print(errors_index) return errors_index
def do_validation(): # read the data data = pd.read_csv('data.csv') # define validation elements decimal_validation = [ CustomElementValidation(lambda d: check_decimal(d), 'is not decimal') ] int_validation = [ CustomElementValidation(lambda i: check_int(i), 'is not integer') ] null_validation = [ CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null') ] # define validation schema schema = pandas_schema.Schema([ Column('dec1', decimal_validation + null_validation), Column('dec2', decimal_validation), Column('dec3', decimal_validation), Column('dec4', decimal_validation), Column('dec5', decimal_validation), Column('dec6', decimal_validation), Column('dec7', decimal_validation), Column('company_id', int_validation + null_validation), Column('currency_id', int_validation + null_validation), Column('country_id', int_validation + null_validation) ]) # apply validation errors = schema.validate(data) errors_index_rows = [e.row for e in errors] data_clean = data.drop(index=errors_index_rows) # save data pd.DataFrame({'col': errors}).to_csv('errors.csv') data_clean.to_csv('clean_data.csv')
return False return True decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')] int_validation = [CustomElementValidation(lambda i: check_int(i), 'is not integer')] null_validation = [CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null')] bool_validation = [CustomElementValidation(lambda d: d in ['False','True'], 'is not bool')] schema = pandas_schema.Schema([ Column('id', int_validation ), Column('y', ), Column('x1',decimal_validation ), Column('x2',decimal_validation ), Column('x3',decimal_validation ), Column('x4', decimal_validation), Column('x5', bool_validation ), Column('x6', ), Column('x7', decimal_validation ), Column('x8',decimal_validation ), Column('x9',decimal_validation), Column('x10', decimal_validation)]) trainData = pd.read_csv('TrainOnMe.csv') errors = schema.validate(trainData) errors_index_rows = [e.row for e in errors] data_clean = trainData.drop(index=errors_index_rows) pd.DataFrame({'col':errors}).to_csv('errors.csv') data_clean.to_csv('clean_data.csv') def preProcess(data) :
schema = pandas_schema.Schema([ Column('cantidaddepedido', int_validation + null_validation), Column('year', decimal_validation), Column('index', int_validation + null_validation), Column('pedido', int_validation + null_validation), Column('fechapedido', null_validation), Column('tienda', int_validation + null_validation), Column('nombretienda', null_validation), Column('direcciontienda', null_validation), Column('fabricante', int_validation + null_validation), Column('nombrefabricante', null_validation), Column('material', int_validation + null_validation), Column('nombrematerial', null_validation), Column('um', null_validation), Column('valorunitariopedido', int_validation + null_validation), Column('valortotalpedido', int_validation + null_validation), Column('entrega'), Column('factura'), Column('zona'), Column('nombrezonacomercial'), Column('ruta'), Column('nombreruta'), Column('comuna'), Column('barrio', null_validation), Column('poblacion', null_validation), Column('row_number', int_validation + null_validation), Column('month', int_validation + null_validation), Column('day', int_validation + null_validation) ])
def main(): logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s") excel_library = 'openpyxl' # Alternative: xlsxwriter write_excel = importlib.util.find_spec(excel_library) is not None if not write_excel: logging.info( "Couldn't find package {}, thus won't write Excel files".format( excel_library)) parser = argparse.ArgumentParser() configure_cli_parser(parser) args = parser.parse_args() files: List[io.TextIOWrapper] = \ [args.baseline] + args.variant input_data: List[pd.DataFrame] = \ [pd.read_csv(file) for file in files] schema = pds.Schema([ pds.Column(input_headers.file), pds.Column(input_headers.outputs, [pds.validation.CanConvertValidation(int)]), pds.Column(input_headers.mean, [pds.validation.CanConvertValidation(int)]), pds.Column(input_headers.stddev, [pds.validation.CanConvertValidation(int)]), pds.Column(input_headers.relstddev, [pds.validation.CanConvertValidation(int)]), pds.Column(input_headers.best, [pds.validation.CanConvertValidation(int)]), pds.Column(input_headers.median, [pds.validation.CanConvertValidation(int)]), pds.Column(input_headers.worst, [pds.validation.CanConvertValidation(int)]) ]) for idx in range(len(files)): file = files[idx] data = input_data[idx] # Passing expected column headers to validate() has the effect that additional # columns in the CSV file are ignored, instead of being reported as an error. # See https://github.com/TMiguelT/PandasSchema/issues/12. errors = schema.validate(data, columns=input_headers.all) message = "Validating structure of {}: {} error(s)".format( file.name, len(errors)) if errors: logging.error(message) for error in errors: logging.error(" {}".format(error)) else: logging.info(message) baseline: pd.DataFrame = input_data[0] for idx in range(1, len(input_data)): variant: pd.DataFrame = input_data[idx] comparison: pd.DataFrame = \ variant[[input_headers.file, input_headers.outputs, input_headers.mean]] comparison = pd.merge(comparison, baseline[[ input_headers.file, input_headers.outputs, input_headers.mean ]], how='outer', on=input_headers.file, suffixes=('_base', '_var')) comparison = comparison.set_index(input_headers.file) # print(comparison.head()) comparison['Consistency'] = \ comparison[input_headers.outputs + '_var'] == comparison[input_headers.outputs + '_base'] comparison['ΔMean [ms]'] = \ comparison[input_headers.mean + '_var'] - comparison[input_headers.mean + '_base'] comparison['ΔMean [%]'] = \ (comparison['ΔMean [ms]'] / comparison[input_headers.mean + '_base']) * 100 comparison = comparison.drop(columns=[ input_headers.outputs + '_var', input_headers.outputs + '_base' ]) comparison = comparison.rename( columns={ input_headers.mean + '_var': 'Mean (V) [ms]', input_headers.mean + '_base': 'Mean (B) [ms]' }) comparison = \ comparison[[ 'Consistency', 'Mean (B) [ms]', 'Mean (V) [ms]', 'ΔMean [ms]', 'ΔMean [%]' ]] # See https://stackoverflow.com/questions/25788037 for why encoding is UTF-16 csv_filename = '{}__vs-base.csv'.format(files[idx].name) logging.info('Writing comparison to CSV file {}'.format(csv_filename)) comparison.to_csv(csv_filename, encoding='utf-16') if write_excel: excel_filename = '{}__vs-base.xlsx'.format(files[idx].name) logging.info( 'Writing comparison to Excel file {}'.format(excel_filename)) with pd.ExcelWriter(excel_filename, engine=excel_library) as writer: baseline.set_index(input_headers.file).to_excel( writer, sheet_name='Baseline') variant.set_index(input_headers.file).to_excel( writer, sheet_name='Variant') comparison.to_excel(writer, sheet_name='Comparison')
date_validation = [ CustomElementValidation( lambda i: valid_date(date_format, i), 'no calza con el patrón de fecha "' + date_format + '"') ] dv_validation = [ CustomElementValidation( lambda i: valid_pattern(i, dv_pattern), 'no calza con el patrón de dv "' + dv_pattern + '"') ] schema = pandas_schema.Schema([ Column('RUT', null_validation), Column('FECHA', date_validation), Column('APORTES', int_validation), Column('RETIROS', int_validation), Column('PATRIMONIO', decimal_validation), Column('rent_diaria', decimal_validation), Column('rent_acum', decimal_validation) ]) ############################### Ejecución de Funciones ################################### df = extract_db_data('fuentedatos', 'rentaclientes') errors = schema.validate(df) errors_index_rows = [e.row for e in errors] data_clean = df.drop(index=errors_index_rows) #Todos los errores detectados allErrors = pd.DataFrame() for i in range(0, len(errors)): df3 = pd.DataFrame({