def test_schema(self): """ Test this validation inside a schema, to ensure we get helpful error messages. In particular, we want to make sure that a ValidationWarning without a row number won't break the schema """ df = pd.DataFrame( data={ 'wrong_dtype1': ['not_an_int'], 'wrong_dtype2': [123], 'wrong_dtype3': [12.5] }) schema = Schema([ Column('wrong_dtype1', [IsDtypeValidation(dtype('int64'))]), Column('wrong_dtype2', [IsDtypeValidation(dtype('float64'))]), Column('wrong_dtype3', [IsDtypeValidation(dtype('int64'))]), ]) errors = schema.validate(df) self.assertEqual( sorted([str(x) for x in errors]), sorted([ 'The column wrong_dtype1 has a dtype of object which is not a subclass of the required type int64', 'The column wrong_dtype2 has a dtype of int64 which is not a subclass of the required type float64', 'The column wrong_dtype3 has a dtype of float64 which is not a subclass of the required type int64' ]))
def do_validation(): # read the data data = pd.read_csv('noon.csv') data.dtypes # define validation elements int_validation = [ CustomElementValidation(lambda i: check_int(i), 'is not integer') ] null_validation = [ CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null') ] #d is not np.nan # define validation schema schema = pandas_schema.Schema([ Column('Name', null_validation), Column('SKU', null_validation), Column('Price', int_validation + null_validation), Column('Special price', int_validation + null_validation), Column('Qty', int_validation + null_validation) ]) # apply validation errors = schema.validate(data) for error in errors: print('"{}" failed!'.format(error.value)) errors_index_rows = [e.row for e in errors] # save data pd.DataFrame({'col': errors}).to_csv('errors.csv')
class ClassifierParser(BaseParser): """ Implementation of classifier dao Parser. The classifier output tables contain the output data from geniepy after the classifiers have calculated desired predictions. """ default_type: DataType = None scraper: None """No online sources for classifiers output.""" schema: Schema = Schema([ Column("digest"), Column(PCPCLSFR_NAME, [IsDtypeValidation(np.float64)]), Column(CTCLSFR_NAME, [IsDtypeValidation(np.float64)]), ]) def fetch(self, chunksize: int) -> Generator[DataFrame, None, None]: """No online sources to fetch from for classifiers outputs.""" raise NotImplementedError("Classifier Output Parser has no Scrapers") @staticmethod def parse(data, dtype=DataType.CSV_STR) -> DataFrame: """ Parser function from base class. Raises: NotImplementedError -- Function not implemented since classifiers return dataframes that only need to be validated. """ raise NotImplementedError("Classifier Output Parser has no Scrapers")
def validate_data(df): schema = Schema([ Column('brand', null_validation + string_validation), Column('gear', null_validation + string_validation), Column('model', null_validation + string_validation), Column('price', null_validation + int_validation + price_max_validation, price_min_validation), Column('fuel', null_validation + string_validation), Column('mileage', null_validation + float_validation + mileage_max_validation), Column( 'hp', null_validation + float_validation + hp_min_validation + hp_max_validation), Column('type', null_validation + string_validation), Column('geo', null_validation + string_validation), Column('model_year', null_validation + float_validation), ]) try: errors = schema.validate(df) for e in errors: print(e) except: return False else: if not errors: return True else: return False
def validate_variant(conn, args, filepath): """Validates input file for variant data This function validates that the contents of a file to contain variant data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ schema = Schema([ Column('chr', [ CanConvertValidation(int) ]), Column('pos', [ CanConvertValidation(int), IsDistinctValidation() ]) ]) df = pd.read_csv(filepath, sep='\t', header=None) if len(df.columns) != 2: raise Exception(f"Invalid file format. Excepted 2 columns, found {len(df.columns)} columns. Columns should consist of chromsome number and SNP position. Filepath: {filepath}") df.columns = ['chr', 'pos'] err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
class UnorderedSchema(unittest.TestCase): schema = Schema( [Column('a'), Column('b', [LeadingWhitespaceValidation()])], ordered=False) def test_fields(self): self.assertEqual(len(self.schema.columns), 2, 'The schema is not storing all of its columns') self.assertEqual( self.schema.ordered, False, 'The schema is not storing the correct value of ordered') def test_validate_valid(self): df = pd.DataFrame({'a': ['1', '2', '3'], 'b': ['1', '2', '3']}) results = self.schema.validate(df) self.assertEqual(len(results), 0, 'A correct data frame should have no errors') def test_validate_invalid(self): df = pd.DataFrame({'a': [' 1', '2', '3'], 'b': [' 1', '2', '3']}) results = self.schema.validate(df) self.assertEqual(len(results), 1, 'An incorrect data frame should report errors') def test_mixed_columns(self): """ Tests that when ordered=False, the schema columns are associated with data frame columns by name, not position. In this case, the schema's column order is [a, b], while the data frame's order is [b, a]. There is an error in column b in the data frame (leading whitespace), and a validation on column b in the schema. Schema a b (validation) Data Frame b (error) a Thus there will only be an error if column b in the schema is linked to column b in the data frame, as is correct behaviour. """ df = pd.read_csv(StringIO(''' b,a 1,1 2,3 3,3 '''), sep=',', header=0, dtype=str) results = self.schema.validate(df) self.assertEqual(len(results), 1, 'There should be 1 error') self.assertEqual(results[0].row, 0) self.assertEqual( results[0].column, 'b', 'The Schema object is not associating columns and column schemas by name' )
class StockPrice: """ Model representing stock prices over time """ fieldSchema = Schema([ Column('open', [IsDtypeValidation(np.float64)]), Column('close', [IsDtypeValidation(np.float64)]), Column('high', [IsDtypeValidation(np.float64)]), Column('low', [IsDtypeValidation(np.float64)]), Column('volume', [IsDtypeValidation(np.int64)]) ])
def defSchema(): print('Define expected Schema') schema = Schema([ Column(name='id', validations=[IsDtypeValidation(np.object_)], allow_empty=False), Column(name='comment_text', validations=[IsDtypeValidation(np.object_)], allow_empty=False), Column(name='toxic', validations=[InListValidation([0, 1])], allow_empty=False), Column(name='severe_toxic', validations=[InListValidation([0, 1])], allow_empty=False), Column(name='obscene', validations=[InListValidation([0, 1])], allow_empty=False), Column(name='threat', validations=[InListValidation([0, 1])], allow_empty=False), Column(name='insult', validations=[InListValidation([0, 1])], allow_empty=False), Column(name='identity_hate', validations=[InListValidation([0, 1])], allow_empty=False) ]) return schema
def validate_genotype(conn, args, filepath): """Validates input file for genotype data This function validates that the contents of a file to contain genotype data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ # Allow for users to skip this validation step because it is time consuming if args.skip_genotype_validation is True: return schema_columns = [ Column('row_number', [ CanConvertValidation(int) & IsDistinctValidation() ]) ] # Get the number of lines from the .pos counterpart file pos_filepath = '.'.join([filepath, 'pos']) if not os.path.exists(pos_filepath): raise FileNotFoundError(f"Count not locate the position counterpart file for {filepath}") nPositions = len(pd.read_csv(pos_filepath, header=None).index) for n in range(nPositions): schema_columns.append( Column(f'pos_{n}', [ CanConvertValidation(int) & CustomSeriesValidation(lambda x: x.int in [-1,0,1,2], 'Incorrectly coded value.') ]) ) schema = Schema(schema_columns) df = pd.read_csv(filepath, sep='\t', header=None) err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
def validate_results(conn, args, filepath): """Validates input file for GWAS result data This function validates that the contents of a file to contain GWAS result data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ df = pd.read_csv(filepath) # For each column, add it to the schema, and then for known ones, add the # schema validation. Use fuzzy comparisons when possible schema_columns = [] for col in df.columns: validators = [] if re.match("(SNP)|(chr)|(pos)|(nSNPs)", col, re.IGNORECASE): validators.append(CanConvertValidation(int)) # Look for any of the p-values and make sure that they can be cast as a float if re.match("((null)?pval(ue)?)", col, re.IGNORECASE): validators.append(CanConvertValidation(float)) schema_columns.append(Column(col, validators)) schema = Schema(schema_columns) err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
def validate_line(conn, args, filepath): """Validates input file for line data This function validates that the contents of a file to contain line data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ schema = Schema([ Column('line_name', [ IsDistinctValidation() ]) ]) df = pd.read_csv(filepath, header=None) if len(df.columns) != 1: raise Exception(f"Invalid file format. Excepted 1 column found {len(df.columns)} columns. This file should be a single column of each line. Each entry should be distinct.") df.columns = [ 'line_name' ] err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
def compile_source_data_validation_schema(self, dataset): field_names = [] field_schemas = [] if dataset is not None: sub_map = Mapper.filter_mapper(self.mapper_df, dataset) else: sub_map = self.mapper_df sub_map = sub_map[sub_map['allow_missing'].str.lower() != 'y'].dropna( subset=['source_field_name', 'source_field_type']) for idx, field in sub_map.iterrows(): field_validator = self.compile_field_validator(field) if field_validator: field_names.append(field['source_field_name']) field_schemas.append( Column(field['source_field_name'], field_validator)) if not field_schemas: return None, [] schema = Schema(field_schemas) return schema, field_names
def __init__(self): self.schemas = Schema( [ Column( "Given Name", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column( "Family Name", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("Age", [InRangeValidation(0, 120)]), Column("Sex", [InListValidation(["Male", "Female", "Other"])]), Column("Customer ID", [MatchesPatternValidation(r"\d{4}[A-Z]{4}")]), ] )
def __init__( self, args: Namespace, sources: Dict[str, Any], schema: List[Tuple[str, np.generic]], destinations: Dict[str, Any], stage: str, task: str, ): """Initiate parameters and client libraries for ETL task. :param args: args passed from command line, see `get_arg_parser()` :param sources: data source to be extracted, specified in task config, see `configs/*.py` :param schema: the target schema to load to. :param destinations: destinations to load data to, specified in task config, see `configs/*.py` :param stage: the stage of the loaded data, could be staging/production. :param task: the name of the task. """ # Clear cached files if args.rm: for source in sources: files = [] files += glob.glob( get_path_format(True).format( prefix=destinations["fs"]["prefix"], stage="raw", task=args.task, source=source, )) files += glob.glob( get_path_format(True).format( prefix=destinations["fs"]["prefix"], stage=stage, task=args.task, source=source, )) for f in files: log.info("Removing cached file: %s" % f) os.remove(f) self.task = task self.stage = stage self.args = args self.period = args.period self.current_date = args.date self.last_month = lookback_dates(args.date, args.period) self.sources = sources coltypes = [] for coltype in schema: coltypes += [Column(coltype[0], [IsDtypeValidation(coltype[1])])] self.schema = Schema(coltypes) self.raw_schema = schema self.destinations = destinations self.raw = dict() self.extracted_base = dict() self.extracted = dict() self.transformed = dict() self.gcs = storage.Client()
def validate_phenotype(conn, args, filepath): """Validates input file for phenotype data This function validates that the contents of a file to contain phenotype data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ df = pd.read_csv(filepath) nrows, ncols = df.shape nrows += 1 # include the header in the row count if re.match('(genotype)|(pedigree)|(line)', df.columns[0], re.IGNORECASE) is None: raise Exception("Genotype/pedigree/line should be the first column in the phenotype file") # Rename the first column of data to be the genotypes/lines df.rename(columns={f'{df.columns[0]}': 'genotype'}, inplace=True) schema_columns = [ Column('genotype', [ IsDistinctValidation() ]) ] for n in range(1, ncols): schema_columns.append( Column(df.columns[n], [ # NOTE(tparker): This may not always be true. If there any phenotypes that # are listed as categories or strings, then this would fail # Find out all the possible phenotype values. It may be difficult to # validate input data without a user-provided dtype list CanConvertValidation(float) ]) ) schema = Schema(schema_columns) err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
def run(self): # define validation elements self.logger.info('1. Starting data Clean Action ..') system_packs_base_path = cfg.CONF.content.system_packs_base_path path_of_pack = system_packs_base_path + '/monitor_mqtt' success = False VALIDATORS = { 'decimal': CustomElementValidation(lambda d: self.check_decimal(d), 'is not decimal'), 'int': CustomElementValidation(lambda i: self.check_int(i), 'is not integer'), 'null': CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null'), 'time_stamp': CustomElementValidation(lambda d: self.check_time_stamp(d), 'time_stamp format is not valid') } self.logger.info('2. Loading Schema ..') with open(self._json_schema_path, 'r') as my_json: json_schema = json.load(my_json) column_list = [ Column(k, [VALIDATORS[v] for v in vals]) for k, vals in json_schema.items() ] schema = pandas_schema.Schema(column_list) self.logger.info('3. Loading CSV Data ..') data = pd.read_csv(self._data_file_path) self.logger.debug(data) try: self.logger.info('4. Validating input CSV data ..') errors = schema.validate(data) for e in errors: self.logger.debug(e) if errors: errors_index_rows = [e.row for e in errors] self.logger.info('5. Cleaning input CSV data ..') data_clean = data.drop(index=errors_index_rows) ct = datetime.datetime.now() filename = '{:%Y_%m_%d_%H_%M_%S_%f}.csv'.format(ct) pathoffile = path_of_pack + '/etc/clean_data_output/errors_' + filename message = 'Error Data file: ' + pathoffile self.logger.debug(message) pd.DataFrame({'col': errors}).to_csv(pathoffile) else: self.logger.info('5. Couldn`t find issue with input CSV ..') data_clean = data cleanpath = path_of_pack + '/etc/clean_data_output/clean_data.csv' cleanmessage = 'Clean Data path: ' + cleanpath self.logger.debug(cleanmessage) data_clean.to_csv(cleanpath) success = True self.logger.info('Action Completed Successfully') except Exception as msg: self.logger.info(f"FAILED STEP: {msg}\n FAILED: Clean Data Action") return success
def _validate(self, diagnosis_df): schema = Schema([ Column('visit_dt', [ MatchesPatternValidation(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:00$') ]), Column('sex', [InListValidation(['M', 'K'])]), Column('icd10', [ MatchesPatternValidation(r'^[CDIJKMNRZ]{1}\d{1,2}.?\d{0,2}$') ]) ]) errors = schema.validate(diagnosis_df) for error in errors: self.Logger.error(error) if len(errors) > 0: exit()
def do_validation(data): # define validation elements decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')] null_validation = [CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null')] # define validation schema schema = pandas_schema.Schema([ Column('ch1', decimal_validation + null_validation), Column('ch2', decimal_validation+ null_validation)]) # apply validation errors = schema.validate(data) errors_index_rows = [e.row for e in errors] if len(errors)==0: return True else: for e in range(0,len(errors_index_rows)): print("Error on line ", errors[e].row, " for ",errors[e].column, " : ",errors[e].value, " ", errors[e].message) return False
def test_custom_message(self): validator = InRangeValidation(min=4, message=self.message) for error in validator.get_errors(pd.Series( [ 1, 2, 3 ] ), Column('')): self.assertRegex(error.message, self.message, 'Validator not using the custom warning message!')
def test_default_message(self): validator = InRangeValidation(min=4) for error in validator.get_errors(pd.Series( [ 1, 2, 3 ] ), Column('')): self.assertNotRegex(error.message, self.message, 'Validator not using the default warning message!')
def validate(data: pd.DataFrame): decimal_validation = [ CustomElementValidation(lambda x: check_decimal(x), 'is not decimal') ] datetime_validation = [ CustomElementValidation(lambda x: check_datetime(x), 'is not datetime') ] string_validation = [ CustomElementValidation(lambda x: check_is_string_or_nan(x), 'is not string') ] nan_validation = [ CustomElementValidation(lambda x: x is not np.nan, 'this field cannot be NaN') ] schema = pandas_schema.Schema([ Column('value', decimal_validation + nan_validation), Column('time', datetime_validation + nan_validation), Column('target', string_validation), Column('message', string_validation), Column('event', string_validation), Column('account_number', string_validation), ]) errors = schema.validate(data) if len(errors) > 0: for error in errors: print(error) raise InvalidDataFrame("Invalid dataframe!")
def data_validation(filename): """ :param filename: name of the csv file with data :return: dataframe if the data is correct, list of errors :does: validates the data in the csv file """ # read the data try: data = pd.read_csv(filename) except Exception: return [False, ['Error reading a file.']] # check column names if (data.columns.to_list() == [ 'City', 'Cappuccino', 'Cinema', 'Wine', 'Gasoline', 'Avg Rent', 'Avg Disposable Income' ]): # define validation elements decimal_validation = [ CustomElementValidation(lambda d: check_float(d), 'Must be decimal') ] null_validation = [ CustomElementValidation(lambda d: d is not np.nan, 'Must not be nan') ] # define validation schema schema = pandas_schema.Schema([ Column('City', null_validation), Column('Cappuccino', decimal_validation + null_validation), Column('Cinema', decimal_validation + null_validation), Column('Wine', decimal_validation + null_validation), Column('Gasoline', decimal_validation + null_validation), Column('Avg Rent', decimal_validation + null_validation), Column('Avg Disposable Income', decimal_validation + null_validation) ]) # apply validation errors = schema.validate(data) errors_index_rows = [e.row for e in errors] data_clean = data.drop(index=errors_index_rows) if errors is not None and len(errors) == len(data['City']): return [False, errors] else: for e in errors: if e.column != 'City': data_clean[e.column] = pd.to_numeric(data_clean[e.column]) return [True, data_clean, errors] else: return [False, ['The criteria names are incorrect.']]
def do_validation(): # read the data data = pd.read_csv('data.csv') # define validation elements decimal_validation = [ CustomElementValidation(lambda d: check_decimal(d), 'is not decimal') ] int_validation = [ CustomElementValidation(lambda i: check_int(i), 'is not integer') ] null_validation = [ CustomElementValidation(lambda d: d is not np.nan, 'this field cannot be null') ] # define validation schema schema = pandas_schema.Schema([ Column('dec1', decimal_validation + null_validation), Column('dec2', decimal_validation), Column('dec3', decimal_validation), Column('dec4', decimal_validation), Column('dec5', decimal_validation), Column('dec6', decimal_validation), Column('dec7', decimal_validation), Column('company_id', int_validation + null_validation), Column('currency_id', int_validation + null_validation), Column('country_id', int_validation + null_validation) ]) # apply validation errors = schema.validate(data) errors_index_rows = [e.row for e in errors] data_clean = data.drop(index=errors_index_rows) # save data pd.DataFrame({'col': errors}).to_csv('errors.csv') data_clean.to_csv('clean_data.csv')
def validate(data): decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')] null_validation = [CustomElementValidation(lambda d: d , 'this field cannot be null')] test = [CustomElementValidation(lambda d: check_decimal(d), 'invalideted') | CustomElementValidation(lambda d: str(d).__contains__('nan'),"")] range_text = [CustomElementValidation(lambda d: (d>=0)&(d<100), 'not range')] range_number = [CustomElementValidation(lambda d: (d>=0)&(d<10000000), 'not range') | CustomElementValidation(lambda d: str(d).__contains__('nan'),"")] schema = pandas_schema.Schema([ Column('RevExp'), Column('budget', test), Column('budgetA', test), Column('total', test), Column('YTDA', test), Column('Q4F', test), Column('Q4FTB',test), Column('Q4FTBP', test), Column('comments') ]) # schema = pandas_schema.Schema([ # Column('LHIN Program: Revenue & Expenses'), # Column('Budget', test), # Column('Budget Adjustments', test), # Column('Total', test), # Column('YTD Actual', test), # Column('Q4 Forecast', test), # Column('Q4 $ Forecast Variance to Budget',test), # Column('Q4 % Forecast Variance to Budget', test), # Column('Comments\nExplanations are required where \nthe Q4 Forecasted % exceeds +/-10%') # ]) errors = schema.validate(data) # for e in errors: # print(e) # errors_index = {"row":[e.row for e in errors],"column":[e.column for e in errors]} list = [] # print(errors) # result = jsonify({"error": tuple(errors)}) for e in errors: list.append(str(e)) errors_index = {"row":[e.row for e in errors],"column":[e.column for e in errors]} print(errors_index) return errors_index
def validate_kinship(conn, args, filepath): """Validates input file for kinship data This function validates that the contents of a file to contain kinship data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ df = pd.read_csv(filepath) nrows, ncols = df.shape df.rename(columns = {"Unnamed: 0": "line_name"}, inplace=True) # since column name is blank by default, rename it for later reference nrows += 1 # include the header row in the count logging.debug(f"Dimensions of kinship matrix: <{nrows}, {ncols}>") schema_columns = [ Column('line_name', [ IsDistinctValidation() ]) ] for n in range(1, ncols): schema_columns.append(Column(df.columns[n], [ CanConvertValidation(float) ])) schema = Schema(schema_columns) err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
def validate_population_structure(conn, args, filepath): """Validates input file for population structure data This function validates that the contents of a file to contain population structure data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ df = pd.read_csv(filepath) nrows, ncols = df.shape nrows += 1 # include the header rows in the count logging.debug(f'Population structure columns: {df.columns}') logging.debug(f"Population structure dimensions: <{nrows}, {ncols}>") schema_columns = [ Column('Pedigree', [ IsDistinctValidation() ]) ] for n in range(1, ncols): schema_columns.append(Column(df.columns[n], [ CanConvertValidation(float) ])) schema = Schema(schema_columns) err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
def create_schema(self) -> Schema: """ Create Pandas schema with all the necessary validation rules read in from config """ col_list = [] for column in self.__spreadsheet_def.keys(): validators = [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation() ] mandatory_field_flag = self.__spreadsheet_def.is_mandatory(column) # Special cases for checking institutions/countries... if column == 'submitting_institution': validators.append( InListValidation([i.name for i in self.__institutions])) if column == 'country': validators.append( InListValidation([i.country for i in self.__institutions])) else: # Regex validation if self.__spreadsheet_def.get_regex(column): validators.append( MatchesPatternValidation( self.__spreadsheet_def.get_regex(column), message=self.__spreadsheet_def. get_regex_validation_message(column))) # Validate allowed values elif self.__spreadsheet_def.get_allowed_values(column): validators.append( InListValidation( self.__spreadsheet_def.get_allowed_values(column), case_sensitive=False)) # Field length validation max_len = self.__spreadsheet_def.get_max_length(column) if max_len and max_len > 0: validators.append( _StringLengthValidation( 'field length is greater than {} characters'. format(str(max_len)), max_len)) # Mandatory field validation col_list.append( Column(self.__spreadsheet_def.get_column_name(column), validators, allow_empty=not mandatory_field_flag)) return Schema(col_list)
class OrderedSchema(unittest.TestCase): schema = Schema( [Column('a', [LeadingWhitespaceValidation()]), Column('b')], ordered=True) def test_mixed_columns(self): """ Tests that when ordered=True, the schema columns are associated with data frame columns by position, not name. In this case, the schema's column order is [a, b], while the data frame's order is [b, a]. There is an error in column b in the data frame (leading whitespace), and a validation on column a in the schema. Schema a (validation) b Data Frame b (error) a Thus there will only be an error if column b in the schema is linked to column a in the data frame, as is correct behaviour when ordered=True. """ df = pd.read_csv(StringIO(''' b,a 1,1 2,3 3,3 '''), sep=',', header=0, dtype=str) results = self.schema.validate(df) self.assertEqual(len(results), 1, 'There should be 1 error') self.assertEqual(results[0].row, 0) self.assertEqual( results[0].column, 'b', 'The Schema object is not associating columns and column schemas by position' )
class AllowEmptyColumn(unittest.TestCase): """ Test a column with one single validation that allows empty columns """ NAME = 'col1' col = Column(NAME, [CanConvertValidation(int)], allow_empty=True) ser = pd.Series([ '', ]) def test_outputs(self): results = self.col.validate(self.ser) self.assertEqual(len(results), 0, 'allow_empty is not allowing empty columns')
def check_join_cols(df1, df2, on): schema = Schema([ Column( col, [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), IsDistinctValidation() ], ) for col in on ]) results = [schema.validate(df) for df in [df1[on], df2[on]]] if len(results) > 0: print("The following issues exist in the index:") for error in itertools.chain(*results): print(error)