def _fw_schema(custom_function): return [ ('ID', StringParser(1, 2)), ('NAME', StringParser(3, 5).change_case('U').not_null('nan', allow_white_space=True)), ('GENDER', StringParser(6, 6).value_set(['M', 'F'])), ('NOT_NULLABLE_VALUE', StringParser(7, 11).not_null('dummy')), ('NULLABLE_VALUE', StringParser(7, 11)), ('BIRTH_YEAR', IntegerParser(12, 13).max_value(20)), ('BALANCE', FloatParser(14, 17).min_value(10.0)) ]
def test_not_null_validator(): func = StringParser(quoted=0).not_null().build() assert func("SOME_VALUE") == "SOME_VALUE" with pytest.raises(NullValueInNotNullFieldException): assert func(None) assert func("") == "" p = StringParser().not_null(allow_white_space=True) func = p.build() assert func("") == "" # Default value assignment check func = StringParser(quoted=0).not_null(default_value="NA").build() assert func(None) == "NA" assert func("") == "NA"
def _schema(custom_function): return [ ('ID', StringParser(quoted=1)), ('RUN_ID', StringParser().regex_match(r'\w+_\d{4}-\d{2}-\d{2}').change_case('u')), ('CLASS', StringParser(start=1, end=1).value_set(['a', 'b', 'A'])), ('INITIATED_ON', DatetimeParser(formats=['%Y%m%d', '%Y-%m-%d %H:%M:%S']) .convert('%Y/%m/%d').max_value(datetime.datetime.now()) .min_value(value='20000101', format='%Y%m%d') .not_null(datetime.datetime.strptime('19001231', '%Y%m%d')) ), ('ASKED_AMOUNT', IntegerParser().max_value(2000).not_null(default_value=0)), ('ADJUSTED_AMOUNT', FloatParser().min_value(10.0).not_null(0.0)), ('ROLE_MODEL', ConstantParser('Leo Messi')), ('BLOCK_NUMBER', IntegerParser().add_func(custom_function).range(0, 40)) ]
from parseval.parser import (StringParser, IntegerParser, FloatParser) # In this example we will see how fixed width dataset can be processed using `Parser` API # For all types of dataset structure, fixed width data is supported # For demo purposes we will consider input in list of rows format, and the parsed rows will be returned in JSON format # To process any kind of dataset, first one schema has to be defined # Schema is nothing but a set of parsers, stored in a specific structure in the sequence of columns in dataset # The cursors has to provided in a list, where each element of the list is a tuple. # First element of each element is the column name, this is just for reference, no internal usage # Second element of each tuple is the actual parser (parser objects, not built parser function) logging.basicConfig(format='%(levelname)s:%(asctime)s:: %(message)s', level=logging.DEBUG) fw_schema = [ ('ID', StringParser(1, 2)), ('NAME', StringParser(3, 5).change_case('U').not_null('nan', allow_white_space=True)), ('GENDER', StringParser(6, 6).value_set(['M', 'F'])), ('NOT_NULLABLE_VALUE', StringParser(7, 11).not_null('dummy')), ('NULLABLE_VALUE', StringParser(7, 11)), ('BIRTH_YEAR', IntegerParser(12, 13).max_value(20)), ('BALANCE', FloatParser(14, 17).min_value(10.0)) ] p = Parser(schema=fw_schema, stop_on_error=1, input_row_format='fixed-width', parsed_row_format='dict') logging.info(('#' * 50) + " FIXED WIDTH DATASET PARSING " + ('#' * 50)) parsed_data = p.parse(
def test_add_func_validator(): func = StringParser(quoted=0).add_func(_first_three_char_check).build() assert func("ABC2344") == "2344" with pytest.raises(Exception): assert func("PQR2344") == "2344"
def test_change_case_validator(): func = StringParser(quoted=0).change_case(case_type='u').build() assert func('Manual_2020-23-12') == "MANUAL_2020-23-12"
def test_regex_match_validator(): pattern = r'\w+_\d{4}-\d{2}-\d{2}' func = StringParser(quoted=0).regex_match(pattern=pattern).build() assert func('Manual_2020-23-12') == "Manual_2020-23-12" with pytest.raises(RegexMatchException): assert func('Trig2020-23-12')
def test_value_set_validator(): allowed_values = ['MAEVE', 'OTIS', 'ERIC'] func = StringParser(quoted=0).value_set(allowed_values).build() assert func('OTIS') == "OTIS" with pytest.raises(ValidValueCheckException): assert func('GRY')
def test_single_quoted_data(): func = StringParser(quoted=2).build() data = "'ABC'" assert func(data) == "ABC"
def test_double_quoted_data(): func = StringParser(quoted=1).build() data = '"ABC"' assert func(data) == 'ABC'
def test_non_quoted_data(): func = StringParser().build() data = "ABC" assert func(data) == "ABC"
# Import required parser class import logging from parseval.parser import StringParser logging.basicConfig(format='%(levelname)s:%(asctime)s:: %(message)s', level=logging.DEBUG) basic_parser = StringParser() # Create basic parser object basic_parse_func = basic_parser.build() # Build the parser function input_data = 'Any String' # Input Data basic_parsed_output = basic_parse_func(input_data) # Parse data logging.info(('#' * 50) + " STRING PARSING " + ('#' * 50)) logging.info("====> Simple Data Parsing example:") logging.info("Input 1: {}".format(input_data)) logging.info("Output 1: {}".format(basic_parsed_output)) logging.info('\n') # Now let's see some available validators, to get the idea of how to use those # Note, we will not go through all available validators, because all validators work in same fashion # Syntax and description of all validators are available in documentation validation_parser = StringParser()\ .not_null(default_value="NA")\ .value_set(["Apple", "Google", "NA"]) # null check validation and allowed values validation is added validation_parse_func = validation_parser( ) # Yes, you can directly call the object to build the parser valid_input_data = 'Apple' output_for_valid_input_data = validation_parse_func( valid_input_data) # Parse data logging.info("====> Data Validation example:")
# Adding custom validation that the first 3 digit is from a set of values def _first_three_char_check(data: str): list_of_allowed_value: list = ['ABC', 'DEF'] if not data: return data if str(data)[:3].upper() in [ word.upper() for word in list_of_allowed_value ]: return data else: raise Exception("Bad prefix in the value.") # Create and build the parser parser = StringParser().add_func(_first_three_char_check) parser_func = parser.build() input_data = "ABC2344" logging.info(('#' * 50) + " ADDING CUSTOM VALIDATOR " + ('#' * 50)) logging.info("====> Valid Input:") logging.info("Input: {}".format(input_data)) logging.info("Output: {}".format(parser_func(input_data))) logging.info('\n') input_data = "PQR12344" logging.info("====> Invalid Input:") try: logging.info("Invalid Input: {}".format(input_data)) parser_func(input_data) except Exception as e:
def _parity_check(data): if data: try: i_data = int(data) except: pass if i_data % 2 != 0: raise Exception("The data has to be even!") return data # The cursors has to provided in a list, where each element of the list is a tuple. # First element of each element is the column name, this is just for reference, no internal usage # Second element of each tuple is the actual parser (parser objects, not built parser function) schema = [ ('ID', StringParser(quoted=1)), ('RUN_ID', StringParser().regex_match(r'\w+_\d{4}-\d{2}-\d{2}').change_case('u')), ('CLASS', StringParser(start=1, end=1).value_set(['a', 'b', 'A'])), ('INITIATED_ON', DatetimeParser( formats=['%Y%m%d', '%Y-%m-%d %H:%M:%S']).convert('%Y/%m/%d').max_value( datetime.datetime.now()).min_value(value='20000101', format='%Y%m%d').not_null( datetime.datetime.strptime( '19001231', '%Y%m%d'))), ('ASKED_AMOUNT', IntegerParser().max_value(2000).not_null(default_value=0)), ('ADJUSTED_AMOUNT', FloatParser().min_value(10.0).not_null(0.0)), ('ROLE_MODEL', ConstantParser('Iron-Man')), ('BLOCK_NUMBER', IntegerParser().add_func(_parity_check).range(0, 40)) ]