def test_can_handle_checks_from_excel(self): cid_reader = interface.Cid() source_path = dev_test.CID_CUSTOMERS_XLS_PATH cid_reader.read(source_path, rowio.excel_rows(source_path)) self.assertTrue( isinstance(cid_reader.check_for(cid_reader.check_names[0]), checks.IsUniqueCheck))
def test_can_read_fields_from_excel(self): cid_reader = interface.Cid() source_path = dev_test.path_to_test_cid('cid_customers.xls') cid_reader.read(source_path, rowio.excel_rows(source_path)) self.assertEqual(cid_reader.field_names[0], 'customer_id') self.assertTrue( isinstance(cid_reader.field_formats[0], fields.IntegerFieldFormat)) self.assertEqual(cid_reader.field_names[1], 'surname') self.assertTrue( isinstance(cid_reader.field_formats[1], fields.TextFieldFormat)) self.assertEqual(cid_reader.field_formats[1].length.items, ranges.Range('...60').items) self.assertEqual(cid_reader.field_names[2], 'first_name') self.assertTrue( isinstance(cid_reader.field_formats[2], fields.TextFieldFormat)) self.assertEqual(cid_reader.field_formats[2].length.items, ranges.Range('...60').items) self.assertTrue(cid_reader.field_formats[2].is_allowed_to_be_empty) self.assertEqual(cid_reader.field_names[3], 'date_of_birth') self.assertTrue( isinstance(cid_reader.field_formats[3], fields.DateTimeFieldFormat)) self.assertEqual(cid_reader.field_names[4], 'gender') self.assertTrue( isinstance(cid_reader.field_formats[4], fields.ChoiceFieldFormat)) self.assertTrue(cid_reader.field_formats[4].is_allowed_to_be_empty)
def test_fails_on_csv_source_file_with_more_elements_than_expected(self): cid_reader = interface.Cid( dev_test.path_to_test_cid("icd_customers.xls")) with validio.Reader( cid_reader, dev_test.path_to_test_data( "broken_customers_more_elements.csv")) as reader: self.assertRaises(errors.DataError, reader.validate_rows)
def test_fails_on_invalid_csv_source_file_with_not_observed_count_expression( self): cid = interface.Cid(dev_test.path_to_test_cid("icd_customers.xls")) data_path = dev_test.path_to_test_data( "broken_customers_with_too_many_branches.csv") reader = validio.Reader(cid, data_path) reader.validate_rows() self.assertRaises(errors.CheckError, reader.close)
def test_fails_on_delimited_with_unterminated_quote(self): customer_cid = interface.Cid(dev_test.CID_CUSTOMERS_ODS_PATH) broken_delimited_path = dev_test.path_to_test_data('broken_customers_with_unterminated_quote.csv') try: list(rowio.delimited_rows(broken_delimited_path, customer_cid.data_format)) except errors.DataFormatError as error: error_message = '%s' % error self.assertTrue( 'cannot parse delimited file' in error_message, 'error_message=%r' % error_message)
def test_can_skip_empty_rows(self): cid_reader = interface.Cid() cid_reader.read('inline', [ [], [''], ['d', 'format', 'delimited'], ['f', 'some'], ]) self.assertEqual(cid_reader._data_format.format, "delimited")
def validate(self): def add_log_line(line): self._validation_result_text.config(state=NORMAL) try: self._validation_result_text.insert(END, line + '\n') self._validation_result_text.see(END) finally: self._validation_result_text.config(state=DISABLED) def add_log_error_line(line): add_log_line('ERROR: %s' % line) def show_status_line(line): self._validation_status_text.set(line) self.master.update() assert self.cid_path != '' cid_name = os.path.basename(self.cid_path) self.clear_validation_result_text() add_log_line('%s: validating' % cid_name) self.enable_usable_widgets() cid = None try: cid = interface.Cid(self.cid_path) add_log_line('%s: ok' % cid_name) except errors.InterfaceError as error: add_log_error_line(error) except Exception as error: add_log_error_line('cannot read CID: %s' % error) if (cid is not None) and (self.data_path != ''): try: data_name = os.path.basename(self.data_path) add_log_line('%s: validating' % data_name) validator = validio.Reader(cid, self.data_path, on_error='yield') show_status_line('Validation started') last_update_time = time.time() for row_or_error in validator.rows(): now = time.time() if (now - last_update_time) > 3: last_update_time = now show_status_line('%d rows validated' % (validator.accepted_rows_count + validator.rejected_rows_count)) if isinstance(row_or_error, errors.CutplaceError): add_log_error_line(row_or_error) show_status_line('%d rows validated - finished' % (validator.accepted_rows_count + validator.rejected_rows_count)) add_log_line('%s: %d rows accepted, %d rows rejected' % (data_name, validator.accepted_rows_count, validator.rejected_rows_count)) except Exception as error: add_log_error_line('cannot validate data: %s' % error)
def test_can_handle_checks_from_excel(self): cid_reader = interface.Cid() source_path = dev_test.path_to_test_cid("customers.xls") cid_reader.read(source_path, rowio.excel_rows(source_path)) self.assertTrue( isinstance(cid_reader.check_for(cid_reader.check_names[0]), checks.IsUniqueCheck)) self.assertTrue( isinstance(cid_reader.check_for(cid_reader.check_names[1]), checks.DistinctCountCheck))
def test_can_read_delimited_rows(self): # TODO: either get rid of the CID and move it to test_iotools or use validate.Reader and move it to test_validate. delimited_cid = interface.Cid( dev_test.path_to_test_cid("icd_customers.xls")) delimited_rows = rowio.delimited_rows( dev_test.path_to_test_data("valid_customers.csv"), delimited_cid._data_format) first_row = next(delimited_rows) self.assertEqual(first_row, ['38000', '23', 'John', 'Doe', 'male', '08.03.1957'])
def __init__(self, cid_or_path): assert cid_or_path is not None if isinstance(cid_or_path, six.string_types): self._cid = interface.Cid(cid_or_path) else: self._cid = cid_or_path assert self._cid.data_format.is_valid, \ 'DataFormat.validate() must be called before using a CID for validation' self._expected_item_count = len(self._cid.field_formats) self._location = None self._is_closed = False
def set_cid_from_path(self, cid_path): """ Read the :py:class:`cutplace.interface.Cid` to be used by this application from ``cid_path``. """ assert cid_path is not None new_cid = interface.Cid() _log.info('read CID from "%s"', cid_path) cid_rows = rowio.auto_rows(cid_path) new_cid.read(cid_path, cid_rows) self.cid = new_cid self.cid_path = cid_path
def test_can_read_delimited_rows(self): # TODO: either get rid of the CID and move it to test_iotools or use validate.Reader and move it to test_validate. delimited_cid = interface.Cid(dev_test.CID_CUSTOMERS_ODS_PATH) delimited_rows = rowio.delimited_rows(dev_test.CUSTOMERS_CSV_PATH, delimited_cid.data_format) title_row = next(delimited_rows) self.assertEqual( title_row, ['customer_id', 'surname', 'first_name', 'born', 'gender']) first_data_row = next(delimited_rows) self.assertEqual(first_data_row, ['1', 'Beck', 'Tyler', '1995-11-15', 'male'])
def test_can_read_fixed_rows(self): cid_path = dev_test.path_to_test_cid('customers_fixed.ods') customer_cid = interface.Cid(cid_path) fixed_path = dev_test.path_to_test_data('valid_customers_fixed.txt') field_names_and_lengths = interface.field_names_and_lengths(customer_cid) rows = list(rowio.fixed_rows(fixed_path, customer_cid.data_format.encoding, field_names_and_lengths)) self.assertNotEqual(0, len(rows)) for row_index in range(len(rows) - 1): row = rows[row_index] next_row = rows[row_index + 1] self.assertNotEqual(0, len(row)) self.assertEqual(len(row), len(next_row))
def test_can_handle_all_field_formats_from_excel(self): cid_reader = interface.Cid() source_path = dev_test.path_to_test_cid("alltypes.xls") cid_reader.read(source_path, rowio.excel_rows(source_path)) self.assertTrue( isinstance(cid_reader.field_formats[0], fields.IntegerFieldFormat)) self.assertTrue( isinstance(cid_reader.field_formats[1], fields.TextFieldFormat)) self.assertTrue( isinstance(cid_reader.field_formats[2], fields.ChoiceFieldFormat)) self.assertTrue( isinstance(cid_reader.field_formats[3], fields.DateTimeFieldFormat)) self.assertTrue( isinstance(cid_reader.field_formats[4], fields.DecimalFieldFormat))
def test_can_create_decimal_field(self): cid = interface.Cid() cid.read('customers', [ ['D', 'Format', 'delimited'], ['D', 'Line delimiter', 'any'], ['D', 'Item delimiter', ','], ['D', 'Quote character', '"'], ['D', 'Escape character', '\\'], ['D', 'Encoding', 'ISO-8859-1'], ['D', 'Allowed characters', '32:'], ['F', 'latitude', '1.5853', '', '', 'Decimal'], ]) sql_factory = sql.SqlFactory(cid, 'customers') for _, field_type, _, _, is_not_null, _ in sql_factory.sql_fields(): self.assertEqual(field_type, 'decimal') self.assertEqual(is_not_null, False)
def test_can_create_int_field(self): cid = interface.Cid() cid.read('customers', [ ['D', 'Format', 'delimited'], ['D', 'Line delimiter', 'any'], ['D', 'Item delimiter', ','], ['D', 'Quote character', '"'], ['D', 'Escape character', '\\'], ['D', 'Encoding', 'ISO-8859-1'], ['D', 'Allowed characters', '32:'], ['F', 'customer_id', '12345', '', '', 'Integer', '0...99999'], ]) sql_factory = sql.SqlFactory(cid, 'customers') for field in sql_factory.sql_fields(): self.assertEqual(field[1], 'int') self.assertEqual(field[4], False)
def test_can_handle_oracle_sql_dialect(self): cid = interface.Cid() cid.read('customers', [ ['D', 'Format', 'delimited'], ['D', 'Line delimiter', 'any'], ['D', 'Item delimiter', ','], ['D', 'Quote character', '"'], ['D', 'Escape character', '\\'], ['D', 'Encoding', 'ISO-8859-1'], ['D', 'Allowed characters', '32:'], ['F', 'latitude', '1.5853', '', '', 'Decimal'], [ 'F', 'small', '1', '', '', 'Integer', '0...%s' % six.text_type(sql.MAX_SMALLINT) ], [ 'F', 'int', '1', '', '', 'Integer', '0...%s' % six.text_type(sql.MAX_SMALLINT + 1) ], [ 'F', 'big', '1', '', '', 'Integer', '0...%s' % six.text_type(sql.MAX_INTEGER + 1) ], [ 'F', 'decimal', '1', '', '', 'Integer', '0...%s' % six.text_type(sql.MAX_BIGINT + 1) ], ['F', 'surname', 'Doe', '', '1...60', 'Text'], ]) sql_factory = sql.SqlFactory(cid, 'customers', sql.PL_SQL_DIALECT) sql_fields = list(sql_factory.sql_fields()) self.assertEqual(sql_fields[0][1], 'number') self.assertEqual(sql_fields[0][4], False) self.assertEqual(sql_fields[1][1], 'int') self.assertEqual(sql_fields[1][4], False) self.assertEqual(sql_fields[2][1], 'int') self.assertEqual(sql_fields[2][4], False) self.assertEqual(sql_fields[3][1], 'number') self.assertEqual(sql_fields[3][4], False) self.assertEqual(sql_fields[4][1], 'number') self.assertEqual(sql_fields[4][4], False) self.assertEqual(sql_fields[5][1], 'varchar2')
def test_can_create_char_field(self): cid = interface.Cid() cid.read('customers', [ ['D', 'Format', 'delimited'], ['D', 'Line delimiter', 'any'], ['D', 'Item delimiter', ','], ['D', 'Quote character', '"'], ['D', 'Escape character', '\\'], ['D', 'Encoding', 'ISO-8859-1'], ['D', 'Allowed characters', '32:'], ['F', 'surname', 'Doe', 'x', '1...60', 'Text'], ]) sql_factory = sql.SqlFactory(cid, 'customers') for field in sql_factory.sql_fields(): self.assertEqual(field[1], 'varchar') self.assertEqual(field[2], 60) self.assertEqual(field[4], True)
def test_can_handle_all_field_formats_from_array(self): cid_reader = interface.Cid() cid_reader.read( 'inline', [['d', 'format', 'delimited'], ['f', 'int', '', '', '', 'Integer'], ['f', 'choice', '', '', '', 'Choice', 'x,y'], ['f', 'date', '', '', '', 'DateTime'], ['f', 'dec', '', '', '', 'Decimal', ''], ['f', 'text']]) self.assertTrue( isinstance(cid_reader.field_formats[0], fields.IntegerFieldFormat)) self.assertTrue( isinstance(cid_reader.field_formats[1], fields.ChoiceFieldFormat)) self.assertTrue( isinstance(cid_reader.field_formats[2], fields.DateTimeFieldFormat)) self.assertTrue( isinstance(cid_reader.field_formats[3], fields.DecimalFieldFormat)) self.assertTrue( isinstance(cid_reader.field_formats[4], fields.TextFieldFormat))
def _build_and_validate_many_customers(): icd_ods_path = dev_test.path_to_test_cid("customers.ods") # TODO: Write to 'build/many_customers.csv' many_customers_csv_path = dev_test.path_to_test_data( "lots_of_customers.csv") _build_lots_of_customers_csv(many_customers_csv_path, 50) # Validate the data using the API, so in case of errors we get specific information. customers_cid = interface.Cid(icd_ods_path) with validio.Reader(customers_cid, many_customers_csv_path) as reader: reader.validate_rows() # Validate the data using the command line application in order to use # the whole tool chain from an end user's point of view. exit_code = applications.main( ["test_performance.py", icd_ods_path, many_customers_csv_path]) if exit_code != 0: raise ValueError("exit code of performance test must be 0 but is %d" % exit_code)
def test_can_create_date_field(self): cid = interface.Cid() cid.read('customers', [ ['D', 'Format', 'delimited'], ['D', 'Line delimiter', 'any'], ['D', 'Item delimiter', ','], ['D', 'Quote character', '"'], ['D', 'Escape character', '\\'], ['D', 'Encoding', 'ISO-8859-1'], ['D', 'Allowed characters', '32:'], [ 'F', 'date_of_birth', '03.11.1969', '', '', 'DateTime', 'DD.MM.YYYY' ], ]) sql_factory = sql.SqlFactory(cid, 'customers') for _, field_type, _, _, is_not_null, _ in sql_factory.sql_fields(): self.assertEqual(field_type, 'date') self.assertEqual(is_not_null, False)
def test_can_create_sql_factory(self): cid = interface.Cid() cid.read('customers', [ ['D', 'Format', 'delimited'], ['D', 'Line delimiter', 'any'], ['D', 'Item delimiter', ','], ['D', 'Quote character', '"'], ['D', 'Escape character', '\\'], ['D', 'Encoding', 'ISO-8859-1'], ['D', 'Allowed characters', '32:'], ['F', 'branch_id', '38123', '', '', 'RegEx'], ['F', 'customer_id', '12345', '', '', 'Integer', '0...99999'], ['F', 'first_name', 'John', 'X', '', 'Text'], ['F', 'surname', 'Doe', '', '1...60', 'Text'], ['F', 'gender', 'male', '', '', 'Choice', 'male, female, unknown'], [ 'F', 'date_of_birth', '03.11.1969', '', '', 'DateTime', 'DD.MM.YYYY' ], ]) sql_factory = sql.SqlFactory(cid, 'customers') self.assertEqual(cid.field_names, sql_factory.cid._field_names)
def process(argv=None): """ Do whatever the command line options ``argv`` request. In case of error, raise an appropriate :py:exc:`Exception`. Before calling this, module :py:mod:`logging` has to be set up properly. For example, by calling :py:func:`logging.basicConfig`. :return: 0 unless ``argv`` requested to validate one or more files and \ at least one of them contained rejected data. In this case, the \ result is 1. """ if argv is None: # pragma: no cover argv = sys.argv assert argv result = 0 cutplace_app = CutplaceApp() cutplace_app.set_options(argv) if cutplace_app.is_gui: data_path = cutplace_app.data_paths[0] if len( cutplace_app.data_paths) >= 1 else None gui.open_gui(cutplace_app.cid_path, data_path) elif cutplace_app.is_create_sql: cid_reader = interface.Cid() sql.write_create(cutplace_app.cid_path, cid_reader) elif cutplace_app.data_paths: for data_path in cutplace_app.data_paths: try: cutplace_app.validate(data_path) except (EnvironmentError, OSError) as error: raise EnvironmentError( _("cannot read data file %r: %s") % (data_path, error)) if not cutplace_app.all_validations_were_ok: result = 1 return result
def test_fails_on_invalid_row_typ(self): cid_reader = interface.Cid() self.assertRaises(errors.InterfaceError, cid_reader.read, 'inline', [['x']])
def test_fails_on_python_keyword_as_field_name(self): cid_to_read = interface.Cid() self.assertRaises( errors.InterfaceError, cid_to_read.read, 'inline', [['d', 'format', 'delimited'], ['f', 'class', '38000', '', '5']])
def test_can_create_empty_cid(self): cid = interface.Cid() cid_name = os.path.splitext(os.path.basename( cid._location.file_path))[0] self.assertEqual('test_interface', cid_name)
def test_fails_on_missing_data_format_property_name(self): cid_reader = interface.Cid() self.assertRaises(errors.InterfaceError, cid_reader.read, 'inline', [ ['d', 'format', 'delimited'], ['d'], ])
def test_fails_on_invalid_csv_source_file_with_duplicates(self): cid = interface.Cid(dev_test.path_to_test_cid("icd_customers.xls")) with validio.Reader(cid, dev_test.path_to_test_data("broken_customers_with_duplicates.csv")) as reader: self.assertRaises(errors.CheckError, reader.validate_rows)
def test_can_open_and_validate_fixed_source_file(self): cid = interface.Cid(dev_test.path_to_test_cid("customers_fixed.xls")) with validio.Reader(cid, dev_test.path_to_test_data("valid_customers_fixed.txt")) as reader: reader.validate_rows()
def setUp(self): self._cid_path = dev_test.path_to_test_cid("icd_customers.xls") self._cid = interface.Cid(self._cid_path) self._data_path = dev_test.path_to_test_data("valid_customers.csv")