def test_can_compare_two_locations(self): location = errors.Location("eggs.ods", has_cell=True, has_sheet=True) location_other = errors.Location("eggs.ods", has_cell=True, has_sheet=True) self.assertEqual(location.__eq__(location_other), True) self.assertEqual(location.__lt__(location_other), False)
def test_fails_on_too_many_distinct_values(self): field_names = _TEST_FIELD_NAMES checks.DistinctCountCheck("test check", "branch_id<3", field_names) check = checks.DistinctCountCheck("test check", "branch_id < 3", field_names) location = errors.Location(self.test_fails_on_too_many_distinct_values, has_cell=True) check.check_row( _create_field_map( field_names, [38000, 23, "John", "Doe", "male", "08.03.1957"]), location) location.advance_line() check.check_row( _create_field_map( field_names, [38001, 59, "Jane", "Miller", "female", "04.10.1946"]), location) check.check_at_end(location) location.advance_line() check.check_row( _create_field_map( field_names, [38003, 59, "Jane", "Miller", "female", "04.10.1946"]), location) self.assertRaises(errors.CheckError, check.check_at_end, location)
def test_can_check_empty_row(self): # HACK: This is just here to make coverage happy because "# pragma: no cover" does not work # on methods that consist of nothing but a single "pass". field_names = _TEST_FIELD_NAMES check = checks.AbstractCheck("test check", "", field_names) location = errors.Location(self.test_can_check_empty_row, has_cell=True) check.check_row([], location)
def ods_content_root(): """ `ElementTree` for content.xml in `source_ods_path`. """ assert source_ods_path is not None location = errors.Location(source_ods_path) try: # HACK: Use ``closing()`` because of Python 2.6. with closing(zipfile.ZipFile(source_ods_path, "r")) as zip_archive: try: xml_data = zip_archive.read("content.xml") except Exception as error: raise errors.DataFormatError( 'cannot extract content.xml for ODS spreadsheet: %s' % error, location) except errors.DataFormatError: raise except Exception as error: raise errors.DataFormatError( 'cannot uncompress ODS spreadsheet: %s' % error, location) with io.BytesIO(xml_data) as xml_stream: try: tree = ElementTree.parse(xml_stream) except Exception as error: raise errors.DataFormatError( 'cannot parse content.xml: %s' % error, location) return tree.getroot()
def _raise_delimited_data_format_error(delimited_path, reader, error): location = errors.Location(delimited_path) line_number = reader.line_num if line_number > 0: location.advance_line(line_number) raise errors.DataFormatError('cannot parse delimited file: %s' % error, location)
def test_fails_on_duplicate_with_multiple_fields(self): field_names = _TEST_FIELD_NAMES check = checks.IsUniqueCheck("test check", "branch_id, customer_id", field_names) location = errors.Location( self.test_fails_on_duplicate_with_multiple_fields, has_cell=True) check.check_row( _create_field_map( field_names, [38000, 23, "John", "Doe", "male", "08.03.1957"]), location) location.advance_line() check.check_row( _create_field_map( field_names, [38000, 59, "Jane", "Miller", "female", "04.10.1946"]), location) location.advance_line() try: check.check_row( _create_field_map( field_names, [38000, 59, "Jane", "Miller", "female", "04.10.1946"]), location) self.fail("duplicate row must cause CheckError") except errors.CheckError as error: self.assertTrue(error.see_also_location) self.assertNotEqual(location, error.see_also_location) self.assertEqual(error.location.cell, 0) # These methods should not do anything, but call them anyway for tests sake. check.check_at_end(location) check.cleanup()
def excel_rows(source_path, sheet=1): """ Rows read from an Excel document (both :file:`*.xls` and :file:`*.xlsx` thanks to :py:mod:`xlrd`). :param str source_path: path to the Excel file to be read :param int sheet: the sheet in the file to be read :return: sequence of lists with each list representing a row in the \ Excel file :raises cutplace.errors.DataFormatError: in case the file cannot be read """ assert source_path is not None assert sheet >= 1, 'sheet=%r' % sheet location = errors.Location(source_path, has_cell=True) try: with xlrd.open_workbook(source_path) as book: sheet = book.sheet_by_index(0) datemode = book.datemode for y in range(sheet.nrows): row = [] for x in range(sheet.ncols): row.append(_excel_cell_value(sheet.cell(y, x), datemode)) location.advance_cell() yield row location.advance_line() except xlrd.XLRDError as error: raise errors.DataFormatError('cannot read Excel file: %s' % error, location) except UnicodeError as error: raise errors.DataFormatError('cannot decode Excel data: %s' % error, location)
def __init__(self, cid_or_path, source_data_stream_or_path, on_error='raise', validate_until=None): """ An iterator that produces possibly validated rows from ``source_data_stream_or_path`` conforming to ``cid_or_path``. If a row cannot be read, ``on_error`` specifies what to do about it: * ``'continue'``: quietly continue with the next row. * ``'raise'`` (the default): raise an exception and stop reading. * ``'yield'``: instead of of a row, the result contains a \ :py:exc:`cutplace.errors.DataError`. :param validate_until: number of rows after which validation should \ stop; further rows are still produces but not validated anymore; \ ``None`` all rows should be validated (the default); 0 means no \ rows should be validated :type: int or None """ assert cid_or_path is not None assert source_data_stream_or_path is not None assert on_error in _VALID_ON_ERROR_CHOICES, 'on_error=%r' % on_error assert (validate_until is None) or (validate_until >= 0) super(Reader, self).__init__(cid_or_path) # TODO: Consolidate obtaining source path with other code segments that do similar things. if isinstance(source_data_stream_or_path, six.string_types): source_path = source_data_stream_or_path else: try: source_path = source_data_stream_or_path.name except AttributeError: source_path = '<io>' self._location = errors.Location(source_path, has_cell=True) self._source_data_stream_or_path = source_data_stream_or_path self._on_error = on_error self._validate_until = validate_until self.accepted_rows_count = None self.rejected_rows_count = None
def test_can_create_simple_cutplace_error(self): location = errors.Location('eggs.ods', has_cell=True, has_sheet=True) error = errors.CutplaceError('something must be something else', location) self.assertEqual(error.location, location) self.assertEqual( error.__str__(), 'eggs.ods (Sheet1!R1C1): something must be something else')
def test_can_create_cutplace_error_with_see_also_details(self): location = errors.Location('eggs.ods', has_cell=True, has_sheet=True) location.advance_line(3) location.advance_cell(2) location_of_cause = errors.Location('spam.ods', has_cell=True, has_sheet=True) cause = errors.CutplaceError('something must be something else', location_of_cause) error = errors.CutplaceError('cannot do something', location, cause.message, cause.location, cause) self.assertEqual(error.location, location) self.assertEqual(error.see_also_location, cause.location) self.assertEqual(error.cause, cause) self.assertEqual( error.__str__(), 'eggs.ods (Sheet1!R4C3): cannot do something ' + '(see also: spam.ods (Sheet1!R1C1): something must be something else)' )
def test_can_work_with_location(self): # TODO: Cleanup: split up in several tests with meaningful names. location = errors.Location("eggs.txt", has_column=True) self.assertEqual(location.line, 0) self.assertEqual(location.column, 0) self.assertEqual(str(location), "eggs.txt (1;1)") location.advance_column(3) self.assertEqual(location.column, 3) location.advance_column() self.assertEqual(location.column, 4) location.advance_line() self.assertEqual(location.line, 1) self.assertEqual(location.column, 0) self.assertEqual(str(location), "eggs.txt (2;1)") # Test input with cells. location = errors.Location("eggs.csv", has_cell=True) self.assertEqual(location.line, 0) self.assertEqual(location.cell, 0) self.assertEqual(str(location), "eggs.csv (R1C1)") location.advance_line() location.advance_cell(17) self.assertEqual(location.__repr__(), "eggs.csv (R2C18)") # Test input with sheet. location = errors.Location("eggs.ods", has_cell=True, has_sheet=True) self.assertEqual(str(location), "eggs.ods (Sheet1!R1C1)") location.advance_sheet() location.advance_line() location.advance_cell(17) location._set_sheet(4) self.assertEqual(str(location), "eggs.ods (Sheet5!R2C18)") # Test StringIO input. input_stream = io.StringIO("hugo was here") location = errors.Location(input_stream) self.assertEqual(str(location), "<io> (1)")
def read(self, cid_path, rows): """ Provided no ``cid_path`` has already been specified for :py:class:`~cutplace.interface.Cid.__init__()`, process ``rows`` using :py:meth:`~cutplace.interface.Cid.add_data_format_row()`, :py:meth:`~cutplace.interface.Cid.add_field_format()` and :py:meth:`~cutplace.interface.Cid.add_check()`. Report any errors by referring to ``cid_path``. :param str cid_path: the path from which ``rows`` where obtained :param sequence rows: sequence of lists where each list either \ describes a data format, field format, check or comment for a CID. :raises cutplace.errors.InterfaceError: in case any row in ``rows`` \ cannot be processed """ assert cid_path is not None assert self.data_format is None, 'CID must be read only once' # TODO: Detect format and use proper reader. self._location = errors.Location(cid_path, has_cell=True) if self._cid_path is None: self._cid_path = cid_path for row in rows: if row: row_type = row[0].lower().strip() row_data = (row[1:] + [''] * 6)[:6] if row_type == 'd': self.add_data_format_row(row_data) elif row_type == 'f': self.add_field_format_row(row_data) elif row_type == 'c': self.add_check_row(row_data) elif row_type != '': # Raise error when value is not supported. raise errors.InterfaceError( 'CID row type is "%s" but must be empty or one of: C, D, or F' % row_type, self._location) self._location.advance_line() if self.data_format is None: raise errors.InterfaceError('data format must be specified', self._location) self.data_format.validate() if len(self.field_names) == 0: raise errors.InterfaceError('fields must be specified', self._location)
def __init__(self, target_path): """ Set up a writer that stores the data in ``target_path``, which has to be a string. Unlike with some other writers, this can not be stream. Internally data are written to a worksheet first and written to a file during :py:meth:`cutplace.rowio.XlsxRowWriter.close`. """ assert target_path is not None assert isinstance( target_path, six.string_types ), 'target_path must be a string but is: %s' % type(target_path) self._target_path = target_path self._target_stream = None self._has_opened_target_stream = False self._location = errors.Location(self.target_path, has_cell=True) self._workbook = xlsxwriter.Workbook(self.target_path) self._worksheet = self._workbook.add_worksheet()
def test_fails_on_duplicate_with_single_field(self): field_names = ['customer_id'] check = checks.IsUniqueCheck('test check', 'customer_id', field_names) location = errors.Location( self.test_fails_on_duplicate_with_single_field, has_cell=True) check.check_row(_create_field_map(field_names, [1]), location) location.advance_line() check.check_row(_create_field_map(field_names, [2]), location) location.advance_line() try: check.check_row(_create_field_map(field_names, [1]), location) self.fail('duplicate row must cause CheckError') except errors.CheckError as error: self.assertTrue(error.see_also_location) self.assertNotEqual(location, error.see_also_location) self.assertEqual(error.location.cell, 0) # These methods should not do anything, but call them anyway for tests sake. check.check_at_end(location) check.cleanup()
def __init__(self, target, data_format): assert target is not None assert data_format is not None assert data_format.is_valid self._data_format = data_format self._has_opened_target_stream = False if isinstance(target, six.string_types): self._target_path = target self._target_stream = io.open(self._target_path, 'w', encoding=data_format.encoding, newline='') self._has_opened_target_stream = True else: try: self._target_path = target.name except AttributeError: self._target_path = '<io>' self._target_stream = target self._location = errors.Location(self.target_path, has_cell=True)
def fixed_rows(fixed_source, encoding, field_name_and_lengths, line_delimiter='any'): r""" Rows found in file ``fixed_source`` using ``encoding``. The name and (fixed) length of the fields for each row are specified as a list of tuples ``(name, length)``. Each row can end with a line feed unless ``line_delimiter`` equals ``None``. Valid values are: ``'\n'``, ``'\r'`` and ``'\r\n'``, in which case other values result in a `errors.DataFormatError`. Additionally ``'any'`` accepts any of the previous values. """ assert fixed_source is not None assert encoding is not None for name, length in field_name_and_lengths: assert name is not None assert length >= 1, 'length for %s must be at least 1 but is %s' % ( name, length) assert line_delimiter in _VALID_FIXED_LINE_DELIMITERS, \ 'line_delimiter=%s but must be one of: %s' % (_compat.text_repr(line_delimiter), _VALID_FIXED_LINE_DELIMITERS) # Predefine variable for access in local function. location = errors.Location(fixed_source, has_column=True) fixed_file = None # HACK: list with at most 1 character to be unread after a line feed. We # need to use a list so `_has_data_after_skipped_line_delimiter` can # modify its contents. unread_character_after_line_delimiter = [None] def _has_data_after_skipped_line_delimiter(): """ If `fixed_file` has data, assume they are a line delimiter as specified by `line_delimiter` and read and validate them. In case `line_delimiter` is `None`, the result is always ``True`` even if the input has already reached its end. """ assert location is not None assert line_delimiter in _VALID_FIXED_LINE_DELIMITERS assert unread_character_after_line_delimiter[0] is None result = True if line_delimiter is not None: if line_delimiter == '\r\n': actual_line_delimiter = fixed_file.read(2) else: assert line_delimiter in ('\n', '\r', 'any') actual_line_delimiter = fixed_file.read(1) if actual_line_delimiter == '': result = False elif line_delimiter == 'any': if actual_line_delimiter == '\r': # Process the optional '\n' for 'any'. anticipated_linefeed = fixed_file.read(1) if anticipated_linefeed == '\n': actual_line_delimiter += anticipated_linefeed elif anticipated_linefeed == '': result = False else: # Unread the previous character because it is unrelated to line delimiters. unread_character_after_line_delimiter[ 0] = anticipated_linefeed if actual_line_delimiter not in _VALID_FIXED_ANY_LINE_DELIMITERS: valid_line_delimiters = _tools.human_readable_list( _VALID_FIXED_ANY_LINE_DELIMITERS) raise errors.DataFormatError( 'line delimiter is %s but must be one of: %s' % (_compat.text_repr(actual_line_delimiter), valid_line_delimiters), location) elif actual_line_delimiter != line_delimiter: raise errors.DataFormatError( 'line delimiter is %s but must be %s' % (_compat.text_repr(actual_line_delimiter), _compat.text_repr(line_delimiter)), location) return result if isinstance(fixed_source, six.string_types): fixed_file = io.open(fixed_source, 'r', encoding=encoding) is_opened = True else: fixed_file = fixed_source is_opened = False has_data = True try: while has_data: field_index = 0 row = [] for field_name, field_length in field_name_and_lengths: if unread_character_after_line_delimiter[0] is None: item = fixed_file.read(field_length) else: assert len(unread_character_after_line_delimiter) == 1 item = unread_character_after_line_delimiter[0] if field_length >= 2: item += fixed_file.read(field_length - 1) unread_character_after_line_delimiter[0] = None assert unread_character_after_line_delimiter[0] is None if not is_opened: # Ensure that the input is a text file, `io.StringIO` or something similar. Binary files, # `io.BytesIO` and the like cannot be used because the return bytes instead of strings. # NOTE: We do not need to use _compat.text_repr(item) because type `unicode` does not fail here. assert isinstance(item, six.text_type), \ '%s: fixed_source must yield strings but got type %s, value %r' % (location, type(item), item) item_length = len(item) if item_length == 0: if field_index > 0: names = [name for name, _ in field_name_and_lengths] lengths = [ length for _, length in field_name_and_lengths ] previous_field_index = field_index - 1 characters_needed_count = sum(lengths[field_index:]) list_of_missing_field_names = _tools.human_readable_list( names[field_index:], 'and') raise errors.DataFormatError( "after field '%s' %d characters must follow for: %s" % (names[previous_field_index], characters_needed_count, list_of_missing_field_names), location) # End of input reached. has_data = False elif item_length == field_length: row.append(item) location.advance_column(field_length) field_index += 1 else: raise errors.DataFormatError( "cannot read field '%s': need %d characters but found only %d: %s" % (field_name, field_length, item_length, _compat.text_repr(item)), location) if has_data and not _has_data_after_skipped_line_delimiter(): has_data = False if len(row) > 0: yield row location.advance_line() finally: if is_opened: fixed_file.close()
def ods_rows(source_ods_path, sheet=1): """ Rows stored in ODS document ``source_ods_path`` in ``sheet``. :raises cutplace.errors.DataFormarError: if ``source_ods_path`` is not \ a valid ODS file. """ assert sheet >= 1 def ods_content_root(): """ `ElementTree` for content.xml in `source_ods_path`. """ assert source_ods_path is not None location = errors.Location(source_ods_path) try: # HACK: Use ``closing()`` because of Python 2.6. with closing(zipfile.ZipFile(source_ods_path, "r")) as zip_archive: try: xml_data = zip_archive.read("content.xml") except Exception as error: raise errors.DataFormatError( 'cannot extract content.xml for ODS spreadsheet: %s' % error, location) except errors.DataFormatError: raise except Exception as error: raise errors.DataFormatError( 'cannot uncompress ODS spreadsheet: %s' % error, location) with io.BytesIO(xml_data) as xml_stream: try: tree = ElementTree.parse(xml_stream) except Exception as error: raise errors.DataFormatError( 'cannot parse content.xml: %s' % error, location) return tree.getroot() content_root = ods_content_root() table_elements = list( _findall(content_root, 'office:body/office:spreadsheet/table:table', namespaces=_OOO_NAMESPACES)) table_count = len(table_elements) if table_count < sheet: error_message = 'ODS must contain at least %d sheet(s) instead of just %d' % ( sheet, table_count) raise errors.DataFormatError(error_message, errors.Location(source_ods_path)) table_element = table_elements[sheet - 1] location = errors.Location(source_ods_path, has_cell=True, has_sheet=True) for _ in range(sheet - 1): location.advance_sheet() for table_row in _findall(table_element, 'table:table-row', namespaces=_OOO_NAMESPACES): row = [] for table_cell in _findall(table_row, 'table:table-cell', namespaces=_OOO_NAMESPACES): repeated_text = table_cell.attrib.get(_NUMBER_COLUMNS_REPEATED, '1') try: repeated_count = int(repeated_text) if repeated_count < 1: raise errors.DataFormatError( 'table:number-columns-repeated is %s but must be at least 1' % _compat.text_repr(repeated_text), location) except ValueError: raise errors.DataFormatError( 'table:number-columns-repeated is %s but must be an integer' % _compat.text_repr(repeated_text), location) if six.PY2: text_p = table_cell.find('{%s}p' % _OOO_NAMESPACES['text']) else: text_p = table_cell.find('text:p', namespaces=_OOO_NAMESPACES) if text_p is None: cell_value = '' else: cell_value = text_p.text if six.PY2: # HACK: It seems that under Python 2 ElementTree.find() returns a unicode string only of the value # actually contains non ASCII characters, and otherwise a binary string. To work around this we # check the result for binary strings and possibly convert them to uncicode strings assuming UTF-8 # to be the internal encoding for the XML file. Ideally we would parse the XML header for the # encoding. Considering that Python 2 is on the way out, this just doesn't seem to be worth the # trouble right now. if isinstance(cell_value, six.binary_type): cell_value = six.text_type(cell_value, 'utf-8') else: assert isinstance( cell_value, six.text_type), 'cell_value=%r' % cell_value row.extend([cell_value] * repeated_count) location.advance_cell(repeated_count) yield row location.advance_line()