コード例 #1
0
ファイル: test_errors.py プロジェクト: butterhirsch/cutplace2
 def test_can_compare_two_locations(self):
     location = errors.Location("eggs.ods", has_cell=True, has_sheet=True)
     location_other = errors.Location("eggs.ods",
                                      has_cell=True,
                                      has_sheet=True)
     self.assertEqual(location.__eq__(location_other), True)
     self.assertEqual(location.__lt__(location_other), False)
コード例 #2
0
 def test_fails_on_too_many_distinct_values(self):
     field_names = _TEST_FIELD_NAMES
     checks.DistinctCountCheck("test check", "branch_id<3", field_names)
     check = checks.DistinctCountCheck("test check", "branch_id < 3",
                                       field_names)
     location = errors.Location(self.test_fails_on_too_many_distinct_values,
                                has_cell=True)
     check.check_row(
         _create_field_map(
             field_names, [38000, 23, "John", "Doe", "male", "08.03.1957"]),
         location)
     location.advance_line()
     check.check_row(
         _create_field_map(
             field_names,
             [38001, 59, "Jane", "Miller", "female", "04.10.1946"]),
         location)
     check.check_at_end(location)
     location.advance_line()
     check.check_row(
         _create_field_map(
             field_names,
             [38003, 59, "Jane", "Miller", "female", "04.10.1946"]),
         location)
     self.assertRaises(errors.CheckError, check.check_at_end, location)
コード例 #3
0
 def test_can_check_empty_row(self):
     # HACK: This is just here to make coverage happy because "# pragma: no cover" does not work
     # on methods that consist of nothing but a single "pass".
     field_names = _TEST_FIELD_NAMES
     check = checks.AbstractCheck("test check", "", field_names)
     location = errors.Location(self.test_can_check_empty_row, has_cell=True)
     check.check_row([], location)
コード例 #4
0
ファイル: rowio.py プロジェクト: butterhirsch/cutplace2
    def ods_content_root():
        """
        `ElementTree` for content.xml in `source_ods_path`.
        """
        assert source_ods_path is not None

        location = errors.Location(source_ods_path)
        try:
            # HACK: Use ``closing()`` because of Python 2.6.
            with closing(zipfile.ZipFile(source_ods_path, "r")) as zip_archive:
                try:
                    xml_data = zip_archive.read("content.xml")
                except Exception as error:
                    raise errors.DataFormatError(
                        'cannot extract content.xml for ODS spreadsheet: %s' %
                        error, location)
        except errors.DataFormatError:
            raise
        except Exception as error:
            raise errors.DataFormatError(
                'cannot uncompress ODS spreadsheet: %s' % error, location)

        with io.BytesIO(xml_data) as xml_stream:
            try:
                tree = ElementTree.parse(xml_stream)
            except Exception as error:
                raise errors.DataFormatError(
                    'cannot parse content.xml: %s' % error, location)

        return tree.getroot()
コード例 #5
0
ファイル: rowio.py プロジェクト: butterhirsch/cutplace2
def _raise_delimited_data_format_error(delimited_path, reader, error):
    location = errors.Location(delimited_path)
    line_number = reader.line_num
    if line_number > 0:
        location.advance_line(line_number)
    raise errors.DataFormatError('cannot parse delimited file: %s' % error,
                                 location)
コード例 #6
0
    def test_fails_on_duplicate_with_multiple_fields(self):
        field_names = _TEST_FIELD_NAMES
        check = checks.IsUniqueCheck("test check", "branch_id, customer_id",
                                     field_names)
        location = errors.Location(
            self.test_fails_on_duplicate_with_multiple_fields, has_cell=True)
        check.check_row(
            _create_field_map(
                field_names, [38000, 23, "John", "Doe", "male", "08.03.1957"]),
            location)
        location.advance_line()
        check.check_row(
            _create_field_map(
                field_names,
                [38000, 59, "Jane", "Miller", "female", "04.10.1946"]),
            location)
        location.advance_line()
        try:
            check.check_row(
                _create_field_map(
                    field_names,
                    [38000, 59, "Jane", "Miller", "female", "04.10.1946"]),
                location)
            self.fail("duplicate row must cause CheckError")
        except errors.CheckError as error:
            self.assertTrue(error.see_also_location)
            self.assertNotEqual(location, error.see_also_location)
            self.assertEqual(error.location.cell, 0)

        # These methods should not do anything, but call them anyway for tests sake.
        check.check_at_end(location)
        check.cleanup()
コード例 #7
0
ファイル: rowio.py プロジェクト: butterhirsch/cutplace2
def excel_rows(source_path, sheet=1):
    """
    Rows read from an Excel document (both :file:`*.xls` and :file:`*.xlsx`
    thanks to :py:mod:`xlrd`).

    :param str source_path: path to the Excel file to be read
    :param int sheet: the sheet in the file to be read
    :return: sequence of lists with each list representing a row in the \
      Excel file
    :raises cutplace.errors.DataFormatError: in case the file cannot be read
    """
    assert source_path is not None
    assert sheet >= 1, 'sheet=%r' % sheet

    location = errors.Location(source_path, has_cell=True)
    try:
        with xlrd.open_workbook(source_path) as book:
            sheet = book.sheet_by_index(0)
            datemode = book.datemode
            for y in range(sheet.nrows):
                row = []
                for x in range(sheet.ncols):
                    row.append(_excel_cell_value(sheet.cell(y, x), datemode))
                    location.advance_cell()
                yield row
                location.advance_line()
    except xlrd.XLRDError as error:
        raise errors.DataFormatError('cannot read Excel file: %s' % error,
                                     location)
    except UnicodeError as error:
        raise errors.DataFormatError('cannot decode Excel data: %s' % error,
                                     location)
コード例 #8
0
    def __init__(self, cid_or_path, source_data_stream_or_path, on_error='raise', validate_until=None):
        """
        An iterator that produces possibly validated rows from
        ``source_data_stream_or_path`` conforming to ``cid_or_path``.
        If a row cannot be read, ``on_error`` specifies what to do about it:
        * ``'continue'``: quietly continue with the next row.
        * ``'raise'`` (the default): raise an exception and stop reading.
        * ``'yield'``: instead of of a row, the result contains a \
          :py:exc:`cutplace.errors.DataError`.
        :param validate_until: number of rows after which validation should \
          stop; further rows are still produces but not validated anymore; \
          ``None`` all rows should be validated (the default); 0 means no \
          rows should be validated
        :type: int or None
        """
        assert cid_or_path is not None
        assert source_data_stream_or_path is not None
        assert on_error in _VALID_ON_ERROR_CHOICES, 'on_error=%r' % on_error
        assert (validate_until is None) or (validate_until >= 0)

        super(Reader, self).__init__(cid_or_path)
        # TODO: Consolidate obtaining source path with other code segments that do similar things.
        if isinstance(source_data_stream_or_path, six.string_types):
            source_path = source_data_stream_or_path
        else:
            try:
                source_path = source_data_stream_or_path.name
            except AttributeError:
                source_path = '<io>'
        self._location = errors.Location(source_path, has_cell=True)
        self._source_data_stream_or_path = source_data_stream_or_path
        self._on_error = on_error
        self._validate_until = validate_until
        self.accepted_rows_count = None
        self.rejected_rows_count = None
コード例 #9
0
ファイル: test_errors.py プロジェクト: butterhirsch/cutplace2
 def test_can_create_simple_cutplace_error(self):
     location = errors.Location('eggs.ods', has_cell=True, has_sheet=True)
     error = errors.CutplaceError('something must be something else',
                                  location)
     self.assertEqual(error.location, location)
     self.assertEqual(
         error.__str__(),
         'eggs.ods (Sheet1!R1C1): something must be something else')
コード例 #10
0
ファイル: test_errors.py プロジェクト: butterhirsch/cutplace2
 def test_can_create_cutplace_error_with_see_also_details(self):
     location = errors.Location('eggs.ods', has_cell=True, has_sheet=True)
     location.advance_line(3)
     location.advance_cell(2)
     location_of_cause = errors.Location('spam.ods',
                                         has_cell=True,
                                         has_sheet=True)
     cause = errors.CutplaceError('something must be something else',
                                  location_of_cause)
     error = errors.CutplaceError('cannot do something', location,
                                  cause.message, cause.location, cause)
     self.assertEqual(error.location, location)
     self.assertEqual(error.see_also_location, cause.location)
     self.assertEqual(error.cause, cause)
     self.assertEqual(
         error.__str__(), 'eggs.ods (Sheet1!R4C3): cannot do something ' +
         '(see also: spam.ods (Sheet1!R1C1): something must be something else)'
     )
コード例 #11
0
ファイル: test_errors.py プロジェクト: butterhirsch/cutplace2
    def test_can_work_with_location(self):
        # TODO: Cleanup: split up in several tests with meaningful names.
        location = errors.Location("eggs.txt", has_column=True)
        self.assertEqual(location.line, 0)
        self.assertEqual(location.column, 0)
        self.assertEqual(str(location), "eggs.txt (1;1)")
        location.advance_column(3)
        self.assertEqual(location.column, 3)
        location.advance_column()
        self.assertEqual(location.column, 4)
        location.advance_line()
        self.assertEqual(location.line, 1)
        self.assertEqual(location.column, 0)
        self.assertEqual(str(location), "eggs.txt (2;1)")

        # Test input with cells.
        location = errors.Location("eggs.csv", has_cell=True)
        self.assertEqual(location.line, 0)
        self.assertEqual(location.cell, 0)
        self.assertEqual(str(location), "eggs.csv (R1C1)")
        location.advance_line()
        location.advance_cell(17)
        self.assertEqual(location.__repr__(), "eggs.csv (R2C18)")

        # Test input with sheet.
        location = errors.Location("eggs.ods", has_cell=True, has_sheet=True)
        self.assertEqual(str(location), "eggs.ods (Sheet1!R1C1)")
        location.advance_sheet()
        location.advance_line()
        location.advance_cell(17)
        location._set_sheet(4)
        self.assertEqual(str(location), "eggs.ods (Sheet5!R2C18)")

        # Test StringIO input.
        input_stream = io.StringIO("hugo was here")
        location = errors.Location(input_stream)
        self.assertEqual(str(location), "<io> (1)")
コード例 #12
0
    def read(self, cid_path, rows):
        """
        Provided no ``cid_path`` has already been specified for
        :py:class:`~cutplace.interface.Cid.__init__()`, process ``rows``
        using :py:meth:`~cutplace.interface.Cid.add_data_format_row()`,
        :py:meth:`~cutplace.interface.Cid.add_field_format()` and
        :py:meth:`~cutplace.interface.Cid.add_check()`. Report any errors by
        referring to ``cid_path``.

        :param str cid_path: the path from which ``rows`` where obtained
        :param sequence rows: sequence of lists where each list either \
          describes a data format, field format, check or comment for a CID.

        :raises cutplace.errors.InterfaceError: in case any row in ``rows`` \
          cannot be processed
        """
        assert cid_path is not None
        assert self.data_format is None, 'CID must be read only once'

        # TODO: Detect format and use proper reader.
        self._location = errors.Location(cid_path, has_cell=True)
        if self._cid_path is None:
            self._cid_path = cid_path
        for row in rows:
            if row:
                row_type = row[0].lower().strip()
                row_data = (row[1:] + [''] * 6)[:6]
                if row_type == 'd':
                    self.add_data_format_row(row_data)
                elif row_type == 'f':
                    self.add_field_format_row(row_data)
                elif row_type == 'c':
                    self.add_check_row(row_data)
                elif row_type != '':
                    # Raise error when value is not supported.
                    raise errors.InterfaceError(
                        'CID row type is "%s" but must be empty or one of: C, D, or F'
                        % row_type, self._location)
            self._location.advance_line()
        if self.data_format is None:
            raise errors.InterfaceError('data format must be specified',
                                        self._location)
        self.data_format.validate()
        if len(self.field_names) == 0:
            raise errors.InterfaceError('fields must be specified',
                                        self._location)
コード例 #13
0
    def __init__(self, target_path):
        """
        Set up a writer that stores the data in ``target_path``, which has to
        be a string. Unlike with some other writers, this can not be stream.

        Internally data are written to a worksheet first and written to a
        file during :py:meth:`cutplace.rowio.XlsxRowWriter.close`.
        """
        assert target_path is not None
        assert isinstance(
            target_path, six.string_types
        ), 'target_path must be a string but is: %s' % type(target_path)

        self._target_path = target_path
        self._target_stream = None
        self._has_opened_target_stream = False
        self._location = errors.Location(self.target_path, has_cell=True)
        self._workbook = xlsxwriter.Workbook(self.target_path)
        self._worksheet = self._workbook.add_worksheet()
コード例 #14
0
    def test_fails_on_duplicate_with_single_field(self):
        field_names = ['customer_id']
        check = checks.IsUniqueCheck('test check', 'customer_id', field_names)
        location = errors.Location(
            self.test_fails_on_duplicate_with_single_field, has_cell=True)
        check.check_row(_create_field_map(field_names, [1]), location)
        location.advance_line()
        check.check_row(_create_field_map(field_names, [2]), location)
        location.advance_line()
        try:
            check.check_row(_create_field_map(field_names, [1]), location)
            self.fail('duplicate row must cause CheckError')
        except errors.CheckError as error:
            self.assertTrue(error.see_also_location)
            self.assertNotEqual(location, error.see_also_location)
            self.assertEqual(error.location.cell, 0)

        # These methods should not do anything, but call them anyway for tests sake.
        check.check_at_end(location)
        check.cleanup()
コード例 #15
0
ファイル: rowio.py プロジェクト: butterhirsch/cutplace2
    def __init__(self, target, data_format):
        assert target is not None
        assert data_format is not None
        assert data_format.is_valid

        self._data_format = data_format
        self._has_opened_target_stream = False
        if isinstance(target, six.string_types):
            self._target_path = target
            self._target_stream = io.open(self._target_path,
                                          'w',
                                          encoding=data_format.encoding,
                                          newline='')
            self._has_opened_target_stream = True
        else:
            try:
                self._target_path = target.name
            except AttributeError:
                self._target_path = '<io>'
            self._target_stream = target
        self._location = errors.Location(self.target_path, has_cell=True)
コード例 #16
0
ファイル: rowio.py プロジェクト: butterhirsch/cutplace2
def fixed_rows(fixed_source,
               encoding,
               field_name_and_lengths,
               line_delimiter='any'):
    r"""
    Rows found in file ``fixed_source`` using ``encoding``. The name and
    (fixed) length of the fields for each row are specified as a list of
    tuples ``(name, length)``. Each row can end with a line feed unless
    ``line_delimiter`` equals ``None``. Valid values are: ``'\n'``, ``'\r'``
    and ``'\r\n'``, in which case other values result in a
    `errors.DataFormatError`. Additionally ``'any'`` accepts any of the
    previous values.
    """
    assert fixed_source is not None
    assert encoding is not None
    for name, length in field_name_and_lengths:
        assert name is not None
        assert length >= 1, 'length for %s must be at least 1 but is %s' % (
            name, length)
    assert line_delimiter in _VALID_FIXED_LINE_DELIMITERS, \
        'line_delimiter=%s but must be one of: %s' % (_compat.text_repr(line_delimiter), _VALID_FIXED_LINE_DELIMITERS)

    # Predefine variable for access in local function.
    location = errors.Location(fixed_source, has_column=True)
    fixed_file = None
    # HACK: list with at most 1 character to be unread after a line feed. We
    # need to use a list so `_has_data_after_skipped_line_delimiter` can
    # modify its contents.
    unread_character_after_line_delimiter = [None]

    def _has_data_after_skipped_line_delimiter():
        """
        If `fixed_file` has data, assume they are a line delimiter as specified
        by `line_delimiter` and read and validate them.

        In case `line_delimiter` is `None`, the result is always ``True`` even
        if the input has already reached its end.
        """
        assert location is not None
        assert line_delimiter in _VALID_FIXED_LINE_DELIMITERS
        assert unread_character_after_line_delimiter[0] is None

        result = True
        if line_delimiter is not None:
            if line_delimiter == '\r\n':
                actual_line_delimiter = fixed_file.read(2)
            else:
                assert line_delimiter in ('\n', '\r', 'any')
                actual_line_delimiter = fixed_file.read(1)
            if actual_line_delimiter == '':
                result = False
            elif line_delimiter == 'any':
                if actual_line_delimiter == '\r':
                    # Process the optional '\n' for 'any'.
                    anticipated_linefeed = fixed_file.read(1)
                    if anticipated_linefeed == '\n':
                        actual_line_delimiter += anticipated_linefeed
                    elif anticipated_linefeed == '':
                        result = False
                    else:
                        # Unread the previous character because it is unrelated to line delimiters.
                        unread_character_after_line_delimiter[
                            0] = anticipated_linefeed
                if actual_line_delimiter not in _VALID_FIXED_ANY_LINE_DELIMITERS:
                    valid_line_delimiters = _tools.human_readable_list(
                        _VALID_FIXED_ANY_LINE_DELIMITERS)
                    raise errors.DataFormatError(
                        'line delimiter is %s but must be one of: %s' %
                        (_compat.text_repr(actual_line_delimiter),
                         valid_line_delimiters), location)
            elif actual_line_delimiter != line_delimiter:
                raise errors.DataFormatError(
                    'line delimiter is %s but must be %s' %
                    (_compat.text_repr(actual_line_delimiter),
                     _compat.text_repr(line_delimiter)), location)
        return result

    if isinstance(fixed_source, six.string_types):
        fixed_file = io.open(fixed_source, 'r', encoding=encoding)
        is_opened = True
    else:
        fixed_file = fixed_source
        is_opened = False

    has_data = True
    try:
        while has_data:
            field_index = 0
            row = []
            for field_name, field_length in field_name_and_lengths:
                if unread_character_after_line_delimiter[0] is None:
                    item = fixed_file.read(field_length)
                else:
                    assert len(unread_character_after_line_delimiter) == 1
                    item = unread_character_after_line_delimiter[0]
                    if field_length >= 2:
                        item += fixed_file.read(field_length - 1)
                    unread_character_after_line_delimiter[0] = None
                assert unread_character_after_line_delimiter[0] is None
                if not is_opened:
                    # Ensure that the input is a text file, `io.StringIO` or something similar. Binary files,
                    # `io.BytesIO` and the like cannot be used because the return bytes instead of strings.
                    # NOTE: We do not need to use _compat.text_repr(item) because type `unicode` does not fail here.
                    assert isinstance(item, six.text_type), \
                        '%s: fixed_source must yield strings but got type %s, value %r' % (location, type(item), item)
                item_length = len(item)
                if item_length == 0:
                    if field_index > 0:
                        names = [name for name, _ in field_name_and_lengths]
                        lengths = [
                            length for _, length in field_name_and_lengths
                        ]
                        previous_field_index = field_index - 1
                        characters_needed_count = sum(lengths[field_index:])
                        list_of_missing_field_names = _tools.human_readable_list(
                            names[field_index:], 'and')
                        raise errors.DataFormatError(
                            "after field '%s' %d characters must follow for: %s"
                            % (names[previous_field_index],
                               characters_needed_count,
                               list_of_missing_field_names), location)
                    # End of input reached.
                    has_data = False
                elif item_length == field_length:
                    row.append(item)
                    location.advance_column(field_length)
                    field_index += 1
                else:
                    raise errors.DataFormatError(
                        "cannot read field '%s': need %d characters but found only %d: %s"
                        % (field_name, field_length, item_length,
                           _compat.text_repr(item)), location)
            if has_data and not _has_data_after_skipped_line_delimiter():
                has_data = False
            if len(row) > 0:
                yield row
                location.advance_line()
    finally:
        if is_opened:
            fixed_file.close()
コード例 #17
0
ファイル: rowio.py プロジェクト: butterhirsch/cutplace2
def ods_rows(source_ods_path, sheet=1):
    """
    Rows stored in ODS document ``source_ods_path`` in ``sheet``.

    :raises cutplace.errors.DataFormarError: if ``source_ods_path`` is not \
      a valid ODS file.
    """
    assert sheet >= 1

    def ods_content_root():
        """
        `ElementTree` for content.xml in `source_ods_path`.
        """
        assert source_ods_path is not None

        location = errors.Location(source_ods_path)
        try:
            # HACK: Use ``closing()`` because of Python 2.6.
            with closing(zipfile.ZipFile(source_ods_path, "r")) as zip_archive:
                try:
                    xml_data = zip_archive.read("content.xml")
                except Exception as error:
                    raise errors.DataFormatError(
                        'cannot extract content.xml for ODS spreadsheet: %s' %
                        error, location)
        except errors.DataFormatError:
            raise
        except Exception as error:
            raise errors.DataFormatError(
                'cannot uncompress ODS spreadsheet: %s' % error, location)

        with io.BytesIO(xml_data) as xml_stream:
            try:
                tree = ElementTree.parse(xml_stream)
            except Exception as error:
                raise errors.DataFormatError(
                    'cannot parse content.xml: %s' % error, location)

        return tree.getroot()

    content_root = ods_content_root()
    table_elements = list(
        _findall(content_root,
                 'office:body/office:spreadsheet/table:table',
                 namespaces=_OOO_NAMESPACES))
    table_count = len(table_elements)
    if table_count < sheet:
        error_message = 'ODS must contain at least %d sheet(s) instead of just %d' % (
            sheet, table_count)
        raise errors.DataFormatError(error_message,
                                     errors.Location(source_ods_path))
    table_element = table_elements[sheet - 1]
    location = errors.Location(source_ods_path, has_cell=True, has_sheet=True)
    for _ in range(sheet - 1):
        location.advance_sheet()
    for table_row in _findall(table_element,
                              'table:table-row',
                              namespaces=_OOO_NAMESPACES):
        row = []
        for table_cell in _findall(table_row,
                                   'table:table-cell',
                                   namespaces=_OOO_NAMESPACES):
            repeated_text = table_cell.attrib.get(_NUMBER_COLUMNS_REPEATED,
                                                  '1')
            try:
                repeated_count = int(repeated_text)
                if repeated_count < 1:
                    raise errors.DataFormatError(
                        'table:number-columns-repeated is %s but must be at least 1'
                        % _compat.text_repr(repeated_text), location)
            except ValueError:
                raise errors.DataFormatError(
                    'table:number-columns-repeated is %s but must be an integer'
                    % _compat.text_repr(repeated_text), location)
            if six.PY2:
                text_p = table_cell.find('{%s}p' % _OOO_NAMESPACES['text'])
            else:
                text_p = table_cell.find('text:p', namespaces=_OOO_NAMESPACES)
            if text_p is None:
                cell_value = ''
            else:
                cell_value = text_p.text
                if six.PY2:
                    # HACK: It seems that under Python 2 ElementTree.find() returns a unicode string only of the value
                    # actually contains non ASCII characters, and otherwise a binary string. To work around this we
                    # check the result for binary strings and possibly convert them to uncicode strings assuming UTF-8
                    # to be the internal encoding for the XML file. Ideally we would parse the XML header for the
                    # encoding. Considering that Python 2 is on the way out, this just doesn't seem to be worth the
                    # trouble right now.
                    if isinstance(cell_value, six.binary_type):
                        cell_value = six.text_type(cell_value, 'utf-8')
                    else:
                        assert isinstance(
                            cell_value,
                            six.text_type), 'cell_value=%r' % cell_value
            row.extend([cell_value] * repeated_count)
            location.advance_cell(repeated_count)
        yield row
        location.advance_line()