Ejemplo n.º 1
0
    def ods_content_root():
        """
        `ElementTree` for content.xml in `source_ods_path`.
        """
        assert source_ods_path is not None

        location = errors.Location(source_ods_path)
        try:
            # HACK: Use ``closing()`` because of Python 2.6.
            with closing(zipfile.ZipFile(source_ods_path, "r")) as zip_archive:
                try:
                    xml_data = zip_archive.read("content.xml")
                except Exception as error:
                    raise errors.DataFormatError(
                        'cannot extract content.xml for ODS spreadsheet: %s' %
                        error, location)
        except errors.DataFormatError:
            raise
        except Exception as error:
            raise errors.DataFormatError(
                'cannot uncompress ODS spreadsheet: %s' % error, location)

        with io.BytesIO(xml_data) as xml_stream:
            try:
                tree = ElementTree.parse(xml_stream)
            except Exception as error:
                raise errors.DataFormatError(
                    'cannot parse content.xml: %s' % error, location)

        return tree.getroot()
Ejemplo n.º 2
0
def excel_rows(source_path, sheet=1):
    """
    Rows read from an Excel document (both :file:`*.xls` and :file:`*.xlsx`
    thanks to :py:mod:`xlrd`).

    :param str source_path: path to the Excel file to be read
    :param int sheet: the sheet in the file to be read
    :return: sequence of lists with each list representing a row in the \
      Excel file
    :raises cutplace.errors.DataFormatError: in case the file cannot be read
    """
    assert source_path is not None
    assert sheet >= 1, 'sheet=%r' % sheet

    location = errors.Location(source_path, has_cell=True)
    try:
        with xlrd.open_workbook(source_path) as book:
            sheet = book.sheet_by_index(0)
            datemode = book.datemode
            for y in range(sheet.nrows):
                row = []
                for x in range(sheet.ncols):
                    row.append(_excel_cell_value(sheet.cell(y, x), datemode))
                    location.advance_cell()
                yield row
                location.advance_line()
    except xlrd.XLRDError as error:
        raise errors.DataFormatError('cannot read Excel file: %s' % error,
                                     location)
    except UnicodeError as error:
        raise errors.DataFormatError('cannot decode Excel data: %s' % error,
                                     location)
Ejemplo n.º 3
0
def _raise_delimited_data_format_error(delimited_path, reader, error):
    location = errors.Location(delimited_path)
    line_number = reader.line_num
    if line_number > 0:
        location.advance_line(line_number)
    raise errors.DataFormatError('cannot parse delimited file: %s' % error,
                                 location)
Ejemplo n.º 4
0
 def write_row(self, row_to_write):
     try:
         self._delimited_writer.writerow(row_to_write)
     except UnicodeEncodeError as error:
         raise errors.DataFormatError(
             'cannot write data row: %s; row=%s' % (error, row_to_write),
             self.location)
     self._location.advance_line()
Ejemplo n.º 5
0
    def _has_data_after_skipped_line_delimiter():
        """
        If `fixed_file` has data, assume they are a line delimiter as specified
        by `line_delimiter` and read and validate them.

        In case `line_delimiter` is `None`, the result is always ``True`` even
        if the input has already reached its end.
        """
        assert location is not None
        assert line_delimiter in _VALID_FIXED_LINE_DELIMITERS
        assert unread_character_after_line_delimiter[0] is None

        result = True
        if line_delimiter is not None:
            if line_delimiter == '\r\n':
                actual_line_delimiter = fixed_file.read(2)
            else:
                assert line_delimiter in ('\n', '\r', 'any')
                actual_line_delimiter = fixed_file.read(1)
            if actual_line_delimiter == '':
                result = False
            elif line_delimiter == 'any':
                if actual_line_delimiter == '\r':
                    # Process the optional '\n' for 'any'.
                    anticipated_linefeed = fixed_file.read(1)
                    if anticipated_linefeed == '\n':
                        actual_line_delimiter += anticipated_linefeed
                    elif anticipated_linefeed == '':
                        result = False
                    else:
                        # Unread the previous character because it is unrelated to line delimiters.
                        unread_character_after_line_delimiter[
                            0] = anticipated_linefeed
                if actual_line_delimiter not in _VALID_FIXED_ANY_LINE_DELIMITERS:
                    valid_line_delimiters = _tools.human_readable_list(
                        _VALID_FIXED_ANY_LINE_DELIMITERS)
                    raise errors.DataFormatError(
                        'line delimiter is %s but must be one of: %s' %
                        (_compat.text_repr(actual_line_delimiter),
                         valid_line_delimiters), location)
            elif actual_line_delimiter != line_delimiter:
                raise errors.DataFormatError(
                    'line delimiter is %s but must be %s' %
                    (_compat.text_repr(actual_line_delimiter),
                     _compat.text_repr(line_delimiter)), location)
        return result
Ejemplo n.º 6
0
    def write_row(self, row_to_write):
        """
        Write a row of fixed length strings.

        :param list row_to_write: a list of str where each item must have \
          exactly the same length as the corresponding entry in \
          :py:attr:`~.field_lengths`
        :raises AssertionError: if ``row_to_write`` is not a list of \
          strings with each matching the corresponding ``field_lengths`` \
          as specified to :py:meth:`~.__init__`.
        """
        assert row_to_write is not None
        row_to_write_item_count = len(row_to_write)
        assert row_to_write_item_count == self._expected_row_item_count, \
            '%s: row must have %d items instead of %d: %s' \
            % (self.location, self._expected_row_item_count, row_to_write_item_count, row_to_write)
        if __debug__:
            for field_index, field_value in enumerate(row_to_write):
                self.location.set_cell(field_index)
                field_name, expected_field_length = self._field_names_and_lengths[
                    field_index]
                assert isinstance(field_value, six.text_type), \
                    '%s: field %s must be of type %s instead of %s: %r' \
                    % (self.location, _compat.text_repr(field_name), six.text_type.__name__, type(field_value).__name__,
                       field_value)
                actual_field_length = len(field_value)
                assert actual_field_length == expected_field_length, \
                    '%s: field %s must have exactly %d characters instead of %d: %r' \
                    % (self.location, _compat.text_repr(field_name), expected_field_length, actual_field_length,
                       field_value)
            self.location.set_cell(0)

        try:
            self._target_stream.write(''.join(row_to_write))
        except UnicodeEncodeError as error:
            raise errors.DataFormatError(
                'cannot write data row: %s; row=%s' % (error, row_to_write),
                self.location)
        if self._line_separator is not None:
            self._target_stream.write(self._line_separator)
        self.location.advance_line()
Ejemplo n.º 7
0
def fixed_rows(fixed_source,
               encoding,
               field_name_and_lengths,
               line_delimiter='any'):
    r"""
    Rows found in file ``fixed_source`` using ``encoding``. The name and
    (fixed) length of the fields for each row are specified as a list of
    tuples ``(name, length)``. Each row can end with a line feed unless
    ``line_delimiter`` equals ``None``. Valid values are: ``'\n'``, ``'\r'``
    and ``'\r\n'``, in which case other values result in a
    `errors.DataFormatError`. Additionally ``'any'`` accepts any of the
    previous values.
    """
    assert fixed_source is not None
    assert encoding is not None
    for name, length in field_name_and_lengths:
        assert name is not None
        assert length >= 1, 'length for %s must be at least 1 but is %s' % (
            name, length)
    assert line_delimiter in _VALID_FIXED_LINE_DELIMITERS, \
        'line_delimiter=%s but must be one of: %s' % (_compat.text_repr(line_delimiter), _VALID_FIXED_LINE_DELIMITERS)

    # Predefine variable for access in local function.
    location = errors.Location(fixed_source, has_column=True)
    fixed_file = None
    # HACK: list with at most 1 character to be unread after a line feed. We
    # need to use a list so `_has_data_after_skipped_line_delimiter` can
    # modify its contents.
    unread_character_after_line_delimiter = [None]

    def _has_data_after_skipped_line_delimiter():
        """
        If `fixed_file` has data, assume they are a line delimiter as specified
        by `line_delimiter` and read and validate them.

        In case `line_delimiter` is `None`, the result is always ``True`` even
        if the input has already reached its end.
        """
        assert location is not None
        assert line_delimiter in _VALID_FIXED_LINE_DELIMITERS
        assert unread_character_after_line_delimiter[0] is None

        result = True
        if line_delimiter is not None:
            if line_delimiter == '\r\n':
                actual_line_delimiter = fixed_file.read(2)
            else:
                assert line_delimiter in ('\n', '\r', 'any')
                actual_line_delimiter = fixed_file.read(1)
            if actual_line_delimiter == '':
                result = False
            elif line_delimiter == 'any':
                if actual_line_delimiter == '\r':
                    # Process the optional '\n' for 'any'.
                    anticipated_linefeed = fixed_file.read(1)
                    if anticipated_linefeed == '\n':
                        actual_line_delimiter += anticipated_linefeed
                    elif anticipated_linefeed == '':
                        result = False
                    else:
                        # Unread the previous character because it is unrelated to line delimiters.
                        unread_character_after_line_delimiter[
                            0] = anticipated_linefeed
                if actual_line_delimiter not in _VALID_FIXED_ANY_LINE_DELIMITERS:
                    valid_line_delimiters = _tools.human_readable_list(
                        _VALID_FIXED_ANY_LINE_DELIMITERS)
                    raise errors.DataFormatError(
                        'line delimiter is %s but must be one of: %s' %
                        (_compat.text_repr(actual_line_delimiter),
                         valid_line_delimiters), location)
            elif actual_line_delimiter != line_delimiter:
                raise errors.DataFormatError(
                    'line delimiter is %s but must be %s' %
                    (_compat.text_repr(actual_line_delimiter),
                     _compat.text_repr(line_delimiter)), location)
        return result

    if isinstance(fixed_source, six.string_types):
        fixed_file = io.open(fixed_source, 'r', encoding=encoding)
        is_opened = True
    else:
        fixed_file = fixed_source
        is_opened = False

    has_data = True
    try:
        while has_data:
            field_index = 0
            row = []
            for field_name, field_length in field_name_and_lengths:
                if unread_character_after_line_delimiter[0] is None:
                    item = fixed_file.read(field_length)
                else:
                    assert len(unread_character_after_line_delimiter) == 1
                    item = unread_character_after_line_delimiter[0]
                    if field_length >= 2:
                        item += fixed_file.read(field_length - 1)
                    unread_character_after_line_delimiter[0] = None
                assert unread_character_after_line_delimiter[0] is None
                if not is_opened:
                    # Ensure that the input is a text file, `io.StringIO` or something similar. Binary files,
                    # `io.BytesIO` and the like cannot be used because the return bytes instead of strings.
                    # NOTE: We do not need to use _compat.text_repr(item) because type `unicode` does not fail here.
                    assert isinstance(item, six.text_type), \
                        '%s: fixed_source must yield strings but got type %s, value %r' % (location, type(item), item)
                item_length = len(item)
                if item_length == 0:
                    if field_index > 0:
                        names = [name for name, _ in field_name_and_lengths]
                        lengths = [
                            length for _, length in field_name_and_lengths
                        ]
                        previous_field_index = field_index - 1
                        characters_needed_count = sum(lengths[field_index:])
                        list_of_missing_field_names = _tools.human_readable_list(
                            names[field_index:], 'and')
                        raise errors.DataFormatError(
                            "after field '%s' %d characters must follow for: %s"
                            % (names[previous_field_index],
                               characters_needed_count,
                               list_of_missing_field_names), location)
                    # End of input reached.
                    has_data = False
                elif item_length == field_length:
                    row.append(item)
                    location.advance_column(field_length)
                    field_index += 1
                else:
                    raise errors.DataFormatError(
                        "cannot read field '%s': need %d characters but found only %d: %s"
                        % (field_name, field_length, item_length,
                           _compat.text_repr(item)), location)
            if has_data and not _has_data_after_skipped_line_delimiter():
                has_data = False
            if len(row) > 0:
                yield row
                location.advance_line()
    finally:
        if is_opened:
            fixed_file.close()
Ejemplo n.º 8
0
def ods_rows(source_ods_path, sheet=1):
    """
    Rows stored in ODS document ``source_ods_path`` in ``sheet``.

    :raises cutplace.errors.DataFormarError: if ``source_ods_path`` is not \
      a valid ODS file.
    """
    assert sheet >= 1

    def ods_content_root():
        """
        `ElementTree` for content.xml in `source_ods_path`.
        """
        assert source_ods_path is not None

        location = errors.Location(source_ods_path)
        try:
            # HACK: Use ``closing()`` because of Python 2.6.
            with closing(zipfile.ZipFile(source_ods_path, "r")) as zip_archive:
                try:
                    xml_data = zip_archive.read("content.xml")
                except Exception as error:
                    raise errors.DataFormatError(
                        'cannot extract content.xml for ODS spreadsheet: %s' %
                        error, location)
        except errors.DataFormatError:
            raise
        except Exception as error:
            raise errors.DataFormatError(
                'cannot uncompress ODS spreadsheet: %s' % error, location)

        with io.BytesIO(xml_data) as xml_stream:
            try:
                tree = ElementTree.parse(xml_stream)
            except Exception as error:
                raise errors.DataFormatError(
                    'cannot parse content.xml: %s' % error, location)

        return tree.getroot()

    content_root = ods_content_root()
    table_elements = list(
        _findall(content_root,
                 'office:body/office:spreadsheet/table:table',
                 namespaces=_OOO_NAMESPACES))
    table_count = len(table_elements)
    if table_count < sheet:
        error_message = 'ODS must contain at least %d sheet(s) instead of just %d' % (
            sheet, table_count)
        raise errors.DataFormatError(error_message,
                                     errors.Location(source_ods_path))
    table_element = table_elements[sheet - 1]
    location = errors.Location(source_ods_path, has_cell=True, has_sheet=True)
    for _ in range(sheet - 1):
        location.advance_sheet()
    for table_row in _findall(table_element,
                              'table:table-row',
                              namespaces=_OOO_NAMESPACES):
        row = []
        for table_cell in _findall(table_row,
                                   'table:table-cell',
                                   namespaces=_OOO_NAMESPACES):
            repeated_text = table_cell.attrib.get(_NUMBER_COLUMNS_REPEATED,
                                                  '1')
            try:
                repeated_count = int(repeated_text)
                if repeated_count < 1:
                    raise errors.DataFormatError(
                        'table:number-columns-repeated is %s but must be at least 1'
                        % _compat.text_repr(repeated_text), location)
            except ValueError:
                raise errors.DataFormatError(
                    'table:number-columns-repeated is %s but must be an integer'
                    % _compat.text_repr(repeated_text), location)
            if six.PY2:
                text_p = table_cell.find('{%s}p' % _OOO_NAMESPACES['text'])
            else:
                text_p = table_cell.find('text:p', namespaces=_OOO_NAMESPACES)
            if text_p is None:
                cell_value = ''
            else:
                cell_value = text_p.text
                if six.PY2:
                    # HACK: It seems that under Python 2 ElementTree.find() returns a unicode string only of the value
                    # actually contains non ASCII characters, and otherwise a binary string. To work around this we
                    # check the result for binary strings and possibly convert them to uncicode strings assuming UTF-8
                    # to be the internal encoding for the XML file. Ideally we would parse the XML header for the
                    # encoding. Considering that Python 2 is on the way out, this just doesn't seem to be worth the
                    # trouble right now.
                    if isinstance(cell_value, six.binary_type):
                        cell_value = six.text_type(cell_value, 'utf-8')
                    else:
                        assert isinstance(
                            cell_value,
                            six.text_type), 'cell_value=%r' % cell_value
            row.extend([cell_value] * repeated_count)
            location.advance_cell(repeated_count)
        yield row
        location.advance_line()