Beispiel #1
0
 def test_can_read_delimited_rows(self):
     # TODO: either get rid of the CID and move it to test_iotools or use validate.Reader and move it to test_validate.
     delimited_cid = interface.Cid(dev_test.CID_CUSTOMERS_ODS_PATH)
     delimited_rows = rowio.delimited_rows(dev_test.CUSTOMERS_CSV_PATH, delimited_cid.data_format)
     title_row = next(delimited_rows)
     self.assertEqual(title_row, ['customer_id', 'surname', 'first_name', 'born', 'gender'])
     first_data_row = next(delimited_rows)
     self.assertEqual(first_data_row, ['1', 'Beck', 'Tyler', '1995-11-15', 'male'])
Beispiel #2
0
 def test_fails_on_delimited_with_unterminated_quote(self):
     customer_cid = interface.Cid(dev_test.CID_CUSTOMERS_ODS_PATH)
     broken_delimited_path = dev_test.path_to_test_data('broken_customers_with_unterminated_quote.csv')
     try:
         list(rowio.delimited_rows(broken_delimited_path, customer_cid.data_format))
     except errors.DataFormatError as error:
         error_message = '%s' % error
         self.assertTrue(
             'cannot parse delimited file' in error_message, 'error_message=%r' % error_message)
 def test_can_read_delimited_rows(self):
     # TODO: either get rid of the CID and move it to test_iotools or use validate.Reader and move it to test_validate.
     delimited_cid = interface.Cid(
         dev_test.path_to_test_cid("icd_customers.xls"))
     delimited_rows = rowio.delimited_rows(
         dev_test.path_to_test_data("valid_customers.csv"),
         delimited_cid._data_format)
     first_row = next(delimited_rows)
     self.assertEqual(first_row,
                      ['38000', '23', 'John', 'Doe', 'male', '08.03.1957'])
 def test_can_read_delimited_rows(self):
     # TODO: either get rid of the CID and move it to test_iotools or use validate.Reader and move it to test_validate.
     delimited_cid = interface.Cid(dev_test.CID_CUSTOMERS_ODS_PATH)
     delimited_rows = rowio.delimited_rows(dev_test.CUSTOMERS_CSV_PATH,
                                           delimited_cid.data_format)
     title_row = next(delimited_rows)
     self.assertEqual(
         title_row,
         ['customer_id', 'surname', 'first_name', 'born', 'gender'])
     first_data_row = next(delimited_rows)
     self.assertEqual(first_data_row,
                      ['1', 'Beck', 'Tyler', '1995-11-15', 'male'])
Beispiel #5
0
def _convert_to_rst(cid_path,
                    data_path,
                    target_rst_path,
                    target_encoding='utf-8'):
    assert cid_path is not None
    assert data_path is not None

    _log.info('read CID from "%s"', cid_path)
    cid = cutplace.Cid(cid_path)
    data_format = cid.data_format
    if data_format.format != data.FORMAT_DELIMITED:
        raise NotImplementedError('format=%s' % data_format.format)
    if cid.data_format.header >= 2:
        raise NotImplementedError('cid.data_format.header=%s' %
                                  cid.data_format.header)
    first_row_is_heading = cid.data_format.header == 1

    _log.info('read data from "%s"', data_path)
    rows = list(rowio.delimited_rows(data_path, data_format))

    # Find out the length of each column.
    lengths = []
    for row_number, row in enumerate(rows):
        for column_index, item in enumerate(row):
            item_length = len(item)
            is_first_row = row_number == 0
            is_past_last_column = column_index == len(lengths)
            if is_first_row or is_past_last_column:
                lengths.append(item_length)
            elif lengths[column_index] < item_length:
                lengths[column_index] = item_length
    if len(lengths) == 0:
        raise ValueError('file must contain columns: "%s"' % data_path)
    for column_index in range(len(lengths)):
        if lengths[column_index] == 0:
            raise ValueError(
                'column %d in file "%s" must not always be empty' %
                (column_index + 1, data_path))

    _log.info('write RST to "%s"', target_rst_path)
    with io.open(target_rst_path, mode='w',
                 encoding=target_encoding) as rst_target_file:
        is_first_row = first_row_is_heading
        _write_rst_separator_line(rst_target_file, lengths, "-")
        for row_number, row in enumerate(rows):
            _write_rst_row(rst_target_file, lengths, row)
            is_first_row = row_number == 0
            if is_first_row and first_row_is_heading:
                line_separator = "="
            else:
                line_separator = "-"
            _write_rst_separator_line(rst_target_file, lengths, line_separator)
Beispiel #6
0
 def _raw_rows(self):
     data_format = self.cid.data_format
     format = data_format.format
     if format == data.FORMAT_EXCEL:
         return rowio.excel_rows(self._source_data_stream_or_path, data_format.sheet)
     elif format == data.FORMAT_DELIMITED:
         return rowio.delimited_rows(self._source_data_stream_or_path, data_format)
     elif format == data.FORMAT_FIXED:
         return rowio.fixed_rows(
             self._source_data_stream_or_path, data_format.encoding, interface.field_names_and_lengths(self.cid),
             data_format.line_delimiter)
     elif format == data.FORMAT_ODS:
         return rowio.ods_rows(self._source_data_stream_or_path, data_format.sheet)
     else:
         assert False, 'format=%r' % format
Beispiel #7
0
 def _raw_rows(self):
     data_format = self.cid.data_format
     format = data_format.format
     if format == data.FORMAT_EXCEL:
         return rowio.excel_rows(self._source_data_stream_or_path, data_format.sheet)
     elif format == data.FORMAT_DELIMITED:
         return rowio.delimited_rows(self._source_data_stream_or_path, data_format)
     elif format == data.FORMAT_FIXED:
         return rowio.fixed_rows(
             self._source_data_stream_or_path, data_format.encoding, interface.field_names_and_lengths(self.cid),
             data_format.line_delimiter)
     elif format == data.FORMAT_ODS:
         return rowio.ods_rows(self._source_data_stream_or_path, data_format.sheet)
     else:
         assert False, 'format=%r' % format
Beispiel #8
0
def _convert_to_rst(cid_path, data_path, target_rst_path, target_encoding='utf-8'):
    assert cid_path is not None
    assert data_path is not None

    _log.info('read CID from "%s"', cid_path)
    cid = cutplace.Cid(cid_path)
    data_format = cid.data_format
    if data_format.format != data.FORMAT_DELIMITED:
        raise NotImplementedError('format=%s' % data_format.format)
    if cid.data_format.header >= 2:
        raise NotImplementedError('cid.data_format.header=%s' % cid.data_format.header)
    first_row_is_heading = cid.data_format.header == 1

    _log.info('read data from "%s"', data_path)
    rows = list(rowio.delimited_rows(data_path, data_format))

    # Find out the length of each column.
    lengths = []
    for row_number, row in enumerate(rows):
        for column_index, item in enumerate(row):
            item_length = len(item)
            is_first_row = row_number == 0
            is_past_last_column = column_index == len(lengths)
            if is_first_row or is_past_last_column:
                lengths.append(item_length)
            elif lengths[column_index] < item_length:
                lengths[column_index] = item_length
    if len(lengths) == 0:
        raise ValueError('file must contain columns: "%s"' % data_path)
    for column_index in range(len(lengths)):
        if lengths[column_index] == 0:
            raise ValueError('column %d in file "%s" must not always be empty' % (column_index + 1, data_path))

    _log.info('write RST to "%s"', target_rst_path)
    with io.open(target_rst_path, mode='w', encoding=target_encoding) as rst_target_file:
        is_first_row = first_row_is_heading
        _write_rst_separator_line(rst_target_file, lengths, "-")
        for row_number, row in enumerate(rows):
            _write_rst_row(rst_target_file, lengths, row)
            is_first_row = row_number == 0
            if is_first_row and first_row_is_heading:
                line_separator = "="
            else:
                line_separator = "-"
            _write_rst_separator_line(rst_target_file, lengths, line_separator)
Beispiel #9
0
 def test_can_read_delimited_non_ascii(self):
     data_format = data.DataFormat(data.FORMAT_DELIMITED)
     data_format.validate()
     with io.StringIO('eggs\nsp\u00c4m') as data_stream:
         actual_rows = list(rowio.delimited_rows(data_stream, data_format))
     self.assertEqual([['eggs'], ['sp\u00c4m']], actual_rows)