コード例 #1
0
ファイル: html.py プロジェクト: EdwardBetts/rows
def import_from_html(filename_or_fobj, encoding='utf-8', index=0,
                     ignore_colspan=True, preserve_html=False, row_tag='tr',
                     column_tag='td|th', *args, **kwargs):
    # TODO: unescape before returning: html_parser.unescape(html)
    # TODO: lxml -> unicode?

    filename, fobj = get_filename_and_fobj(filename_or_fobj)
    kwargs['encoding'] = encoding
    html = fobj.read().decode(encoding)
    html_tree = document_fromstring(html)
    tables = html_tree.xpath('//table')
    table = tables[index]

    strip_tags(table, 'thead')
    strip_tags(table, 'tbody')
    row_elements = table.xpath(row_tag)
    if not preserve_html:
        table_rows = [[value_element.text_content().strip()
                       for value_element in row.xpath(column_tag)]
                      for row in row_elements]
    else:
        table_rows = [[_get_content(value_element)
                       for value_element in row.xpath(column_tag)]
                      for row in row_elements]

    max_columns = max(len(row) for row in table_rows)
    if ignore_colspan:
        table_rows = filter(lambda row: len(row) == max_columns, table_rows)

    meta = {'imported_from': 'html', 'filename': filename,}
    return create_table(table_rows, meta=meta, *args, **kwargs)
コード例 #2
0
ファイル: tests_utils.py プロジェクト: EdwardBetts/rows
    def test_create_table_skip_header(self):
        field_types = OrderedDict([('integer', fields.IntegerField),
                                   ('string', fields.UnicodeField),])
        data = [['1', 'Álvaro'], ['2', 'turicas'], ['3', 'Justen']]
        table_1 = create_table(data, fields=field_types, skip_header=True)
        table_2 = create_table(data, fields=field_types, skip_header=False)

        self.assertEqual(field_types, table_1.fields)
        self.assertEqual(table_1.fields, table_2.fields)
        self.assertEqual(len(table_1), 2)
        self.assertEqual(len(table_2), 3)

        first_row = {'integer': 1, 'string': 'Álvaro'}
        second_row = {'integer': 2, 'string': 'turicas'}
        third_row = {'integer': 3, 'string': 'Justen'}
        self.assertEqual(dict(table_1[0]._asdict()), second_row)
        self.assertEqual(dict(table_2[0]._asdict()), first_row)
        self.assertEqual(dict(table_1[1]._asdict()), third_row)
        self.assertEqual(dict(table_2[1]._asdict()), second_row)
        self.assertEqual(dict(table_2[2]._asdict()), third_row)
コード例 #3
0
ファイル: csv.py プロジェクト: EdwardBetts/rows
def import_from_csv(filename_or_fobj, encoding='utf-8', delimiter=',',
                    quotechar='"', *args, **kwargs):
    'Import data from a CSV file'

    filename, fobj = get_filename_and_fobj(filename_or_fobj)
    kwargs['encoding'] = encoding
    csv_reader = unicodecsv.reader(fobj, encoding=encoding,
                                   delimiter=str(delimiter),
                                   quotechar=str(quotechar))

    meta = {'imported_from': 'csv', 'filename': filename,}
    return create_table(csv_reader, meta=meta, *args, **kwargs)
コード例 #4
0
    def test_create_table_skip_header(self):
        field_types = OrderedDict([
            ('integer', fields.IntegerField),
            ('string', fields.UnicodeField),
        ])
        data = [['1', 'Álvaro'], ['2', 'turicas'], ['3', 'Justen']]
        table_1 = create_table(data, fields=field_types, skip_header=True)
        table_2 = create_table(data, fields=field_types, skip_header=False)

        self.assertEqual(field_types, table_1.fields)
        self.assertEqual(table_1.fields, table_2.fields)
        self.assertEqual(len(table_1), 2)
        self.assertEqual(len(table_2), 3)

        first_row = {'integer': 1, 'string': 'Álvaro'}
        second_row = {'integer': 2, 'string': 'turicas'}
        third_row = {'integer': 3, 'string': 'Justen'}
        self.assertEqual(dict(table_1[0]._asdict()), second_row)
        self.assertEqual(dict(table_2[0]._asdict()), first_row)
        self.assertEqual(dict(table_1[1]._asdict()), third_row)
        self.assertEqual(dict(table_2[1]._asdict()), second_row)
        self.assertEqual(dict(table_2[2]._asdict()), third_row)
コード例 #5
0
ファイル: xls.py プロジェクト: tilacog/rows
def import_from_xls(filename_or_fobj,
                    sheet_name=None,
                    sheet_index=0,
                    start_row=0,
                    start_column=0,
                    *args,
                    **kwargs):

    filename, _ = get_filename_and_fobj(filename_or_fobj)
    book = xlrd.open_workbook(filename, formatting_info=True)
    if sheet_name is not None:
        sheet = book.sheet_by_name(sheet_name)
    else:
        sheet = book.sheet_by_index(sheet_index)
    # TODO: may re-use Excel data types

    # Get field names
    # TODO: may use sheet.col_values or even sheet.ncols
    column_count = 0
    header = []
    column_value = cell_value(sheet, start_row, start_column + column_count)
    while column_value:
        header.append(column_value)
        column_count += 1
        column_value = cell_value(sheet, start_row,
                                  start_column + column_count)

    # Get sheet rows
    # TODO: may use sheel.col_slice or even sheet.nrows
    table_rows = []
    row_count = 0
    start_row += 1
    cell_is_empty = False
    while not cell_is_empty:
        row = [
            cell_value(sheet, start_row + row_count,
                       start_column + column_index)
            for column_index in range(column_count)
        ]
        cell_is_empty = not any(row)
        if not cell_is_empty:
            table_rows.append(row)
            row_count += 1

    meta = {
        'imported_from': 'xls',
        'filename': filename,
    }
    return create_table([header] + table_rows, meta=meta, *args, **kwargs)
コード例 #6
0
def import_from_html(filename_or_fobj,
                     encoding='utf-8',
                     index=0,
                     ignore_colspan=True,
                     preserve_html=False,
                     row_tag='tr',
                     column_tag='td|th',
                     *args,
                     **kwargs):
    # TODO: unescape before returning: html_parser.unescape(html)
    # TODO: lxml -> unicode?

    filename, fobj = get_filename_and_fobj(filename_or_fobj)
    kwargs['encoding'] = encoding
    html = fobj.read().decode(encoding)
    html_tree = document_fromstring(html)
    tables = html_tree.xpath('//table')
    table = tables[index]

    strip_tags(table, 'thead')
    strip_tags(table, 'tbody')
    row_elements = table.xpath(row_tag)
    if not preserve_html:
        table_rows = [[
            value_element.text_content().strip()
            for value_element in row.xpath(column_tag)
        ] for row in row_elements]
    else:
        table_rows = [[
            _get_content(value_element)
            for value_element in row.xpath(column_tag)
        ] for row in row_elements]

    max_columns = max(len(row) for row in table_rows)
    if ignore_colspan:
        table_rows = filter(lambda row: len(row) == max_columns, table_rows)

    meta = {
        'imported_from': 'html',
        'filename': filename,
    }
    return create_table(table_rows, meta=meta, *args, **kwargs)
コード例 #7
0
ファイル: xls.py プロジェクト: EdwardBetts/rows
def import_from_xls(filename_or_fobj, sheet_name=None, sheet_index=0,
                    start_row=0, start_column=0, *args, **kwargs):

    filename, _ = get_filename_and_fobj(filename_or_fobj)
    book = xlrd.open_workbook(filename, formatting_info=True)
    if sheet_name is not None:
        sheet = book.sheet_by_name(sheet_name)
    else:
        sheet = book.sheet_by_index(sheet_index)
    # TODO: may re-use Excel data types

    # Get field names
    # TODO: may use sheet.col_values or even sheet.ncols
    column_count = 0
    header = []
    column_value = cell_value(sheet, start_row, start_column + column_count)
    while column_value:
        header.append(column_value)
        column_count += 1
        column_value = cell_value(sheet, start_row,
                                  start_column + column_count)

    # Get sheet rows
    # TODO: may use sheel.col_slice or even sheet.nrows
    table_rows = []
    row_count = 0
    start_row += 1
    cell_is_empty = False
    while not cell_is_empty:
        row = [cell_value(sheet, start_row + row_count,
                          start_column + column_index)
               for column_index in range(column_count)]
        cell_is_empty = not any(row)
        if not cell_is_empty:
            table_rows.append(row)
            row_count += 1

    meta = {'imported_from': 'xls', 'filename': filename,}
    return create_table([header] + table_rows, meta=meta, *args, **kwargs)