Example #1
0
    def test_create_table_import_fields_ordering(self):
        # From: https://github.com/turicas/rows/issues/239

        data = [
            ["intfield", "textfield", "floatfield"],
            [1, "str1", 1.2],
            [2, "str2", 2.3],
            [3, "str3", 3.4],
        ]
        # `fields` parameter on `create_table` must always be in the same order
        # as the data.
        fields = OrderedDict(
            [
                ("intfield", rows.fields.IntegerField),
                ("textfield", rows.fields.TextField),
                ("floatfield", rows.fields.FloatField),
            ]
        )

        # Regular case: no `import_fields` specified
        table = plugins_utils.create_table(data, fields=fields, skip_header=True)
        self.assertEqual(table.fields, fields)
        for row, row_data in zip(table, data[1:]):
            self.assertEqual(row_data, [row.intfield, row.textfield, row.floatfield])

        # Special case: `import_fields` has different order from `fields`
        import_fields = ["textfield", "intfield"]
        table = plugins_utils.create_table(
            data, fields=fields, import_fields=import_fields, skip_header=True
        )
        self.assertEqual(list(table.fields.keys()), import_fields)
        for row, row_data in zip(table, data[1:]):
            self.assertEqual(row_data[1], row.textfield)
            self.assertEqual(row_data[0], row.intfield)
    def test_create_table_skip_header(self):
        field_types = OrderedDict([
            ('integer', fields.IntegerField),
            ('string', fields.TextField),
        ])
        data = [['1', 'Álvaro'], ['2', 'turicas'], ['3', 'Justen']]
        table_1 = plugins_utils.create_table(data,
                                             fields=field_types,
                                             skip_header=True)
        table_2 = plugins_utils.create_table(data,
                                             fields=field_types,
                                             skip_header=False)

        self.assertEqual(field_types, table_1.fields)
        self.assertEqual(table_1.fields, table_2.fields)
        self.assertEqual(len(table_1), 2)
        self.assertEqual(len(table_2), 3)

        first_row = {'integer': 1, 'string': 'Álvaro'}
        second_row = {'integer': 2, 'string': 'turicas'}
        third_row = {'integer': 3, 'string': 'Justen'}
        self.assertEqual(dict(table_1[0]._asdict()), second_row)
        self.assertEqual(dict(table_2[0]._asdict()), first_row)
        self.assertEqual(dict(table_1[1]._asdict()), third_row)
        self.assertEqual(dict(table_2[1]._asdict()), second_row)
        self.assertEqual(dict(table_2[2]._asdict()), third_row)
    def test_create_table_import_fields_dont_exist(self):
        header = ['field1', 'field2', 'field3']
        table_rows = [['1', 3.14, 'Álvaro'], ['2', 2.71, 'turicas'],
                      ['3', 1.23, 'Justen']]

        error_fields = ['doesnt_exist', 'ruby']
        import_fields = list(header)[:-1] + error_fields
        with self.assertRaises(ValueError) as exception_context:
            plugins_utils.create_table([header] + table_rows,
                                       import_fields=import_fields)

        self.assertIn(exception_context.exception.args[0],
                      possible_field_names_errors(error_fields))
Example #4
0
    def test_create_table_import_fields_dont_exist(self):
        header = ['field1', 'field2', 'field3']
        table_rows = [['1', 3.14, 'Álvaro'],
                      ['2', 2.71, 'turicas'],
                      ['3', 1.23, 'Justen']]

        error_fields = ['doesnt_exist', 'ruby']
        import_fields = list(header)[:-1] + error_fields
        with self.assertRaises(ValueError) as exception_context:
            plugins_utils.create_table([header] + table_rows,
                                       import_fields=import_fields)

        self.assertIn(exception_context.exception.message,
                      possible_field_names_errors(error_fields))
Example #5
0
File: xlsx.py Project: wnlima/rows
def import_from_xlsx(filename_or_fobj,
                     sheet_name=None,
                     sheet_index=0,
                     start_row=0,
                     start_column=0,
                     *args,
                     **kwargs):
    workbook = load_workbook(filename_or_fobj)
    if sheet_name is None:
        sheet_name = workbook.sheetnames[sheet_index]
    sheet = workbook.get_sheet_by_name(sheet_name)

    start_row, end_row = max(start_row, sheet.min_row), sheet.max_row
    start_col, end_col = max(start_column, sheet.min_column), sheet.max_column
    table_rows = [[
        _cell_to_python(sheet.cell(row=row_index, column=col_index))
        for col_index in range(start_col, end_col + 1)
    ] for row_index in range(start_row, end_row + 1)]

    filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True)
    metadata = {
        'imported_from': 'xlsx',
        'filename': filename,
        'sheet_name': sheet_name,
    }
    return create_table(table_rows, meta=metadata, *args, **kwargs)
Example #6
0
File: xlsx.py Project: abelthf/rows
def import_from_xlsx(filename_or_fobj, sheet_name=None, sheet_index=0,
                     start_row=0, start_column=0, *args, **kwargs):
    workbook = load_workbook(filename_or_fobj)
    if sheet_name is None:
        sheet_name = workbook.sheetnames[sheet_index]
    sheet = workbook.get_sheet_by_name(sheet_name)

    # Get sheet header
    header = []
    last_column = start_column
    header_value = _get_cell_value(sheet, start_row, last_column)
    while header_value:
        header.append(header_value)
        last_column += 1
        header_value = _get_cell_value(sheet, start_row, last_column)
    last_column -= 1

    # Get sheet rows based on `last_column` defined in 'get sheet header'
    row_pos = start_row + 1
    all_rows = []
    row = _read_row(sheet, row_pos, last_column)
    while any(row):
        all_rows.append(row)
        row_pos += 1
        row = _read_row(sheet, row_pos, last_column)

    filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True)
    metadata = {'imported_from': 'xlsx', 'filename': filename, }
    return create_table([header] + all_rows, meta=metadata, *args, **kwargs)
Example #7
0
def extract_ibama_pdf(filename):
    """Extract all pages from a "Autuação Ambiental" PDF, generated by IBAMA

    This function will extract each page at once so we can use rows'
    `starts_after/ends_before`.
    It's using `pdf_table_lines` instead of `import_from_pdf` because it's
    faster and we can fix the table lines before importing data as a
    `rows.Table`.
    """

    final = []
    total_pages = rows.plugins.pdf.number_of_pages(filename)
    for page_number in range(1, total_pages + 1):
        print('Processing page {}...'.format(page_number))
        with open(filename, mode='rb') as fobj:
            table_rows = rows.plugins.pdf.pdf_table_lines(
                fobj,
                page_numbers=(page_number, ),
                starts_after='DIRETORIA DE PROTEÇÃO AMBIENTAL',
                ends_before=re.compile('Pag [0-9]+/[0-9]+'),
                algorithm='rects-boundaries',
            )
            final.extend(fix_rows(table_rows, header=page_number == 1))

    table = create_table(final,
                         meta={
                             'imported_from': 'pdf',
                             'filename': filename
                         },
                         force_types={
                             'data_infracao': BRDateField,
                             'valor_multa': BRMoneyField
                         })
    return table
Example #8
0
def import_from_postgresql(
    connection_or_uri,
    table_name="table1",
    query=None,
    query_args=None,
    close_connection=False,
    *args,
    **kwargs
):

    if query is None:
        if not _valid_table_name(table_name):
            raise ValueError("Invalid table name: {}".format(table_name))

        query = SQL_SELECT_ALL.format(table_name=table_name)

    if query_args is None:
        query_args = tuple()

    connection = _get_connection(connection_or_uri)
    cursor = connection.cursor()
    cursor.execute(query, query_args)
    table_rows = list(cursor.fetchall())  # TODO: make it lazy
    header = [six.text_type(info[0]) for info in cursor.description]
    cursor.close()
    connection.commit()  # WHY?

    meta = {"imported_from": "postgresql", "source": connection_or_uri}
    if close_connection:
        connection.close()
    return create_table([header] + table_rows, meta=meta, *args, **kwargs)
Example #9
0
def import_from_xpath(filename_or_fobj,
                      rows_xpath,
                      fields_xpath,
                      encoding='utf-8',
                      *args,
                      **kwargs):

    types = set([type(rows_xpath)] + \
                [type(xpath) for xpath in fields_xpath.values()])
    if types != set([six.text_type]):
        raise TypeError('XPath must be {}'.format(six.text_type.__name__))

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')
    xml = fobj.read().decode(encoding)
    tree = tree_from_string(xml)
    row_elements = tree.xpath(rows_xpath)

    header = list(fields_xpath.keys())
    row_data = _get_row_data(fields_xpath)
    result_rows = list(map(row_data, row_elements))

    meta = {
        'imported_from': 'xpath',
        'filename': filename,
        'encoding': encoding,
    }
    return create_table([header] + result_rows, meta=meta, *args, **kwargs)
Example #10
0
def import_from_dicts(data, samples=None, *args, **kwargs):
    """Import data from a iterable of dicts

    The algorithm will use the `samples` first `dict`s to determine the field
    names (if `samples` is `None` all `dict`s will be used).
    """

    data = iter(data)

    cached_rows, headers = [], []
    for index, row in enumerate(data, start=1):
        cached_rows.append(row)

        for key in row.keys():
            if key not in headers:
                headers.append(key)

        if samples and index == samples:
            break

    data_rows = ([row.get(header, None) for header in headers]
                 for row in chain(cached_rows, data))

    kwargs["samples"] = samples
    meta = {"imported_from": "dicts"}
    return create_table(chain([headers], data_rows),
                        meta=meta,
                        *args,
                        **kwargs)
Example #11
0
def import_from_sqlite(filename_or_connection,
                       table_name="table1",
                       query=None,
                       query_args=None,
                       *args,
                       **kwargs):
    """Return a rows.Table with data from SQLite database."""
    connection = _get_connection(filename_or_connection)
    cursor = connection.cursor()

    if query is None:
        if not _valid_table_name(table_name):
            raise ValueError("Invalid table name: {}".format(table_name))

        query = SQL_SELECT_ALL.format(table_name=table_name)

    if query_args is None:
        query_args = tuple()

    table_rows = list(cursor.execute(query, query_args))  # TODO: may be lazy
    header = [six.text_type(info[0]) for info in cursor.description]
    cursor.close()
    # TODO: should close connection also?

    meta = {"imported_from": "sqlite", "filename": filename_or_connection}
    return create_table([header] + table_rows, meta=meta, *args, **kwargs)
Example #12
0
def import_from_sqlite(
    filename_or_connection,
    table_name="table1",
    query=None,
    query_args=None,
    *args,
    **kwargs
):
    """Return a rows.Table with data from SQLite database."""
    connection = _get_connection(filename_or_connection)
    cursor = connection.cursor()

    if query is None:
        if not _valid_table_name(table_name):
            raise ValueError("Invalid table name: {}".format(table_name))

        query = SQL_SELECT_ALL.format(table_name=table_name)

    if query_args is None:
        query_args = tuple()

    table_rows = list(cursor.execute(query, query_args))  # TODO: may be lazy
    header = [six.text_type(info[0]) for info in cursor.description]
    cursor.close()
    # TODO: should close connection also?

    meta = {"imported_from": "sqlite", "filename": filename_or_connection}
    return create_table([header] + table_rows, meta=meta, *args, **kwargs)
Example #13
0
 def test_create_table_empty_data(self):
     header = ['first', 'first', 'first']
     table_rows = []
     table = plugins_utils.create_table([header] + table_rows)
     self.assertEqual(list(table.fields.keys()),
                      ['first', 'first_2', 'first_3'])
     self.assertEqual(len(table), 0)
Example #14
0
def import_from_sqlite(filename_or_connection,
                       table_name='table1',
                       query=None,
                       query_args=None,
                       *args,
                       **kwargs):

    connection = _get_connection(filename_or_connection)
    cursor = connection.cursor()

    if query is None:
        if not _valid_table_name(table_name):
            raise ValueError('Invalid table name: {}'.format(table_name))

        query = SQL_SELECT_ALL.format(table_name=table_name)

    if query_args is None:
        query_args = tuple()

    table_rows = list(cursor.execute(query, query_args))  # TODO: may be lazy
    header = [six.text_type(info[0]) for info in cursor.description]
    cursor.close()
    # TODO: should close connection also?

    meta = {
        'imported_from': 'sqlite',
        'filename': filename_or_connection,
    }
    return create_table([header] + table_rows, meta=meta, *args, **kwargs)
Example #15
0
File: xls.py Project: wnlima/rows
def import_from_xls(filename_or_fobj,
                    sheet_name=None,
                    sheet_index=0,
                    start_row=0,
                    start_column=0,
                    *args,
                    **kwargs):

    filename, _ = get_filename_and_fobj(filename_or_fobj, mode='rb')
    book = xlrd.open_workbook(filename, formatting_info=True)
    if sheet_name is not None:
        sheet = book.sheet_by_name(sheet_name)
    else:
        sheet = book.sheet_by_index(sheet_index)
    # TODO: may re-use Excel data types

    # Get header and rows
    table_rows = [[
        cell_value(sheet, row_index, column_index)
        for column_index in range(start_column, sheet.ncols)
    ] for row_index in range(start_row, sheet.nrows)]

    meta = {
        'imported_from': 'xls',
        'filename': filename,
        'sheet_name': sheet.name,
    }
    return create_table(table_rows, meta=meta, *args, **kwargs)
Example #16
0
def import_from_dicts(data, samples=None, *args, **kwargs):
    """Import data from a iterable of dicts

    The algorithm will use the `samples` first `dict`s to determine the field
    names (if `samples` is `None` all `dict`s will be used).
    """

    data = iter(data)

    cached_rows, headers = [], []
    for index, row in enumerate(data, start=1):
        cached_rows.append(row)

        for key in row.keys():
            if key not in headers:
                headers.append(key)

        if samples and index == samples:
            break

    data_rows = (
        [row.get(header, None) for header in headers]
        for row in chain(cached_rows, data)
    )

    kwargs["samples"] = samples
    meta = {"imported_from": "dicts"}
    return create_table(chain([headers], data_rows), meta=meta, *args, **kwargs)
 def test_create_table_empty_data(self):
     header = ['first', 'first', 'first']
     table_rows = []
     table = plugins_utils.create_table([header] + table_rows)
     self.assertEqual(list(table.fields.keys()),
                      ['first', 'first_2', 'first_3'])
     self.assertEqual(len(table), 0)
Example #18
0
def import_from_csv(filename_or_fobj,
                    encoding='utf-8',
                    dialect=None,
                    sample_size=8192,
                    *args,
                    **kwargs):
    '''Import data from a CSV file

    If a file-like object is provided it MUST be in binary mode, like in
    `open(filename, mode='rb')`.
    '''

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')

    if dialect is None:
        cursor = fobj.tell()
        dialect = discover_dialect(fobj.read(sample_size), encoding)
        fobj.seek(cursor)

    reader = unicodecsv.reader(fobj, encoding=encoding, dialect=dialect)

    meta = {
        'imported_from': 'csv',
        'filename': filename,
        'encoding': encoding,
    }
    return create_table(reader, meta=meta, *args, **kwargs)
Example #19
0
def import_from_xpath(filename_or_fobj,
                      rows_xpath,
                      fields_xpath,
                      encoding="utf-8",
                      *args,
                      **kwargs):

    types = set([type(rows_xpath)] +
                [type(xpath) for xpath in fields_xpath.values()])
    if types != set([six.text_type]):
        raise TypeError("XPath must be {}".format(six.text_type.__name__))

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb")
    xml = fobj.read().decode(encoding)
    tree = tree_from_string(xml)
    row_elements = tree.xpath(rows_xpath)

    header = list(fields_xpath.keys())
    row_data = _get_row_data(fields_xpath)
    result_rows = list(map(row_data, row_elements))

    meta = {
        "imported_from": "xpath",
        "filename": filename,
        "encoding": encoding
    }
    return create_table([header] + result_rows, meta=meta, *args, **kwargs)
Example #20
0
def import_from_pdf(
    filename_or_fobj,
    page_numbers=None,
    starts_after=None,
    ends_before=None,
    backend=None,
    algorithm="y-groups",
    x_threshold=0.5,
    y_threshold=0.5,
    *args,
    **kwargs
):
    backend = backend or default_backend()
    meta = {"imported_from": "pdf"}
    table_rows = pdf_table_lines(
        filename_or_fobj,
        page_numbers,
        starts_after=starts_after,
        ends_before=ends_before,
        algorithm=algorithm,
        x_threshold=x_threshold,
        y_threshold=y_threshold,
        backend=backend,
    )
    return create_table(table_rows, meta=meta, *args, **kwargs)
Example #21
0
def import_from_postgresql(connection_or_uri,
                           table_name="table1",
                           query=None,
                           query_args=None,
                           close_connection=False,
                           *args,
                           **kwargs):

    if query is None:
        if not _valid_table_name(table_name):
            raise ValueError("Invalid table name: {}".format(table_name))

        query = SQL_SELECT_ALL.format(table_name=table_name)

    if query_args is None:
        query_args = tuple()

    connection = _get_connection(connection_or_uri)
    cursor = connection.cursor()
    cursor.execute(query, query_args)
    table_rows = list(cursor.fetchall())  # TODO: make it lazy
    header = [six.text_type(info[0]) for info in cursor.description]
    cursor.close()
    connection.commit()  # WHY?

    meta = {"imported_from": "postgresql", "source": connection_or_uri}
    if close_connection:
        connection.close()
    return create_table([header] + table_rows, meta=meta, *args, **kwargs)
    def test_create_table_import_fields(self):
        header = ['field1', 'field2', 'field3']
        table_rows = [['1', 3.14, 'Álvaro'], ['2', 2.71, 'turicas'],
                      ['3', 1.23, 'Justen']]
        table = plugins_utils.create_table([header] + table_rows,
                                           import_fields=None)
        self.assertEqual(list(table.fields.keys()), header)
        self.assertEqual(table[0].field1, 1)
        self.assertEqual(table[0].field2, 3.14)
        self.assertEqual(table[0].field3, 'Álvaro')

        import_fields = ['field3', 'field2']
        table = plugins_utils.create_table([header] + table_rows,
                                           import_fields=import_fields)
        self.assertEqual(list(table.fields.keys()), import_fields)
        self.assertEqual(table[0]._asdict(),
                         OrderedDict([('field3', 'Álvaro'), ('field2', 3.14)]))
Example #23
0
    def test_create_table_import_fields(self):
        header = ['field1', 'field2', 'field3']
        table_rows = [['1', 3.14, 'Álvaro'],
                      ['2', 2.71, 'turicas'],
                      ['3', 1.23, 'Justen']]
        table = plugins_utils.create_table([header] + table_rows,
                                           import_fields=None)
        self.assertEqual(table.fields.keys(), header)
        self.assertEqual(table[0].field1, 1)
        self.assertEqual(table[0].field2, 3.14)
        self.assertEqual(table[0].field3, 'Álvaro')

        import_fields = ['field3', 'field2']
        table = plugins_utils.create_table([header] + table_rows,
                                           import_fields=import_fields)
        self.assertEqual(table.fields.keys(), import_fields)
        self.assertEqual(table[0]._asdict(),
                         OrderedDict([('field3', 'Álvaro'), ('field2', 3.14)]))
    def test_create_table_force_types(self):
        header = ['field1', 'field2', 'field3']
        table_rows = [['1', '3.14', 'Álvaro'], ['2', '2.71', 'turicas'],
                      ['3', '1.23', 'Justen']]
        force_types = {'field2': rows.fields.DecimalField}

        table = plugins_utils.create_table([header] + table_rows,
                                           force_types=force_types)
        for field_name, field_type in force_types.items():
            self.assertEqual(table.fields[field_name], field_type)
Example #25
0
    def test_create_table_import_fields_dont_exist(self):
        header = ["field1", "field2", "field3"]
        table_rows = [
            ["1", 3.14, "Álvaro"],
            ["2", 2.71, "turicas"],
            ["3", 1.23, "Justen"],
        ]

        error_fields = ["doesnt_exist", "ruby"]
        import_fields = list(header)[:-1] + error_fields
        with self.assertRaises(ValueError) as exception_context:
            plugins_utils.create_table(
                [header] + table_rows, import_fields=import_fields
            )

        self.assertIn(
            exception_context.exception.args[0],
            possible_field_names_errors(error_fields),
        )
Example #26
0
File: txt.py Project: turicas/rows
def import_from_txt(
    filename_or_fobj, encoding="utf-8", frame_style=FRAME_SENTINEL, *args, **kwargs
):
    """Return a rows.Table created from imported TXT file."""

    # TODO: (maybe)
    # enable parsing of non-fixed-width-columns
    # with old algorithm - that would just split columns
    # at the vertical separator character for the frame.
    # (if doing so, include an optional parameter)
    # Also, this fixes an outstanding unreported issue:
    # trying to parse tables which fields values
    # included a Pipe char - "|" - would silently
    # yield bad results.

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb")
    raw_contents = fobj.read().decode(encoding).rstrip("\n")

    if frame_style is FRAME_SENTINEL:
        frame_style = _guess_frame_style(raw_contents)
    else:
        frame_style = _parse_frame_style(frame_style)

    contents = raw_contents.splitlines()
    del raw_contents

    if frame_style != "None":
        contents = contents[1:-1]
        del contents[1]
    else:
        # the table is possibly generated from other source.
        # check if the line we reserve as a separator is realy empty.
        if not contents[1].strip():
            del contents[1]
    col_positions = _parse_col_positions(frame_style, contents[0])

    table_rows = [
        [
            row[start + 1 : end].strip()
            for start, end in zip(col_positions, col_positions[1:])
        ]
        for row in contents
    ]
    #
    # Variable columns - old behavior:
    # table_rows = [[value.strip() for value in row.split(vertical_char)[1:-1]]
    #              for row in contents]

    meta = {
        "imported_from": "txt",
        "filename": filename,
        "encoding": encoding,
        "frame_style": frame_style,
    }
    return create_table(table_rows, meta=meta, *args, **kwargs)
Example #27
0
def import_from_dicts(data, *args, **kwargs):
    """Import data from a list of dicts."""
    headers = set()
    for row in data:
        headers.update(row.keys())
    headers = sorted(list(headers))

    data = [[row.get(header, None) for header in headers] for row in data]

    meta = {'imported_from': 'dicts', }
    return create_table([headers] + data, meta=meta, *args, **kwargs)
Example #28
0
def import_from_txt(filename_or_fobj,
                    encoding='utf-8',
                    frame_style=FRAME_SENTINEL,
                    *args,
                    **kwargs):
    """Return a rows.Table created from imported TXT file."""

    # TODO: (maybe)
    # enable parsing of non-fixed-width-columns
    # with old algorithm - that would just split columns
    # at the vertical separator character for the frame.
    # (if doing so, include an optional parameter)
    # Also, this fixes an outstanding unreported issue:
    # trying to parse tables which fields values
    # included a Pipe char - "|" - would silently
    # yield bad results.

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')
    raw_contents = fobj.read().decode(encoding).rstrip('\n')

    if frame_style is FRAME_SENTINEL:
        frame_style = _guess_frame_style(raw_contents)
    else:
        frame_style = _parse_frame_style(frame_style)

    contents = raw_contents.splitlines()
    del raw_contents

    if frame_style != 'None':
        contents = contents[1:-1]
        del contents[1]
    else:
        # the table is possibly generated from other source.
        # check if the line we reserve as a separator is realy empty.
        if not contents[1].strip():
            del contents[1]
    col_positions = _parse_col_positions(frame_style, contents[0])

    table_rows = [[
        row[start + 1:end].strip()
        for start, end in zip(col_positions, col_positions[1:])
    ] for row in contents]
    #
    # Variable columns - old behavior:
    # table_rows = [[value.strip() for value in row.split(vertical_char)[1:-1]]
    #              for row in contents]

    meta = {
        'imported_from': 'txt',
        'filename': filename,
        'encoding': encoding,
        'frame_style': frame_style
    }
    return create_table(table_rows, meta=meta, *args, **kwargs)
Example #29
0
    def test_create_table_repeated_field_names(self):
        header = ['first', 'first', 'first']
        table_rows = [['1', 3.14, 'Álvaro'],
                      ['2', 2.71, 'turicas'],
                      ['3', 1.23, 'Justen']]
        table = plugins_utils.create_table([header] + table_rows)
        self.assertEqual(table.fields.keys(), ['first', 'first_2', 'first_3'])
        self.assertEqual(table[0].first, 1)
        self.assertEqual(table[0].first_2, 3.14)
        self.assertEqual(table[0].first_3, 'Álvaro')

        header = ['field', '', 'field']
        table_rows = [['1', 3.14, 'Álvaro'],
                      ['2', 2.71, 'turicas'],
                      ['3', 1.23, 'Justen']]
        table = plugins_utils.create_table([header] + table_rows)
        self.assertEqual(table.fields.keys(), ['field', 'field_1', 'field_2'])
        self.assertEqual(table[0].field, 1)
        self.assertEqual(table[0].field_1, 3.14)
        self.assertEqual(table[0].field_2, 'Álvaro')
Example #30
0
    def test_create_table_force_types(self):
        header = ['field1', 'field2', 'field3']
        table_rows = [['1', '3.14', 'Álvaro'],
                      ['2', '2.71', 'turicas'],
                      ['3', '1.23', 'Justen']]
        force_types = {'field2': rows.fields.DecimalField}

        table = plugins_utils.create_table([header] + table_rows,
                                           force_types=force_types)
        for field_name, field_type in force_types.items():
            self.assertEqual(table.fields[field_name], field_type)
    def test_create_table_repeated_field_names(self):
        header = ['first', 'first', 'first']
        table_rows = [['1', 3.14, 'Álvaro'], ['2', 2.71, 'turicas'],
                      ['3', 1.23, 'Justen']]
        table = plugins_utils.create_table([header] + table_rows)
        self.assertEqual(list(table.fields.keys()),
                         ['first', 'first_2', 'first_3'])
        self.assertEqual(table[0].first, 1)
        self.assertEqual(table[0].first_2, 3.14)
        self.assertEqual(table[0].first_3, 'Álvaro')

        header = ['field', '', 'field']
        table_rows = [['1', 3.14, 'Álvaro'], ['2', 2.71, 'turicas'],
                      ['3', 1.23, 'Justen']]
        table = plugins_utils.create_table([header] + table_rows)
        self.assertEqual(list(table.fields.keys()),
                         ['field', 'field_1', 'field_2'])
        self.assertEqual(table[0].field, 1)
        self.assertEqual(table[0].field_1, 3.14)
        self.assertEqual(table[0].field_2, 'Álvaro')
Example #32
0
def import_from_parquet(filename, encoding='utf-8', *args, **kwargs):
    'Import data from a Parquet file'

    # TODO: should be able to used fobj also

    data, field_names = parquet.dump(filename, OPTIONS, _callback)
    length = len(data[field_names[0]])
    table_rows = [[data[field_name][index] for field_name in field_names]
                  for index in range(length)]

    meta = {'imported_from': 'parquet', 'filename': filename,}
    return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)
Example #33
0
    def test_create_table_import_fields(self):
        header = ["field1", "field2", "field3"]
        table_rows = [
            ["1", 3.14, "Álvaro"],
            ["2", 2.71, "turicas"],
            ["3", 1.23, "Justen"],
        ]
        table = plugins_utils.create_table([header] + table_rows, import_fields=None)
        self.assertEqual(list(table.fields.keys()), header)
        self.assertEqual(table[0].field1, 1)
        self.assertEqual(table[0].field2, 3.14)
        self.assertEqual(table[0].field3, "Álvaro")

        import_fields = ["field3", "field2"]
        table = plugins_utils.create_table(
            [header] + table_rows, import_fields=import_fields
        )
        self.assertEqual(list(table.fields.keys()), import_fields)
        self.assertEqual(
            table[0]._asdict(), OrderedDict([("field3", "Álvaro"), ("field2", 3.14)])
        )
Example #34
0
def import_from_dicts(data, *args, **kwargs):
    'Import data from a list of dicts'

    headers = set()
    for row in data:
        headers.update(row.keys())
    headers = sorted(list(headers))

    data = [[row.get(header, None) for header in headers] for row in data]

    meta = {'imported_from': 'dicts', }
    return create_table([headers] + data, meta=meta, *args, **kwargs)
Example #35
0
    def test_create_table_skip_header(self):
        field_types = OrderedDict([('integer', fields.IntegerField),
                                   ('string', fields.TextField),])
        data = [['1', 'Álvaro'], ['2', 'turicas'], ['3', 'Justen']]
        table_1 = plugins_utils.create_table(data, fields=field_types,
                                             skip_header=True)
        table_2 = plugins_utils.create_table(data, fields=field_types,
                                             skip_header=False)

        self.assertEqual(field_types, table_1.fields)
        self.assertEqual(table_1.fields, table_2.fields)
        self.assertEqual(len(table_1), 2)
        self.assertEqual(len(table_2), 3)

        first_row = {'integer': 1, 'string': 'Álvaro'}
        second_row = {'integer': 2, 'string': 'turicas'}
        third_row = {'integer': 3, 'string': 'Justen'}
        self.assertEqual(dict(table_1[0]._asdict()), second_row)
        self.assertEqual(dict(table_2[0]._asdict()), first_row)
        self.assertEqual(dict(table_1[1]._asdict()), third_row)
        self.assertEqual(dict(table_2[1]._asdict()), second_row)
        self.assertEqual(dict(table_2[2]._asdict()), third_row)
Example #36
0
def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs):
    'Import data from a JSON file'

    kwargs['encoding'] = encoding
    filename, fobj = get_filename_and_fobj(filename_or_fobj)

    json_obj = json.load(fobj, encoding=encoding)
    field_names = json_obj[0].keys()
    table_rows = [[item[key] for key in field_names] for item in json_obj]

    data = [field_names] + table_rows
    meta = {'imported_from': 'json', 'filename': filename, }
    return create_table(data, meta=meta, *args, **kwargs)
Example #37
0
def transpose(table, fields_column, *args, **kwargs):
    field_names = []
    new_rows = [{} for _ in range(len(table.fields) - 1)]
    for row in table:
        row = row._asdict()
        field_name = row[fields_column]
        field_names.append(field_name)
        del row[fields_column]
        for index, value in enumerate(row.values()):
            new_rows[index][field_name] = value

    table_rows = [[row[field_name] for field_name in field_names] for row in new_rows]
    return create_table([field_names] + table_rows, *args, **kwargs)
Example #38
0
def import_from_sqlite(filename_or_connection, table_name='rows', query=None,
                       *args, **kwargs):
    connection = _get_connection(filename_or_connection)
    cursor = connection.cursor()
    sql = query if query else SQL_SELECT_ALL.format(table_name=table_name)

    cursor.execute(sql)
    header = [info[0] for info in cursor.description]
    table_rows = list(cursor)  # TODO: may not put everything in memory
    cursor.close()

    meta = {'imported_from': 'sqlite', 'filename': filename_or_connection, }
    return create_table([header] + table_rows, meta=meta, *args, **kwargs)
Example #39
0
def transpose(table, fields_column, *args, **kwargs):
    field_names = []
    new_rows = [{} for _ in range(len(table.fields) - 1)]
    for row in table:
        row = row._asdict()
        field_name = row[fields_column]
        field_names.append(field_name)
        del row[fields_column]
        for index, value in enumerate(row.values()):
            new_rows[index][field_name] = value

    table_rows = [[row[field_name] for field_name in field_names]
                  for row in new_rows]
    return create_table([field_names] + table_rows, *args, **kwargs)
Example #40
0
def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath,
                      encoding='utf-8', *args, **kwargs):

    filename, fobj = get_filename_and_fobj(filename_or_fobj)
    kwargs['encoding'] = encoding
    xml = fobj.read().decode(encoding)
    tree = tree_from_string(xml)
    row_elements = tree.xpath(rows_xpath)

    header = fields_xpath.keys()
    result_rows = [_get_row_data(row, fields_xpath) for row in row_elements]

    meta = {'imported_from': 'xpath', 'filename': filename,}
    return create_table([header] + result_rows, meta=meta, *args, **kwargs)
Example #41
0
    def test_create_table_skip_header(self):
        field_types = OrderedDict(
            [("integer", fields.IntegerField), ("string", fields.TextField)]
        )
        data = [["1", "Álvaro"], ["2", "turicas"], ["3", "Justen"]]
        table_1 = plugins_utils.create_table(data, fields=field_types, skip_header=True)
        table_2 = plugins_utils.create_table(
            data, fields=field_types, skip_header=False
        )

        self.assertEqual(field_types, table_1.fields)
        self.assertEqual(table_1.fields, table_2.fields)
        self.assertEqual(len(table_1), 2)
        self.assertEqual(len(table_2), 3)

        first_row = {"integer": 1, "string": "Álvaro"}
        second_row = {"integer": 2, "string": "turicas"}
        third_row = {"integer": 3, "string": "Justen"}
        self.assertEqual(dict(table_1[0]._asdict()), second_row)
        self.assertEqual(dict(table_2[0]._asdict()), first_row)
        self.assertEqual(dict(table_1[1]._asdict()), third_row)
        self.assertEqual(dict(table_2[1]._asdict()), second_row)
        self.assertEqual(dict(table_2[2]._asdict()), third_row)
Example #42
0
def import_from_json(filename_or_fobj, encoding="utf-8", *args, **kwargs):
    """Import a JSON file or file-like object into a `rows.Table`.

    If a file-like object is provided it MUST be open in text (non-binary) mode
    on Python 3 and could be open in both binary or text mode on Python 2.
    """
    filename, fobj = get_filename_and_fobj(filename_or_fobj)

    json_obj = json.load(fobj, encoding=encoding)
    field_names = list(json_obj[0].keys())
    table_rows = [[item[key] for key in field_names] for item in json_obj]

    meta = {"imported_from": "json", "filename": filename, "encoding": encoding}
    return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)
Example #43
0
    def test_create_table_force_types(self):
        header = ["field1", "field2", "field3"]
        table_rows = [
            ["1", "3.14", "Álvaro"],
            ["2", "2.71", "turicas"],
            ["3", "1.23", "Justen"],
        ]
        force_types = {"field2": rows.fields.DecimalField}

        table = plugins_utils.create_table(
            [header] + table_rows, force_types=force_types
        )
        for field_name, field_type in force_types.items():
            self.assertEqual(table.fields[field_name], field_type)
Example #44
0
File: txt.py Project: abelthf/rows
def import_from_txt(filename_or_fobj, encoding='utf-8', *args, **kwargs):
    # TODO: should be able to change DASH, PLUS and PIPE
    filename, fobj = get_filename_and_fobj(filename_or_fobj)
    kwargs['encoding'] = encoding
    contents = fobj.read().decode(encoding).strip().splitlines()

    # remove '+----+----+' lines
    contents = contents[1:-1]
    del contents[1]

    table_rows = [[value.strip() for value in row.split(PIPE)[1:-1]]
                  for row in contents]
    meta = {'imported_from': 'txt', 'filename': filename,}
    return create_table(table_rows, meta=meta, *args, **kwargs)
Example #45
0
def import_from_html(
    filename_or_fobj,
    encoding="utf-8",
    index=0,
    ignore_colspan=True,
    preserve_html=False,
    properties=False,
    table_tag="table",
    row_tag="tr",
    column_tag="td|th",
    *args,
    **kwargs
):
    """Return rows.Table from HTML file."""
    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb")
    html = fobj.read().decode(encoding)
    html_tree = document_fromstring(html)
    tables = html_tree.xpath("//{}".format(table_tag))
    table = tables[index]

    strip_tags(table, "thead")
    strip_tags(table, "tbody")
    row_elements = table.xpath(row_tag)

    table_rows = [
        _get_row(
            row,
            column_tag=column_tag,
            preserve_html=preserve_html,
            properties=properties,
        )
        for row in row_elements
    ]

    if properties:
        table_rows[0][-1] = "properties"

    if preserve_html and kwargs.get("fields", None) is None:
        # The field names will be the first table row, so we need to strip HTML
        # from it even if `preserve_html` is `True` (it's `True` only for rows,
        # not for the header).
        table_rows[0] = list(map(_extract_node_text, row_elements[0]))

    if ignore_colspan:
        max_columns = max(map(len, table_rows))
        table_rows = [row for row in table_rows if len(row) == max_columns]

    meta = {"imported_from": "html", "filename": filename, "encoding": encoding}
    return create_table(table_rows, meta=meta, *args, **kwargs)
Example #46
0
File: xls.py Project: abelthf/rows
def import_from_xls(filename_or_fobj,
                    sheet_name=None,
                    sheet_index=0,
                    start_row=0,
                    start_column=0,
                    *args,
                    **kwargs):

    filename, _ = get_filename_and_fobj(filename_or_fobj)
    book = xlrd.open_workbook(filename, formatting_info=True)
    if sheet_name is not None:
        sheet = book.sheet_by_name(sheet_name)
    else:
        sheet = book.sheet_by_index(sheet_index)
    # TODO: may re-use Excel data types

    # Get field names
    # TODO: may use sheet.col_values or even sheet.ncols
    column_count = 0
    header = []
    column_value = cell_value(sheet, start_row, start_column + column_count)
    while column_value:
        header.append(column_value)
        column_count += 1
        column_value = cell_value(sheet, start_row,
                                  start_column + column_count)

    # Get sheet rows
    # TODO: may use sheel.col_slice or even sheet.nrows
    table_rows = []
    row_count = 0
    start_row += 1
    cell_is_empty = False
    while not cell_is_empty:
        row = [
            cell_value(sheet, start_row + row_count,
                       start_column + column_index)
            for column_index in range(column_count)
        ]
        cell_is_empty = not any(row)
        if not cell_is_empty:
            table_rows.append(row)
            row_count += 1

    meta = {
        'imported_from': 'xls',
        'filename': filename,
    }
    return create_table([header] + table_rows, meta=meta, *args, **kwargs)
Example #47
0
    def test_create_table_repeated_field_names(self):
        header = ["first", "first", "first"]
        table_rows = [
            ["1", 3.14, "Álvaro"],
            ["2", 2.71, "turicas"],
            ["3", 1.23, "Justen"],
        ]
        table = plugins_utils.create_table([header] + table_rows)
        self.assertEqual(list(table.fields.keys()), ["first", "first_2", "first_3"])
        self.assertEqual(table[0].first, 1)
        self.assertEqual(table[0].first_2, 3.14)
        self.assertEqual(table[0].first_3, "Álvaro")

        header = ["field", "", "field"]
        table_rows = [
            ["1", 3.14, "Álvaro"],
            ["2", 2.71, "turicas"],
            ["3", 1.23, "Justen"],
        ]
        table = plugins_utils.create_table([header] + table_rows)
        self.assertEqual(list(table.fields.keys()), ["field", "field_1", "field_2"])
        self.assertEqual(table[0].field, 1)
        self.assertEqual(table[0].field_1, 3.14)
        self.assertEqual(table[0].field_2, "Álvaro")
Example #48
0
def import_from_parquet(filename_or_fobj, *args, **kwargs):
    'Import data from a Parquet file'

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')

    # TODO: should look into `schema.converted_type` also
    types = OrderedDict([(schema.name, PARQUET_TO_ROWS[schema.type])
                         for schema in parquet._read_footer(fobj).schema
                         if schema.type is not None])
    header = list(types.keys())
    table_rows = list(parquet.reader(fobj))  # TODO: be lazy

    meta = {'imported_from': 'parquet', 'filename': filename,}
    return create_table([header] + table_rows, meta=meta, force_types=types,
                        *args, **kwargs)
Example #49
0
File: csv.py Project: abelthf/rows
def import_from_csv(filename_or_fobj, encoding='utf-8', dialect=None, *args,
                    **kwargs):
    'Import data from a CSV file'

    filename, fobj = get_filename_and_fobj(filename_or_fobj)

    if dialect is None:
        sample = fobj.readline().decode(encoding)
        dialect = unicodecsv.Sniffer().sniff(sample)
        fobj.seek(0)

    kwargs['encoding'] = encoding
    csv_reader = unicodecsv.reader(fobj, encoding=encoding, dialect=dialect)

    meta = {'imported_from': 'csv', 'filename': filename,}
    return create_table(csv_reader, meta=meta, *args, **kwargs)
Example #50
0
File: xls.py Project: turicas/rows
def import_from_xls(
    filename_or_fobj,
    sheet_name=None,
    sheet_index=0,
    start_row=None,
    start_column=None,
    end_row=None,
    end_column=None,
    *args,
    **kwargs
):
    """Return a rows.Table created from imported XLS file."""

    filename, _ = get_filename_and_fobj(filename_or_fobj, mode="rb")
    book = xlrd.open_workbook(filename, formatting_info=True)
    if sheet_name is not None:
        sheet = book.sheet_by_name(sheet_name)
    else:
        sheet = book.sheet_by_index(sheet_index)
    # TODO: may re-use Excel data types

    # Get header and rows
    # xlrd library reads rows and columns starting from 0 and ending on
    # sheet.nrows/ncols - 1. rows accepts the same pattern
    # The xlrd library reads rows and columns starting from 0 and ending on
    # sheet.nrows/ncols - 1. rows also uses 0-based indexes, so no
    # transformation is needed
    min_row, min_column = get_table_start(sheet)
    max_row, max_column = sheet.nrows - 1, sheet.ncols - 1
    # TODO: consider adding a parameter `ignore_padding=True` and when it's
    # True, consider `start_row` starting from `min_row` and `start_column`
    # starting from `min_col`.
    start_row = start_row if start_row is not None else min_row
    end_row = end_row if end_row is not None else max_row
    start_column = start_column if start_column is not None else min_column
    end_column = end_column if end_column is not None else max_column
    table_rows = [
        [
            cell_value(sheet, row_index, column_index)
            for column_index in range(start_column, end_column + 1)
        ]
        for row_index in range(start_row, end_row + 1)
    ]

    meta = {"imported_from": "xls", "filename": filename, "sheet_name": sheet.name}
    return create_table(table_rows, meta=meta, *args, **kwargs)
Example #51
0
def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs):
    '''Import a JSON file or file-like object into a `rows.Table`

    If a file-like object is provided it MUST be open in text (non-binary) mode
    on Python 3 and could be open in both binary or text mode on Python 2.
    '''

    filename, fobj = get_filename_and_fobj(filename_or_fobj)

    json_obj = json.load(fobj, encoding=encoding)
    field_names = list(json_obj[0].keys())
    table_rows = [[item[key] for key in field_names] for item in json_obj]

    meta = {'imported_from': 'json',
            'filename': filename,
            'encoding': encoding,}
    return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)