Esempio n. 1
0
def import_from_xpath(filename_or_fobj,
                      rows_xpath,
                      fields_xpath,
                      encoding="utf-8",
                      *args,
                      **kwargs):

    types = set([type(rows_xpath)] +
                [type(xpath) for xpath in fields_xpath.values()])
    if types != set([six.text_type]):
        raise TypeError("XPath must be {}".format(six.text_type.__name__))

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb")
    xml = fobj.read().decode(encoding)
    tree = tree_from_string(xml)
    row_elements = tree.xpath(rows_xpath)

    header = list(fields_xpath.keys())
    row_data = _get_row_data(fields_xpath)
    result_rows = list(map(row_data, row_elements))

    meta = {
        "imported_from": "xpath",
        "filename": filename,
        "encoding": encoding
    }
    return create_table([header] + result_rows, meta=meta, *args, **kwargs)
Esempio n. 2
0
def export_to_csv(table,
                  filename_or_fobj=None,
                  encoding='utf-8',
                  dialect=unicodecsv.excel,
                  *args,
                  **kwargs):
    '''Export a `rows.Table` to a CSV file

    If a file-like object is provided it MUST be in binary mode, like in
    `open(filename, mode='wb')`.
    If not filename/fobj is provided, the function returns a string with CSV
    contents.
    '''
    # TODO: will work only if table.fields is OrderedDict
    # TODO: should use fobj? What about creating a method like json.dumps?

    if filename_or_fobj is not None:
        _, fobj = get_filename_and_fobj(filename_or_fobj, mode='wb')
    else:
        fobj = BytesIO()

    writer = unicodecsv.writer(fobj, encoding=encoding, dialect=dialect)
    for row in serialize(table, *args, **kwargs):
        writer.writerow(row)

    if filename_or_fobj is not None:
        fobj.flush()
        return fobj
    else:
        fobj.seek(0)
        result = fobj.read()
        fobj.close()
        return result
Esempio n. 3
0
def export_to_xls(table, filename_or_fobj=None, sheet_name='Sheet1', *args,
                  **kwargs):

    work_book = xlwt.Workbook()
    sheet = work_book.add_sheet(sheet_name)

    prepared_table = prepare_to_export(table, *args, **kwargs)

    field_names = next(prepared_table)
    for column_index, field_name in enumerate(field_names):
        sheet.write(0, column_index, field_name)

    _convert_row = _python_to_xls([table.fields.get(field)
                                   for field in field_names])
    for row_index, row in enumerate(prepared_table, start=1):
        for column_index, (value, data) in enumerate(_convert_row(row)):
            sheet.write(row_index, column_index, value, **data)

    if filename_or_fobj is not None:
        _, fobj = get_filename_and_fobj(filename_or_fobj, mode='wb')
        work_book.save(fobj)
        fobj.flush()
        return fobj
    else:
        fobj = BytesIO()
        work_book.save(fobj)
        fobj.seek(0)
        result = fobj.read()
        fobj.close()
        return result
Esempio n. 4
0
def export_to_csv(table, filename_or_fobj=None, encoding='utf-8',
                  dialect=unicodecsv.excel, *args, **kwargs):
    '''Export a `rows.Table` to a CSV file

    If a file-like object is provided it MUST be in binary mode, like in
    `open(filename, mode='wb')`.
    If not filename/fobj is provided, the function returns a string with CSV
    contents.
    '''
    # TODO: will work only if table.fields is OrderedDict
    # TODO: should use fobj? What about creating a method like json.dumps?

    if filename_or_fobj is not None:
        _, fobj = get_filename_and_fobj(filename_or_fobj, mode='wb')
    else:
        fobj = BytesIO()

    writer = unicodecsv.writer(fobj, encoding=encoding, dialect=dialect)
    for row in serialize(table, *args, **kwargs):
        writer.writerow(row)

    if filename_or_fobj is not None:
        fobj.flush()
        return fobj
    else:
        fobj.seek(0)
        result = fobj.read()
        fobj.close()
        return result
Esempio n. 5
0
File: xls.py Progetto: abelthf/rows
def export_to_xls(table, filename_or_fobj=None, sheet_name='Sheet1', *args,
                  **kwargs):

    work_book = xlwt.Workbook()
    sheet = work_book.add_sheet(sheet_name)

    prepared_table = prepare_to_export(table, *args, **kwargs)

    field_names = prepared_table.next()
    for column_index, field_name in enumerate(field_names):
        sheet.write(0, column_index, field_name)

    for row_index, row in enumerate(prepared_table, start=1):
        for column_index, (field_name, value) in \
                enumerate(zip(field_names, row)):
            field_type = table.fields[field_name]
            data = {}
            if field_type in FORMATTING_STYLES:
                data['style'] = FORMATTING_STYLES[field_type]
            sheet.write(row_index, column_index, value, **data)

    if filename_or_fobj is not None:
        _, fobj = get_filename_and_fobj(filename_or_fobj, mode='wb')
        work_book.save(fobj)
        fobj.flush()
        return fobj
    else:
        fobj = BytesIO()
        work_book.save(fobj)
        fobj.seek(0)
        result = fobj.read()
        fobj.close()
        return result
Esempio n. 6
0
def export_to_xls(table,
                  filename_or_fobj=None,
                  sheet_name='Sheet1',
                  *args,
                  **kwargs):
    """Export the rows.Table to XLS file and return the saved file."""
    work_book = xlwt.Workbook()
    sheet = work_book.add_sheet(sheet_name)

    prepared_table = prepare_to_export(table, *args, **kwargs)

    field_names = next(prepared_table)
    for column_index, field_name in enumerate(field_names):
        sheet.write(0, column_index, field_name)

    _convert_row = _python_to_xls(
        [table.fields.get(field) for field in field_names])
    for row_index, row in enumerate(prepared_table, start=1):
        for column_index, (value, data) in enumerate(_convert_row(row)):
            sheet.write(row_index, column_index, value, **data)

    if filename_or_fobj is not None:
        _, fobj = get_filename_and_fobj(filename_or_fobj, mode='wb')
        work_book.save(fobj)
        fobj.flush()
        return fobj
    else:
        fobj = BytesIO()
        work_book.save(fobj)
        fobj.seek(0)
        result = fobj.read()
        fobj.close()
        return result
Esempio n. 7
0
def import_from_csv(filename_or_fobj,
                    encoding='utf-8',
                    dialect=None,
                    sample_size=8192,
                    *args,
                    **kwargs):
    '''Import data from a CSV file

    If a file-like object is provided it MUST be in binary mode, like in
    `open(filename, mode='rb')`.
    '''

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')

    if dialect is None:
        cursor = fobj.tell()
        dialect = discover_dialect(fobj.read(sample_size), encoding)
        fobj.seek(cursor)

    reader = unicodecsv.reader(fobj, encoding=encoding, dialect=dialect)

    meta = {
        'imported_from': 'csv',
        'filename': filename,
        'encoding': encoding,
    }
    return create_table(reader, meta=meta, *args, **kwargs)
Esempio n. 8
0
File: xlsx.py Progetto: abelthf/rows
def import_from_xlsx(filename_or_fobj, sheet_name=None, sheet_index=0,
                     start_row=0, start_column=0, *args, **kwargs):
    workbook = load_workbook(filename_or_fobj)
    if sheet_name is None:
        sheet_name = workbook.sheetnames[sheet_index]
    sheet = workbook.get_sheet_by_name(sheet_name)

    # Get sheet header
    header = []
    last_column = start_column
    header_value = _get_cell_value(sheet, start_row, last_column)
    while header_value:
        header.append(header_value)
        last_column += 1
        header_value = _get_cell_value(sheet, start_row, last_column)
    last_column -= 1

    # Get sheet rows based on `last_column` defined in 'get sheet header'
    row_pos = start_row + 1
    all_rows = []
    row = _read_row(sheet, row_pos, last_column)
    while any(row):
        all_rows.append(row)
        row_pos += 1
        row = _read_row(sheet, row_pos, last_column)

    filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True)
    metadata = {'imported_from': 'xlsx', 'filename': filename, }
    return create_table([header] + all_rows, meta=metadata, *args, **kwargs)
Esempio n. 9
0
def import_from_xpath(filename_or_fobj,
                      rows_xpath,
                      fields_xpath,
                      encoding='utf-8',
                      *args,
                      **kwargs):

    types = set([type(rows_xpath)] + \
                [type(xpath) for xpath in fields_xpath.values()])
    if types != set([six.text_type]):
        raise TypeError('XPath must be {}'.format(six.text_type.__name__))

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')
    xml = fobj.read().decode(encoding)
    tree = tree_from_string(xml)
    row_elements = tree.xpath(rows_xpath)

    header = list(fields_xpath.keys())
    row_data = _get_row_data(fields_xpath)
    result_rows = list(map(row_data, row_elements))

    meta = {
        'imported_from': 'xpath',
        'filename': filename,
        'encoding': encoding,
    }
    return create_table([header] + result_rows, meta=meta, *args, **kwargs)
Esempio n. 10
0
File: xlsx.py Progetto: wnlima/rows
def import_from_xlsx(filename_or_fobj,
                     sheet_name=None,
                     sheet_index=0,
                     start_row=0,
                     start_column=0,
                     *args,
                     **kwargs):
    workbook = load_workbook(filename_or_fobj)
    if sheet_name is None:
        sheet_name = workbook.sheetnames[sheet_index]
    sheet = workbook.get_sheet_by_name(sheet_name)

    start_row, end_row = max(start_row, sheet.min_row), sheet.max_row
    start_col, end_col = max(start_column, sheet.min_column), sheet.max_column
    table_rows = [[
        _cell_to_python(sheet.cell(row=row_index, column=col_index))
        for col_index in range(start_col, end_col + 1)
    ] for row_index in range(start_row, end_row + 1)]

    filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True)
    metadata = {
        'imported_from': 'xlsx',
        'filename': filename,
        'sheet_name': sheet_name,
    }
    return create_table(table_rows, meta=metadata, *args, **kwargs)
Esempio n. 11
0
File: xlsx.py Progetto: abelthf/rows
def export_to_xlsx(table, filename_or_fobj=None, sheet_name='Sheet1', *args,
                   **kwargs):

    workbook = Workbook()
    sheet = workbook.active
    sheet.title = sheet_name
    field_names = list(enumerate(table.fields))
    prepared_table = prepare_to_export(table, *args, **kwargs)

    # Write header
    header = prepared_table.next()
    for col_index, field_name in enumerate(header):
        _write_cell(sheet, 0, col_index, field_name, fields.TextField)

    # Write sheet rows
    table_fields = table.fields
    for row_index, row in enumerate(prepared_table, start=1):
        for col_index, field_name in field_names:
            _write_cell(sheet, row_index, col_index,
                        value=row[col_index],
                        field_type=table_fields[field_name])

    if filename_or_fobj is not None:
        _, fobj = get_filename_and_fobj(filename_or_fobj, mode='wb')
        workbook.save(fobj)
        fobj.flush()
        return fobj
    else:
        fobj = BytesIO()
        workbook.save(fobj)
        fobj.seek(0)
        result = fobj.read()
        fobj.close()
        return result
Esempio n. 12
0
def count_tables(filename_or_fobj, encoding="utf-8", table_tag="table"):
    """Read a file passed by arg and return your table HTML tag count."""
    filename, fobj = get_filename_and_fobj(filename_or_fobj)
    html = fobj.read().decode(encoding)
    html_tree = document_fromstring(html)
    tables = html_tree.xpath("//{}".format(table_tag))
    return len(tables)
Esempio n. 13
0
File: xlsx.py Progetto: turicas/rows
def export_to_xlsx(table, filename_or_fobj=None, sheet_name="Sheet1", *args, **kwargs):
    """Export the rows.Table to XLSX file and return the saved file."""

    workbook = Workbook()
    sheet = workbook.active
    sheet.title = sheet_name
    prepared_table = prepare_to_export(table, *args, **kwargs)

    # Write header
    field_names = next(prepared_table)
    for col_index, field_name in enumerate(field_names):
        cell = sheet.cell(row=1, column=col_index + 1)
        cell.value = field_name

    # Write sheet rows
    _convert_row = _python_to_cell(list(map(table.fields.get, field_names)))
    for row_index, row in enumerate(prepared_table, start=1):
        for col_index, (value, number_format) in enumerate(_convert_row(row)):
            cell = sheet.cell(row=row_index + 1, column=col_index + 1)
            cell.value = value
            if number_format is not None:
                cell.number_format = number_format

    if filename_or_fobj is not None:
        _, fobj = get_filename_and_fobj(filename_or_fobj, mode="wb")
        workbook.save(fobj)
        fobj.flush()
        return fobj
    else:
        fobj = BytesIO()
        workbook.save(fobj)
        fobj.seek(0)
        result = fobj.read()
        fobj.close()
        return result
Esempio n. 14
0
 def document(self):
     filename, fobj = get_filename_and_fobj(self.filename_or_fobj,
                                            mode="rb")
     parser = PDFParser(fobj)
     doc = PDFDocument(parser)
     parser.set_document(doc)
     return doc
Esempio n. 15
0
File: xls.py Progetto: wnlima/rows
def import_from_xls(filename_or_fobj,
                    sheet_name=None,
                    sheet_index=0,
                    start_row=0,
                    start_column=0,
                    *args,
                    **kwargs):

    filename, _ = get_filename_and_fobj(filename_or_fobj, mode='rb')
    book = xlrd.open_workbook(filename, formatting_info=True)
    if sheet_name is not None:
        sheet = book.sheet_by_name(sheet_name)
    else:
        sheet = book.sheet_by_index(sheet_index)
    # TODO: may re-use Excel data types

    # Get header and rows
    table_rows = [[
        cell_value(sheet, row_index, column_index)
        for column_index in range(start_column, sheet.ncols)
    ] for row_index in range(start_row, sheet.nrows)]

    meta = {
        'imported_from': 'xls',
        'filename': filename,
        'sheet_name': sheet.name,
    }
    return create_table(table_rows, meta=meta, *args, **kwargs)
Esempio n. 16
0
 def document(self):
     filename, fobj = get_filename_and_fobj(self.filename_or_fobj, mode="rb")
     if not filename:
         data = fobj.read()  # TODO: may use a lot of memory
         doc = pymupdf.open(stream=data, filetype="pdf")
     else:
         doc = pymupdf.open(filename=filename, filetype="pdf")
     return doc
Esempio n. 17
0
 def document(self):
     filename, fobj = get_filename_and_fobj(self.filename_or_fobj,
                                            mode="rb")
     if not filename:
         data = fobj.read()  # TODO: may use a lot of memory
         doc = pymupdf.open(stream=data, filetype="pdf")
     else:
         doc = pymupdf.open(filename=filename, filetype="pdf")
     return doc
Esempio n. 18
0
File: txt.py Progetto: turicas/rows
def import_from_txt(
    filename_or_fobj, encoding="utf-8", frame_style=FRAME_SENTINEL, *args, **kwargs
):
    """Return a rows.Table created from imported TXT file."""

    # TODO: (maybe)
    # enable parsing of non-fixed-width-columns
    # with old algorithm - that would just split columns
    # at the vertical separator character for the frame.
    # (if doing so, include an optional parameter)
    # Also, this fixes an outstanding unreported issue:
    # trying to parse tables which fields values
    # included a Pipe char - "|" - would silently
    # yield bad results.

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb")
    raw_contents = fobj.read().decode(encoding).rstrip("\n")

    if frame_style is FRAME_SENTINEL:
        frame_style = _guess_frame_style(raw_contents)
    else:
        frame_style = _parse_frame_style(frame_style)

    contents = raw_contents.splitlines()
    del raw_contents

    if frame_style != "None":
        contents = contents[1:-1]
        del contents[1]
    else:
        # the table is possibly generated from other source.
        # check if the line we reserve as a separator is realy empty.
        if not contents[1].strip():
            del contents[1]
    col_positions = _parse_col_positions(frame_style, contents[0])

    table_rows = [
        [
            row[start + 1 : end].strip()
            for start, end in zip(col_positions, col_positions[1:])
        ]
        for row in contents
    ]
    #
    # Variable columns - old behavior:
    # table_rows = [[value.strip() for value in row.split(vertical_char)[1:-1]]
    #              for row in contents]

    meta = {
        "imported_from": "txt",
        "filename": filename,
        "encoding": encoding,
        "frame_style": frame_style,
    }
    return create_table(table_rows, meta=meta, *args, **kwargs)
Esempio n. 19
0
def import_from_txt(filename_or_fobj,
                    encoding='utf-8',
                    frame_style=FRAME_SENTINEL,
                    *args,
                    **kwargs):
    """Return a rows.Table created from imported TXT file."""

    # TODO: (maybe)
    # enable parsing of non-fixed-width-columns
    # with old algorithm - that would just split columns
    # at the vertical separator character for the frame.
    # (if doing so, include an optional parameter)
    # Also, this fixes an outstanding unreported issue:
    # trying to parse tables which fields values
    # included a Pipe char - "|" - would silently
    # yield bad results.

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')
    raw_contents = fobj.read().decode(encoding).rstrip('\n')

    if frame_style is FRAME_SENTINEL:
        frame_style = _guess_frame_style(raw_contents)
    else:
        frame_style = _parse_frame_style(frame_style)

    contents = raw_contents.splitlines()
    del raw_contents

    if frame_style != 'None':
        contents = contents[1:-1]
        del contents[1]
    else:
        # the table is possibly generated from other source.
        # check if the line we reserve as a separator is realy empty.
        if not contents[1].strip():
            del contents[1]
    col_positions = _parse_col_positions(frame_style, contents[0])

    table_rows = [[
        row[start + 1:end].strip()
        for start, end in zip(col_positions, col_positions[1:])
    ] for row in contents]
    #
    # Variable columns - old behavior:
    # table_rows = [[value.strip() for value in row.split(vertical_char)[1:-1]]
    #              for row in contents]

    meta = {
        'imported_from': 'txt',
        'filename': filename,
        'encoding': encoding,
        'frame_style': frame_style
    }
    return create_table(table_rows, meta=meta, *args, **kwargs)
Esempio n. 20
0
def export_to_csv(
    table,
    filename_or_fobj=None,
    encoding="utf-8",
    dialect=unicodecsv.excel,
    batch_size=100,
    callback=None,
    *args,
    **kwargs
):
    """Export a `rows.Table` to a CSV file.


    If a file-like object is provided it MUST be in binary mode, like in
    `open(filename, mode='wb')`.
    If not filename/fobj is provided, the function returns a string with CSV
    contents.
    """
    # TODO: will work only if table.fields is OrderedDict
    # TODO: should use fobj? What about creating a method like json.dumps?

    if filename_or_fobj is not None:
        _, fobj = get_filename_and_fobj(filename_or_fobj, mode="wb")
    else:
        fobj = BytesIO()

    # TODO: may use `io.BufferedWriter` instead of `ipartition` so user can
    # choose the real size (in Bytes) when to flush to the file system, instead
    # number of rows
    writer = unicodecsv.writer(fobj, encoding=encoding, dialect=dialect)

    if callback is None:
        for batch in ipartition(serialize(table, *args, **kwargs), batch_size):
            writer.writerows(batch)

    else:
        serialized = serialize(table, *args, **kwargs)
        writer.writerow(next(serialized))  # First, write the header
        total = 0
        for batch in ipartition(serialized, batch_size):
            writer.writerows(batch)
            total += len(batch)
            callback(total)

    if filename_or_fobj is not None:
        fobj.flush()
        return fobj
    else:
        fobj.seek(0)
        result = fobj.read()
        fobj.close()
        return result
Esempio n. 21
0
def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs):
    'Import data from a JSON file'

    kwargs['encoding'] = encoding
    filename, fobj = get_filename_and_fobj(filename_or_fobj)

    json_obj = json.load(fobj, encoding=encoding)
    field_names = json_obj[0].keys()
    table_rows = [[item[key] for key in field_names] for item in json_obj]

    data = [field_names] + table_rows
    meta = {'imported_from': 'json', 'filename': filename, }
    return create_table(data, meta=meta, *args, **kwargs)
Esempio n. 22
0
def import_from_json(filename_or_fobj, encoding="utf-8", *args, **kwargs):
    """Import a JSON file or file-like object into a `rows.Table`.

    If a file-like object is provided it MUST be open in text (non-binary) mode
    on Python 3 and could be open in both binary or text mode on Python 2.
    """
    filename, fobj = get_filename_and_fobj(filename_or_fobj)

    json_obj = json.load(fobj, encoding=encoding)
    field_names = list(json_obj[0].keys())
    table_rows = [[item[key] for key in field_names] for item in json_obj]

    meta = {"imported_from": "json", "filename": filename, "encoding": encoding}
    return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)
Esempio n. 23
0
def export_to_csv(table,
                  filename_or_fobj=None,
                  encoding="utf-8",
                  dialect=unicodecsv.excel,
                  batch_size=100,
                  callback=None,
                  *args,
                  **kwargs):
    """Export a `rows.Table` to a CSV file.


    If a file-like object is provided it MUST be in binary mode, like in
    `open(filename, mode='wb')`.
    If not filename/fobj is provided, the function returns a string with CSV
    contents.
    """
    # TODO: will work only if table.fields is OrderedDict
    # TODO: should use fobj? What about creating a method like json.dumps?

    if filename_or_fobj is not None:
        _, fobj = get_filename_and_fobj(filename_or_fobj, mode="wb")
    else:
        fobj = BytesIO()

    # TODO: may use `io.BufferedWriter` instead of `ipartition` so user can
    # choose the real size (in Bytes) when to flush to the file system, instead
    # number of rows
    writer = unicodecsv.writer(fobj, encoding=encoding, dialect=dialect)

    if callback is None:
        for batch in ipartition(serialize(table, *args, **kwargs), batch_size):
            writer.writerows(batch)

    else:
        serialized = serialize(table, *args, **kwargs)
        writer.writerow(next(serialized))  # First, write the header
        total = 0
        for batch in ipartition(serialized, batch_size):
            writer.writerows(batch)
            total += len(batch)
            callback(total)

    if filename_or_fobj is not None:
        fobj.flush()
        return fobj
    else:
        fobj.seek(0)
        result = fobj.read()
        fobj.close()
        return result
Esempio n. 24
0
File: txt.py Progetto: abelthf/rows
def import_from_txt(filename_or_fobj, encoding='utf-8', *args, **kwargs):
    # TODO: should be able to change DASH, PLUS and PIPE
    filename, fobj = get_filename_and_fobj(filename_or_fobj)
    kwargs['encoding'] = encoding
    contents = fobj.read().decode(encoding).strip().splitlines()

    # remove '+----+----+' lines
    contents = contents[1:-1]
    del contents[1]

    table_rows = [[value.strip() for value in row.split(PIPE)[1:-1]]
                  for row in contents]
    meta = {'imported_from': 'txt', 'filename': filename,}
    return create_table(table_rows, meta=meta, *args, **kwargs)
Esempio n. 25
0
def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath,
                      encoding='utf-8', *args, **kwargs):

    filename, fobj = get_filename_and_fobj(filename_or_fobj)
    kwargs['encoding'] = encoding
    xml = fobj.read().decode(encoding)
    tree = tree_from_string(xml)
    row_elements = tree.xpath(rows_xpath)

    header = fields_xpath.keys()
    result_rows = [_get_row_data(row, fields_xpath) for row in row_elements]

    meta = {'imported_from': 'xpath', 'filename': filename,}
    return create_table([header] + result_rows, meta=meta, *args, **kwargs)
Esempio n. 26
0
def import_from_html(
    filename_or_fobj,
    encoding="utf-8",
    index=0,
    ignore_colspan=True,
    preserve_html=False,
    properties=False,
    table_tag="table",
    row_tag="tr",
    column_tag="td|th",
    *args,
    **kwargs
):
    """Return rows.Table from HTML file."""
    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb")
    html = fobj.read().decode(encoding)
    html_tree = document_fromstring(html)
    tables = html_tree.xpath("//{}".format(table_tag))
    table = tables[index]

    strip_tags(table, "thead")
    strip_tags(table, "tbody")
    row_elements = table.xpath(row_tag)

    table_rows = [
        _get_row(
            row,
            column_tag=column_tag,
            preserve_html=preserve_html,
            properties=properties,
        )
        for row in row_elements
    ]

    if properties:
        table_rows[0][-1] = "properties"

    if preserve_html and kwargs.get("fields", None) is None:
        # The field names will be the first table row, so we need to strip HTML
        # from it even if `preserve_html` is `True` (it's `True` only for rows,
        # not for the header).
        table_rows[0] = list(map(_extract_node_text, row_elements[0]))

    if ignore_colspan:
        max_columns = max(map(len, table_rows))
        table_rows = [row for row in table_rows if len(row) == max_columns]

    meta = {"imported_from": "html", "filename": filename, "encoding": encoding}
    return create_table(table_rows, meta=meta, *args, **kwargs)
Esempio n. 27
0
File: xls.py Progetto: abelthf/rows
def import_from_xls(filename_or_fobj,
                    sheet_name=None,
                    sheet_index=0,
                    start_row=0,
                    start_column=0,
                    *args,
                    **kwargs):

    filename, _ = get_filename_and_fobj(filename_or_fobj)
    book = xlrd.open_workbook(filename, formatting_info=True)
    if sheet_name is not None:
        sheet = book.sheet_by_name(sheet_name)
    else:
        sheet = book.sheet_by_index(sheet_index)
    # TODO: may re-use Excel data types

    # Get field names
    # TODO: may use sheet.col_values or even sheet.ncols
    column_count = 0
    header = []
    column_value = cell_value(sheet, start_row, start_column + column_count)
    while column_value:
        header.append(column_value)
        column_count += 1
        column_value = cell_value(sheet, start_row,
                                  start_column + column_count)

    # Get sheet rows
    # TODO: may use sheel.col_slice or even sheet.nrows
    table_rows = []
    row_count = 0
    start_row += 1
    cell_is_empty = False
    while not cell_is_empty:
        row = [
            cell_value(sheet, start_row + row_count,
                       start_column + column_index)
            for column_index in range(column_count)
        ]
        cell_is_empty = not any(row)
        if not cell_is_empty:
            table_rows.append(row)
            row_count += 1

    meta = {
        'imported_from': 'xls',
        'filename': filename,
    }
    return create_table([header] + table_rows, meta=meta, *args, **kwargs)
Esempio n. 28
0
def import_from_txt(filename_or_fobj, encoding='utf-8', *args, **kwargs):
    # TODO: should be able to change DASH, PLUS and PIPE
    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')
    contents = fobj.read().decode(encoding).strip().splitlines()

    # remove '+----+----+' lines
    contents = contents[1:-1]
    del contents[1]

    table_rows = [[value.strip() for value in row.split(PIPE)[1:-1]]
                  for row in contents]
    meta = {'imported_from': 'txt',
            'filename': filename,
            'encoding': encoding,}
    return create_table(table_rows, meta=meta, *args, **kwargs)
Esempio n. 29
0
def import_from_parquet(filename_or_fobj, *args, **kwargs):
    'Import data from a Parquet file'

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')

    # TODO: should look into `schema.converted_type` also
    types = OrderedDict([(schema.name, PARQUET_TO_ROWS[schema.type])
                         for schema in parquet._read_footer(fobj).schema
                         if schema.type is not None])
    header = list(types.keys())
    table_rows = list(parquet.reader(fobj))  # TODO: be lazy

    meta = {'imported_from': 'parquet', 'filename': filename,}
    return create_table([header] + table_rows, meta=meta, force_types=types,
                        *args, **kwargs)
Esempio n. 30
0
def import_from_html(filename_or_fobj,
                     encoding='utf-8',
                     index=0,
                     ignore_colspan=True,
                     preserve_html=False,
                     properties=False,
                     table_tag='table',
                     row_tag='tr',
                     column_tag='td|th',
                     *args,
                     **kwargs):
    """Return rows.Table from HTML file."""
    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')
    html = fobj.read().decode(encoding)
    html_tree = document_fromstring(html)
    tables = html_tree.xpath('//{}'.format(table_tag))
    table = tables[index]

    strip_tags(table, 'thead')
    strip_tags(table, 'tbody')
    row_elements = table.xpath(row_tag)

    table_rows = [
        _get_row(row,
                 column_tag=column_tag,
                 preserve_html=preserve_html,
                 properties=properties) for row in row_elements
    ]

    if properties:
        table_rows[0][-1] = 'properties'

    if preserve_html and kwargs.get('fields', None) is None:
        # The field names will be the first table row, so we need to strip HTML
        # from it even if `preserve_html` is `True` (it's `True` only for rows,
        # not for the header).
        table_rows[0] = list(map(_extract_node_text, row_elements[0]))

    max_columns = max(map(len, table_rows))
    if ignore_colspan:
        table_rows = [row for row in table_rows if len(row) == max_columns]

    meta = {
        'imported_from': 'html',
        'filename': filename,
        'encoding': encoding,
    }
    return create_table(table_rows, meta=meta, *args, **kwargs)
Esempio n. 31
0
File: csv.py Progetto: abelthf/rows
def import_from_csv(filename_or_fobj, encoding='utf-8', dialect=None, *args,
                    **kwargs):
    'Import data from a CSV file'

    filename, fobj = get_filename_and_fobj(filename_or_fobj)

    if dialect is None:
        sample = fobj.readline().decode(encoding)
        dialect = unicodecsv.Sniffer().sniff(sample)
        fobj.seek(0)

    kwargs['encoding'] = encoding
    csv_reader = unicodecsv.reader(fobj, encoding=encoding, dialect=dialect)

    meta = {'imported_from': 'csv', 'filename': filename,}
    return create_table(csv_reader, meta=meta, *args, **kwargs)
Esempio n. 32
0
def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs):
    'Import data from a JSON file'

    kwargs['encoding'] = encoding
    filename, fobj = get_filename_and_fobj(filename_or_fobj)

    json_obj = json.load(fobj, encoding=encoding)
    field_names = json_obj[0].keys()
    table_rows = [[item[key] for key in field_names] for item in json_obj]

    data = [field_names] + table_rows
    meta = {
        'imported_from': 'json',
        'filename': filename,
    }
    return create_table(data, meta=meta, *args, **kwargs)
Esempio n. 33
0
File: xls.py Progetto: turicas/rows
def import_from_xls(
    filename_or_fobj,
    sheet_name=None,
    sheet_index=0,
    start_row=None,
    start_column=None,
    end_row=None,
    end_column=None,
    *args,
    **kwargs
):
    """Return a rows.Table created from imported XLS file."""

    filename, _ = get_filename_and_fobj(filename_or_fobj, mode="rb")
    book = xlrd.open_workbook(filename, formatting_info=True)
    if sheet_name is not None:
        sheet = book.sheet_by_name(sheet_name)
    else:
        sheet = book.sheet_by_index(sheet_index)
    # TODO: may re-use Excel data types

    # Get header and rows
    # xlrd library reads rows and columns starting from 0 and ending on
    # sheet.nrows/ncols - 1. rows accepts the same pattern
    # The xlrd library reads rows and columns starting from 0 and ending on
    # sheet.nrows/ncols - 1. rows also uses 0-based indexes, so no
    # transformation is needed
    min_row, min_column = get_table_start(sheet)
    max_row, max_column = sheet.nrows - 1, sheet.ncols - 1
    # TODO: consider adding a parameter `ignore_padding=True` and when it's
    # True, consider `start_row` starting from `min_row` and `start_column`
    # starting from `min_col`.
    start_row = start_row if start_row is not None else min_row
    end_row = end_row if end_row is not None else max_row
    start_column = start_column if start_column is not None else min_column
    end_column = end_column if end_column is not None else max_column
    table_rows = [
        [
            cell_value(sheet, row_index, column_index)
            for column_index in range(start_column, end_column + 1)
        ]
        for row_index in range(start_row, end_row + 1)
    ]

    meta = {"imported_from": "xls", "filename": filename, "sheet_name": sheet.name}
    return create_table(table_rows, meta=meta, *args, **kwargs)
Esempio n. 34
0
def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs):
    '''Import a JSON file or file-like object into a `rows.Table`

    If a file-like object is provided it MUST be open in text (non-binary) mode
    on Python 3 and could be open in both binary or text mode on Python 2.
    '''

    filename, fobj = get_filename_and_fobj(filename_or_fobj)

    json_obj = json.load(fobj, encoding=encoding)
    field_names = list(json_obj[0].keys())
    table_rows = [[item[key] for key in field_names] for item in json_obj]

    meta = {'imported_from': 'json',
            'filename': filename,
            'encoding': encoding,}
    return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)
Esempio n. 35
0
def import_from_parquet(filename_or_fobj, *args, **kwargs):
    """Import data from a Parquet file and return with rows.Table."""
    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb")

    # TODO: should look into `schema.converted_type` also
    types = OrderedDict([(schema.name, PARQUET_TO_ROWS[schema.type])
                         for schema in parquet._read_footer(fobj).schema
                         if schema.type is not None])
    header = list(types.keys())
    table_rows = list(parquet.reader(fobj))  # TODO: be lazy

    meta = {"imported_from": "parquet", "filename": filename}
    return create_table([header] + table_rows,
                        meta=meta,
                        force_types=types,
                        *args,
                        **kwargs)
Esempio n. 36
0
def import_from_xlsx(filename_or_fobj,
                     sheet_name=None,
                     sheet_index=0,
                     start_row=None,
                     start_column=None,
                     end_row=None,
                     end_column=None,
                     *args,
                     **kwargs):
    """Return a rows.Table created from imported XLSX file."""

    workbook = load_workbook(filename_or_fobj, read_only=True)
    if sheet_name is None:
        sheet_name = workbook.sheetnames[sheet_index]
    sheet = workbook[sheet_name]

    # The openpyxl library reads rows and columns starting from 1 and ending on
    # sheet.max_row/max_col. rows uses 0-based indexes (from 0 to N - 1), so we
    # need to adjust the ranges accordingly.
    min_row, min_column = sheet.min_row - 1, sheet.min_column - 1
    max_row, max_column = sheet.max_row - 1, sheet.max_column - 1
    # TODO: consider adding a parameter `ignore_padding=True` and when it's
    # True, consider `start_row` starting from `sheet.min_row` and
    # `start_column` starting from `sheet.min_col`.
    start_row = start_row if start_row is not None else min_row
    end_row = end_row if end_row is not None else max_row
    start_column = start_column if start_column is not None else min_column
    end_column = end_column if end_column is not None else max_column
    table_rows = []
    is_empty = lambda row: all(cell is None for cell in row)
    for row_index in range(start_row + 1, end_row + 2):
        row = [
            _cell_to_python(sheet_cell(sheet, row_index, col_index))
            for col_index in range(start_column + 1, end_column + 2)
        ]
        if not is_empty(row):
            table_rows.append(row)

    filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True)
    metadata = {
        "imported_from": "xlsx",
        "filename": filename,
        "sheet_name": sheet_name
    }
    return create_table(table_rows, meta=metadata, *args, **kwargs)
Esempio n. 37
0
def import_from_xls(filename_or_fobj,
                    sheet_name=None,
                    sheet_index=0,
                    start_row=None,
                    start_column=None,
                    end_row=None,
                    end_column=None,
                    *args,
                    **kwargs):
    """Return a rows.Table created from imported XLS file."""

    filename, _ = get_filename_and_fobj(filename_or_fobj, mode="rb")
    book = xlrd.open_workbook(filename, formatting_info=True)
    if sheet_name is not None:
        sheet = book.sheet_by_name(sheet_name)
    else:
        sheet = book.sheet_by_index(sheet_index)
    # TODO: may re-use Excel data types

    # Get header and rows
    # xlrd library reads rows and columns starting from 0 and ending on
    # sheet.nrows/ncols - 1. rows accepts the same pattern
    # The xlrd library reads rows and columns starting from 0 and ending on
    # sheet.nrows/ncols - 1. rows also uses 0-based indexes, so no
    # transformation is needed
    min_row, min_column = get_table_start(sheet)
    max_row, max_column = sheet.nrows - 1, sheet.ncols - 1
    # TODO: consider adding a parameter `ignore_padding=True` and when it's
    # True, consider `start_row` starting from `min_row` and `start_column`
    # starting from `min_col`.
    start_row = start_row if start_row is not None else min_row
    end_row = end_row if end_row is not None else max_row
    start_column = start_column if start_column is not None else min_column
    end_column = end_column if end_column is not None else max_column
    table_rows = [[
        cell_value(sheet, row_index, column_index)
        for column_index in range(start_column, end_column + 1)
    ] for row_index in range(start_row, end_row + 1)]

    meta = {
        "imported_from": "xls",
        "filename": filename,
        "sheet_name": sheet.name
    }
    return create_table(table_rows, meta=meta, *args, **kwargs)
Esempio n. 38
0
def import_from_json(filename_or_fobj, encoding="utf-8", *args, **kwargs):
    """Import a JSON file or file-like object into a `rows.Table`.

    If a file-like object is provided it MUST be open in text (non-binary) mode
    on Python 3 and could be open in both binary or text mode on Python 2.
    """
    filename, fobj = get_filename_and_fobj(filename_or_fobj)

    json_obj = json.load(fobj, encoding=encoding)
    field_names = list(json_obj[0].keys())
    table_rows = [[item[key] for key in field_names] for item in json_obj]

    meta = {
        "imported_from": "json",
        "filename": filename,
        "encoding": encoding
    }
    return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)
Esempio n. 39
0
def import_from_yaml(filename_or_fobj, encoding='utf-8', *args, **kwargs):
    '''Import a YAML file or file-like object into a `rows.Table`

    If a file-like object is provided it MUST be open in text (non-binary) mode
    on Python 3 and could be open in both binary or text mode on Python 2.
    '''

    filename, fobj = get_filename_and_fobj(filename_or_fobj)

    yaml_obj = yaml.load(fobj)
    field_names = list(yaml_obj[0].keys())
    table_rows = [[item[key] for key in field_names] for item in yaml_obj]

    meta = {
        'imported_from': 'yaml',
        'filename': filename,
        'encoding': encoding
    }
    return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)
Esempio n. 40
0
def import_from_xpath(
    filename_or_fobj, rows_xpath, fields_xpath, encoding="utf-8", *args, **kwargs
):

    types = set([type(rows_xpath)] + [type(xpath) for xpath in fields_xpath.values()])
    if types != set([six.text_type]):
        raise TypeError("XPath must be {}".format(six.text_type.__name__))

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb")
    xml = fobj.read().decode(encoding)
    tree = tree_from_string(xml)
    row_elements = tree.xpath(rows_xpath)

    header = list(fields_xpath.keys())
    row_data = _get_row_data(fields_xpath)
    result_rows = list(map(row_data, row_elements))

    meta = {"imported_from": "xpath", "filename": filename, "encoding": encoding}
    return create_table([header] + result_rows, meta=meta, *args, **kwargs)
Esempio n. 41
0
File: xlsx.py Progetto: turicas/rows
def import_from_xlsx(
    filename_or_fobj,
    sheet_name=None,
    sheet_index=0,
    start_row=None,
    start_column=None,
    end_row=None,
    end_column=None,
    *args,
    **kwargs
):
    """Return a rows.Table created from imported XLSX file."""

    workbook = load_workbook(filename_or_fobj, read_only=True)
    if sheet_name is None:
        sheet_name = workbook.sheetnames[sheet_index]
    sheet = workbook[sheet_name]

    # The openpyxl library reads rows and columns starting from 1 and ending on
    # sheet.max_row/max_col. rows uses 0-based indexes (from 0 to N - 1), so we
    # need to adjust the ranges accordingly.
    min_row, min_column = sheet.min_row - 1, sheet.min_column - 1
    max_row, max_column = sheet.max_row - 1, sheet.max_column - 1
    # TODO: consider adding a parameter `ignore_padding=True` and when it's
    # True, consider `start_row` starting from `sheet.min_row` and
    # `start_column` starting from `sheet.min_col`.
    start_row = start_row if start_row is not None else min_row
    end_row = end_row if end_row is not None else max_row
    start_column = start_column if start_column is not None else min_column
    end_column = end_column if end_column is not None else max_column
    table_rows = []
    is_empty = lambda row: all(cell is None for cell in row)
    for row_index in range(start_row + 1, end_row + 2):
        row = [
            _cell_to_python(sheet_cell(sheet, row_index, col_index))
            for col_index in range(start_column + 1, end_column + 2)
        ]
        if not is_empty(row):
            table_rows.append(row)

    filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True)
    metadata = {"imported_from": "xlsx", "filename": filename, "sheet_name": sheet_name}
    return create_table(table_rows, meta=metadata, *args, **kwargs)
Esempio n. 42
0
def import_from_parquet(filename_or_fobj, *args, **kwargs):
    """Import data from a Parquet file and return with rows.Table."""
    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb")

    # TODO: should look into `schema.converted_type` also
    types = OrderedDict(
        [
            (schema.name, PARQUET_TO_ROWS[schema.type])
            for schema in parquet._read_footer(fobj).schema
            if schema.type is not None
        ]
    )
    header = list(types.keys())
    table_rows = list(parquet.reader(fobj))  # TODO: be lazy

    meta = {"imported_from": "parquet", "filename": filename}
    return create_table(
        [header] + table_rows, meta=meta, force_types=types, *args, **kwargs
    )
Esempio n. 43
0
def import_from_xls(filename_or_fobj, sheet_name=None, sheet_index=0,
                    start_row=0, start_column=0, *args, **kwargs):

    filename, _ = get_filename_and_fobj(filename_or_fobj, mode='rb')
    book = xlrd.open_workbook(filename, formatting_info=True)
    if sheet_name is not None:
        sheet = book.sheet_by_name(sheet_name)
    else:
        sheet = book.sheet_by_index(sheet_index)
    # TODO: may re-use Excel data types

    # Get header and rows
    table_rows = [[cell_value(sheet, row_index, column_index)
                   for column_index in range(start_column, sheet.ncols)]
                  for row_index in range(start_row, sheet.nrows)]

    meta = {'imported_from': 'xls',
            'filename': filename,
            'sheet_name': sheet.name, }
    return create_table(table_rows, meta=meta, *args, **kwargs)
Esempio n. 44
0
def import_from_csv(filename_or_fobj,
                    encoding="utf-8",
                    dialect=None,
                    sample_size=262144,
                    *args,
                    **kwargs):
    """Import data from a CSV file (automatically detects dialect).

    If a file-like object is provided it MUST be in binary mode, like in
    `open(filename, mode='rb')`.
    """
    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb")

    if dialect is None:
        dialect = discover_dialect(sample=read_sample(fobj, sample_size),
                                   encoding=encoding)

    reader = unicodecsv.reader(fobj, encoding=encoding, dialect=dialect)

    meta = {"imported_from": "csv", "filename": filename, "encoding": encoding}
    return create_table(reader, meta=meta, *args, **kwargs)
Esempio n. 45
0
def import_from_csv(filename_or_fobj, encoding='utf-8', dialect=None,
                    sample_size=8192, *args, **kwargs):
    '''Import data from a CSV file

    If a file-like object is provided it MUST be in binary mode, like in
    `open(filename, mode='rb')`.
    '''

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')

    if dialect is None:
        cursor = fobj.tell()
        dialect = discover_dialect(fobj.read(sample_size), encoding)
        fobj.seek(cursor)

    reader = unicodecsv.reader(fobj, encoding=encoding, dialect=dialect)

    meta = {'imported_from': 'csv',
            'filename': filename,
            'encoding': encoding,}
    return create_table(reader, meta=meta, *args, **kwargs)
Esempio n. 46
0
File: csv.py Progetto: abelthf/rows
def export_to_csv(table, filename_or_fobj=None, encoding='utf-8', *args, **kwargs):
    # TODO: will work only if table.fields is OrderedDict
    # TODO: should use fobj? What about creating a method like json.dumps?

    kwargs['encoding'] = encoding
    if filename_or_fobj is not None:
        _, fobj = get_filename_and_fobj(filename_or_fobj, mode='w')
    else:
        fobj = BytesIO()

    csv_writer = unicodecsv.writer(fobj, encoding=encoding)
    map(csv_writer.writerow, serialize(table, *args, **kwargs))

    if filename_or_fobj is not None:
        fobj.flush()
        return fobj
    else:
        fobj.seek(0)
        result = fobj.read()
        fobj.close()
        return result
Esempio n. 47
0
def import_from_parquet(filename_or_fobj, *args, **kwargs):
    'Import data from a Parquet file'

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')

    # TODO: should look into `schema.converted_type` also
    types = OrderedDict([(schema.name, PARQUET_TO_ROWS[schema.type])
                         for schema in parquet._read_footer(fobj).schema
                         if schema.type is not None])
    header = list(types.keys())
    table_rows = list(parquet.reader(fobj))  # TODO: be lazy

    meta = {
        'imported_from': 'parquet',
        'filename': filename,
    }
    return create_table([header] + table_rows,
                        meta=meta,
                        force_types=types,
                        *args,
                        **kwargs)
Esempio n. 48
0
def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath,
                      encoding='utf-8', *args, **kwargs):

    types = set([type(rows_xpath)] + \
                [type(xpath) for xpath in fields_xpath.values()])
    if types != set([six.text_type]):
        raise TypeError('XPath must be {}'.format(six.text_type.__name__))

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')
    xml = fobj.read().decode(encoding)
    tree = tree_from_string(xml)
    row_elements = tree.xpath(rows_xpath)

    header = list(fields_xpath.keys())
    row_data = _get_row_data(fields_xpath)
    result_rows = list(map(row_data, row_elements))

    meta = {'imported_from': 'xpath',
            'filename': filename,
            'encoding': encoding,}
    return create_table([header] + result_rows, meta=meta, *args, **kwargs)
Esempio n. 49
0
File: xls.py Progetto: abelthf/rows
def import_from_xls(filename_or_fobj, sheet_name=None, sheet_index=0,
                    start_row=0, start_column=0, *args, **kwargs):

    filename, _ = get_filename_and_fobj(filename_or_fobj)
    book = xlrd.open_workbook(filename, formatting_info=True)
    if sheet_name is not None:
        sheet = book.sheet_by_name(sheet_name)
    else:
        sheet = book.sheet_by_index(sheet_index)
    # TODO: may re-use Excel data types

    # Get field names
    # TODO: may use sheet.col_values or even sheet.ncols
    column_count = 0
    header = []
    column_value = cell_value(sheet, start_row, start_column + column_count)
    while column_value:
        header.append(column_value)
        column_count += 1
        column_value = cell_value(sheet, start_row,
                                  start_column + column_count)

    # Get sheet rows
    # TODO: may use sheel.col_slice or even sheet.nrows
    table_rows = []
    row_count = 0
    start_row += 1
    cell_is_empty = False
    while not cell_is_empty:
        row = [cell_value(sheet, start_row + row_count,
                          start_column + column_index)
               for column_index in range(column_count)]
        cell_is_empty = not any(row)
        if not cell_is_empty:
            table_rows.append(row)
            row_count += 1

    meta = {'imported_from': 'xls', 'filename': filename,}
    return create_table([header] + table_rows, meta=meta, *args, **kwargs)
Esempio n. 50
0
def export_to_xlsx(table,
                   filename_or_fobj=None,
                   sheet_name='Sheet1',
                   *args,
                   **kwargs):
    """Export the rows.Table to XLSX file and return the saved file."""

    workbook = Workbook()
    sheet = workbook.active
    sheet.title = sheet_name
    prepared_table = prepare_to_export(table, *args, **kwargs)

    # Write header
    field_names = next(prepared_table)
    for col_index, field_name in enumerate(field_names):
        cell = sheet.cell(row=1, column=col_index + 1)
        cell.value = field_name

    # Write sheet rows
    _convert_row = _python_to_cell(list(map(table.fields.get, field_names)))
    for row_index, row in enumerate(prepared_table, start=1):
        for col_index, (value, number_format) in enumerate(_convert_row(row)):
            cell = sheet.cell(row=row_index + 1, column=col_index + 1)
            cell.value = value
            if number_format is not None:
                cell.number_format = number_format

    if filename_or_fobj is not None:
        _, fobj = get_filename_and_fobj(filename_or_fobj, mode='wb')
        workbook.save(fobj)
        fobj.flush()
        return fobj
    else:
        fobj = BytesIO()
        workbook.save(fobj)
        fobj.seek(0)
        result = fobj.read()
        fobj.close()
        return result
Esempio n. 51
0
def export_to_xlsx(table,
                   filename_or_fobj=None,
                   sheet_name='Sheet1',
                   *args,
                   **kwargs):

    workbook = Workbook()
    sheet = workbook.active
    sheet.title = sheet_name
    field_names = list(enumerate(table.fields))
    prepared_table = prepare_to_export(table, *args, **kwargs)

    # Write header
    header = prepared_table.next()
    for col_index, field_name in enumerate(header):
        _write_cell(sheet, 0, col_index, field_name, fields.TextField)

    # Write sheet rows
    table_fields = table.fields
    for row_index, row in enumerate(prepared_table, start=1):
        for col_index, field_name in field_names:
            _write_cell(sheet,
                        row_index,
                        col_index,
                        value=row[col_index],
                        field_type=table_fields[field_name])

    if filename_or_fobj is not None:
        _, fobj = get_filename_and_fobj(filename_or_fobj, mode='wb')
        workbook.save(fobj)
        fobj.flush()
        return fobj
    else:
        fobj = BytesIO()
        workbook.save(fobj)
        fobj.seek(0)
        result = fobj.read()
        fobj.close()
        return result
Esempio n. 52
0
def import_from_html(filename_or_fobj, encoding='utf-8', index=0,
                     ignore_colspan=True, preserve_html=False,
                     properties=False, table_tag='table', row_tag='tr',
                     column_tag='td|th', *args, **kwargs):

    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb')
    html = fobj.read().decode(encoding)
    html_tree = document_fromstring(html)
    tables = html_tree.xpath('//{}'.format(table_tag))
    table = tables[index]

    strip_tags(table, 'thead')
    strip_tags(table, 'tbody')
    row_elements = table.xpath(row_tag)

    table_rows = [_get_row(row,
                           column_tag=column_tag,
                           preserve_html=preserve_html,
                           properties=properties)
                  for row in row_elements]

    if properties:
        table_rows[0][-1] = 'properties'

    if preserve_html and kwargs.get('fields', None) is None:
        # The field names will be the first table row, so we need to strip HTML
        # from it even if `preserve_html` is `True` (it's `True` only for rows,
        # not for the header).
        table_rows[0] = list(map(_extract_node_text, row_elements[0]))

    max_columns = max(map(len, table_rows))
    if ignore_colspan:
        table_rows = [row for row in table_rows if len(row) == max_columns]

    meta = {'imported_from': 'html',
            'filename': filename,
            'encoding': encoding,}
    return create_table(table_rows, meta=meta, *args, **kwargs)
Esempio n. 53
0
def import_from_xlsx(filename_or_fobj,
                     sheet_name=None,
                     sheet_index=0,
                     start_row=0,
                     start_column=0,
                     end_row=None,
                     end_column=None,
                     *args,
                     **kwargs):
    """Return a rows.Table created from imported XLSX file."""

    workbook = load_workbook(filename_or_fobj, data_only=True)
    if sheet_name is None:
        sheet_name = workbook.sheetnames[sheet_index]
    sheet = workbook[sheet_name]

    # openpyxl library reads rows and columns starting from 1 and ending on
    # sheet.max_row/max_col. rows uses another pattern: 0 to N - 1, so we need
    # to adjust the ranges accordingly
    min_row, min_col = sheet.min_row - 1, sheet.min_column - 1
    max_row, max_col = sheet.max_row - 1, sheet.max_column - 1
    start_row = max(start_row, min_row)
    end_row = min(end_row or max_row, max_row)
    start_col = max(start_column, min_col)
    end_col = min(end_column or max_col, max_col)
    table_rows = [[
        _cell_to_python(sheet.cell(row=row_index, column=col_index))
        for col_index in range(start_col + 1, end_col + 2)
    ] for row_index in range(start_row + 1, end_row + 2)]

    filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True)
    metadata = {
        'imported_from': 'xlsx',
        'filename': filename,
        'sheet_name': sheet_name,
    }
    return create_table(table_rows, meta=metadata, *args, **kwargs)
Esempio n. 54
0
def import_from_xlsx(filename_or_fobj,
                     sheet_name=None,
                     sheet_index=0,
                     start_row=0,
                     start_column=0,
                     *args,
                     **kwargs):
    workbook = load_workbook(filename_or_fobj)
    if sheet_name is None:
        sheet_name = workbook.sheetnames[sheet_index]
    sheet = workbook.get_sheet_by_name(sheet_name)

    # Get sheet header
    header = []
    last_column = start_column
    header_value = _get_cell_value(sheet, start_row, last_column)
    while header_value:
        header.append(header_value)
        last_column += 1
        header_value = _get_cell_value(sheet, start_row, last_column)
    last_column -= 1

    # Get sheet rows based on `last_column` defined in 'get sheet header'
    row_pos = start_row + 1
    all_rows = []
    row = _read_row(sheet, row_pos, last_column)
    while any(row):
        all_rows.append(row)
        row_pos += 1
        row = _read_row(sheet, row_pos, last_column)

    filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True)
    metadata = {
        'imported_from': 'xlsx',
        'filename': filename,
    }
    return create_table([header] + all_rows, meta=metadata, *args, **kwargs)
Esempio n. 55
0
File: xls.py Progetto: abelthf/rows
def export_to_xls(table,
                  filename_or_fobj=None,
                  sheet_name='Sheet1',
                  *args,
                  **kwargs):

    work_book = xlwt.Workbook()
    sheet = work_book.add_sheet(sheet_name)

    prepared_table = prepare_to_export(table, *args, **kwargs)

    field_names = prepared_table.next()
    for column_index, field_name in enumerate(field_names):
        sheet.write(0, column_index, field_name)

    for row_index, row in enumerate(prepared_table, start=1):
        for column_index, (field_name, value) in \
                enumerate(zip(field_names, row)):
            field_type = table.fields[field_name]
            data = {}
            if field_type in FORMATTING_STYLES:
                data['style'] = FORMATTING_STYLES[field_type]
            sheet.write(row_index, column_index, value, **data)

    if filename_or_fobj is not None:
        _, fobj = get_filename_and_fobj(filename_or_fobj, mode='wb')
        work_book.save(fobj)
        fobj.flush()
        return fobj
    else:
        fobj = BytesIO()
        work_book.save(fobj)
        fobj.seek(0)
        result = fobj.read()
        fobj.close()
        return result
Esempio n. 56
0
def import_from_csv(
    filename_or_fobj,
    encoding="utf-8",
    dialect=None,
    sample_size=262144,
    *args,
    **kwargs
):
    """Import data from a CSV file (automatically detects dialect).

    If a file-like object is provided it MUST be in binary mode, like in
    `open(filename, mode='rb')`.
    """
    filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb")

    if dialect is None:
        dialect = discover_dialect(
            sample=read_sample(fobj, sample_size), encoding=encoding
        )

    reader = unicodecsv.reader(fobj, encoding=encoding, dialect=dialect)

    meta = {"imported_from": "csv", "filename": filename, "encoding": encoding}
    return create_table(reader, meta=meta, *args, **kwargs)
Esempio n. 57
0
File: ods.py Progetto: turicas/rows
def import_from_ods(filename_or_fobj, index=0, *args, **kwargs):
    # TODO: import spreadsheet by name
    # TODO: unescape values

    filename, _ = get_filename_and_fobj(filename_or_fobj)

    ods_file = zipfile.ZipFile(filename)
    content_fobj = ods_file.open("content.xml")
    xml = content_fobj.read()  # will return bytes
    content_fobj.close()

    document = xml_from_string(xml)
    namespaces = document.nsmap
    spreadsheet = document.xpath("//office:spreadsheet", namespaces=namespaces)[0]
    tables = xpath(spreadsheet, "//table:table", namespaces)
    table = tables[index]

    table_rows_obj = xpath(table, "//table:table-row", namespaces)
    table_rows = []
    for row_obj in table_rows_obj:
        row = []
        for cell in xpath(row_obj, "//table:table-cell", namespaces):
            children = cell.getchildren()
            if not children:
                continue

            # TODO: evalute 'boolean' and 'time' types
            value_type = attrib(cell, namespaces["office"], "value-type")
            if value_type == "date":
                cell_value = attrib(cell, namespaces["office"], "date-value")
            elif value_type == "float":
                cell_value = attrib(cell, namespaces["office"], "value")
            elif value_type == "percentage":
                cell_value = attrib(cell, namespaces["office"], "value")
                cell_value = Decimal(cell_value)
                cell_value = "{:%}".format(cell_value)
            elif value_type == "string":
                try:
                    # get computed string (from formula, for example)
                    cell_value = attrib(cell, namespaces["office"], "string-value")
                except KeyError:
                    # computed string not present => get from <p>...</p>
                    cell_value = children[0].text
            else:  # value_type == some type we don't know
                cell_value = children[0].text

            try:
                repeat = attrib(cell, namespaces["table"], "number-columns-repeated")
            except KeyError:
                row.append(cell_value)
            else:
                for _ in range(int(repeat)):
                    row.append(cell_value)

        if row:
            table_rows.append(row)

    max_length = max(len(row) for row in table_rows)
    full_rows = complete_with_None(table_rows, max_length)
    meta = {"imported_from": "ods", "filename": filename}
    return create_table(full_rows, meta=meta, *args, **kwargs)