def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath, encoding="utf-8", *args, **kwargs): types = set([type(rows_xpath)] + [type(xpath) for xpath in fields_xpath.values()]) if types != set([six.text_type]): raise TypeError("XPath must be {}".format(six.text_type.__name__)) filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb") xml = fobj.read().decode(encoding) tree = tree_from_string(xml) row_elements = tree.xpath(rows_xpath) header = list(fields_xpath.keys()) row_data = _get_row_data(fields_xpath) result_rows = list(map(row_data, row_elements)) meta = { "imported_from": "xpath", "filename": filename, "encoding": encoding } return create_table([header] + result_rows, meta=meta, *args, **kwargs)
def export_to_csv(table, filename_or_fobj=None, encoding='utf-8', dialect=unicodecsv.excel, *args, **kwargs): '''Export a `rows.Table` to a CSV file If a file-like object is provided it MUST be in binary mode, like in `open(filename, mode='wb')`. If not filename/fobj is provided, the function returns a string with CSV contents. ''' # TODO: will work only if table.fields is OrderedDict # TODO: should use fobj? What about creating a method like json.dumps? if filename_or_fobj is not None: _, fobj = get_filename_and_fobj(filename_or_fobj, mode='wb') else: fobj = BytesIO() writer = unicodecsv.writer(fobj, encoding=encoding, dialect=dialect) for row in serialize(table, *args, **kwargs): writer.writerow(row) if filename_or_fobj is not None: fobj.flush() return fobj else: fobj.seek(0) result = fobj.read() fobj.close() return result
def export_to_xls(table, filename_or_fobj=None, sheet_name='Sheet1', *args, **kwargs): work_book = xlwt.Workbook() sheet = work_book.add_sheet(sheet_name) prepared_table = prepare_to_export(table, *args, **kwargs) field_names = next(prepared_table) for column_index, field_name in enumerate(field_names): sheet.write(0, column_index, field_name) _convert_row = _python_to_xls([table.fields.get(field) for field in field_names]) for row_index, row in enumerate(prepared_table, start=1): for column_index, (value, data) in enumerate(_convert_row(row)): sheet.write(row_index, column_index, value, **data) if filename_or_fobj is not None: _, fobj = get_filename_and_fobj(filename_or_fobj, mode='wb') work_book.save(fobj) fobj.flush() return fobj else: fobj = BytesIO() work_book.save(fobj) fobj.seek(0) result = fobj.read() fobj.close() return result
def export_to_xls(table, filename_or_fobj=None, sheet_name='Sheet1', *args, **kwargs): work_book = xlwt.Workbook() sheet = work_book.add_sheet(sheet_name) prepared_table = prepare_to_export(table, *args, **kwargs) field_names = prepared_table.next() for column_index, field_name in enumerate(field_names): sheet.write(0, column_index, field_name) for row_index, row in enumerate(prepared_table, start=1): for column_index, (field_name, value) in \ enumerate(zip(field_names, row)): field_type = table.fields[field_name] data = {} if field_type in FORMATTING_STYLES: data['style'] = FORMATTING_STYLES[field_type] sheet.write(row_index, column_index, value, **data) if filename_or_fobj is not None: _, fobj = get_filename_and_fobj(filename_or_fobj, mode='wb') work_book.save(fobj) fobj.flush() return fobj else: fobj = BytesIO() work_book.save(fobj) fobj.seek(0) result = fobj.read() fobj.close() return result
def export_to_xls(table, filename_or_fobj=None, sheet_name='Sheet1', *args, **kwargs): """Export the rows.Table to XLS file and return the saved file.""" work_book = xlwt.Workbook() sheet = work_book.add_sheet(sheet_name) prepared_table = prepare_to_export(table, *args, **kwargs) field_names = next(prepared_table) for column_index, field_name in enumerate(field_names): sheet.write(0, column_index, field_name) _convert_row = _python_to_xls( [table.fields.get(field) for field in field_names]) for row_index, row in enumerate(prepared_table, start=1): for column_index, (value, data) in enumerate(_convert_row(row)): sheet.write(row_index, column_index, value, **data) if filename_or_fobj is not None: _, fobj = get_filename_and_fobj(filename_or_fobj, mode='wb') work_book.save(fobj) fobj.flush() return fobj else: fobj = BytesIO() work_book.save(fobj) fobj.seek(0) result = fobj.read() fobj.close() return result
def import_from_csv(filename_or_fobj, encoding='utf-8', dialect=None, sample_size=8192, *args, **kwargs): '''Import data from a CSV file If a file-like object is provided it MUST be in binary mode, like in `open(filename, mode='rb')`. ''' filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') if dialect is None: cursor = fobj.tell() dialect = discover_dialect(fobj.read(sample_size), encoding) fobj.seek(cursor) reader = unicodecsv.reader(fobj, encoding=encoding, dialect=dialect) meta = { 'imported_from': 'csv', 'filename': filename, 'encoding': encoding, } return create_table(reader, meta=meta, *args, **kwargs)
def import_from_xlsx(filename_or_fobj, sheet_name=None, sheet_index=0, start_row=0, start_column=0, *args, **kwargs): workbook = load_workbook(filename_or_fobj) if sheet_name is None: sheet_name = workbook.sheetnames[sheet_index] sheet = workbook.get_sheet_by_name(sheet_name) # Get sheet header header = [] last_column = start_column header_value = _get_cell_value(sheet, start_row, last_column) while header_value: header.append(header_value) last_column += 1 header_value = _get_cell_value(sheet, start_row, last_column) last_column -= 1 # Get sheet rows based on `last_column` defined in 'get sheet header' row_pos = start_row + 1 all_rows = [] row = _read_row(sheet, row_pos, last_column) while any(row): all_rows.append(row) row_pos += 1 row = _read_row(sheet, row_pos, last_column) filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True) metadata = {'imported_from': 'xlsx', 'filename': filename, } return create_table([header] + all_rows, meta=metadata, *args, **kwargs)
def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath, encoding='utf-8', *args, **kwargs): types = set([type(rows_xpath)] + \ [type(xpath) for xpath in fields_xpath.values()]) if types != set([six.text_type]): raise TypeError('XPath must be {}'.format(six.text_type.__name__)) filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') xml = fobj.read().decode(encoding) tree = tree_from_string(xml) row_elements = tree.xpath(rows_xpath) header = list(fields_xpath.keys()) row_data = _get_row_data(fields_xpath) result_rows = list(map(row_data, row_elements)) meta = { 'imported_from': 'xpath', 'filename': filename, 'encoding': encoding, } return create_table([header] + result_rows, meta=meta, *args, **kwargs)
def import_from_xlsx(filename_or_fobj, sheet_name=None, sheet_index=0, start_row=0, start_column=0, *args, **kwargs): workbook = load_workbook(filename_or_fobj) if sheet_name is None: sheet_name = workbook.sheetnames[sheet_index] sheet = workbook.get_sheet_by_name(sheet_name) start_row, end_row = max(start_row, sheet.min_row), sheet.max_row start_col, end_col = max(start_column, sheet.min_column), sheet.max_column table_rows = [[ _cell_to_python(sheet.cell(row=row_index, column=col_index)) for col_index in range(start_col, end_col + 1) ] for row_index in range(start_row, end_row + 1)] filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True) metadata = { 'imported_from': 'xlsx', 'filename': filename, 'sheet_name': sheet_name, } return create_table(table_rows, meta=metadata, *args, **kwargs)
def export_to_xlsx(table, filename_or_fobj=None, sheet_name='Sheet1', *args, **kwargs): workbook = Workbook() sheet = workbook.active sheet.title = sheet_name field_names = list(enumerate(table.fields)) prepared_table = prepare_to_export(table, *args, **kwargs) # Write header header = prepared_table.next() for col_index, field_name in enumerate(header): _write_cell(sheet, 0, col_index, field_name, fields.TextField) # Write sheet rows table_fields = table.fields for row_index, row in enumerate(prepared_table, start=1): for col_index, field_name in field_names: _write_cell(sheet, row_index, col_index, value=row[col_index], field_type=table_fields[field_name]) if filename_or_fobj is not None: _, fobj = get_filename_and_fobj(filename_or_fobj, mode='wb') workbook.save(fobj) fobj.flush() return fobj else: fobj = BytesIO() workbook.save(fobj) fobj.seek(0) result = fobj.read() fobj.close() return result
def count_tables(filename_or_fobj, encoding="utf-8", table_tag="table"): """Read a file passed by arg and return your table HTML tag count.""" filename, fobj = get_filename_and_fobj(filename_or_fobj) html = fobj.read().decode(encoding) html_tree = document_fromstring(html) tables = html_tree.xpath("//{}".format(table_tag)) return len(tables)
def export_to_xlsx(table, filename_or_fobj=None, sheet_name="Sheet1", *args, **kwargs): """Export the rows.Table to XLSX file and return the saved file.""" workbook = Workbook() sheet = workbook.active sheet.title = sheet_name prepared_table = prepare_to_export(table, *args, **kwargs) # Write header field_names = next(prepared_table) for col_index, field_name in enumerate(field_names): cell = sheet.cell(row=1, column=col_index + 1) cell.value = field_name # Write sheet rows _convert_row = _python_to_cell(list(map(table.fields.get, field_names))) for row_index, row in enumerate(prepared_table, start=1): for col_index, (value, number_format) in enumerate(_convert_row(row)): cell = sheet.cell(row=row_index + 1, column=col_index + 1) cell.value = value if number_format is not None: cell.number_format = number_format if filename_or_fobj is not None: _, fobj = get_filename_and_fobj(filename_or_fobj, mode="wb") workbook.save(fobj) fobj.flush() return fobj else: fobj = BytesIO() workbook.save(fobj) fobj.seek(0) result = fobj.read() fobj.close() return result
def document(self): filename, fobj = get_filename_and_fobj(self.filename_or_fobj, mode="rb") parser = PDFParser(fobj) doc = PDFDocument(parser) parser.set_document(doc) return doc
def import_from_xls(filename_or_fobj, sheet_name=None, sheet_index=0, start_row=0, start_column=0, *args, **kwargs): filename, _ = get_filename_and_fobj(filename_or_fobj, mode='rb') book = xlrd.open_workbook(filename, formatting_info=True) if sheet_name is not None: sheet = book.sheet_by_name(sheet_name) else: sheet = book.sheet_by_index(sheet_index) # TODO: may re-use Excel data types # Get header and rows table_rows = [[ cell_value(sheet, row_index, column_index) for column_index in range(start_column, sheet.ncols) ] for row_index in range(start_row, sheet.nrows)] meta = { 'imported_from': 'xls', 'filename': filename, 'sheet_name': sheet.name, } return create_table(table_rows, meta=meta, *args, **kwargs)
def document(self): filename, fobj = get_filename_and_fobj(self.filename_or_fobj, mode="rb") if not filename: data = fobj.read() # TODO: may use a lot of memory doc = pymupdf.open(stream=data, filetype="pdf") else: doc = pymupdf.open(filename=filename, filetype="pdf") return doc
def import_from_txt( filename_or_fobj, encoding="utf-8", frame_style=FRAME_SENTINEL, *args, **kwargs ): """Return a rows.Table created from imported TXT file.""" # TODO: (maybe) # enable parsing of non-fixed-width-columns # with old algorithm - that would just split columns # at the vertical separator character for the frame. # (if doing so, include an optional parameter) # Also, this fixes an outstanding unreported issue: # trying to parse tables which fields values # included a Pipe char - "|" - would silently # yield bad results. filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb") raw_contents = fobj.read().decode(encoding).rstrip("\n") if frame_style is FRAME_SENTINEL: frame_style = _guess_frame_style(raw_contents) else: frame_style = _parse_frame_style(frame_style) contents = raw_contents.splitlines() del raw_contents if frame_style != "None": contents = contents[1:-1] del contents[1] else: # the table is possibly generated from other source. # check if the line we reserve as a separator is realy empty. if not contents[1].strip(): del contents[1] col_positions = _parse_col_positions(frame_style, contents[0]) table_rows = [ [ row[start + 1 : end].strip() for start, end in zip(col_positions, col_positions[1:]) ] for row in contents ] # # Variable columns - old behavior: # table_rows = [[value.strip() for value in row.split(vertical_char)[1:-1]] # for row in contents] meta = { "imported_from": "txt", "filename": filename, "encoding": encoding, "frame_style": frame_style, } return create_table(table_rows, meta=meta, *args, **kwargs)
def import_from_txt(filename_or_fobj, encoding='utf-8', frame_style=FRAME_SENTINEL, *args, **kwargs): """Return a rows.Table created from imported TXT file.""" # TODO: (maybe) # enable parsing of non-fixed-width-columns # with old algorithm - that would just split columns # at the vertical separator character for the frame. # (if doing so, include an optional parameter) # Also, this fixes an outstanding unreported issue: # trying to parse tables which fields values # included a Pipe char - "|" - would silently # yield bad results. filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') raw_contents = fobj.read().decode(encoding).rstrip('\n') if frame_style is FRAME_SENTINEL: frame_style = _guess_frame_style(raw_contents) else: frame_style = _parse_frame_style(frame_style) contents = raw_contents.splitlines() del raw_contents if frame_style != 'None': contents = contents[1:-1] del contents[1] else: # the table is possibly generated from other source. # check if the line we reserve as a separator is realy empty. if not contents[1].strip(): del contents[1] col_positions = _parse_col_positions(frame_style, contents[0]) table_rows = [[ row[start + 1:end].strip() for start, end in zip(col_positions, col_positions[1:]) ] for row in contents] # # Variable columns - old behavior: # table_rows = [[value.strip() for value in row.split(vertical_char)[1:-1]] # for row in contents] meta = { 'imported_from': 'txt', 'filename': filename, 'encoding': encoding, 'frame_style': frame_style } return create_table(table_rows, meta=meta, *args, **kwargs)
def export_to_csv( table, filename_or_fobj=None, encoding="utf-8", dialect=unicodecsv.excel, batch_size=100, callback=None, *args, **kwargs ): """Export a `rows.Table` to a CSV file. If a file-like object is provided it MUST be in binary mode, like in `open(filename, mode='wb')`. If not filename/fobj is provided, the function returns a string with CSV contents. """ # TODO: will work only if table.fields is OrderedDict # TODO: should use fobj? What about creating a method like json.dumps? if filename_or_fobj is not None: _, fobj = get_filename_and_fobj(filename_or_fobj, mode="wb") else: fobj = BytesIO() # TODO: may use `io.BufferedWriter` instead of `ipartition` so user can # choose the real size (in Bytes) when to flush to the file system, instead # number of rows writer = unicodecsv.writer(fobj, encoding=encoding, dialect=dialect) if callback is None: for batch in ipartition(serialize(table, *args, **kwargs), batch_size): writer.writerows(batch) else: serialized = serialize(table, *args, **kwargs) writer.writerow(next(serialized)) # First, write the header total = 0 for batch in ipartition(serialized, batch_size): writer.writerows(batch) total += len(batch) callback(total) if filename_or_fobj is not None: fobj.flush() return fobj else: fobj.seek(0) result = fobj.read() fobj.close() return result
def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs): 'Import data from a JSON file' kwargs['encoding'] = encoding filename, fobj = get_filename_and_fobj(filename_or_fobj) json_obj = json.load(fobj, encoding=encoding) field_names = json_obj[0].keys() table_rows = [[item[key] for key in field_names] for item in json_obj] data = [field_names] + table_rows meta = {'imported_from': 'json', 'filename': filename, } return create_table(data, meta=meta, *args, **kwargs)
def import_from_json(filename_or_fobj, encoding="utf-8", *args, **kwargs): """Import a JSON file or file-like object into a `rows.Table`. If a file-like object is provided it MUST be open in text (non-binary) mode on Python 3 and could be open in both binary or text mode on Python 2. """ filename, fobj = get_filename_and_fobj(filename_or_fobj) json_obj = json.load(fobj, encoding=encoding) field_names = list(json_obj[0].keys()) table_rows = [[item[key] for key in field_names] for item in json_obj] meta = {"imported_from": "json", "filename": filename, "encoding": encoding} return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)
def export_to_csv(table, filename_or_fobj=None, encoding="utf-8", dialect=unicodecsv.excel, batch_size=100, callback=None, *args, **kwargs): """Export a `rows.Table` to a CSV file. If a file-like object is provided it MUST be in binary mode, like in `open(filename, mode='wb')`. If not filename/fobj is provided, the function returns a string with CSV contents. """ # TODO: will work only if table.fields is OrderedDict # TODO: should use fobj? What about creating a method like json.dumps? if filename_or_fobj is not None: _, fobj = get_filename_and_fobj(filename_or_fobj, mode="wb") else: fobj = BytesIO() # TODO: may use `io.BufferedWriter` instead of `ipartition` so user can # choose the real size (in Bytes) when to flush to the file system, instead # number of rows writer = unicodecsv.writer(fobj, encoding=encoding, dialect=dialect) if callback is None: for batch in ipartition(serialize(table, *args, **kwargs), batch_size): writer.writerows(batch) else: serialized = serialize(table, *args, **kwargs) writer.writerow(next(serialized)) # First, write the header total = 0 for batch in ipartition(serialized, batch_size): writer.writerows(batch) total += len(batch) callback(total) if filename_or_fobj is not None: fobj.flush() return fobj else: fobj.seek(0) result = fobj.read() fobj.close() return result
def import_from_txt(filename_or_fobj, encoding='utf-8', *args, **kwargs): # TODO: should be able to change DASH, PLUS and PIPE filename, fobj = get_filename_and_fobj(filename_or_fobj) kwargs['encoding'] = encoding contents = fobj.read().decode(encoding).strip().splitlines() # remove '+----+----+' lines contents = contents[1:-1] del contents[1] table_rows = [[value.strip() for value in row.split(PIPE)[1:-1]] for row in contents] meta = {'imported_from': 'txt', 'filename': filename,} return create_table(table_rows, meta=meta, *args, **kwargs)
def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath, encoding='utf-8', *args, **kwargs): filename, fobj = get_filename_and_fobj(filename_or_fobj) kwargs['encoding'] = encoding xml = fobj.read().decode(encoding) tree = tree_from_string(xml) row_elements = tree.xpath(rows_xpath) header = fields_xpath.keys() result_rows = [_get_row_data(row, fields_xpath) for row in row_elements] meta = {'imported_from': 'xpath', 'filename': filename,} return create_table([header] + result_rows, meta=meta, *args, **kwargs)
def import_from_html( filename_or_fobj, encoding="utf-8", index=0, ignore_colspan=True, preserve_html=False, properties=False, table_tag="table", row_tag="tr", column_tag="td|th", *args, **kwargs ): """Return rows.Table from HTML file.""" filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb") html = fobj.read().decode(encoding) html_tree = document_fromstring(html) tables = html_tree.xpath("//{}".format(table_tag)) table = tables[index] strip_tags(table, "thead") strip_tags(table, "tbody") row_elements = table.xpath(row_tag) table_rows = [ _get_row( row, column_tag=column_tag, preserve_html=preserve_html, properties=properties, ) for row in row_elements ] if properties: table_rows[0][-1] = "properties" if preserve_html and kwargs.get("fields", None) is None: # The field names will be the first table row, so we need to strip HTML # from it even if `preserve_html` is `True` (it's `True` only for rows, # not for the header). table_rows[0] = list(map(_extract_node_text, row_elements[0])) if ignore_colspan: max_columns = max(map(len, table_rows)) table_rows = [row for row in table_rows if len(row) == max_columns] meta = {"imported_from": "html", "filename": filename, "encoding": encoding} return create_table(table_rows, meta=meta, *args, **kwargs)
def import_from_xls(filename_or_fobj, sheet_name=None, sheet_index=0, start_row=0, start_column=0, *args, **kwargs): filename, _ = get_filename_and_fobj(filename_or_fobj) book = xlrd.open_workbook(filename, formatting_info=True) if sheet_name is not None: sheet = book.sheet_by_name(sheet_name) else: sheet = book.sheet_by_index(sheet_index) # TODO: may re-use Excel data types # Get field names # TODO: may use sheet.col_values or even sheet.ncols column_count = 0 header = [] column_value = cell_value(sheet, start_row, start_column + column_count) while column_value: header.append(column_value) column_count += 1 column_value = cell_value(sheet, start_row, start_column + column_count) # Get sheet rows # TODO: may use sheel.col_slice or even sheet.nrows table_rows = [] row_count = 0 start_row += 1 cell_is_empty = False while not cell_is_empty: row = [ cell_value(sheet, start_row + row_count, start_column + column_index) for column_index in range(column_count) ] cell_is_empty = not any(row) if not cell_is_empty: table_rows.append(row) row_count += 1 meta = { 'imported_from': 'xls', 'filename': filename, } return create_table([header] + table_rows, meta=meta, *args, **kwargs)
def import_from_txt(filename_or_fobj, encoding='utf-8', *args, **kwargs): # TODO: should be able to change DASH, PLUS and PIPE filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') contents = fobj.read().decode(encoding).strip().splitlines() # remove '+----+----+' lines contents = contents[1:-1] del contents[1] table_rows = [[value.strip() for value in row.split(PIPE)[1:-1]] for row in contents] meta = {'imported_from': 'txt', 'filename': filename, 'encoding': encoding,} return create_table(table_rows, meta=meta, *args, **kwargs)
def import_from_parquet(filename_or_fobj, *args, **kwargs): 'Import data from a Parquet file' filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') # TODO: should look into `schema.converted_type` also types = OrderedDict([(schema.name, PARQUET_TO_ROWS[schema.type]) for schema in parquet._read_footer(fobj).schema if schema.type is not None]) header = list(types.keys()) table_rows = list(parquet.reader(fobj)) # TODO: be lazy meta = {'imported_from': 'parquet', 'filename': filename,} return create_table([header] + table_rows, meta=meta, force_types=types, *args, **kwargs)
def import_from_html(filename_or_fobj, encoding='utf-8', index=0, ignore_colspan=True, preserve_html=False, properties=False, table_tag='table', row_tag='tr', column_tag='td|th', *args, **kwargs): """Return rows.Table from HTML file.""" filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') html = fobj.read().decode(encoding) html_tree = document_fromstring(html) tables = html_tree.xpath('//{}'.format(table_tag)) table = tables[index] strip_tags(table, 'thead') strip_tags(table, 'tbody') row_elements = table.xpath(row_tag) table_rows = [ _get_row(row, column_tag=column_tag, preserve_html=preserve_html, properties=properties) for row in row_elements ] if properties: table_rows[0][-1] = 'properties' if preserve_html and kwargs.get('fields', None) is None: # The field names will be the first table row, so we need to strip HTML # from it even if `preserve_html` is `True` (it's `True` only for rows, # not for the header). table_rows[0] = list(map(_extract_node_text, row_elements[0])) max_columns = max(map(len, table_rows)) if ignore_colspan: table_rows = [row for row in table_rows if len(row) == max_columns] meta = { 'imported_from': 'html', 'filename': filename, 'encoding': encoding, } return create_table(table_rows, meta=meta, *args, **kwargs)
def import_from_csv(filename_or_fobj, encoding='utf-8', dialect=None, *args, **kwargs): 'Import data from a CSV file' filename, fobj = get_filename_and_fobj(filename_or_fobj) if dialect is None: sample = fobj.readline().decode(encoding) dialect = unicodecsv.Sniffer().sniff(sample) fobj.seek(0) kwargs['encoding'] = encoding csv_reader = unicodecsv.reader(fobj, encoding=encoding, dialect=dialect) meta = {'imported_from': 'csv', 'filename': filename,} return create_table(csv_reader, meta=meta, *args, **kwargs)
def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs): 'Import data from a JSON file' kwargs['encoding'] = encoding filename, fobj = get_filename_and_fobj(filename_or_fobj) json_obj = json.load(fobj, encoding=encoding) field_names = json_obj[0].keys() table_rows = [[item[key] for key in field_names] for item in json_obj] data = [field_names] + table_rows meta = { 'imported_from': 'json', 'filename': filename, } return create_table(data, meta=meta, *args, **kwargs)
def import_from_xls( filename_or_fobj, sheet_name=None, sheet_index=0, start_row=None, start_column=None, end_row=None, end_column=None, *args, **kwargs ): """Return a rows.Table created from imported XLS file.""" filename, _ = get_filename_and_fobj(filename_or_fobj, mode="rb") book = xlrd.open_workbook(filename, formatting_info=True) if sheet_name is not None: sheet = book.sheet_by_name(sheet_name) else: sheet = book.sheet_by_index(sheet_index) # TODO: may re-use Excel data types # Get header and rows # xlrd library reads rows and columns starting from 0 and ending on # sheet.nrows/ncols - 1. rows accepts the same pattern # The xlrd library reads rows and columns starting from 0 and ending on # sheet.nrows/ncols - 1. rows also uses 0-based indexes, so no # transformation is needed min_row, min_column = get_table_start(sheet) max_row, max_column = sheet.nrows - 1, sheet.ncols - 1 # TODO: consider adding a parameter `ignore_padding=True` and when it's # True, consider `start_row` starting from `min_row` and `start_column` # starting from `min_col`. start_row = start_row if start_row is not None else min_row end_row = end_row if end_row is not None else max_row start_column = start_column if start_column is not None else min_column end_column = end_column if end_column is not None else max_column table_rows = [ [ cell_value(sheet, row_index, column_index) for column_index in range(start_column, end_column + 1) ] for row_index in range(start_row, end_row + 1) ] meta = {"imported_from": "xls", "filename": filename, "sheet_name": sheet.name} return create_table(table_rows, meta=meta, *args, **kwargs)
def import_from_json(filename_or_fobj, encoding='utf-8', *args, **kwargs): '''Import a JSON file or file-like object into a `rows.Table` If a file-like object is provided it MUST be open in text (non-binary) mode on Python 3 and could be open in both binary or text mode on Python 2. ''' filename, fobj = get_filename_and_fobj(filename_or_fobj) json_obj = json.load(fobj, encoding=encoding) field_names = list(json_obj[0].keys()) table_rows = [[item[key] for key in field_names] for item in json_obj] meta = {'imported_from': 'json', 'filename': filename, 'encoding': encoding,} return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)
def import_from_parquet(filename_or_fobj, *args, **kwargs): """Import data from a Parquet file and return with rows.Table.""" filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb") # TODO: should look into `schema.converted_type` also types = OrderedDict([(schema.name, PARQUET_TO_ROWS[schema.type]) for schema in parquet._read_footer(fobj).schema if schema.type is not None]) header = list(types.keys()) table_rows = list(parquet.reader(fobj)) # TODO: be lazy meta = {"imported_from": "parquet", "filename": filename} return create_table([header] + table_rows, meta=meta, force_types=types, *args, **kwargs)
def import_from_xlsx(filename_or_fobj, sheet_name=None, sheet_index=0, start_row=None, start_column=None, end_row=None, end_column=None, *args, **kwargs): """Return a rows.Table created from imported XLSX file.""" workbook = load_workbook(filename_or_fobj, read_only=True) if sheet_name is None: sheet_name = workbook.sheetnames[sheet_index] sheet = workbook[sheet_name] # The openpyxl library reads rows and columns starting from 1 and ending on # sheet.max_row/max_col. rows uses 0-based indexes (from 0 to N - 1), so we # need to adjust the ranges accordingly. min_row, min_column = sheet.min_row - 1, sheet.min_column - 1 max_row, max_column = sheet.max_row - 1, sheet.max_column - 1 # TODO: consider adding a parameter `ignore_padding=True` and when it's # True, consider `start_row` starting from `sheet.min_row` and # `start_column` starting from `sheet.min_col`. start_row = start_row if start_row is not None else min_row end_row = end_row if end_row is not None else max_row start_column = start_column if start_column is not None else min_column end_column = end_column if end_column is not None else max_column table_rows = [] is_empty = lambda row: all(cell is None for cell in row) for row_index in range(start_row + 1, end_row + 2): row = [ _cell_to_python(sheet_cell(sheet, row_index, col_index)) for col_index in range(start_column + 1, end_column + 2) ] if not is_empty(row): table_rows.append(row) filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True) metadata = { "imported_from": "xlsx", "filename": filename, "sheet_name": sheet_name } return create_table(table_rows, meta=metadata, *args, **kwargs)
def import_from_xls(filename_or_fobj, sheet_name=None, sheet_index=0, start_row=None, start_column=None, end_row=None, end_column=None, *args, **kwargs): """Return a rows.Table created from imported XLS file.""" filename, _ = get_filename_and_fobj(filename_or_fobj, mode="rb") book = xlrd.open_workbook(filename, formatting_info=True) if sheet_name is not None: sheet = book.sheet_by_name(sheet_name) else: sheet = book.sheet_by_index(sheet_index) # TODO: may re-use Excel data types # Get header and rows # xlrd library reads rows and columns starting from 0 and ending on # sheet.nrows/ncols - 1. rows accepts the same pattern # The xlrd library reads rows and columns starting from 0 and ending on # sheet.nrows/ncols - 1. rows also uses 0-based indexes, so no # transformation is needed min_row, min_column = get_table_start(sheet) max_row, max_column = sheet.nrows - 1, sheet.ncols - 1 # TODO: consider adding a parameter `ignore_padding=True` and when it's # True, consider `start_row` starting from `min_row` and `start_column` # starting from `min_col`. start_row = start_row if start_row is not None else min_row end_row = end_row if end_row is not None else max_row start_column = start_column if start_column is not None else min_column end_column = end_column if end_column is not None else max_column table_rows = [[ cell_value(sheet, row_index, column_index) for column_index in range(start_column, end_column + 1) ] for row_index in range(start_row, end_row + 1)] meta = { "imported_from": "xls", "filename": filename, "sheet_name": sheet.name } return create_table(table_rows, meta=meta, *args, **kwargs)
def import_from_json(filename_or_fobj, encoding="utf-8", *args, **kwargs): """Import a JSON file or file-like object into a `rows.Table`. If a file-like object is provided it MUST be open in text (non-binary) mode on Python 3 and could be open in both binary or text mode on Python 2. """ filename, fobj = get_filename_and_fobj(filename_or_fobj) json_obj = json.load(fobj, encoding=encoding) field_names = list(json_obj[0].keys()) table_rows = [[item[key] for key in field_names] for item in json_obj] meta = { "imported_from": "json", "filename": filename, "encoding": encoding } return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)
def import_from_yaml(filename_or_fobj, encoding='utf-8', *args, **kwargs): '''Import a YAML file or file-like object into a `rows.Table` If a file-like object is provided it MUST be open in text (non-binary) mode on Python 3 and could be open in both binary or text mode on Python 2. ''' filename, fobj = get_filename_and_fobj(filename_or_fobj) yaml_obj = yaml.load(fobj) field_names = list(yaml_obj[0].keys()) table_rows = [[item[key] for key in field_names] for item in yaml_obj] meta = { 'imported_from': 'yaml', 'filename': filename, 'encoding': encoding } return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)
def import_from_xpath( filename_or_fobj, rows_xpath, fields_xpath, encoding="utf-8", *args, **kwargs ): types = set([type(rows_xpath)] + [type(xpath) for xpath in fields_xpath.values()]) if types != set([six.text_type]): raise TypeError("XPath must be {}".format(six.text_type.__name__)) filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb") xml = fobj.read().decode(encoding) tree = tree_from_string(xml) row_elements = tree.xpath(rows_xpath) header = list(fields_xpath.keys()) row_data = _get_row_data(fields_xpath) result_rows = list(map(row_data, row_elements)) meta = {"imported_from": "xpath", "filename": filename, "encoding": encoding} return create_table([header] + result_rows, meta=meta, *args, **kwargs)
def import_from_xlsx( filename_or_fobj, sheet_name=None, sheet_index=0, start_row=None, start_column=None, end_row=None, end_column=None, *args, **kwargs ): """Return a rows.Table created from imported XLSX file.""" workbook = load_workbook(filename_or_fobj, read_only=True) if sheet_name is None: sheet_name = workbook.sheetnames[sheet_index] sheet = workbook[sheet_name] # The openpyxl library reads rows and columns starting from 1 and ending on # sheet.max_row/max_col. rows uses 0-based indexes (from 0 to N - 1), so we # need to adjust the ranges accordingly. min_row, min_column = sheet.min_row - 1, sheet.min_column - 1 max_row, max_column = sheet.max_row - 1, sheet.max_column - 1 # TODO: consider adding a parameter `ignore_padding=True` and when it's # True, consider `start_row` starting from `sheet.min_row` and # `start_column` starting from `sheet.min_col`. start_row = start_row if start_row is not None else min_row end_row = end_row if end_row is not None else max_row start_column = start_column if start_column is not None else min_column end_column = end_column if end_column is not None else max_column table_rows = [] is_empty = lambda row: all(cell is None for cell in row) for row_index in range(start_row + 1, end_row + 2): row = [ _cell_to_python(sheet_cell(sheet, row_index, col_index)) for col_index in range(start_column + 1, end_column + 2) ] if not is_empty(row): table_rows.append(row) filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True) metadata = {"imported_from": "xlsx", "filename": filename, "sheet_name": sheet_name} return create_table(table_rows, meta=metadata, *args, **kwargs)
def import_from_parquet(filename_or_fobj, *args, **kwargs): """Import data from a Parquet file and return with rows.Table.""" filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb") # TODO: should look into `schema.converted_type` also types = OrderedDict( [ (schema.name, PARQUET_TO_ROWS[schema.type]) for schema in parquet._read_footer(fobj).schema if schema.type is not None ] ) header = list(types.keys()) table_rows = list(parquet.reader(fobj)) # TODO: be lazy meta = {"imported_from": "parquet", "filename": filename} return create_table( [header] + table_rows, meta=meta, force_types=types, *args, **kwargs )
def import_from_xls(filename_or_fobj, sheet_name=None, sheet_index=0, start_row=0, start_column=0, *args, **kwargs): filename, _ = get_filename_and_fobj(filename_or_fobj, mode='rb') book = xlrd.open_workbook(filename, formatting_info=True) if sheet_name is not None: sheet = book.sheet_by_name(sheet_name) else: sheet = book.sheet_by_index(sheet_index) # TODO: may re-use Excel data types # Get header and rows table_rows = [[cell_value(sheet, row_index, column_index) for column_index in range(start_column, sheet.ncols)] for row_index in range(start_row, sheet.nrows)] meta = {'imported_from': 'xls', 'filename': filename, 'sheet_name': sheet.name, } return create_table(table_rows, meta=meta, *args, **kwargs)
def import_from_csv(filename_or_fobj, encoding="utf-8", dialect=None, sample_size=262144, *args, **kwargs): """Import data from a CSV file (automatically detects dialect). If a file-like object is provided it MUST be in binary mode, like in `open(filename, mode='rb')`. """ filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb") if dialect is None: dialect = discover_dialect(sample=read_sample(fobj, sample_size), encoding=encoding) reader = unicodecsv.reader(fobj, encoding=encoding, dialect=dialect) meta = {"imported_from": "csv", "filename": filename, "encoding": encoding} return create_table(reader, meta=meta, *args, **kwargs)
def import_from_csv(filename_or_fobj, encoding='utf-8', dialect=None, sample_size=8192, *args, **kwargs): '''Import data from a CSV file If a file-like object is provided it MUST be in binary mode, like in `open(filename, mode='rb')`. ''' filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') if dialect is None: cursor = fobj.tell() dialect = discover_dialect(fobj.read(sample_size), encoding) fobj.seek(cursor) reader = unicodecsv.reader(fobj, encoding=encoding, dialect=dialect) meta = {'imported_from': 'csv', 'filename': filename, 'encoding': encoding,} return create_table(reader, meta=meta, *args, **kwargs)
def export_to_csv(table, filename_or_fobj=None, encoding='utf-8', *args, **kwargs): # TODO: will work only if table.fields is OrderedDict # TODO: should use fobj? What about creating a method like json.dumps? kwargs['encoding'] = encoding if filename_or_fobj is not None: _, fobj = get_filename_and_fobj(filename_or_fobj, mode='w') else: fobj = BytesIO() csv_writer = unicodecsv.writer(fobj, encoding=encoding) map(csv_writer.writerow, serialize(table, *args, **kwargs)) if filename_or_fobj is not None: fobj.flush() return fobj else: fobj.seek(0) result = fobj.read() fobj.close() return result
def import_from_parquet(filename_or_fobj, *args, **kwargs): 'Import data from a Parquet file' filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') # TODO: should look into `schema.converted_type` also types = OrderedDict([(schema.name, PARQUET_TO_ROWS[schema.type]) for schema in parquet._read_footer(fobj).schema if schema.type is not None]) header = list(types.keys()) table_rows = list(parquet.reader(fobj)) # TODO: be lazy meta = { 'imported_from': 'parquet', 'filename': filename, } return create_table([header] + table_rows, meta=meta, force_types=types, *args, **kwargs)
def import_from_xpath(filename_or_fobj, rows_xpath, fields_xpath, encoding='utf-8', *args, **kwargs): types = set([type(rows_xpath)] + \ [type(xpath) for xpath in fields_xpath.values()]) if types != set([six.text_type]): raise TypeError('XPath must be {}'.format(six.text_type.__name__)) filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') xml = fobj.read().decode(encoding) tree = tree_from_string(xml) row_elements = tree.xpath(rows_xpath) header = list(fields_xpath.keys()) row_data = _get_row_data(fields_xpath) result_rows = list(map(row_data, row_elements)) meta = {'imported_from': 'xpath', 'filename': filename, 'encoding': encoding,} return create_table([header] + result_rows, meta=meta, *args, **kwargs)
def import_from_xls(filename_or_fobj, sheet_name=None, sheet_index=0, start_row=0, start_column=0, *args, **kwargs): filename, _ = get_filename_and_fobj(filename_or_fobj) book = xlrd.open_workbook(filename, formatting_info=True) if sheet_name is not None: sheet = book.sheet_by_name(sheet_name) else: sheet = book.sheet_by_index(sheet_index) # TODO: may re-use Excel data types # Get field names # TODO: may use sheet.col_values or even sheet.ncols column_count = 0 header = [] column_value = cell_value(sheet, start_row, start_column + column_count) while column_value: header.append(column_value) column_count += 1 column_value = cell_value(sheet, start_row, start_column + column_count) # Get sheet rows # TODO: may use sheel.col_slice or even sheet.nrows table_rows = [] row_count = 0 start_row += 1 cell_is_empty = False while not cell_is_empty: row = [cell_value(sheet, start_row + row_count, start_column + column_index) for column_index in range(column_count)] cell_is_empty = not any(row) if not cell_is_empty: table_rows.append(row) row_count += 1 meta = {'imported_from': 'xls', 'filename': filename,} return create_table([header] + table_rows, meta=meta, *args, **kwargs)
def export_to_xlsx(table, filename_or_fobj=None, sheet_name='Sheet1', *args, **kwargs): """Export the rows.Table to XLSX file and return the saved file.""" workbook = Workbook() sheet = workbook.active sheet.title = sheet_name prepared_table = prepare_to_export(table, *args, **kwargs) # Write header field_names = next(prepared_table) for col_index, field_name in enumerate(field_names): cell = sheet.cell(row=1, column=col_index + 1) cell.value = field_name # Write sheet rows _convert_row = _python_to_cell(list(map(table.fields.get, field_names))) for row_index, row in enumerate(prepared_table, start=1): for col_index, (value, number_format) in enumerate(_convert_row(row)): cell = sheet.cell(row=row_index + 1, column=col_index + 1) cell.value = value if number_format is not None: cell.number_format = number_format if filename_or_fobj is not None: _, fobj = get_filename_and_fobj(filename_or_fobj, mode='wb') workbook.save(fobj) fobj.flush() return fobj else: fobj = BytesIO() workbook.save(fobj) fobj.seek(0) result = fobj.read() fobj.close() return result
def import_from_html(filename_or_fobj, encoding='utf-8', index=0, ignore_colspan=True, preserve_html=False, properties=False, table_tag='table', row_tag='tr', column_tag='td|th', *args, **kwargs): filename, fobj = get_filename_and_fobj(filename_or_fobj, mode='rb') html = fobj.read().decode(encoding) html_tree = document_fromstring(html) tables = html_tree.xpath('//{}'.format(table_tag)) table = tables[index] strip_tags(table, 'thead') strip_tags(table, 'tbody') row_elements = table.xpath(row_tag) table_rows = [_get_row(row, column_tag=column_tag, preserve_html=preserve_html, properties=properties) for row in row_elements] if properties: table_rows[0][-1] = 'properties' if preserve_html and kwargs.get('fields', None) is None: # The field names will be the first table row, so we need to strip HTML # from it even if `preserve_html` is `True` (it's `True` only for rows, # not for the header). table_rows[0] = list(map(_extract_node_text, row_elements[0])) max_columns = max(map(len, table_rows)) if ignore_colspan: table_rows = [row for row in table_rows if len(row) == max_columns] meta = {'imported_from': 'html', 'filename': filename, 'encoding': encoding,} return create_table(table_rows, meta=meta, *args, **kwargs)
def import_from_xlsx(filename_or_fobj, sheet_name=None, sheet_index=0, start_row=0, start_column=0, end_row=None, end_column=None, *args, **kwargs): """Return a rows.Table created from imported XLSX file.""" workbook = load_workbook(filename_or_fobj, data_only=True) if sheet_name is None: sheet_name = workbook.sheetnames[sheet_index] sheet = workbook[sheet_name] # openpyxl library reads rows and columns starting from 1 and ending on # sheet.max_row/max_col. rows uses another pattern: 0 to N - 1, so we need # to adjust the ranges accordingly min_row, min_col = sheet.min_row - 1, sheet.min_column - 1 max_row, max_col = sheet.max_row - 1, sheet.max_column - 1 start_row = max(start_row, min_row) end_row = min(end_row or max_row, max_row) start_col = max(start_column, min_col) end_col = min(end_column or max_col, max_col) table_rows = [[ _cell_to_python(sheet.cell(row=row_index, column=col_index)) for col_index in range(start_col + 1, end_col + 2) ] for row_index in range(start_row + 1, end_row + 2)] filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True) metadata = { 'imported_from': 'xlsx', 'filename': filename, 'sheet_name': sheet_name, } return create_table(table_rows, meta=metadata, *args, **kwargs)
def import_from_xlsx(filename_or_fobj, sheet_name=None, sheet_index=0, start_row=0, start_column=0, *args, **kwargs): workbook = load_workbook(filename_or_fobj) if sheet_name is None: sheet_name = workbook.sheetnames[sheet_index] sheet = workbook.get_sheet_by_name(sheet_name) # Get sheet header header = [] last_column = start_column header_value = _get_cell_value(sheet, start_row, last_column) while header_value: header.append(header_value) last_column += 1 header_value = _get_cell_value(sheet, start_row, last_column) last_column -= 1 # Get sheet rows based on `last_column` defined in 'get sheet header' row_pos = start_row + 1 all_rows = [] row = _read_row(sheet, row_pos, last_column) while any(row): all_rows.append(row) row_pos += 1 row = _read_row(sheet, row_pos, last_column) filename, _ = get_filename_and_fobj(filename_or_fobj, dont_open=True) metadata = { 'imported_from': 'xlsx', 'filename': filename, } return create_table([header] + all_rows, meta=metadata, *args, **kwargs)
def import_from_csv( filename_or_fobj, encoding="utf-8", dialect=None, sample_size=262144, *args, **kwargs ): """Import data from a CSV file (automatically detects dialect). If a file-like object is provided it MUST be in binary mode, like in `open(filename, mode='rb')`. """ filename, fobj = get_filename_and_fobj(filename_or_fobj, mode="rb") if dialect is None: dialect = discover_dialect( sample=read_sample(fobj, sample_size), encoding=encoding ) reader = unicodecsv.reader(fobj, encoding=encoding, dialect=dialect) meta = {"imported_from": "csv", "filename": filename, "encoding": encoding} return create_table(reader, meta=meta, *args, **kwargs)
def import_from_ods(filename_or_fobj, index=0, *args, **kwargs): # TODO: import spreadsheet by name # TODO: unescape values filename, _ = get_filename_and_fobj(filename_or_fobj) ods_file = zipfile.ZipFile(filename) content_fobj = ods_file.open("content.xml") xml = content_fobj.read() # will return bytes content_fobj.close() document = xml_from_string(xml) namespaces = document.nsmap spreadsheet = document.xpath("//office:spreadsheet", namespaces=namespaces)[0] tables = xpath(spreadsheet, "//table:table", namespaces) table = tables[index] table_rows_obj = xpath(table, "//table:table-row", namespaces) table_rows = [] for row_obj in table_rows_obj: row = [] for cell in xpath(row_obj, "//table:table-cell", namespaces): children = cell.getchildren() if not children: continue # TODO: evalute 'boolean' and 'time' types value_type = attrib(cell, namespaces["office"], "value-type") if value_type == "date": cell_value = attrib(cell, namespaces["office"], "date-value") elif value_type == "float": cell_value = attrib(cell, namespaces["office"], "value") elif value_type == "percentage": cell_value = attrib(cell, namespaces["office"], "value") cell_value = Decimal(cell_value) cell_value = "{:%}".format(cell_value) elif value_type == "string": try: # get computed string (from formula, for example) cell_value = attrib(cell, namespaces["office"], "string-value") except KeyError: # computed string not present => get from <p>...</p> cell_value = children[0].text else: # value_type == some type we don't know cell_value = children[0].text try: repeat = attrib(cell, namespaces["table"], "number-columns-repeated") except KeyError: row.append(cell_value) else: for _ in range(int(repeat)): row.append(cell_value) if row: table_rows.append(row) max_length = max(len(row) for row in table_rows) full_rows = complete_with_None(table_rows, max_length) meta = {"imported_from": "ods", "filename": filename} return create_table(full_rows, meta=meta, *args, **kwargs)