def from_xls(cls, path, sheet=None, skip_lines=0, header=True, encoding_override=None, **kwargs): """ Parse an XLS file. :param path: Path to an XLS file to load or a file-like object for one. :param sheet: The names or integer indices of the worksheets to load. If not specified then the first sheet will be used. :param skip_lines: The number of rows to skip from the top of the sheet. :param header: If :code:`True`, the first row is assumed to contain column names. """ if not isinstance(skip_lines, int): raise ValueError('skip_lines argument must be an int') if hasattr(path, 'read'): book = xlrd.open_workbook(file_contents=path.read(), encoding_override=encoding_override) else: with open(path, 'rb') as f: book = xlrd.open_workbook(file_contents=f.read(), encoding_override=encoding_override) multiple = agate.utils.issequence(sheet) if multiple: sheets = sheet else: sheets = [sheet] tables = OrderedDict() for i, sheet in enumerate(sheets): if isinstance(sheet, six.string_types): sheet = book.sheet_by_name(sheet) elif isinstance(sheet, int): sheet = book.sheet_by_index(sheet) else: sheet = book.sheet_by_index(0) if header: offset = 1 column_names = [] else: offset = 0 column_names = None columns = [] for i in range(sheet.ncols): data = sheet.col_values(i) values = data[skip_lines + offset:] types = sheet.col_types(i)[skip_lines + offset:] excel_type = determine_excel_type(types) if excel_type == xlrd.biffh.XL_CELL_BOOLEAN: values = normalize_booleans(values) elif excel_type == xlrd.biffh.XL_CELL_DATE: values = normalize_dates(values, book.datemode) if header: name = six.text_type(data[skip_lines]) or None column_names.append(name) columns.append(values) rows = [] if columns: for i in range(len(columns[0])): rows.append([c[i] for c in columns]) tables[sheet.name] = agate.Table(rows, column_names, **kwargs) if multiple: return agate.MappedSequence(tables.values(), tables.keys()) else: return tables.popitem()[1]
def from_html(cls, path, table_identifier=0, header=True, encoding='utf-8', mso_number_formats_override=None, row_limit=None, **kwargs): """ Parse an HTML file. :param path: Path to an HTML file to load or a file-like object for one. :param table_identifier: The names or integer indices of the tables to load. If not specified then the first table will be used. :param header: If :code:`True`, the first row is assumed to contain column names. """ if 'column_names' in kwargs: if not header: column_names = kwargs['column_names'] del kwargs['column_names'] column_types = None if 'column_types' in kwargs: column_types = kwargs['column_types'] del kwargs['column_types'] if 'parser' in kwargs: # TODO ignored for now del kwargs['parser'] parser = 'lxml' if hasattr(path, 'read'): html_soup = BeautifulSoup(path, parser, parse_only=SoupStrainer('table'), from_encoding=encoding) else: with open(path, 'rt') as f: html_soup = BeautifulSoup(f.read(), parser, parse_only=SoupStrainer('table'), from_encoding=encoding) multiple = agate.utils.issequence(table_identifier) if multiple: table_identifiers = table_identifier else: table_identifiers = [table_identifier] tables = OrderedDict() for i, table_identifier in enumerate(table_identifiers): if isinstance(table_identifier, six.string_types): # sheet = book.sheet_by_name(sheet) raise Exception("Not implemented yet.") # FIXME elif isinstance(table_identifier, int): table_html = html_soup.find_all('table')[table_identifier] else: raise Exception( f"Could not interpret table identifier {table_identifier}") head_rows = parse_thead_tr(table_html) body_rows = parse_tbody_tr(table_html) if row_limit is not None: body_rows = body_rows[0:row_limit] foot_rows = parse_tfoot_tr(table_html) if not head_rows: # The table has no <thead>. Move the top all-<th> rows from # body_rows to header_rows. (This is a common case because many # tables in the wild have no <thead> or <tfoot>.) while body_rows and row_is_all_th(body_rows[0]): head_rows.append(body_rows.pop(0)) head = expand_colspan_rowspan(head_rows) body = expand_colspan_rowspan(body_rows) foot = expand_colspan_rowspan(foot_rows) if header: column_names = head[0] tables[table_identifier] = agate.Table(rows=body, column_names=column_names, column_types=column_types, **kwargs) if multiple: return agate.MappedSequence(tables.values(), tables.keys()) else: return tables.popitem()[1]
def from_xls(cls, path, sheet=None, skip_lines=0, header=True, encoding_override=None, row_limit=None, column_names=None, column_types=None, **kwargs): """ Parse an XLS file. :param path: Path to an XLS file to load or a file-like object for one. :param sheet: The names or integer indices of the worksheets to load. If not specified then the first sheet will be used. :param skip_lines: The number of rows to skip from the top of the sheet. :param header: If :code:`True`, the first row is assumed to contain column names. :param row_limit: Limit how many rows of data will be read :param column_names: See :meth:`.Table.__init__`. :param column_types: See :meth:`.Table.__init__`. """ if not isinstance(skip_lines, int): raise ValueError('skip_lines argument must be an int') def open_workbook(f): try: book = xlrd.open_workbook(file_contents=f.read(), encoding_override=encoding_override, on_demand=True) except xlrd.compdoc.CompDocError: # This is not a pure XLS file; we'll try to read it though. # Let's try the Compound File Binary Format: ole = olefile.OleFileIO(f) if ole.exists('Workbook'): d = ole.openstream('Workbook') book = xlrd.open_workbook(file_contents=d.read(), on_demand=True) else: raise IOError('No Workbook stream found in OLE file') return book if hasattr(path, 'read'): book = open_workbook(path) else: with open(path, 'rb') as f: book = open_workbook(f) try: multiple = agate.utils.issequence(sheet) if multiple: sheets = sheet else: sheets = [sheet] tables = OrderedDict() for i, sheet in enumerate(sheets): if isinstance(sheet, six.string_types): sheet = book.sheet_by_name(sheet) elif isinstance(sheet, int): sheet = book.sheet_by_index(sheet) else: sheet = book.sheet_by_index(0) if header: offset = 1 column_names_detected = [] else: offset = 0 column_names_detected = None columns = [] column_types_detected = [] for i in range(sheet.ncols): if row_limit is None: values = sheet.col_values(i, skip_lines + offset) types = sheet.col_types(i, skip_lines + offset) else: values = sheet.col_values(i, skip_lines + offset, skip_lines + offset + row_limit) types = sheet.col_types(i, skip_lines + offset, skip_lines + offset + row_limit) excel_type = determine_excel_type(types) agate_type = determine_agate_type(excel_type) if excel_type == xlrd.biffh.XL_CELL_BOOLEAN: values = normalize_booleans(values) elif excel_type == xlrd.biffh.XL_CELL_DATE: values, with_date, with_time = normalize_dates( values, book.datemode) if not with_date: agate_type = agate.TimeDelta() if not with_time: agate_type = agate.Date() if header: name = six.text_type(sheet.cell_value(skip_lines, i)) or None column_names_detected.append(name) columns.append(values) column_types_detected.append(agate_type) rows = [] if columns: for i in range(len(columns[0])): rows.append([c[i] for c in columns]) if column_names is None: sheet_column_names = column_names_detected else: sheet_column_names = column_names sheet_column_types = column_types if isinstance(column_types, dict) and sheet_column_names is not None: sheet_column_types = dict( zip(sheet_column_names, column_types_detected)) sheet_column_types.update(column_types) tables[sheet.name] = agate.Table(rows, sheet_column_names, sheet_column_types, **kwargs) finally: book.release_resources() if multiple: return agate.MappedSequence(tables.values(), tables.keys()) else: return tables.popitem()[1]
def from_xlsx(cls, path, sheet=None, skip_lines=0, header=True, read_only=True, **kwargs): """ Parse an XLSX file. :param path: Path to an XLSX file to load or a file-like object for one. :param sheet: The names or integer indices of the worksheets to load. If not specified then the "active" sheet will be used. :param skip_lines: The number of rows to skip from the top of the sheet. :param header: If :code:`True`, the first row is assumed to contain column names. """ if not isinstance(skip_lines, int): raise ValueError('skip_lines argument must be an int') if hasattr(path, 'read'): f = path else: f = open(path, 'rb') book = openpyxl.load_workbook(f, read_only=read_only, data_only=True) multiple = agate.utils.issequence(sheet) if multiple: sheets = sheet else: sheets = [sheet] tables = OrderedDict() for i, sheet in enumerate(sheets): if isinstance(sheet, six.string_types): sheet = book[sheet] elif isinstance(sheet, int): sheet = book.worksheets[sheet] else: sheet = book.active column_names = None rows = [] for i, row in enumerate(sheet.iter_rows(row_offset=skip_lines)): if i == 0 and header: column_names = [ None if c.value is None else six.text_type(c.value) for c in row ] continue values = [] for c in row: value = c.value if value.__class__ is datetime.datetime: # Handle default XLSX date as 00:00 time if value.date() == datetime.date( 1904, 1, 1) and not has_date_elements(c): value = value.time() value = normalize_datetime(value) elif value.time() == NULL_TIME: value = value.date() else: value = normalize_datetime(value) values.append(value) rows.append(values) tables[sheet.title] = agate.Table(rows, column_names, **kwargs) f.close() if multiple: return agate.MappedSequence(tables.values(), tables.keys()) else: return tables.popitem()[1]
def from_xlsx(cls, path, sheet=None, skip_lines=0, header=True, read_only=True, reset_dimensions=False, row_limit=None, column_names=None, column_types=None, **kwargs): """ Parse an XLSX file. :param path: Path to an XLSX file to load or a file-like object for one. :param sheet: The names or integer indices of the worksheets to load. If not specified then the "active" sheet will be used. :param skip_lines: The number of rows to skip from the top of the sheet. :param header: If :code:`True`, the first row is assumed to contain column names. :param reset_dimensions: If :code:`True`, do not trust the dimensions in the file's properties, and recalculate them based on the data in the file. :param row_limit: Limit how many rows of data will be read """ if not isinstance(skip_lines, int): raise ValueError('skip_lines argument must be an int') if hasattr(path, 'read'): f = path else: f = open(path, 'rb') book = openpyxl.load_workbook(f, read_only=read_only, data_only=True) multiple = agate.utils.issequence(sheet) if multiple: sheets = sheet else: sheets = [sheet] tables = OrderedDict() for i, sheet in enumerate(sheets): if isinstance(sheet, six.string_types): try: sheet = book[sheet] except KeyError: f.close() raise elif isinstance(sheet, int): try: sheet = book.worksheets[sheet] except IndexError: f.close() raise else: sheet = book.active column_names_detected = None offset = 0 rows = [] if reset_dimensions: sheet.reset_dimensions() if header: sheet_header = sheet.iter_rows(min_row=1 + skip_lines, max_row=1 + skip_lines) column_names_detected = [None if c.value is None else six.text_type(c.value) for row in sheet_header for c in row] offset = 1 if row_limit is None: sheet_rows = sheet.iter_rows(min_row=1 + skip_lines + offset) else: sheet_rows = sheet.iter_rows(min_row=1 + skip_lines + offset, max_row=1 + skip_lines + offset + row_limit) for i, row in enumerate(sheet_rows): values = [] for c in row: value = c.value if value.__class__ is datetime.datetime: # Handle default XLSX date as 00:00 time if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c): value = value.time() value = normalize_datetime(value) elif value.time() == NULL_TIME: value = value.date() else: value = normalize_datetime(value) values.append(value) rows.append(values) if column_names is None: sheet_column_names = column_names_detected else: sheet_column_names = column_names tables[sheet.title] = agate.Table(rows, sheet_column_names, column_types, **kwargs) f.close() if multiple: return agate.MappedSequence(tables.values(), tables.keys()) else: return tables.popitem()[1]