Example #1
0
def from_xls(cls,
             path,
             sheet=None,
             skip_lines=0,
             header=True,
             encoding_override=None,
             **kwargs):
    """
    Parse an XLS file.

    :param path:
        Path to an XLS file to load or a file-like object for one.
    :param sheet:
        The names or integer indices of the worksheets to load. If not specified
        then the first sheet will be used.
    :param skip_lines:
        The number of rows to skip from the top of the sheet.
    :param header:
        If :code:`True`, the first row is assumed to contain column names.
    """
    if not isinstance(skip_lines, int):
        raise ValueError('skip_lines argument must be an int')

    if hasattr(path, 'read'):
        book = xlrd.open_workbook(file_contents=path.read(),
                                  encoding_override=encoding_override)
    else:
        with open(path, 'rb') as f:
            book = xlrd.open_workbook(file_contents=f.read(),
                                      encoding_override=encoding_override)

    multiple = agate.utils.issequence(sheet)
    if multiple:
        sheets = sheet
    else:
        sheets = [sheet]

    tables = OrderedDict()

    for i, sheet in enumerate(sheets):
        if isinstance(sheet, six.string_types):
            sheet = book.sheet_by_name(sheet)
        elif isinstance(sheet, int):
            sheet = book.sheet_by_index(sheet)
        else:
            sheet = book.sheet_by_index(0)

        if header:
            offset = 1
            column_names = []
        else:
            offset = 0
            column_names = None

        columns = []

        for i in range(sheet.ncols):
            data = sheet.col_values(i)
            values = data[skip_lines + offset:]
            types = sheet.col_types(i)[skip_lines + offset:]
            excel_type = determine_excel_type(types)

            if excel_type == xlrd.biffh.XL_CELL_BOOLEAN:
                values = normalize_booleans(values)
            elif excel_type == xlrd.biffh.XL_CELL_DATE:
                values = normalize_dates(values, book.datemode)

            if header:
                name = six.text_type(data[skip_lines]) or None
                column_names.append(name)

            columns.append(values)

        rows = []

        if columns:
            for i in range(len(columns[0])):
                rows.append([c[i] for c in columns])

        tables[sheet.name] = agate.Table(rows, column_names, **kwargs)

    if multiple:
        return agate.MappedSequence(tables.values(), tables.keys())
    else:
        return tables.popitem()[1]
Example #2
0
def from_html(cls,
              path,
              table_identifier=0,
              header=True,
              encoding='utf-8',
              mso_number_formats_override=None,
              row_limit=None,
              **kwargs):
    """
    Parse an HTML file.

    :param path:
        Path to an HTML file to load or a file-like object for one.
    :param table_identifier:
        The names or integer indices of the tables to load. If not specified
        then the first table will be used.
    :param header:
        If :code:`True`, the first row is assumed to contain column names.
    """

    if 'column_names' in kwargs:
        if not header:
            column_names = kwargs['column_names']
        del kwargs['column_names']

    column_types = None
    if 'column_types' in kwargs:
        column_types = kwargs['column_types']
        del kwargs['column_types']

    if 'parser' in kwargs:  # TODO ignored for now
        del kwargs['parser']
    parser = 'lxml'

    if hasattr(path, 'read'):
        html_soup = BeautifulSoup(path,
                                  parser,
                                  parse_only=SoupStrainer('table'),
                                  from_encoding=encoding)
    else:
        with open(path, 'rt') as f:
            html_soup = BeautifulSoup(f.read(),
                                      parser,
                                      parse_only=SoupStrainer('table'),
                                      from_encoding=encoding)

    multiple = agate.utils.issequence(table_identifier)
    if multiple:
        table_identifiers = table_identifier
    else:
        table_identifiers = [table_identifier]

    tables = OrderedDict()

    for i, table_identifier in enumerate(table_identifiers):
        if isinstance(table_identifier, six.string_types):
            #            sheet = book.sheet_by_name(sheet)
            raise Exception("Not implemented yet.")  # FIXME
        elif isinstance(table_identifier, int):
            table_html = html_soup.find_all('table')[table_identifier]
        else:
            raise Exception(
                f"Could not interpret table identifier {table_identifier}")

        head_rows = parse_thead_tr(table_html)
        body_rows = parse_tbody_tr(table_html)
        if row_limit is not None:
            body_rows = body_rows[0:row_limit]
        foot_rows = parse_tfoot_tr(table_html)

        if not head_rows:
            # The table has no <thead>. Move the top all-<th> rows from
            # body_rows to header_rows. (This is a common case because many
            # tables in the wild have no <thead> or <tfoot>.)
            while body_rows and row_is_all_th(body_rows[0]):
                head_rows.append(body_rows.pop(0))

        head = expand_colspan_rowspan(head_rows)
        body = expand_colspan_rowspan(body_rows)
        foot = expand_colspan_rowspan(foot_rows)

        if header:
            column_names = head[0]

        tables[table_identifier] = agate.Table(rows=body,
                                               column_names=column_names,
                                               column_types=column_types,
                                               **kwargs)

    if multiple:
        return agate.MappedSequence(tables.values(), tables.keys())
    else:
        return tables.popitem()[1]
Example #3
0
def from_xls(cls,
             path,
             sheet=None,
             skip_lines=0,
             header=True,
             encoding_override=None,
             row_limit=None,
             column_names=None,
             column_types=None,
             **kwargs):
    """
    Parse an XLS file.

    :param path:
        Path to an XLS file to load or a file-like object for one.
    :param sheet:
        The names or integer indices of the worksheets to load. If not specified
        then the first sheet will be used.
    :param skip_lines:
        The number of rows to skip from the top of the sheet.
    :param header:
        If :code:`True`, the first row is assumed to contain column names.
    :param row_limit:
        Limit how many rows of data will be read
    :param column_names:
        See :meth:`.Table.__init__`.
    :param column_types:
        See :meth:`.Table.__init__`.
    """
    if not isinstance(skip_lines, int):
        raise ValueError('skip_lines argument must be an int')

    def open_workbook(f):
        try:
            book = xlrd.open_workbook(file_contents=f.read(),
                                      encoding_override=encoding_override,
                                      on_demand=True)
        except xlrd.compdoc.CompDocError:
            # This is not a pure XLS file; we'll try to read it though.
            # Let's try the Compound File Binary Format:
            ole = olefile.OleFileIO(f)
            if ole.exists('Workbook'):
                d = ole.openstream('Workbook')
                book = xlrd.open_workbook(file_contents=d.read(),
                                          on_demand=True)
            else:
                raise IOError('No Workbook stream found in OLE file')
        return book

    if hasattr(path, 'read'):
        book = open_workbook(path)
    else:
        with open(path, 'rb') as f:
            book = open_workbook(f)

    try:
        multiple = agate.utils.issequence(sheet)
        if multiple:
            sheets = sheet
        else:
            sheets = [sheet]

        tables = OrderedDict()

        for i, sheet in enumerate(sheets):
            if isinstance(sheet, six.string_types):
                sheet = book.sheet_by_name(sheet)
            elif isinstance(sheet, int):
                sheet = book.sheet_by_index(sheet)
            else:
                sheet = book.sheet_by_index(0)

            if header:
                offset = 1
                column_names_detected = []
            else:
                offset = 0
                column_names_detected = None

            columns = []
            column_types_detected = []

            for i in range(sheet.ncols):
                if row_limit is None:
                    values = sheet.col_values(i, skip_lines + offset)
                    types = sheet.col_types(i, skip_lines + offset)
                else:
                    values = sheet.col_values(i, skip_lines + offset,
                                              skip_lines + offset + row_limit)
                    types = sheet.col_types(i, skip_lines + offset,
                                            skip_lines + offset + row_limit)
                excel_type = determine_excel_type(types)
                agate_type = determine_agate_type(excel_type)

                if excel_type == xlrd.biffh.XL_CELL_BOOLEAN:
                    values = normalize_booleans(values)
                elif excel_type == xlrd.biffh.XL_CELL_DATE:
                    values, with_date, with_time = normalize_dates(
                        values, book.datemode)
                    if not with_date:
                        agate_type = agate.TimeDelta()
                    if not with_time:
                        agate_type = agate.Date()

                if header:
                    name = six.text_type(sheet.cell_value(skip_lines,
                                                          i)) or None
                    column_names_detected.append(name)

                columns.append(values)
                column_types_detected.append(agate_type)

            rows = []

            if columns:
                for i in range(len(columns[0])):
                    rows.append([c[i] for c in columns])

            if column_names is None:
                sheet_column_names = column_names_detected
            else:
                sheet_column_names = column_names

            sheet_column_types = column_types
            if isinstance(column_types,
                          dict) and sheet_column_names is not None:
                sheet_column_types = dict(
                    zip(sheet_column_names, column_types_detected))
                sheet_column_types.update(column_types)

            tables[sheet.name] = agate.Table(rows, sheet_column_names,
                                             sheet_column_types, **kwargs)

    finally:
        book.release_resources()

    if multiple:
        return agate.MappedSequence(tables.values(), tables.keys())
    else:
        return tables.popitem()[1]
Example #4
0
def from_xlsx(cls,
              path,
              sheet=None,
              skip_lines=0,
              header=True,
              read_only=True,
              **kwargs):
    """
    Parse an XLSX file.

    :param path:
        Path to an XLSX file to load or a file-like object for one.
    :param sheet:
        The names or integer indices of the worksheets to load. If not specified
        then the "active" sheet will be used.
    :param skip_lines:
        The number of rows to skip from the top of the sheet.
    :param header:
        If :code:`True`, the first row is assumed to contain column names.
    """
    if not isinstance(skip_lines, int):
        raise ValueError('skip_lines argument must be an int')

    if hasattr(path, 'read'):
        f = path
    else:
        f = open(path, 'rb')

    book = openpyxl.load_workbook(f, read_only=read_only, data_only=True)

    multiple = agate.utils.issequence(sheet)
    if multiple:
        sheets = sheet
    else:
        sheets = [sheet]

    tables = OrderedDict()

    for i, sheet in enumerate(sheets):
        if isinstance(sheet, six.string_types):
            sheet = book[sheet]
        elif isinstance(sheet, int):
            sheet = book.worksheets[sheet]
        else:
            sheet = book.active

        column_names = None
        rows = []

        for i, row in enumerate(sheet.iter_rows(row_offset=skip_lines)):
            if i == 0 and header:
                column_names = [
                    None if c.value is None else six.text_type(c.value)
                    for c in row
                ]
                continue

            values = []

            for c in row:
                value = c.value

                if value.__class__ is datetime.datetime:
                    # Handle default XLSX date as 00:00 time
                    if value.date() == datetime.date(
                            1904, 1, 1) and not has_date_elements(c):
                        value = value.time()

                        value = normalize_datetime(value)
                    elif value.time() == NULL_TIME:
                        value = value.date()
                    else:
                        value = normalize_datetime(value)

                values.append(value)

            rows.append(values)

        tables[sheet.title] = agate.Table(rows, column_names, **kwargs)

    f.close()

    if multiple:
        return agate.MappedSequence(tables.values(), tables.keys())
    else:
        return tables.popitem()[1]
Example #5
0
def from_xlsx(cls, path, sheet=None, skip_lines=0, header=True, read_only=True, 
              reset_dimensions=False, row_limit=None, column_names=None, column_types=None, **kwargs):
    """
    Parse an XLSX file.

    :param path:
        Path to an XLSX file to load or a file-like object for one.
    :param sheet:
        The names or integer indices of the worksheets to load. If not specified
        then the "active" sheet will be used.
    :param skip_lines:
        The number of rows to skip from the top of the sheet.
    :param header:
        If :code:`True`, the first row is assumed to contain column names.
    :param reset_dimensions:
        If :code:`True`, do not trust the dimensions in the file's properties, 
        and recalculate them based on the data in the file.
    :param row_limit:
        Limit how many rows of data will be read
    """
    if not isinstance(skip_lines, int):
        raise ValueError('skip_lines argument must be an int')

    if hasattr(path, 'read'):
        f = path
    else:
        f = open(path, 'rb')

    book = openpyxl.load_workbook(f, read_only=read_only, data_only=True)

    multiple = agate.utils.issequence(sheet)
    if multiple:
        sheets = sheet
    else:
        sheets = [sheet]

    tables = OrderedDict()

    for i, sheet in enumerate(sheets):
        if isinstance(sheet, six.string_types):
            try:
                sheet = book[sheet]
            except KeyError:
                f.close()
                raise
        elif isinstance(sheet, int):
            try:
                sheet = book.worksheets[sheet]
            except IndexError:
                f.close()
                raise
        else:
            sheet = book.active

        column_names_detected = None
        offset = 0
        rows = []

        if reset_dimensions:
            sheet.reset_dimensions()

        if header:
            sheet_header = sheet.iter_rows(min_row=1 + skip_lines, max_row=1 + skip_lines)
            column_names_detected = [None if c.value is None else six.text_type(c.value) for row in sheet_header for c in row]
            offset = 1

        if row_limit is None:
            sheet_rows = sheet.iter_rows(min_row=1 + skip_lines + offset)
        else:
            sheet_rows = sheet.iter_rows(min_row=1 + skip_lines + offset, max_row=1 + skip_lines + offset + row_limit)

        for i, row in enumerate(sheet_rows):
            values = []

            for c in row:
                value = c.value

                if value.__class__ is datetime.datetime:
                    # Handle default XLSX date as 00:00 time
                    if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c):
                        value = value.time()

                        value = normalize_datetime(value)
                    elif value.time() == NULL_TIME:
                        value = value.date()
                    else:
                        value = normalize_datetime(value)

                values.append(value)

            rows.append(values)

        if column_names is None:
            sheet_column_names = column_names_detected
        else:
            sheet_column_names = column_names

        tables[sheet.title] = agate.Table(rows, sheet_column_names, column_types, **kwargs)

    f.close()

    if multiple:
        return agate.MappedSequence(tables.values(), tables.keys())
    else:
        return tables.popitem()[1]