Python MappedSequence Examples

Programming Language: Python

Namespace/Package Name: agate

Method/Function: MappedSequence

Examples at hotexamples.com: 5

Python MappedSequence - 5 examples found. These are the top rated real world Python examples of agate.MappedSequence extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def from_xls(cls,
             path,
             sheet=None,
             skip_lines=0,
             header=True,
             encoding_override=None,
             **kwargs):
    """
    Parse an XLS file.

    :param path:
        Path to an XLS file to load or a file-like object for one.
    :param sheet:
        The names or integer indices of the worksheets to load. If not specified
        then the first sheet will be used.
    :param skip_lines:
        The number of rows to skip from the top of the sheet.
    :param header:
        If :code:`True`, the first row is assumed to contain column names.
    """
    if not isinstance(skip_lines, int):
        raise ValueError('skip_lines argument must be an int')

    if hasattr(path, 'read'):
        book = xlrd.open_workbook(file_contents=path.read(),
                                  encoding_override=encoding_override)
    else:
        with open(path, 'rb') as f:
            book = xlrd.open_workbook(file_contents=f.read(),
                                      encoding_override=encoding_override)

    multiple = agate.utils.issequence(sheet)
    if multiple:
        sheets = sheet
    else:
        sheets = [sheet]

    tables = OrderedDict()

    for i, sheet in enumerate(sheets):
        if isinstance(sheet, six.string_types):
            sheet = book.sheet_by_name(sheet)
        elif isinstance(sheet, int):
            sheet = book.sheet_by_index(sheet)
        else:
            sheet = book.sheet_by_index(0)

        if header:
            offset = 1
            column_names = []
        else:
            offset = 0
            column_names = None

        columns = []

        for i in range(sheet.ncols):
            data = sheet.col_values(i)
            values = data[skip_lines + offset:]
            types = sheet.col_types(i)[skip_lines + offset:]
            excel_type = determine_excel_type(types)

            if excel_type == xlrd.biffh.XL_CELL_BOOLEAN:
                values = normalize_booleans(values)
            elif excel_type == xlrd.biffh.XL_CELL_DATE:
                values = normalize_dates(values, book.datemode)

            if header:
                name = six.text_type(data[skip_lines]) or None
                column_names.append(name)

            columns.append(values)

        rows = []

        if columns:
            for i in range(len(columns[0])):
                rows.append([c[i] for c in columns])

        tables[sheet.name] = agate.Table(rows, column_names, **kwargs)

    if multiple:
        return agate.MappedSequence(tables.values(), tables.keys())
    else:
        return tables.popitem()[1]

Example #2

Show file

def from_html(cls,
              path,
              table_identifier=0,
              header=True,
              encoding='utf-8',
              mso_number_formats_override=None,
              row_limit=None,
              **kwargs):
    """
    Parse an HTML file.

    :param path:
        Path to an HTML file to load or a file-like object for one.
    :param table_identifier:
        The names or integer indices of the tables to load. If not specified
        then the first table will be used.
    :param header:
        If :code:`True`, the first row is assumed to contain column names.
    """

    if 'column_names' in kwargs:
        if not header:
            column_names = kwargs['column_names']
        del kwargs['column_names']

    column_types = None
    if 'column_types' in kwargs:
        column_types = kwargs['column_types']
        del kwargs['column_types']

    if 'parser' in kwargs:  # TODO ignored for now
        del kwargs['parser']
    parser = 'lxml'

    if hasattr(path, 'read'):
        html_soup = BeautifulSoup(path,
                                  parser,
                                  parse_only=SoupStrainer('table'),
                                  from_encoding=encoding)
    else:
        with open(path, 'rt') as f:
            html_soup = BeautifulSoup(f.read(),
                                      parser,
                                      parse_only=SoupStrainer('table'),
                                      from_encoding=encoding)

    multiple = agate.utils.issequence(table_identifier)
    if multiple:
        table_identifiers = table_identifier
    else:
        table_identifiers = [table_identifier]

    tables = OrderedDict()

    for i, table_identifier in enumerate(table_identifiers):
        if isinstance(table_identifier, six.string_types):
            #            sheet = book.sheet_by_name(sheet)
            raise Exception("Not implemented yet.")  # FIXME
        elif isinstance(table_identifier, int):
            table_html = html_soup.find_all('table')[table_identifier]
        else:
            raise Exception(
                f"Could not interpret table identifier {table_identifier}")

        head_rows = parse_thead_tr(table_html)
        body_rows = parse_tbody_tr(table_html)
        if row_limit is not None:
            body_rows = body_rows[0:row_limit]
        foot_rows = parse_tfoot_tr(table_html)

        if not head_rows:
            # The table has no <thead>. Move the top all-<th> rows from
            # body_rows to header_rows. (This is a common case because many
            # tables in the wild have no <thead> or <tfoot>.)
            while body_rows and row_is_all_th(body_rows[0]):
                head_rows.append(body_rows.pop(0))

        head = expand_colspan_rowspan(head_rows)
        body = expand_colspan_rowspan(body_rows)
        foot = expand_colspan_rowspan(foot_rows)

        if header:
            column_names = head[0]

        tables[table_identifier] = agate.Table(rows=body,
                                               column_names=column_names,
                                               column_types=column_types,
                                               **kwargs)

    if multiple:
        return agate.MappedSequence(tables.values(), tables.keys())
    else:
        return tables.popitem()[1]

Example #3

Show file

File: table_xls.py Project: lcorbasson/agate-excel

def from_xls(cls,
             path,
             sheet=None,
             skip_lines=0,
             header=True,
             encoding_override=None,
             row_limit=None,
             column_names=None,
             column_types=None,
             **kwargs):
    """
    Parse an XLS file.

    :param path:
        Path to an XLS file to load or a file-like object for one.
    :param sheet:
        The names or integer indices of the worksheets to load. If not specified
        then the first sheet will be used.
    :param skip_lines:
        The number of rows to skip from the top of the sheet.
    :param header:
        If :code:`True`, the first row is assumed to contain column names.
    :param row_limit:
        Limit how many rows of data will be read
    :param column_names:
        See :meth:`.Table.__init__`.
    :param column_types:
        See :meth:`.Table.__init__`.
    """
    if not isinstance(skip_lines, int):
        raise ValueError('skip_lines argument must be an int')

    def open_workbook(f):
        try:
            book = xlrd.open_workbook(file_contents=f.read(),
                                      encoding_override=encoding_override,
                                      on_demand=True)
        except xlrd.compdoc.CompDocError:
            # This is not a pure XLS file; we'll try to read it though.
            # Let's try the Compound File Binary Format:
            ole = olefile.OleFileIO(f)
            if ole.exists('Workbook'):
                d = ole.openstream('Workbook')
                book = xlrd.open_workbook(file_contents=d.read(),
                                          on_demand=True)
            else:
                raise IOError('No Workbook stream found in OLE file')
        return book

    if hasattr(path, 'read'):
        book = open_workbook(path)
    else:
        with open(path, 'rb') as f:
            book = open_workbook(f)

    try:
        multiple = agate.utils.issequence(sheet)
        if multiple:
            sheets = sheet
        else:
            sheets = [sheet]

        tables = OrderedDict()

        for i, sheet in enumerate(sheets):
            if isinstance(sheet, six.string_types):
                sheet = book.sheet_by_name(sheet)
            elif isinstance(sheet, int):
                sheet = book.sheet_by_index(sheet)
            else:
                sheet = book.sheet_by_index(0)

            if header:
                offset = 1
                column_names_detected = []
            else:
                offset = 0
                column_names_detected = None

            columns = []
            column_types_detected = []

            for i in range(sheet.ncols):
                if row_limit is None:
                    values = sheet.col_values(i, skip_lines + offset)
                    types = sheet.col_types(i, skip_lines + offset)
                else:
                    values = sheet.col_values(i, skip_lines + offset,
                                              skip_lines + offset + row_limit)
                    types = sheet.col_types(i, skip_lines + offset,
                                            skip_lines + offset + row_limit)
                excel_type = determine_excel_type(types)
                agate_type = determine_agate_type(excel_type)

                if excel_type == xlrd.biffh.XL_CELL_BOOLEAN:
                    values = normalize_booleans(values)
                elif excel_type == xlrd.biffh.XL_CELL_DATE:
                    values, with_date, with_time = normalize_dates(
                        values, book.datemode)
                    if not with_date:
                        agate_type = agate.TimeDelta()
                    if not with_time:
                        agate_type = agate.Date()

                if header:
                    name = six.text_type(sheet.cell_value(skip_lines,
                                                          i)) or None
                    column_names_detected.append(name)

                columns.append(values)
                column_types_detected.append(agate_type)

            rows = []

            if columns:
                for i in range(len(columns[0])):
                    rows.append([c[i] for c in columns])

            if column_names is None:
                sheet_column_names = column_names_detected
            else:
                sheet_column_names = column_names

            sheet_column_types = column_types
            if isinstance(column_types,
                          dict) and sheet_column_names is not None:
                sheet_column_types = dict(
                    zip(sheet_column_names, column_types_detected))
                sheet_column_types.update(column_types)

            tables[sheet.name] = agate.Table(rows, sheet_column_names,
                                             sheet_column_types, **kwargs)

    finally:
        book.release_resources()

    if multiple:
        return agate.MappedSequence(tables.values(), tables.keys())
    else:
        return tables.popitem()[1]

Example #4

Show file

def from_xlsx(cls,
              path,
              sheet=None,
              skip_lines=0,
              header=True,
              read_only=True,
              **kwargs):
    """
    Parse an XLSX file.

    :param path:
        Path to an XLSX file to load or a file-like object for one.
    :param sheet:
        The names or integer indices of the worksheets to load. If not specified
        then the "active" sheet will be used.
    :param skip_lines:
        The number of rows to skip from the top of the sheet.
    :param header:
        If :code:`True`, the first row is assumed to contain column names.
    """
    if not isinstance(skip_lines, int):
        raise ValueError('skip_lines argument must be an int')

    if hasattr(path, 'read'):
        f = path
    else:
        f = open(path, 'rb')

    book = openpyxl.load_workbook(f, read_only=read_only, data_only=True)

    multiple = agate.utils.issequence(sheet)
    if multiple:
        sheets = sheet
    else:
        sheets = [sheet]

    tables = OrderedDict()

    for i, sheet in enumerate(sheets):
        if isinstance(sheet, six.string_types):
            sheet = book[sheet]
        elif isinstance(sheet, int):
            sheet = book.worksheets[sheet]
        else:
            sheet = book.active

        column_names = None
        rows = []

        for i, row in enumerate(sheet.iter_rows(row_offset=skip_lines)):
            if i == 0 and header:
                column_names = [
                    None if c.value is None else six.text_type(c.value)
                    for c in row
                ]
                continue

            values = []

            for c in row:
                value = c.value

                if value.__class__ is datetime.datetime:
                    # Handle default XLSX date as 00:00 time
                    if value.date() == datetime.date(
                            1904, 1, 1) and not has_date_elements(c):
                        value = value.time()

                        value = normalize_datetime(value)
                    elif value.time() == NULL_TIME:
                        value = value.date()
                    else:
                        value = normalize_datetime(value)

                values.append(value)

            rows.append(values)

        tables[sheet.title] = agate.Table(rows, column_names, **kwargs)

    f.close()

    if multiple:
        return agate.MappedSequence(tables.values(), tables.keys())
    else:
        return tables.popitem()[1]

Example #5

Show file

def from_xlsx(cls, path, sheet=None, skip_lines=0, header=True, read_only=True, 
              reset_dimensions=False, row_limit=None, column_names=None, column_types=None, **kwargs):
    """
    Parse an XLSX file.

    :param path:
        Path to an XLSX file to load or a file-like object for one.
    :param sheet:
        The names or integer indices of the worksheets to load. If not specified
        then the "active" sheet will be used.
    :param skip_lines:
        The number of rows to skip from the top of the sheet.
    :param header:
        If :code:`True`, the first row is assumed to contain column names.
    :param reset_dimensions:
        If :code:`True`, do not trust the dimensions in the file's properties, 
        and recalculate them based on the data in the file.
    :param row_limit:
        Limit how many rows of data will be read
    """
    if not isinstance(skip_lines, int):
        raise ValueError('skip_lines argument must be an int')

    if hasattr(path, 'read'):
        f = path
    else:
        f = open(path, 'rb')

    book = openpyxl.load_workbook(f, read_only=read_only, data_only=True)

    multiple = agate.utils.issequence(sheet)
    if multiple:
        sheets = sheet
    else:
        sheets = [sheet]

    tables = OrderedDict()

    for i, sheet in enumerate(sheets):
        if isinstance(sheet, six.string_types):
            try:
                sheet = book[sheet]
            except KeyError:
                f.close()
                raise
        elif isinstance(sheet, int):
            try:
                sheet = book.worksheets[sheet]
            except IndexError:
                f.close()
                raise
        else:
            sheet = book.active

        column_names_detected = None
        offset = 0
        rows = []

        if reset_dimensions:
            sheet.reset_dimensions()

        if header:
            sheet_header = sheet.iter_rows(min_row=1 + skip_lines, max_row=1 + skip_lines)
            column_names_detected = [None if c.value is None else six.text_type(c.value) for row in sheet_header for c in row]
            offset = 1

        if row_limit is None:
            sheet_rows = sheet.iter_rows(min_row=1 + skip_lines + offset)
        else:
            sheet_rows = sheet.iter_rows(min_row=1 + skip_lines + offset, max_row=1 + skip_lines + offset + row_limit)

        for i, row in enumerate(sheet_rows):
            values = []

            for c in row:
                value = c.value

                if value.__class__ is datetime.datetime:
                    # Handle default XLSX date as 00:00 time
                    if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c):
                        value = value.time()

                        value = normalize_datetime(value)
                    elif value.time() == NULL_TIME:
                        value = value.date()
                    else:
                        value = normalize_datetime(value)

                values.append(value)

            rows.append(values)

        if column_names is None:
            sheet_column_names = column_names_detected
        else:
            sheet_column_names = column_names

        tables[sheet.title] = agate.Table(rows, sheet_column_names, column_types, **kwargs)

    f.close()

    if multiple:
        return agate.MappedSequence(tables.values(), tables.keys())
    else:
        return tables.popitem()[1]