Beispiel #1
0
    def parse(
        self,
        sheet_name=0,
        header=0,
        names=None,
        index_col=None,
        usecols=None,
        squeeze=False,
        dtype=None,
        true_values=None,
        false_values=None,
        skiprows=None,
        nrows=None,
        na_values=None,
        verbose=False,
        parse_dates=False,
        date_parser=None,
        thousands=None,
        comment=None,
        skipfooter=0,
        convert_float=True,
        mangle_dupe_cols=True,
        **kwds,
    ):

        validate_header_arg(header)

        ret_dict = False

        # Keep sheetname to maintain backwards compatibility.
        if isinstance(sheet_name, list):
            sheets = sheet_name
            ret_dict = True
        elif sheet_name is None:
            sheets = self.sheet_names
            ret_dict = True
        else:
            sheets = [sheet_name]

        # handle same-type duplicates.
        sheets = list(dict.fromkeys(sheets).keys())

        output = {}

        for asheetname in sheets:
            if verbose:
                print(f"Reading sheet {asheetname}")

            if isinstance(asheetname, str):
                sheet = self.get_sheet_by_name(asheetname)
            else:  # assume an integer if not a string
                sheet = self.get_sheet_by_index(asheetname)

            data = self.get_sheet_data(sheet, convert_float)
            usecols = _maybe_convert_usecols(usecols)

            if not data:
                output[asheetname] = DataFrame()
                continue

            if is_list_like(header) and len(header) == 1:
                header = header[0]

            # forward fill and pull out names for MultiIndex column
            header_names = None
            if header is not None and is_list_like(header):
                header_names = []
                control_row = [True] * len(data[0])

                for row in header:
                    if is_integer(skiprows):
                        row += skiprows

                    data[row], control_row = _fill_mi_header(
                        data[row], control_row)

                    if index_col is not None:
                        header_name, _ = _pop_header_name(data[row], index_col)
                        header_names.append(header_name)

            if is_list_like(index_col):
                # Forward fill values for MultiIndex index.
                if not is_list_like(header):
                    offset = 1 + header
                else:
                    offset = 1 + max(header)

                # Check if we have an empty dataset
                # before trying to collect data.
                if offset < len(data):
                    for col in index_col:
                        last = data[offset][col]

                        for row in range(offset + 1, len(data)):
                            if data[row][col] == "" or data[row][col] is None:
                                data[row][col] = last
                            else:
                                last = data[row][col]

            has_index_names = is_list_like(header) and len(header) > 1

            # GH 12292 : error when read one empty column from excel file
            try:
                parser = TextParser(
                    data,
                    names=names,
                    header=header,
                    index_col=index_col,
                    has_index_names=has_index_names,
                    squeeze=squeeze,
                    dtype=dtype,
                    true_values=true_values,
                    false_values=false_values,
                    skiprows=skiprows,
                    nrows=nrows,
                    na_values=na_values,
                    parse_dates=parse_dates,
                    date_parser=date_parser,
                    thousands=thousands,
                    comment=comment,
                    skipfooter=skipfooter,
                    usecols=usecols,
                    mangle_dupe_cols=mangle_dupe_cols,
                    **kwds,
                )

                output[asheetname] = parser.read(nrows=nrows)

                if not squeeze or isinstance(output[asheetname], DataFrame):
                    if header_names:
                        output[asheetname].columns = output[
                            asheetname].columns.set_names(header_names)

            except EmptyDataError:
                # No Data, return an empty DataFrame
                output[asheetname] = DataFrame()

        if ret_dict:
            return output
        else:
            return output[asheetname]
Beispiel #2
0
    def parse(self,
              sheet_name=0,
              header=0,
              names=None,
              index_col=None,
              usecols=None,
              squeeze=False,
              dtype=None,
              true_values=None,
              false_values=None,
              skiprows=None,
              nrows=None,
              na_values=None,
              verbose=False,
              parse_dates=False,
              date_parser=None,
              thousands=None,
              comment=None,
              skipfooter=0,
              convert_float=True,
              mangle_dupe_cols=True,
              **kwds):

        _validate_header_arg(header)

        ret_dict = False

        # Keep sheetname to maintain backwards compatibility.
        if isinstance(sheet_name, list):
            sheets = sheet_name
            ret_dict = True
        elif sheet_name is None:
            sheets = self.sheet_names
            ret_dict = True
        else:
            sheets = [sheet_name]

        # handle same-type duplicates.
        sheets = list(OrderedDict.fromkeys(sheets).keys())

        output = OrderedDict()

        for asheetname in sheets:
            if verbose:
                print("Reading sheet {sheet}".format(sheet=asheetname))

            if isinstance(asheetname, compat.string_types):
                sheet = self.get_sheet_by_name(asheetname)
            else:  # assume an integer if not a string
                sheet = self.get_sheet_by_index(asheetname)

            data = self.get_sheet_data(sheet, convert_float)
            usecols = _maybe_convert_usecols(usecols)

            if sheet.nrows == 0:
                output[asheetname] = DataFrame()
                continue

            if is_list_like(header) and len(header) == 1:
                header = header[0]

            # forward fill and pull out names for MultiIndex column
            header_names = None
            if header is not None and is_list_like(header):
                header_names = []
                control_row = [True] * len(data[0])

                for row in header:
                    if is_integer(skiprows):
                        row += skiprows

                    data[row], control_row = _fill_mi_header(data[row],
                                                             control_row)

                    if index_col is not None:
                        header_name, _ = _pop_header_name(data[row], index_col)
                        header_names.append(header_name)

            if is_list_like(index_col):
                # Forward fill values for MultiIndex index.
                if not is_list_like(header):
                    offset = 1 + header
                else:
                    offset = 1 + max(header)

                # Check if we have an empty dataset
                # before trying to collect data.
                if offset < len(data):
                    for col in index_col:
                        last = data[offset][col]

                        for row in range(offset + 1, len(data)):
                            if data[row][col] == '' or data[row][col] is None:
                                data[row][col] = last
                            else:
                                last = data[row][col]

            has_index_names = is_list_like(header) and len(header) > 1

            # GH 12292 : error when read one empty column from excel file
            try:
                parser = TextParser(data,
                                    names=names,
                                    header=header,
                                    index_col=index_col,
                                    has_index_names=has_index_names,
                                    squeeze=squeeze,
                                    dtype=dtype,
                                    true_values=true_values,
                                    false_values=false_values,
                                    skiprows=skiprows,
                                    nrows=nrows,
                                    na_values=na_values,
                                    parse_dates=parse_dates,
                                    date_parser=date_parser,
                                    thousands=thousands,
                                    comment=comment,
                                    skipfooter=skipfooter,
                                    usecols=usecols,
                                    mangle_dupe_cols=mangle_dupe_cols,
                                    **kwds)

                output[asheetname] = parser.read(nrows=nrows)

                if not squeeze or isinstance(output[asheetname], DataFrame):
                    if header_names:
                        output[asheetname].columns = output[
                            asheetname].columns.set_names(header_names)
                    elif compat.PY2:
                        output[asheetname].columns = _maybe_convert_to_string(
                            output[asheetname].columns)

            except EmptyDataError:
                # No Data, return an empty DataFrame
                output[asheetname] = DataFrame()

        if ret_dict:
            return output
        else:
            return output[asheetname]
Beispiel #3
0
    def parse(fname, **kwargs):
        num_splits = kwargs.pop("num_splits", None)
        start = kwargs.pop("start", None)
        end = kwargs.pop("end", None)
        _skiprows = kwargs.pop("skiprows")
        excel_header = kwargs.get("_header")
        sheet_name = kwargs.get("sheet_name", 0)
        footer = b"</sheetData></worksheet>"

        # Default to pandas case, where we are not splitting or partitioning
        if start is None or end is None:
            return pandas.read_excel(fname, **kwargs)

        from zipfile import ZipFile
        from openpyxl import load_workbook
        from openpyxl.worksheet._reader import WorksheetReader
        from openpyxl.reader.excel import ExcelReader
        from openpyxl.worksheet.worksheet import Worksheet
        from pandas.core.dtypes.common import is_list_like
        from pandas.io.excel._util import (
            _fill_mi_header,
            _maybe_convert_usecols,
        )
        from pandas.io.parsers import TextParser
        import re

        wb = load_workbook(filename=fname, read_only=True)
        # Get shared strings
        ex = ExcelReader(fname, read_only=True)
        ex.read_manifest()
        ex.read_strings()
        # Convert string name 0 to string
        if sheet_name == 0:
            sheet_name = wb.sheetnames[sheet_name]
        # get the worksheet to use with the worksheet reader
        ws = Worksheet(wb)
        # Read the raw data
        with ZipFile(fname) as z:
            with z.open("xl/worksheets/{}.xml".format(
                    sheet_name.lower())) as file:
                file.seek(start)
                bytes_data = file.read(end - start)

        def update_row_nums(match):
            """Update the row numbers to start at 1.

            Note: This is needed because the parser we are using does not scale well if
            the row numbers remain because empty rows are inserted for all "missing"
            rows.

            Parameters
            ----------
            match
                The match from the origin `re.sub` looking for row number tags.

            Returns
            -------
            string
                The updated string with new row numbers.
            """
            b = match.group(0)
            return re.sub(
                b"\d+",  # noqa: W605
                lambda c: str(int(c.group(0).decode("utf-8")) - _skiprows).
                encode("utf-8"),
                b,
            )

        bytes_data = re.sub(b'r="[A-Z]*\d+"', update_row_nums,
                            bytes_data)  # noqa: W605
        bytesio = BytesIO(excel_header + bytes_data + footer)
        # Use openpyxl to read/parse sheet data
        reader = WorksheetReader(ws, bytesio, ex.shared_strings, False)
        # Attach cells to worksheet object
        reader.bind_cells()
        data = PandasExcelParser.get_sheet_data(
            ws, kwargs.pop("convert_float", True))
        usecols = _maybe_convert_usecols(kwargs.pop("usecols", None))
        header = kwargs.pop("header", 0)
        index_col = kwargs.pop("index_col", None)
        # skiprows is handled externally
        skiprows = None

        # Handle header and create MultiIndex for columns if necessary
        if is_list_like(header) and len(header) == 1:
            header = header[0]
        if header is not None and is_list_like(header):
            control_row = [True] * len(data[0])

            for row in header:
                data[row], control_row = _fill_mi_header(
                    data[row], control_row)
        # Handle MultiIndex for row Index if necessary
        if is_list_like(index_col):
            # Forward fill values for MultiIndex index.
            if not is_list_like(header):
                offset = 1 + header
            else:
                offset = 1 + max(header)

            # Check if dataset is empty
            if offset < len(data):
                for col in index_col:
                    last = data[offset][col]
                    for row in range(offset + 1, len(data)):
                        if data[row][col] == "" or data[row][col] is None:
                            data[row][col] = last
                        else:
                            last = data[row][col]

        parser = TextParser(data,
                            header=header,
                            index_col=index_col,
                            has_index_names=is_list_like(header)
                            and len(header) > 1,
                            skiprows=skiprows,
                            usecols=usecols,
                            **kwargs)
        # In excel if you create a row with only a border (no values), this parser will
        # interpret that as a row of NaN values. Pandas discards these values, so we
        # also must discard these values.
        pandas_df = parser.read().dropna(how="all")
        # Since we know the number of rows that occur before this partition, we can
        # correctly assign the index in cases of RangeIndex. If it is not a RangeIndex,
        # the index is already correct because it came from the data.
        if isinstance(pandas_df.index, pandas.RangeIndex):
            pandas_df.index = pandas.RangeIndex(start=_skiprows,
                                                stop=len(pandas_df.index) +
                                                _skiprows)
        # We return the length if it is a RangeIndex (common case) to reduce
        # serialization cost.
        if index_col is not None:
            index = pandas_df.index
        else:
            # The lengths will become the RangeIndex
            index = len(pandas_df)
        return _split_result_for_readers(1, num_splits, pandas_df) + [
            index,
            pandas_df.dtypes,
        ]
Beispiel #4
0
def _df_filter(ranger,
               lasso,
               header=0,
               names=None,
               index_col=None,
               parse_cols=None,
               usecols=None,
               squeeze=False,
               dtype=None,
               engine=None,
               true_values=None,
               false_values=None,
               skiprows=None,
               nrows=None,
               na_values=None,
               keep_default_na=True,
               verbose=False,
               parse_dates=False,
               thousands=None,
               comment=None,
               skipfooter=0,
               convert_float=True,
               mangle_dupe_cols=True,
               **kwds):
    """
    Converts captured values table as pandas DataFrame

    Most args copied from :func:`pandas.io.read_excel()` except:
    
        sheet_name, skip_footer, converters, date_parser

    Note that ``skip_footer`` has been deprecated by ``skipfooter``.
    """
    data = lasso.values

    # Copied & adapted from `pandas.io.excel.py` v0.24.2+ (Jun 2019)
    #    https://github.com/pandas-dev/pandas/blob/d47fc0c/pandas/io/excel/_base.py#L368

    _validate_header_arg(header)

    invalid_args = (set("skip_footer chunksize date_parser converted".split())
                    & kwds.keys())
    if bool(invalid_args):
        raise NotImplementedError("Cannot implement args: %s" % invalid_args)

    if not data:
        return pd.DataFrame()

    usecols = _maybe_convert_usecols(usecols)

    if is_list_like(header) and len(header) == 1:
        header = header[0]

    # forward fill and pull out names for MultiIndex column
    header_names = None
    if header is not None and is_list_like(header):
        header_names = []
        control_row = [True for _ in data[0]]
        for row in header:
            if is_integer(skiprows):
                row += skiprows
            try:
                data[row], control_row = _fill_mi_header(
                    data[row], control_row)
            except TypeError:
                ## Arg `control_row` introduced in pandas-v0.19.0 to fix
                #  https://github.com/pandas-dev/pandas/issues/12453
                #  https://github.com/pandas-dev/pandas/commit/67b72e3cbbaeb89a5b9c780b2fe1c8d5eaa9c505
                data[row] = _fill_mi_header(data[row])

            if index_col is not None:
                header_name, data[row] = _pop_header_name(data[row], index_col)
                header_names.append(header_name)

    if is_list_like(index_col):
        # forward fill values for MultiIndex index
        if not is_list_like(header):
            offset = 1 + header
        else:
            offset = 1 + max(header)

        # Check if we have an empty dataset
        # before trying to collect data.
        if offset < len(data):
            for col in index_col:
                last = data[offset][col]

                for row in range(offset + 1, len(data)):
                    if data[row][col] == "" or data[row][col] is None:
                        data[row][col] = last
                    else:
                        last = data[row][col]

    has_index_names = is_list_like(header) and len(header) > 1

    # Pandaas expect '' instead of `None`!
    data = [["" if c is None else c for c in r] for r in data]

    # GH 12292 : error when read one empty column from excel file
    try:
        parser = pdparsers.TextParser(data,
                                      names=names,
                                      header=header,
                                      index_col=index_col,
                                      has_index_names=has_index_names,
                                      squeeze=squeeze,
                                      dtype=dtype,
                                      true_values=true_values,
                                      false_values=false_values,
                                      skiprows=skiprows,
                                      nrows=nrows,
                                      na_values=na_values,
                                      parse_dates=parse_dates,
                                      thousands=thousands,
                                      comment=comment,
                                      skipfooter=skipfooter,
                                      usecols=usecols,
                                      mangle_dupe_cols=mangle_dupe_cols,
                                      **kwds)

        output = parser.read()

        if not squeeze or isinstance(output, pd.DataFrame):
            if header_names:
                output.columns = output.columns.set_names(header_names)
    except EmptyDataError:
        # No Data, return an empty DataFrame
        output = pd.DataFrame()

    lasso = lasso._replace(values=output)

    return lasso