def parse( self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, dtype: DtypeArg | None = None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=None, mangle_dupe_cols=True, **kwds, ): if convert_float is None: convert_float = True else: stacklevel = find_stack_level() warnings.warn( "convert_float is deprecated and will be removed in a future version.", FutureWarning, stacklevel=stacklevel, ) validate_header_arg(header) ret_dict = False # Keep sheetname to maintain backwards compatibility. if isinstance(sheet_name, list): sheets = sheet_name ret_dict = True elif sheet_name is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheet_name] # handle same-type duplicates. sheets = list(dict.fromkeys(sheets).keys()) output = {} for asheetname in sheets: if verbose: print(f"Reading sheet {asheetname}") if isinstance(asheetname, str): sheet = self.get_sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) if hasattr(sheet, "close"): # pyxlsb opens two TemporaryFiles sheet.close() usecols = maybe_convert_usecols(usecols) if not data: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): header_names = [] control_row = [True] * len(data[0]) for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = fill_mi_header( data[row], control_row) if index_col is not None: header_name, _ = pop_header_name(data[row], index_col) header_names.append(header_name) # If there is a MultiIndex header and an index then there is also # a row containing just the index name(s) has_index_names = (is_list_like(header) and len(header) > 1 and index_col is not None) if is_list_like(index_col): # Forward fill values for MultiIndex index. if header is None: offset = 0 elif not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # GH34673: if MultiIndex names present and not defined in the header, # offset needs to be incremented so that forward filling starts # from the first MI value instead of the name if has_index_names: offset += 1 # Check if we have an empty dataset # before trying to collect data. if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] # GH 12292 : error when read one empty column from excel file try: parser = TextParser( data, names=names, header=header, index_col=index_col, has_index_names=has_index_names, squeeze=squeeze, dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, skip_blank_lines=False, # GH 39808 parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, **kwds, ) output[asheetname] = parser.read(nrows=nrows) if not squeeze or isinstance(output[asheetname], DataFrame): if header_names: output[asheetname].columns = output[ asheetname].columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() if ret_dict: return output else: return output[asheetname]
def parse( self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, dtype=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True, **kwds, ): validate_header_arg(header) ret_dict = False # Keep sheetname to maintain backwards compatibility. if isinstance(sheet_name, list): sheets = sheet_name ret_dict = True elif sheet_name is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheet_name] # handle same-type duplicates. sheets = list(dict.fromkeys(sheets).keys()) output = {} for asheetname in sheets: if verbose: print(f"Reading sheet {asheetname}") if isinstance(asheetname, str): sheet = self.get_sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) usecols = maybe_convert_usecols(usecols) if not data: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): header_names = [] control_row = [True] * len(data[0]) for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = fill_mi_header( data[row], control_row) if index_col is not None: header_name, _ = pop_header_name(data[row], index_col) header_names.append(header_name) if is_list_like(index_col): # Forward fill values for MultiIndex index. if header is None: offset = 0 elif not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if we have an empty dataset # before trying to collect data. if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] has_index_names = is_list_like(header) and len(header) > 1 # GH 12292 : error when read one empty column from excel file try: parser = TextParser( data, names=names, header=header, index_col=index_col, has_index_names=has_index_names, squeeze=squeeze, dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, **kwds, ) output[asheetname] = parser.read(nrows=nrows) if not squeeze or isinstance(output[asheetname], DataFrame): if header_names: output[asheetname].columns = output[ asheetname].columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() if ret_dict: return output else: return output[asheetname]
def parse(fname, **kwargs): num_splits = kwargs.pop("num_splits", None) start = kwargs.pop("start", None) end = kwargs.pop("end", None) _skiprows = kwargs.pop("skiprows") excel_header = kwargs.get("_header") sheet_name = kwargs.get("sheet_name", 0) footer = b"</sheetData></worksheet>" # Default to pandas case, where we are not splitting or partitioning if start is None or end is None: return pandas.read_excel(fname, **kwargs) from zipfile import ZipFile from openpyxl import load_workbook from openpyxl.worksheet._reader import WorksheetReader from openpyxl.reader.excel import ExcelReader from openpyxl.worksheet.worksheet import Worksheet from pandas.core.dtypes.common import is_list_like from pandas.io.excel._util import ( fill_mi_header, maybe_convert_usecols, ) from pandas.io.parsers import TextParser import re wb = load_workbook(filename=fname, read_only=True) # Get shared strings ex = ExcelReader(fname, read_only=True) ex.read_manifest() ex.read_strings() # Convert string name 0 to string if sheet_name == 0: sheet_name = wb.sheetnames[sheet_name] # get the worksheet to use with the worksheet reader ws = Worksheet(wb) # Read the raw data with ZipFile(fname) as z: with z.open("xl/worksheets/{}.xml".format(sheet_name)) as file: file.seek(start) bytes_data = file.read(end - start) def update_row_nums(match): """ Update the row numbers to start at 1. Parameters ---------- match : re.Match object The match from the origin `re.sub` looking for row number tags. Returns ------- str The updated string with new row numbers. Notes ----- This is needed because the parser we are using does not scale well if the row numbers remain because empty rows are inserted for all "missing" rows. """ b = match.group(0) return re.sub( br"\d+", lambda c: str(int(c.group(0).decode("utf-8")) - _skiprows).encode( "utf-8" ), b, ) bytes_data = re.sub(br'r="[A-Z]*\d+"', update_row_nums, bytes_data) bytesio = BytesIO(excel_header + bytes_data + footer) # Use openpyxl to read/parse sheet data reader = WorksheetReader(ws, bytesio, ex.shared_strings, False) # Attach cells to worksheet object reader.bind_cells() data = PandasExcelParser.get_sheet_data(ws, kwargs.pop("convert_float", True)) usecols = maybe_convert_usecols(kwargs.pop("usecols", None)) header = kwargs.pop("header", 0) index_col = kwargs.pop("index_col", None) # skiprows is handled externally skiprows = None # Handle header and create MultiIndex for columns if necessary if is_list_like(header) and len(header) == 1: header = header[0] if header is not None and is_list_like(header): control_row = [True] * len(data[0]) for row in header: data[row], control_row = fill_mi_header(data[row], control_row) # Handle MultiIndex for row Index if necessary if is_list_like(index_col): # Forward fill values for MultiIndex index. if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if dataset is empty if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] parser = TextParser( data, header=header, index_col=index_col, has_index_names=is_list_like(header) and len(header) > 1, skiprows=skiprows, usecols=usecols, **kwargs ) # In excel if you create a row with only a border (no values), this parser will # interpret that as a row of NaN values. pandas discards these values, so we # also must discard these values. pandas_df = parser.read().dropna(how="all") # Since we know the number of rows that occur before this partition, we can # correctly assign the index in cases of RangeIndex. If it is not a RangeIndex, # the index is already correct because it came from the data. if isinstance(pandas_df.index, pandas.RangeIndex): pandas_df.index = pandas.RangeIndex( start=_skiprows, stop=len(pandas_df.index) + _skiprows ) # We return the length if it is a RangeIndex (common case) to reduce # serialization cost. if index_col is not None: index = pandas_df.index else: # The lengths will become the RangeIndex index = len(pandas_df) return _split_result_for_readers(1, num_splits, pandas_df) + [ index, pandas_df.dtypes, ]