def read_worksheets(self): for sheet, rel in self.parser.find_sheets(): if rel.target not in self.valid_files: continue if "chartsheet" in rel.Type: self.read_chartsheet(sheet, rel) continue rels_path = get_rels_path(rel.target) rels = RelationshipList() if rels_path in self.valid_files: rels = get_dependents(self.archive, rels_path) if self.read_only: ws = ReadOnlyWorksheet(self.wb, sheet.name, rel.target, self.shared_strings) self.wb._sheets.append(ws) continue else: fh = self.archive.open(rel.target) ws = self.wb.create_sheet(sheet.name) ws._rels = rels ws_parser = WorksheetReader(ws, fh, self.shared_strings, self.data_only) ws_parser.bind_all() # assign any comments to cells for r in rels.find(COMMENTS_NS): src = self.archive.read(r.target) comment_sheet = CommentSheet.from_tree(fromstring(src)) for ref, comment in comment_sheet.comments: ws[ref].comment = comment # preserve link to VML file if VBA if self.wb.vba_archive and ws.legacy_drawing: ws.legacy_drawing = rels[ws.legacy_drawing].target for t in ws_parser.tables: src = self.archive.read(t) xml = fromstring(src) table = Table.from_tree(xml) ws.add_table(table) drawings = rels.find(SpreadsheetDrawing._rel_type) for rel in drawings: charts, images = find_images(self.archive, rel.target) for c in charts: ws.add_chart(c, c.anchor) for im in images: ws.add_image(im, im.anchor) pivot_rel = rels.find(TableDefinition.rel_type) for r in pivot_rel: pivot_path = r.Target src = self.archive.read(pivot_path) tree = fromstring(src) pivot = TableDefinition.from_tree(tree) pivot.cache = self.parser.pivot_caches[pivot.cacheId] ws.add_pivot(pivot) ws.sheet_state = sheet.state
def parse(fname, **kwargs): num_splits = kwargs.pop("num_splits", None) start = kwargs.pop("start", None) end = kwargs.pop("end", None) _skiprows = kwargs.pop("skiprows") excel_header = kwargs.get("_header") sheet_name = kwargs.get("sheet_name", 0) footer = b"</sheetData></worksheet>" # Default to pandas case, where we are not splitting or partitioning if start is None or end is None: return pandas.read_excel(fname, **kwargs) from zipfile import ZipFile from openpyxl import load_workbook from openpyxl.worksheet._reader import WorksheetReader from openpyxl.reader.excel import ExcelReader from openpyxl.worksheet.worksheet import Worksheet from pandas.core.dtypes.common import is_list_like from pandas.io.excel._util import ( _fill_mi_header, _maybe_convert_usecols, ) from pandas.io.parsers import TextParser import re wb = load_workbook(filename=fname, read_only=True) # Get shared strings ex = ExcelReader(fname, read_only=True) ex.read_manifest() ex.read_strings() # Convert string name 0 to string if sheet_name == 0: sheet_name = wb.sheetnames[sheet_name] # get the worksheet to use with the worksheet reader ws = Worksheet(wb) # Read the raw data with ZipFile(fname) as z: with z.open("xl/worksheets/{}.xml".format( sheet_name.lower())) as file: file.seek(start) bytes_data = file.read(end - start) def update_row_nums(match): """Update the row numbers to start at 1. Note: This is needed because the parser we are using does not scale well if the row numbers remain because empty rows are inserted for all "missing" rows. Parameters ---------- match The match from the origin `re.sub` looking for row number tags. Returns ------- string The updated string with new row numbers. """ b = match.group(0) return re.sub( b"\d+", # noqa: W605 lambda c: str(int(c.group(0).decode("utf-8")) - _skiprows). encode("utf-8"), b, ) bytes_data = re.sub(b'r="[A-Z]*\d+"', update_row_nums, bytes_data) # noqa: W605 bytesio = BytesIO(excel_header + bytes_data + footer) # Use openpyxl to read/parse sheet data reader = WorksheetReader(ws, bytesio, ex.shared_strings, False) # Attach cells to worksheet object reader.bind_cells() data = PandasExcelParser.get_sheet_data( ws, kwargs.pop("convert_float", True)) usecols = _maybe_convert_usecols(kwargs.pop("usecols", None)) header = kwargs.pop("header", 0) index_col = kwargs.pop("index_col", None) # skiprows is handled externally skiprows = None # Handle header and create MultiIndex for columns if necessary if is_list_like(header) and len(header) == 1: header = header[0] if header is not None and is_list_like(header): control_row = [True] * len(data[0]) for row in header: data[row], control_row = _fill_mi_header( data[row], control_row) # Handle MultiIndex for row Index if necessary if is_list_like(index_col): # Forward fill values for MultiIndex index. if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if dataset is empty if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] parser = TextParser(data, header=header, index_col=index_col, has_index_names=is_list_like(header) and len(header) > 1, skiprows=skiprows, usecols=usecols, **kwargs) # In excel if you create a row with only a border (no values), this parser will # interpret that as a row of NaN values. Pandas discards these values, so we # also must discard these values. pandas_df = parser.read().dropna(how="all") # Since we know the number of rows that occur before this partition, we can # correctly assign the index in cases of RangeIndex. If it is not a RangeIndex, # the index is already correct because it came from the data. if isinstance(pandas_df.index, pandas.RangeIndex): pandas_df.index = pandas.RangeIndex(start=_skiprows, stop=len(pandas_df.index) + _skiprows) # We return the length if it is a RangeIndex (common case) to reduce # serialization cost. if index_col is not None: index = pandas_df.index else: # The lengths will become the RangeIndex index = len(pandas_df) return _split_result_for_readers(1, num_splits, pandas_df) + [ index, pandas_df.dtypes, ]
def _read(cls, io, **kwargs): if (kwargs.get("engine", None) is not None and kwargs.get("engine") != "openpyxl"): warnings.warn( "Modin only implements parallel `read_excel` with `openpyxl` engine, " 'please specify `engine=None` or `engine="openpyxl"` to ' "use Modin's parallel implementation.") return cls.single_worker_read(io, **kwargs) if sys.version_info < (3, 7): warnings.warn( "Python 3.7 or higher required for parallel `read_excel`.") return cls.single_worker_read(io, **kwargs) from zipfile import ZipFile from openpyxl.worksheet.worksheet import Worksheet from openpyxl.worksheet._reader import WorksheetReader from openpyxl.reader.excel import ExcelReader from modin.backends.pandas.parsers import PandasExcelParser sheet_name = kwargs.get("sheet_name", 0) if sheet_name is None or isinstance(sheet_name, list): warnings.warn( "`read_excel` functionality is only implemented for a single sheet at a " "time. Multiple sheet reading coming soon!") return cls.single_worker_read(io, **kwargs) warnings.warn("Parallel `read_excel` is a new feature! Please email " "[email protected] if you run into any problems.") # NOTE: ExcelReader() in read-only mode does not close file handle by itself # work around that by passing file object if we received some path io_file = open(io, "rb") if isinstance(io, str) else io try: ex = ExcelReader(io_file, read_only=True) ex.read() wb = ex.wb # Get shared strings ex.read_manifest() ex.read_strings() ws = Worksheet(wb) finally: if isinstance(io, str): # close only if it were us who opened the object io_file.close() pandas_kw = dict(kwargs) # preserve original kwargs with ZipFile(io) as z: from io import BytesIO # Convert index to sheet name in file if isinstance(sheet_name, int): sheet_name = "sheet{}".format(sheet_name + 1) else: sheet_name = "sheet{}".format( wb.sheetnames.index(sheet_name) + 1) if any(sheet_name.lower() in name for name in z.namelist()): sheet_name = sheet_name.lower() elif any(sheet_name.title() in name for name in z.namelist()): sheet_name = sheet_name.title() else: raise ValueError("Sheet {} not found".format( sheet_name.lower())) # Pass this value to the workers kwargs["sheet_name"] = sheet_name f = z.open("xl/worksheets/{}.xml".format(sheet_name)) f = BytesIO(f.read()) total_bytes = cls.file_size(f) num_partitions = NPartitions.get() # Read some bytes from the sheet so we can extract the XML header and first # line. We need to make sure we get the first line of the data as well # because that is where the column names are. The header information will # be extracted and sent to all of the nodes. sheet_block = f.read(EXCEL_READ_BLOCK_SIZE) end_of_row_tag = b"</row>" while end_of_row_tag not in sheet_block: sheet_block += f.read(EXCEL_READ_BLOCK_SIZE) idx_of_header_end = sheet_block.index(end_of_row_tag) + len( end_of_row_tag) sheet_header = sheet_block[:idx_of_header_end] # Reset the file pointer to begin at the end of the header information. f.seek(idx_of_header_end) kwargs["_header"] = sheet_header footer = b"</sheetData></worksheet>" # Use openpyxml to parse the data reader = WorksheetReader(ws, BytesIO(sheet_header + footer), ex.shared_strings, False) # Attach cells to the worksheet reader.bind_cells() data = PandasExcelParser.get_sheet_data( ws, kwargs.get("convert_float", True)) # Extract column names from parsed data. column_names = pandas.Index(data[0]) index_col = kwargs.get("index_col", None) # Remove column names that are specified as `index_col` if index_col is not None: column_names = column_names.drop(column_names[index_col]) if not all(column_names): # some column names are empty, use pandas reader to take the names from it pandas_kw["nrows"] = 1 df = pandas.read_excel(io, **pandas_kw) column_names = df.columns # Compute partition metadata upfront so it is uniform for all partitions chunk_size = max(1, (total_bytes - f.tell()) // num_partitions) num_splits = min(len(column_names), num_partitions) kwargs["fname"] = io # Skiprows will be used to inform a partition how many rows come before it. kwargs["skiprows"] = 0 rows_to_skip = 0 data_ids = [] index_ids = [] dtypes_ids = [] # Compute column metadata column_chunksize = compute_chunksize( pandas.DataFrame(columns=column_names), num_splits, axis=1) if column_chunksize > len(column_names): column_widths = [len(column_names)] # This prevents us from unnecessarily serializing a bunch of empty # objects. num_splits = 1 else: column_widths = [ column_chunksize if len(column_names) > (column_chunksize * (i + 1)) else 0 if len(column_names) < (column_chunksize * i) else len(column_names) - (column_chunksize * i) for i in range(num_splits) ] kwargs["num_splits"] = num_splits while f.tell() < total_bytes: args = kwargs args["skiprows"] = rows_to_skip args["start"] = f.tell() chunk = f.read(chunk_size) # This edge case can happen when we have reached the end of the data # but not the end of the file. if b"<row" not in chunk: break row_close_tag = b"</row>" row_count = re.subn(row_close_tag, b"", chunk)[1] # Make sure we are reading at least one row. while row_count == 0: chunk += f.read(chunk_size) row_count += re.subn(row_close_tag, b"", chunk)[1] last_index = chunk.rindex(row_close_tag) f.seek(-(len(chunk) - last_index) + len(row_close_tag), 1) args["end"] = f.tell() # If there is no data, exit before triggering computation. if b"</row>" not in chunk and b"</sheetData>" in chunk: break # We need to make sure we include all rows, even those that have no # data. Getting the number of the last row will turn into the number of # skipped rows, so if there are any rows missing between the last row # seen here and the first row the next partition reads, the parser will # have to include those rows in that specific partition to match the # expected behavior. We subtract 1 here because the header is included # in the skip values, and we do not want to skip the header. rows_to_skip = (int(chunk[:last_index + len(row_close_tag)]. split(b'<row r="')[-1].split(b'"')[0]) - 1) remote_results_list = cls.deploy(cls.parse, num_splits + 2, args) data_ids.append(remote_results_list[:-2]) index_ids.append(remote_results_list[-2]) dtypes_ids.append(remote_results_list[-1]) # The end of the spreadsheet if b"</sheetData>" in chunk: break # Compute the index based on a sum of the lengths of each partition (by default) # or based on the column(s) that were requested. if index_col is None: row_lengths = cls.materialize(index_ids) new_index = pandas.RangeIndex(sum(row_lengths)) else: index_objs = cls.materialize(index_ids) row_lengths = [len(o) for o in index_objs] new_index = index_objs[0].append(index_objs[1:]) # Compute dtypes by getting collecting and combining all of the partitions. The # reported dtypes from differing rows can be different based on the inference in # the limited data seen by each worker. We use pandas to compute the exact dtype # over the whole column for each column. The index is set below. dtypes = cls.get_dtypes(dtypes_ids) data_ids = cls.build_partition(data_ids, row_lengths, column_widths) # Set the index for the dtypes to the column names if isinstance(dtypes, pandas.Series): dtypes.index = column_names else: dtypes = pandas.Series(dtypes, index=column_names) new_frame = cls.frame_cls( data_ids, new_index, column_names, row_lengths, column_widths, dtypes=dtypes, ) new_query_compiler = cls.query_compiler_cls(new_frame) if index_col is None: new_query_compiler._modin_frame._apply_index_objs(axis=0) return new_query_compiler
def _read(cls, io, **kwargs): if (kwargs.get("engine", None) is not None and kwargs.get("engine") != "openpyxl"): warnings.warn( "Modin only implements parallel `read_excel` with `openpyxl` engine, " 'please specify `engine=None` or `engine="openpyxl"` to ' "use Modin's parallel implementation.") return cls.single_worker_read(io, **kwargs) if sys.version_info < (3, 7): warnings.warn( "Python 3.7 or higher required for parallel `read_excel`.") return cls.single_worker_read(io, **kwargs) from zipfile import ZipFile from openpyxl import load_workbook from openpyxl.worksheet.worksheet import Worksheet from openpyxl.worksheet._reader import WorksheetReader from openpyxl.reader.excel import ExcelReader from modin.backends.pandas.parsers import PandasExcelParser sheet_name = kwargs.get("sheet_name", 0) if sheet_name is None or isinstance(sheet_name, list): warnings.warn( "`read_excel` functionality is only implemented for a single sheet at a " "time. Multiple sheet reading coming soon!") return cls.single_worker_read(io, **kwargs) warnings.warn("Parallel `read_excel` is a new feature! Please email " "[email protected] if you run into any problems.") wb = load_workbook(filename=io, read_only=True) # Get shared strings ex = ExcelReader(io, read_only=True) ex.read_manifest() ex.read_strings() ws = Worksheet(wb) # Convert string name 0 to string if sheet_name == 0: sheet_name = wb.sheetnames[sheet_name] with ZipFile(io) as z: from io import BytesIO f = z.open("xl/worksheets/{}.xml".format(sheet_name.lower())) f = BytesIO(f.read()) total_bytes = cls.file_size(f) from modin.pandas import DEFAULT_NPARTITIONS num_partitions = DEFAULT_NPARTITIONS # Read some bytes from the sheet so we can extract the XML header and first # line. We need to make sure we get the first line of the data as well # because that is where the column names are. The header information will # be extracted and sent to all of the nodes. sheet_block = f.read(EXCEL_READ_BLOCK_SIZE) end_of_row_tag = b"</row>" while end_of_row_tag not in sheet_block: sheet_block += f.read(EXCEL_READ_BLOCK_SIZE) idx_of_header_end = sheet_block.index(end_of_row_tag) + len( end_of_row_tag) sheet_header = sheet_block[:idx_of_header_end] # Reset the file pointer to begin at the end of the header information. f.seek(idx_of_header_end) kwargs["_header"] = sheet_header footer = b"</sheetData></worksheet>" # Use openpyxml to parse the data reader = WorksheetReader(ws, BytesIO(sheet_header + footer), ex.shared_strings, False) # Attach cells to the worksheet reader.bind_cells() data = PandasExcelParser.get_sheet_data( ws, kwargs.get("convert_float", True)) # Extract column names from parsed data. column_names = pandas.Index(data[0]) index_col = kwargs.get("index_col", None) # Remove column names that are specified as `index_col` if index_col is not None: column_names = column_names.drop(column_names[index_col]) # Compute partition metadata upfront so it is uniform for all partitions chunk_size = max(1, (total_bytes - f.tell()) // num_partitions) num_splits = min(len(column_names), num_partitions) kwargs["fname"] = io # Skiprows will be used to inform a partition how many rows come before it. kwargs["skiprows"] = 0 row_count = 0 data_ids = [] index_ids = [] dtypes_ids = [] # Compute column metadata column_chunksize = compute_chunksize( pandas.DataFrame(columns=column_names), num_splits, axis=1) if column_chunksize > len(column_names): column_widths = [len(column_names)] # This prevents us from unnecessarily serializing a bunch of empty # objects. num_splits = 1 else: column_widths = [ column_chunksize if len(column_names) > (column_chunksize * (i + 1)) else 0 if len(column_names) < (column_chunksize * i) else len(column_names) - (column_chunksize * i) for i in range(num_splits) ] kwargs["num_splits"] = num_splits while f.tell() < total_bytes: args = kwargs args["skiprows"] = row_count + args["skiprows"] args["start"] = f.tell() chunk = f.read(chunk_size) # This edge case can happen when we have reached the end of the data # but not the end of the file. if b"<row" not in chunk: break row_close_tag = b"</row>" row_count = re.subn(row_close_tag, b"", chunk)[1] # Make sure we are reading at least one row. while row_count == 0: chunk += f.read(chunk_size) row_count += re.subn(row_close_tag, b"", chunk)[1] last_index = chunk.rindex(row_close_tag) f.seek(-(len(chunk) - last_index) + len(row_close_tag), 1) args["end"] = f.tell() # If there is no data, exit before triggering computation. if b"</row>" not in chunk and b"</sheetData>" in chunk: break remote_results_list = cls.deploy(cls.parse, num_splits + 2, args) data_ids.append(remote_results_list[:-2]) index_ids.append(remote_results_list[-2]) dtypes_ids.append(remote_results_list[-1]) # The end of the spreadsheet if b"</sheetData>" in chunk: break # Compute the index based on a sum of the lengths of each partition (by default) # or based on the column(s) that were requested. if index_col is None: row_lengths = cls.materialize(index_ids) new_index = pandas.RangeIndex(sum(row_lengths)) else: index_objs = cls.materialize(index_ids) row_lengths = [len(o) for o in index_objs] new_index = index_objs[0].append(index_objs[1:]) # Compute dtypes by getting collecting and combining all of the partitions. The # reported dtypes from differing rows can be different based on the inference in # the limited data seen by each worker. We use pandas to compute the exact dtype # over the whole column for each column. The index is set below. dtypes = cls.get_dtypes(dtypes_ids) data_ids = cls.build_partition(data_ids, row_lengths, column_widths) # Set the index for the dtypes to the column names if isinstance(dtypes, pandas.Series): dtypes.index = column_names else: dtypes = pandas.Series(dtypes, index=column_names) new_frame = cls.frame_cls( data_ids, new_index, column_names, row_lengths, column_widths, dtypes=dtypes, ) return cls.query_compiler_cls(new_frame)
def read_worksheets(self): comment_warning = """Cell '{0}':{1} is part of a merged range but has a comment which will be removed because merged cells cannot contain any data.""" for sheet, rel in self.parser.find_sheets(): if rel.target not in self.valid_files: continue if "chartsheet" in rel.Type: self.read_chartsheet(sheet, rel) continue rels_path = get_rels_path(rel.target) rels = RelationshipList() if rels_path in self.valid_files: rels = get_dependents(self.archive, rels_path) if self.read_only: ws = ReadOnlyWorksheet(self.wb, sheet.name, rel.target, self.shared_strings) self.wb._sheets.append(ws) continue else: fh = self.archive.open(rel.target) ws = self.wb.create_sheet(sheet.name) ws._rels = rels ws_parser = WorksheetReader(ws, fh, self.shared_strings, self.data_only) ws_parser.bind_all() # assign any comments to cells for r in rels.find(COMMENTS_NS): src = self.archive.read(r.target) comment_sheet = CommentSheet.from_tree(fromstring(src)) for ref, comment in comment_sheet.comments: try: ws[ref].comment = comment except AttributeError: c = ws[ref] if isinstance(c, MergedCell): warnings.warn( comment_warning.format(ws.title, c.coordinate)) continue # preserve link to VML file if VBA if self.wb.vba_archive and ws.legacy_drawing: ws.legacy_drawing = rels[ws.legacy_drawing].target for t in ws_parser.tables: src = self.archive.read(t) xml = fromstring(src) table = Table.from_tree(xml) ws.add_table(table) drawings = rels.find(SpreadsheetDrawing._rel_type) for rel in drawings: charts, images = find_images(self.archive, rel.target) for c in charts: ws.add_chart(c, c.anchor) for im in images: ws.add_image(im, im.anchor) pivot_rel = rels.find(TableDefinition.rel_type) for r in pivot_rel: pivot_path = r.Target src = self.archive.read(pivot_path) tree = fromstring(src) pivot = TableDefinition.from_tree(tree) pivot.cache = self.parser.pivot_caches[pivot.cacheId] ws.add_pivot(pivot) ws.sheet_state = sheet.state