def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: # GH 39001 # Reading of excel file depends on dimension data being correct but # writers sometimes omit or get it wrong import openpyxl version = LooseVersion(get_version(openpyxl)) if version >= "3.0.0": sheet.reset_dimensions() data: List[List[Scalar]] = [] for row_number, row in enumerate(sheet.rows): converted_row = [ self._convert_cell(cell, convert_float) for cell in row ] data.append(converted_row) if version >= "3.0.0" and len(data) > 0: # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: empty_cell: List[Scalar] = [""] data = [ data_row + (max_width - len(data_row)) * empty_cell for data_row in data ] return data
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: # GH 39001 # Reading of excel file depends on dimension data being correct but # writers sometimes omit or get it wrong import openpyxl version = LooseVersion(get_version(openpyxl)) # There is no good way of determining if a sheet is read-only # https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1605 is_readonly = hasattr(sheet, "reset_dimensions") if version >= "3.0.0" and is_readonly: sheet.reset_dimensions() data: List[List[Scalar]] = [] for row_number, row in enumerate(sheet.rows): converted_row = [ self._convert_cell(cell, convert_float) for cell in row ] data.append(converted_row) if version >= "3.0.0" and is_readonly and len(data) > 0: # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: empty_cell: List[Scalar] = [""] data = [ data_row + (max_width - len(data_row)) * empty_cell for data_row in data ] return data
def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename, read_only, request): # GH 38956, 39001 - no/incorrect dimension information version = LooseVersion(get_version(openpyxl)) if (read_only or read_only is None) and version < "3.0.0": msg = "openpyxl read-only sheet is incorrect when dimension data is wrong" request.node.add_marker(pytest.mark.xfail(reason=msg)) path = datapath("io", "data", "excel", f"{filename}{ext}") if read_only is None: result = pd.read_excel(path, header=header) else: wb = openpyxl.load_workbook(path, read_only=read_only) result = pd.read_excel(wb, engine="openpyxl", header=header) wb.close() expected = DataFrame(expected_data) tm.assert_frame_equal(result, expected)
def test_read_with_empty_trailing_rows(datapath, ext, read_only, request): # GH 39181 version = LooseVersion(get_version(openpyxl)) if (read_only or read_only is None) and version < "3.0.0": msg = "openpyxl read-only sheet is incorrect when dimension data is wrong" request.node.add_marker(pytest.mark.xfail(reason=msg)) path = datapath("io", "data", "excel", f"empty_trailing_rows{ext}") if read_only is None: result = pd.read_excel(path) else: wb = openpyxl.load_workbook(path, read_only=read_only) result = pd.read_excel(wb, engine="openpyxl") wb.close() expected = DataFrame({ "Title": [np.nan, "A", 1, 2, 3], "Unnamed: 1": [np.nan, "B", 4, 5, 6], "Unnamed: 2": [np.nan, "C", 7, 8, 9], }) tm.assert_frame_equal(result, expected)
def _get_dependency_info() -> Dict[str, JSONSerializable]: """ Returns dependency information as a JSON serializable dictionary. """ deps = [ "pandas", # required "numpy", "pytz", "dateutil", # install / build, "pip", "setuptools", "Cython", # test "pytest", "hypothesis", # docs "sphinx", # Other, need a min version "blosc", "feather", "xlsxwriter", "lxml.etree", "html5lib", "pymysql", "psycopg2", "jinja2", # Other, not imported. "IPython", "pandas_datareader", ] deps.extend(list(VERSIONS)) result: Dict[str, JSONSerializable] = {} for modname in deps: mod = import_optional_dependency(modname, raise_on_missing=False, on_version="ignore") result[modname] = get_version(mod) if mod else None return result
"header, expected_data", [ ( 0, { "Title": [np.nan, "A", 1, 2, 3], "Unnamed: 1": [np.nan, "B", 4, 5, 6], "Unnamed: 2": [np.nan, "C", 7, 8, 9], }, ), (2, { "A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9] }), ], ) @pytest.mark.parametrize( "filename", ["dimension_missing", "dimension_small", "dimension_large"]) @pytest.mark.xfail( LooseVersion(get_version(openpyxl)) < "3.0.0", reason="openpyxl read-only sheet is incorrect when dimension data is wrong", ) def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename): # GH 38956, 39001 - no/incorrect dimension information path = datapath("io", "data", "excel", f"{filename}{ext}") result = pd.read_excel(path, header=header) expected = DataFrame(expected_data) tm.assert_frame_equal(result, expected)
def __init__(self, path_or_buffer, engine=None, storage_options: StorageOptions = None): if engine is not None and engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") # Could be a str, ExcelFile, Book, etc. self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) # Determine xlrd version if installed if import_optional_dependency("xlrd", errors="ignore") is None: xlrd_version = None else: import xlrd xlrd_version = Version(get_version(xlrd)) ext = None if engine is None: # Only determine ext if it is needed if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): ext = "xls" else: ext = inspect_excel_format(content_or_path=path_or_buffer, storage_options=storage_options) if ext is None: raise ValueError( "Excel file format cannot be determined, you must specify " "an engine manually.") engine = config.get_option(f"io.excel.{ext}.reader", silent=True) if engine == "auto": engine = get_default_engine(ext, mode="reader") if engine == "xlrd" and xlrd_version is not None: if ext is None: # Need ext to determine ext in order to raise/warn if isinstance(path_or_buffer, xlrd.Book): ext = "xls" else: ext = inspect_excel_format(path_or_buffer, storage_options=storage_options) # Pass through if ext is None, otherwise check if ext valid for xlrd if ext and ext != "xls" and xlrd_version >= Version("2"): raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install openpyxl instead." ) elif ext and ext != "xls": stacklevel = find_stack_level() warnings.warn( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install " f"openpyxl instead.", FutureWarning, stacklevel=stacklevel, ) self.engine = engine self.storage_options = storage_options self._reader = self._engines[engine](self._io, storage_options=storage_options)
from distutils.version import LooseVersion import pytest from pandas.compat._optional import get_version, import_optional_dependency pytestmark = [ pytest.mark.filterwarnings( # Looks like tree.getiterator is deprecated in favor of tree.iter "ignore:This method will be removed in future versions:" "PendingDeprecationWarning"), pytest.mark.filterwarnings( "ignore:This method will be removed in future versions:DeprecationWarning" ), # GH 26552 pytest.mark.filterwarnings( "ignore:As the xlwt package is no longer maintained:FutureWarning"), # GH 38571 pytest.mark.filterwarnings( "ignore:.*In xlrd >= 2.0, only the xls format is supported:FutureWarning" ), ] if import_optional_dependency("xlrd", errors="ignore") is None: xlrd_version = None else: import xlrd xlrd_version = LooseVersion(get_version(xlrd))
def __init__(self, path_or_buffer, engine=None, storage_options: StorageOptions = None): if engine is not None and engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") # Could be a str, ExcelFile, Book, etc. self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) # Determine xlrd version if installed if (import_optional_dependency( "xlrd", raise_on_missing=False, on_version="ignore") is None): xlrd_version = None else: import xlrd xlrd_version = LooseVersion(get_version(xlrd)) ext = None if engine is None: # Only determine ext if it is needed if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): ext = "xls" else: ext = inspect_excel_format(content=path_or_buffer, storage_options=storage_options) if ext == "ods": engine = "odf" elif ext == "xls": engine = "xlrd" else: # GH 35029 - Prefer openpyxl except for xls files if (import_optional_dependency("openpyxl", raise_on_missing=False, on_version="ignore") is not None): engine = "openpyxl" else: engine = "xlrd" if engine == "xlrd" and xlrd_version is not None: if ext is None: # Need ext to determine ext in order to raise/warn if isinstance(path_or_buffer, xlrd.Book): ext = "xls" else: ext = inspect_excel_format(content=path_or_buffer, storage_options=storage_options) if ext != "xls" and xlrd_version >= "2": raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install openpyxl instead." ) elif ext != "xls": caller = inspect.stack()[1] if (caller.filename.endswith( os.path.join("pandas", "io", "excel", "_base.py")) and caller.function == "read_excel"): stacklevel = 4 else: stacklevel = 2 warnings.warn( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. As a result, the " f"openpyxl engine will be used if it is installed and the " f"engine argument is not specified. Install " f"openpyxl instead.", FutureWarning, stacklevel=stacklevel, ) assert engine in self._engines, f"Engine {engine} not recognized" self.engine = engine self.storage_options = storage_options self._reader = self._engines[engine](self._io, storage_options=storage_options)
def __init__(self, path_or_buffer, engine=None, storage_options: StorageOptions = None): if engine is not None and engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") # Could be a str, ExcelFile, Book, etc. self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) # Determine xlrd version if installed if import_optional_dependency("xlrd", errors="ignore") is None: xlrd_version = None else: import xlrd xlrd_version = LooseVersion(get_version(xlrd)) ext = None if engine is None: # Only determine ext if it is needed if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): ext = "xls" else: ext = inspect_excel_format(content_or_path=path_or_buffer, storage_options=storage_options) # ext will always be valid, otherwise inspect_excel_format would raise engine = config.get_option(f"io.excel.{ext}.reader", silent=True) if engine == "auto": engine = get_default_engine(ext, mode="reader") if engine == "xlrd" and xlrd_version is not None: if ext is None: # Need ext to determine ext in order to raise/warn if isinstance(path_or_buffer, xlrd.Book): ext = "xls" else: ext = inspect_excel_format(path_or_buffer, storage_options=storage_options) if ext != "xls" and xlrd_version >= "2": raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install openpyxl instead." ) elif ext != "xls": caller = inspect.stack()[1] if (caller.filename.endswith( os.path.join("pandas", "io", "excel", "_base.py")) and caller.function == "read_excel"): stacklevel = 4 else: stacklevel = 2 warnings.warn( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install " f"openpyxl instead.", FutureWarning, stacklevel=stacklevel, ) self.engine = engine self.storage_options = storage_options self._reader = self._engines[engine](self._io, storage_options=storage_options)
from pandas.compat._optional import ( get_version, import_optional_dependency, ) from pandas.util.version import Version pytestmark = [ pytest.mark.filterwarnings( # Looks like tree.getiterator is deprecated in favor of tree.iter "ignore:This method will be removed in future versions:" "PendingDeprecationWarning"), pytest.mark.filterwarnings( "ignore:This method will be removed in future versions:DeprecationWarning" ), # GH 26552 pytest.mark.filterwarnings( "ignore:As the xlwt package is no longer maintained:FutureWarning"), # GH 38571 pytest.mark.filterwarnings( "ignore:.*In xlrd >= 2.0, only the xls format is supported:FutureWarning" ), ] if import_optional_dependency("xlrd", errors="ignore") is None: xlrd_version = None else: import xlrd xlrd_version = Version(get_version(xlrd))