Ejemplo n.º 1
0
class TestExcelWriterEngineTests:
    @pytest.mark.parametrize('klass,ext', [
        pytest.param(_XlsxWriter,
                     '.xlsx',
                     marks=pytest.mark.skipif(not td.safe_import('xlsxwriter'),
                                              reason='No xlsxwriter')),
        pytest.param(_OpenpyxlWriter,
                     '.xlsx',
                     marks=pytest.mark.skipif(not td.safe_import('openpyxl'),
                                              reason='No openpyxl')),
        pytest.param(_XlwtWriter,
                     '.xls',
                     marks=pytest.mark.skipif(not td.safe_import('xlwt'),
                                              reason='No xlwt'))
    ])
    def test_ExcelWriter_dispatch(self, klass, ext):
        with ensure_clean(ext) as path:
            writer = ExcelWriter(path)
            if ext == '.xlsx' and td.safe_import('xlsxwriter'):
                # xlsxwriter has preference over openpyxl if both installed
                assert isinstance(writer, _XlsxWriter)
            else:
                assert isinstance(writer, klass)

    def test_ExcelWriter_dispatch_raises(self):
        with pytest.raises(ValueError, match='No engine'):
            ExcelWriter('nothing')

    def test_register_writer(self):
        # some awkward mocking to test out dispatch and such actually works
        called_save = []
        called_write_cells = []

        class DummyClass(ExcelWriter):
            called_save = False
            called_write_cells = False
            supported_extensions = ['xlsx', 'xls']
            engine = 'dummy'

            def save(self):
                called_save.append(True)

            def write_cells(self, *args, **kwargs):
                called_write_cells.append(True)

        def check_called(func):
            func()
            assert len(called_save) >= 1
            assert len(called_write_cells) >= 1
            del called_save[:]
            del called_write_cells[:]

        with pd.option_context('io.excel.xlsx.writer', 'dummy'):
            register_writer(DummyClass)
            writer = ExcelWriter('something.xlsx')
            assert isinstance(writer, DummyClass)
            df = tm.makeCustomDataframe(1, 1)
            check_called(lambda: df.to_excel('something.xlsx'))
            check_called(lambda: df.to_excel('something.xls', engine='dummy'))
Ejemplo n.º 2
0
def test_safe_import(monkeypatch):
    assert not td.safe_import("foo")
    assert not td.safe_import("pandas", min_version="99.99.99")

    # Create dummy module to be imported
    import types
    import sys
    mod_name = "hello123"
    mod = types.ModuleType(mod_name)
    mod.__version__ = "1.5"

    assert not td.safe_import(mod_name)
    monkeypatch.setitem(sys.modules, mod_name, mod)
    assert not td.safe_import(mod_name, min_version="2.0")
    assert td.safe_import(mod_name, min_version="1.0")
Ejemplo n.º 3
0
def test_safe_import(monkeypatch):
    assert not td.safe_import("foo")
    assert not td.safe_import("pandas", min_version="99.99.99")

    # Create dummy module to be imported
    import types
    import sys
    mod_name = "hello123"
    mod = types.ModuleType(mod_name)
    mod.__version__ = "1.5"

    assert not td.safe_import(mod_name)
    monkeypatch.setitem(sys.modules, mod_name, mod)
    assert not td.safe_import(mod_name, min_version="2.0")
    assert td.safe_import(mod_name, min_version="1.0")
Ejemplo n.º 4
0
 def test_ExcelWriter_dispatch(self, klass, ext):
     with ensure_clean(ext) as path:
         writer = ExcelWriter(path)
         if ext == '.xlsx' and td.safe_import('xlsxwriter'):
             # xlsxwriter has preference over openpyxl if both installed
             assert isinstance(writer, _XlsxWriter)
         else:
             assert isinstance(writer, klass)
Ejemplo n.º 5
0
 def test_ExcelWriter_dispatch(self, klass, ext):
     with tm.ensure_clean(ext) as path:
         writer = ExcelWriter(path)
         if ext == ".xlsx" and td.safe_import("xlsxwriter"):
             # xlsxwriter has preference over openpyxl if both installed
             assert isinstance(writer, _XlsxWriter)
         else:
             assert isinstance(writer, klass)
Ejemplo n.º 6
0
def test_safe_import_dummy(monkeypatch, min_version, valid):
    mod_name = "hello123"

    mod = types.ModuleType(mod_name)
    mod.__version__ = "1.5"

    if min_version is not None:
        monkeypatch.setitem(sys.modules, mod_name, mod)

    result = td.safe_import(mod_name, min_version=min_version)
    result = result if valid else not result
    assert result
Ejemplo n.º 7
0
def test_safe_import_dummy(monkeypatch, min_version, valid):
    mod_name = "hello123"

    mod = types.ModuleType(mod_name)
    mod.__version__ = "1.5"

    if min_version is not None:
        monkeypatch.setitem(sys.modules, mod_name, mod)

    result = td.safe_import(mod_name, min_version=min_version)
    result = result if valid else not result
    assert result
Ejemplo n.º 8
0
    def test_thousands_macau_index_col(self, datapath, request):
        # https://github.com/pandas-dev/pandas/issues/29622
        # This tests fails for bs4 >= 4.8.0 - so handle xfail accordingly
        if self.read_html.keywords.get("flavor") == "bs4" and td.safe_import(
                "bs4", "4.8.0"):
            reason = "fails for bs4 version >= 4.8.0"
            request.node.add_marker(pytest.mark.xfail(reason=reason))

        all_non_nan_table_index = -2
        macau_data = datapath("io", "data", "html", "macau.html")
        dfs = self.read_html(macau_data, index_col=0, header=0)
        df = dfs[all_non_nan_table_index]

        assert not any(s.isna().any() for _, s in df.items())
Ejemplo n.º 9
0
def check_for_file_leaks():
    """
    Fixture to run around every test to ensure that we are not leaking files.

    See also
    --------
    _test_decorators.check_file_leaks
    """
    # GH#30162
    psutil = td.safe_import("psutil")
    if not psutil:
        yield

    else:
        proc = psutil.Process()
        flist = proc.open_files()
        yield
        flist2 = proc.open_files()
        assert flist == flist2
Ejemplo n.º 10
0
def test_safe_import_exists():
    assert td.safe_import("pandas")
Ejemplo n.º 11
0
def test_safe_import_versions(min_version, valid):
    result = td.safe_import("qq_pandas", min_version=min_version)
    result = result if valid else not result
    assert result
Ejemplo n.º 12
0
def test_safe_import_exists():
    assert td.safe_import("qq_pandas")
Ejemplo n.º 13
0
        This fixture will run as part of each test method defined in the
        class and any subclasses, on account of the `autouse=True`
        argument
        """
        option_name = 'io.excel.{ext}.writer'.format(ext=ext.strip('.'))
        prev_engine = get_option(option_name)
        set_option(option_name, engine)
        with ensure_clean(ext) as path:
            self.path = path
            yield
        set_option(option_name, prev_engine)  # Roll back option change


@pytest.mark.parametrize("engine,ext", [
    pytest.param('openpyxl', '.xlsx', marks=pytest.mark.skipif(
        not td.safe_import('openpyxl'), reason='No openpyxl')),
    pytest.param('openpyxl', '.xlsm', marks=pytest.mark.skipif(
        not td.safe_import('openpyxl'), reason='No openpyxl')),
    pytest.param('xlwt', '.xls', marks=pytest.mark.skipif(
        not td.safe_import('xlwt'), reason='No xlwt')),
    pytest.param('xlsxwriter', '.xlsx', marks=pytest.mark.skipif(
        not td.safe_import('xlsxwriter'), reason='No xlsxwriter'))
])
class TestExcelWriter(_WriterBase):
    # Base class for test cases to run with different Excel writers.

    def test_excel_sheet_size(self, engine, ext):

        # GH 26080
        breaking_row_count = 2**20 + 1
        breaking_col_count = 2**14 + 1
Ejemplo n.º 14
0
        class and any subclasses, on account of the `autouse=True`
        argument
        """
        option_name = 'io.excel.{ext}.writer'.format(ext=ext.strip('.'))
        prev_engine = get_option(option_name)
        set_option(option_name, engine)
        with ensure_clean(ext) as path:
            self.path = path
            yield
        set_option(option_name, prev_engine)  # Roll back option change


@pytest.mark.parametrize("engine,ext", [
    pytest.param('openpyxl',
                 '.xlsx',
                 marks=pytest.mark.skipif(not td.safe_import('openpyxl'),
                                          reason='No openpyxl')),
    pytest.param('openpyxl',
                 '.xlsm',
                 marks=pytest.mark.skipif(not td.safe_import('openpyxl'),
                                          reason='No openpyxl')),
    pytest.param('xlwt',
                 '.xls',
                 marks=pytest.mark.skipif(not td.safe_import('xlwt'),
                                          reason='No xlwt')),
    pytest.param('xlsxwriter',
                 '.xlsx',
                 marks=pytest.mark.skipif(not td.safe_import('xlsxwriter'),
                                          reason='No xlsxwriter'))
])
class TestExcelWriter(_WriterBase):
Ejemplo n.º 15
0
    with mock.patch('gcsfs.GCSFileSystem') as MockFileSystem:
        s = StringIO()
        instance = MockFileSystem.return_value
        instance.open.return_value = s

        df1.to_csv('gs://test/test.csv', index=True)
        df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0)

    assert_frame_equal(df1, df2)


@td.skip_if_no('gcsfs')
def test_gcs_get_filepath_or_buffer(mock):
    df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
                     'dt': date_range('2018-06-18', periods=2)})
    with mock.patch('pandas.io.gcs.get_filepath_or_buffer') as MockGetFilepath:
        MockGetFilepath.return_value = (StringIO(df1.to_csv(index=False)),
                                        None, None, False)
        df2 = read_csv('gs://test/test.csv', parse_dates=['dt'])

    assert_frame_equal(df1, df2)
    assert MockGetFilepath.called


@pytest.mark.skipif(td.safe_import('gcsfs'),
                    reason='Only check when gcsfs not installed')
def test_gcs_not_present_exception():
    with pytest.raises(ImportError) as e:
        read_csv('gs://test/test.csv')
        assert 'gcsfs library is required' in str(e.value)
Ejemplo n.º 16
0
    df2 = read_csv(StringIO(s.getvalue()), parse_dates=["dt"], index_col=0)

    assert_frame_equal(df1, df2)


@td.skip_if_no("gcsfs")
def test_gcs_get_filepath_or_buffer(monkeypatch):
    df1 = DataFrame({
        "int": [1, 3],
        "float": [2.0, np.nan],
        "str": ["t", "s"],
        "dt": date_range("2018-06-18", periods=2),
    })

    def mock_get_filepath_or_buffer(*args, **kwargs):
        return (StringIO(df1.to_csv(index=False)), None, None, False)

    monkeypatch.setattr("pandas.io.gcs.get_filepath_or_buffer",
                        mock_get_filepath_or_buffer)
    df2 = read_csv("gs://test/test.csv", parse_dates=["dt"])

    assert_frame_equal(df1, df2)


@pytest.mark.skipif(td.safe_import("gcsfs"),
                    reason="Only check when gcsfs not installed")
def test_gcs_not_present_exception():
    with pytest.raises(ImportError) as e:
        read_csv("gs://test/test.csv")
        assert "gcsfs library is required" in str(e.value)
Ejemplo n.º 17
0
@td.skip_if_no("gcsfs")
def test_gcs_get_filepath_or_buffer(monkeypatch):
    df1 = DataFrame(
        {
            "int": [1, 3],
            "float": [2.0, np.nan],
            "str": ["t", "s"],
            "dt": date_range("2018-06-18", periods=2),
        }
    )

    def mock_get_filepath_or_buffer(*args, **kwargs):
        return (StringIO(df1.to_csv(index=False)), None, None, False)

    monkeypatch.setattr(
        "pandas.io.gcs.get_filepath_or_buffer", mock_get_filepath_or_buffer
    )
    df2 = read_csv("gs://test/test.csv", parse_dates=["dt"])

    assert_frame_equal(df1, df2)


@pytest.mark.skipif(
    td.safe_import("gcsfs"), reason="Only check when gcsfs not installed"
)
def test_gcs_not_present_exception():
    with pytest.raises(ImportError) as e:
        read_csv("gs://test/test.csv")
        assert "gcsfs library is required" in str(e.value)
Ejemplo n.º 18
0
class TestReaders:
    @pytest.fixture(
        autouse=True,
        params=[
            # Add any engines to test here
            pytest.param('xlrd',
                         marks=pytest.mark.skipif(not td.safe_import("xlrd"),
                                                  reason="no xlrd")),
            pytest.param(None,
                         marks=pytest.mark.skipif(not td.safe_import("xlrd"),
                                                  reason="no xlrd")),
        ])
    def cd_and_set_engine(self, request, datapath, monkeypatch):
        """
        Change directory and set engine for read_excel calls.
        """
        func = partial(pd.read_excel, engine=request.param)
        monkeypatch.chdir(datapath("io", "data"))
        monkeypatch.setattr(pd, 'read_excel', func)

    def test_usecols_int(self, read_ext, df_ref):
        df_ref = df_ref.reindex(columns=["A", "B", "C"])

        # usecols as int
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            with ignore_xlrd_time_clock_warning():
                df1 = pd.read_excel("test1" + read_ext,
                                    "Sheet1",
                                    index_col=0,
                                    usecols=3)

        # usecols as int
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            with ignore_xlrd_time_clock_warning():
                df2 = pd.read_excel("test1" + read_ext,
                                    "Sheet2",
                                    skiprows=[1],
                                    index_col=0,
                                    usecols=3)

        # TODO add index to xls file)
        tm.assert_frame_equal(df1, df_ref, check_names=False)
        tm.assert_frame_equal(df2, df_ref, check_names=False)

    def test_usecols_list(self, read_ext, df_ref):

        df_ref = df_ref.reindex(columns=['B', 'C'])
        df1 = pd.read_excel('test1' + read_ext,
                            'Sheet1',
                            index_col=0,
                            usecols=[0, 2, 3])
        df2 = pd.read_excel('test1' + read_ext,
                            'Sheet2',
                            skiprows=[1],
                            index_col=0,
                            usecols=[0, 2, 3])

        # TODO add index to xls file)
        tm.assert_frame_equal(df1, df_ref, check_names=False)
        tm.assert_frame_equal(df2, df_ref, check_names=False)

    def test_usecols_str(self, read_ext, df_ref):

        df1 = df_ref.reindex(columns=['A', 'B', 'C'])
        df2 = pd.read_excel('test1' + read_ext,
                            'Sheet1',
                            index_col=0,
                            usecols='A:D')
        df3 = pd.read_excel('test1' + read_ext,
                            'Sheet2',
                            skiprows=[1],
                            index_col=0,
                            usecols='A:D')

        # TODO add index to xls, read xls ignores index name ?
        tm.assert_frame_equal(df2, df1, check_names=False)
        tm.assert_frame_equal(df3, df1, check_names=False)

        df1 = df_ref.reindex(columns=['B', 'C'])
        df2 = pd.read_excel('test1' + read_ext,
                            'Sheet1',
                            index_col=0,
                            usecols='A,C,D')
        df3 = pd.read_excel('test1' + read_ext,
                            'Sheet2',
                            skiprows=[1],
                            index_col=0,
                            usecols='A,C,D')
        # TODO add index to xls file
        tm.assert_frame_equal(df2, df1, check_names=False)
        tm.assert_frame_equal(df3, df1, check_names=False)

        df1 = df_ref.reindex(columns=['B', 'C'])
        df2 = pd.read_excel('test1' + read_ext,
                            'Sheet1',
                            index_col=0,
                            usecols='A,C:D')
        df3 = pd.read_excel('test1' + read_ext,
                            'Sheet2',
                            skiprows=[1],
                            index_col=0,
                            usecols='A,C:D')
        tm.assert_frame_equal(df2, df1, check_names=False)
        tm.assert_frame_equal(df3, df1, check_names=False)

    @pytest.mark.parametrize("usecols", [
        [0, 1, 3],
        [0, 3, 1],
        [1, 0, 3],
        [1, 3, 0],
        [3, 0, 1],
        [3, 1, 0],
    ])
    def test_usecols_diff_positional_int_columns_order(self, read_ext, usecols,
                                                       df_ref):
        expected = df_ref[["A", "C"]]
        result = pd.read_excel("test1" + read_ext,
                               "Sheet1",
                               index_col=0,
                               usecols=usecols)
        tm.assert_frame_equal(result, expected, check_names=False)

    @pytest.mark.parametrize("usecols", [["B", "D"], ["D", "B"]])
    def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols,
                                                       df_ref):
        expected = df_ref[["B", "D"]]
        expected.index = range(len(expected))

        result = pd.read_excel("test1" + read_ext, "Sheet1", usecols=usecols)
        tm.assert_frame_equal(result, expected, check_names=False)

    def test_read_excel_without_slicing(self, read_ext, df_ref):
        expected = df_ref
        result = pd.read_excel("test1" + read_ext, "Sheet1", index_col=0)
        tm.assert_frame_equal(result, expected, check_names=False)

    def test_usecols_excel_range_str(self, read_ext, df_ref):
        expected = df_ref[["C", "D"]]
        result = pd.read_excel("test1" + read_ext,
                               "Sheet1",
                               index_col=0,
                               usecols="A,D:E")
        tm.assert_frame_equal(result, expected, check_names=False)

    def test_usecols_excel_range_str_invalid(self, read_ext):
        msg = "Invalid column name: E1"

        with pytest.raises(ValueError, match=msg):
            pd.read_excel("test1" + read_ext, "Sheet1", usecols="D:E1")

    def test_index_col_label_error(self, read_ext):
        msg = "list indices must be integers.*, not str"

        with pytest.raises(TypeError, match=msg):
            pd.read_excel("test1" + read_ext,
                          "Sheet1",
                          index_col=["A"],
                          usecols=["A", "C"])

    def test_index_col_empty(self, read_ext):
        # see gh-9208
        result = pd.read_excel("test1" + read_ext,
                               "Sheet3",
                               index_col=["A", "B", "C"])
        expected = DataFrame(columns=["D", "E", "F"],
                             index=MultiIndex(levels=[[]] * 3,
                                              codes=[[]] * 3,
                                              names=["A", "B", "C"]))
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("index_col", [None, 2])
    def test_index_col_with_unnamed(self, read_ext, index_col):
        # see gh-18792
        result = pd.read_excel("test1" + read_ext,
                               "Sheet4",
                               index_col=index_col)
        expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]],
                             columns=["Unnamed: 0", "col1", "col2"])
        if index_col:
            expected = expected.set_index(expected.columns[index_col])

        tm.assert_frame_equal(result, expected)

    def test_usecols_pass_non_existent_column(self, read_ext):
        msg = ("Usecols do not match columns, "
               "columns expected but not found: " + r"\['E'\]")

        with pytest.raises(ValueError, match=msg):
            pd.read_excel("test1" + read_ext, usecols=["E"])

    def test_usecols_wrong_type(self, read_ext):
        msg = ("'usecols' must either be list-like of "
               "all strings, all unicode, all integers or a callable.")

        with pytest.raises(ValueError, match=msg):
            pd.read_excel("test1" + read_ext, usecols=["E1", 0])

    def test_excel_stop_iterator(self, read_ext):

        parsed = pd.read_excel('test2' + read_ext, 'Sheet1')
        expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1'])
        tm.assert_frame_equal(parsed, expected)

    def test_excel_cell_error_na(self, read_ext):

        parsed = pd.read_excel('test3' + read_ext, 'Sheet1')
        expected = DataFrame([[np.nan]], columns=['Test'])
        tm.assert_frame_equal(parsed, expected)

    def test_excel_table(self, read_ext, df_ref):

        df1 = pd.read_excel('test1' + read_ext, 'Sheet1', index_col=0)
        df2 = pd.read_excel('test1' + read_ext,
                            'Sheet2',
                            skiprows=[1],
                            index_col=0)
        # TODO add index to file
        tm.assert_frame_equal(df1, df_ref, check_names=False)
        tm.assert_frame_equal(df2, df_ref, check_names=False)

        df3 = pd.read_excel('test1' + read_ext,
                            'Sheet1',
                            index_col=0,
                            skipfooter=1)
        tm.assert_frame_equal(df3, df1.iloc[:-1])

    def test_reader_special_dtypes(self, read_ext):

        expected = DataFrame.from_dict(
            OrderedDict([
                ("IntCol", [1, 2, -3, 4, 0]),
                ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]),
                ("BoolCol", [True, False, True, True, False]),
                ("StrCol", [1, 2, 3, 4, 5]),
                # GH5394 - this is why convert_float isn't vectorized
                ("Str2Col", ["a", 3, "c", "d", "e"]),
                ("DateCol", [
                    datetime(2013, 10, 30),
                    datetime(2013, 10, 31),
                    datetime(1905, 1, 1),
                    datetime(2013, 12, 14),
                    datetime(2015, 3, 14)
                ])
            ]))
        basename = 'test_types'

        # should read in correctly and infer types
        actual = pd.read_excel(basename + read_ext, 'Sheet1')
        tm.assert_frame_equal(actual, expected)

        # if not coercing number, then int comes in as float
        float_expected = expected.copy()
        float_expected["IntCol"] = float_expected["IntCol"].astype(float)
        float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0
        actual = pd.read_excel(basename + read_ext,
                               'Sheet1',
                               convert_float=False)
        tm.assert_frame_equal(actual, float_expected)

        # check setting Index (assuming xls and xlsx are the same here)
        for icol, name in enumerate(expected.columns):
            actual = pd.read_excel(basename + read_ext,
                                   'Sheet1',
                                   index_col=icol)
            exp = expected.set_index(name)
            tm.assert_frame_equal(actual, exp)

        # convert_float and converters should be different but both accepted
        expected["StrCol"] = expected["StrCol"].apply(str)
        actual = pd.read_excel(basename + read_ext,
                               'Sheet1',
                               converters={"StrCol": str})
        tm.assert_frame_equal(actual, expected)

        no_convert_float = float_expected.copy()
        no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
        actual = pd.read_excel(basename + read_ext,
                               'Sheet1',
                               convert_float=False,
                               converters={"StrCol": str})
        tm.assert_frame_equal(actual, no_convert_float)

    # GH8212 - support for converters and missing values
    def test_reader_converters(self, read_ext):

        basename = 'test_converters'

        expected = DataFrame.from_dict(
            OrderedDict([
                ("IntCol", [1, 2, -3, -1000, 0]),
                ("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]),
                ("BoolCol", ['Found', 'Found', 'Found', 'Not found', 'Found']),
                ("StrCol", ['1', np.nan, '3', '4', '5']),
            ]))

        converters = {
            'IntCol': lambda x: int(x) if x != '' else -1000,
            'FloatCol': lambda x: 10 * x if x else np.nan,
            2: lambda x: 'Found' if x != '' else 'Not found',
            3: lambda x: str(x) if x else '',
        }

        # should read in correctly and set types of single cells (not array
        # dtypes)
        actual = pd.read_excel(basename + read_ext,
                               'Sheet1',
                               converters=converters)
        tm.assert_frame_equal(actual, expected)

    def test_reader_dtype(self, read_ext):
        # GH 8212
        basename = 'testdtype'
        actual = pd.read_excel(basename + read_ext)

        expected = DataFrame({
            'a': [1, 2, 3, 4],
            'b': [2.5, 3.5, 4.5, 5.5],
            'c': [1, 2, 3, 4],
            'd': [1.0, 2.0, np.nan, 4.0]
        }).reindex(columns=['a', 'b', 'c', 'd'])

        tm.assert_frame_equal(actual, expected)

        actual = pd.read_excel(basename + read_ext,
                               dtype={
                                   'a': 'float64',
                                   'b': 'float32',
                                   'c': str
                               })

        expected['a'] = expected['a'].astype('float64')
        expected['b'] = expected['b'].astype('float32')
        expected['c'] = ['001', '002', '003', '004']
        tm.assert_frame_equal(actual, expected)

        with pytest.raises(ValueError):
            pd.read_excel(basename + read_ext, dtype={'d': 'int64'})

    @pytest.mark.parametrize("dtype,expected", [
        (None,
         DataFrame({
             "a": [1, 2, 3, 4],
             "b": [2.5, 3.5, 4.5, 5.5],
             "c": [1, 2, 3, 4],
             "d": [1.0, 2.0, np.nan, 4.0]
         })),
        ({
            "a": "float64",
            "b": "float32",
            "c": str,
            "d": str
        },
         DataFrame({
             "a": Series([1, 2, 3, 4], dtype="float64"),
             "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
             "c": ["001", "002", "003", "004"],
             "d": ["1", "2", np.nan, "4"]
         })),
    ])
    def test_reader_dtype_str(self, read_ext, dtype, expected):
        # see gh-20377
        basename = "testdtype"

        actual = pd.read_excel(basename + read_ext, dtype=dtype)
        tm.assert_frame_equal(actual, expected)

    def test_reading_all_sheets(self, read_ext):
        # Test reading all sheetnames by setting sheetname to None,
        # Ensure a dict is returned.
        # See PR #9450
        basename = 'test_multisheet'
        dfs = pd.read_excel(basename + read_ext, sheet_name=None)
        # ensure this is not alphabetical to test order preservation
        expected_keys = ['Charlie', 'Alpha', 'Beta']
        tm.assert_contains_all(expected_keys, dfs.keys())
        # Issue 9930
        # Ensure sheet order is preserved
        assert expected_keys == list(dfs.keys())

    def test_reading_multiple_specific_sheets(self, read_ext):
        # Test reading specific sheetnames by specifying a mixed list
        # of integers and strings, and confirm that duplicated sheet
        # references (positions/names) are removed properly.
        # Ensure a dict is returned
        # See PR #9450
        basename = 'test_multisheet'
        # Explicitly request duplicates. Only the set should be returned.
        expected_keys = [2, 'Charlie', 'Charlie']
        dfs = pd.read_excel(basename + read_ext, sheet_name=expected_keys)
        expected_keys = list(set(expected_keys))
        tm.assert_contains_all(expected_keys, dfs.keys())
        assert len(expected_keys) == len(dfs.keys())

    def test_reading_all_sheets_with_blank(self, read_ext):
        # Test reading all sheetnames by setting sheetname to None,
        # In the case where some sheets are blank.
        # Issue #11711
        basename = 'blank_with_header'
        dfs = pd.read_excel(basename + read_ext, sheet_name=None)
        expected_keys = ['Sheet1', 'Sheet2', 'Sheet3']
        tm.assert_contains_all(expected_keys, dfs.keys())

    # GH6403
    def test_read_excel_blank(self, read_ext):
        actual = pd.read_excel('blank' + read_ext, 'Sheet1')
        tm.assert_frame_equal(actual, DataFrame())

    def test_read_excel_blank_with_header(self, read_ext):
        expected = DataFrame(columns=['col_1', 'col_2'])
        actual = pd.read_excel('blank_with_header' + read_ext, 'Sheet1')
        tm.assert_frame_equal(actual, expected)

    def test_date_conversion_overflow(self, read_ext):
        # GH 10001 : pandas.ExcelFile ignore parse_dates=False
        expected = pd.DataFrame([[pd.Timestamp('2016-03-12'), 'Marc Johnson'],
                                 [pd.Timestamp('2016-03-16'), 'Jack Black'],
                                 [1e+20, 'Timothy Brown']],
                                columns=['DateColWithBigInt', 'StringCol'])

        result = pd.read_excel('testdateoverflow' + read_ext)
        tm.assert_frame_equal(result, expected)

    def test_sheet_name(self, read_ext, df_ref):
        filename = "test1"
        sheet_name = "Sheet1"

        df1 = pd.read_excel(filename + read_ext,
                            sheet_name=sheet_name,
                            index_col=0)  # doc
        with ignore_xlrd_time_clock_warning():
            df2 = pd.read_excel(filename + read_ext,
                                index_col=0,
                                sheet_name=sheet_name)

        tm.assert_frame_equal(df1, df_ref, check_names=False)
        tm.assert_frame_equal(df2, df_ref, check_names=False)

    def test_excel_read_buffer(self, read_ext):

        pth = 'test1' + read_ext
        expected = pd.read_excel(pth, 'Sheet1', index_col=0)
        with open(pth, 'rb') as f:
            actual = pd.read_excel(f, 'Sheet1', index_col=0)
            tm.assert_frame_equal(expected, actual)

    def test_bad_engine_raises(self, read_ext):
        bad_engine = 'foo'
        with pytest.raises(ValueError, match="Unknown engine: foo"):
            pd.read_excel('', engine=bad_engine)

    @tm.network
    def test_read_from_http_url(self, read_ext):
        url = ('https://raw.github.com/pandas-dev/pandas/master/'
               'pandas/tests/io/data/test1' + read_ext)
        url_table = pd.read_excel(url)
        local_table = pd.read_excel('test1' + read_ext)
        tm.assert_frame_equal(url_table, local_table)

    @td.skip_if_not_us_locale
    def test_read_from_s3_url(self, read_ext, s3_resource):
        # Bucket "pandas-test" created in tests/io/conftest.py
        with open('test1' + read_ext, "rb") as f:
            s3_resource.Bucket("pandas-test").put_object(Key="test1" +
                                                         read_ext,
                                                         Body=f)

        url = ('s3://pandas-test/test1' + read_ext)
        url_table = pd.read_excel(url)
        local_table = pd.read_excel('test1' + read_ext)
        tm.assert_frame_equal(url_table, local_table)

    @pytest.mark.slow
    # ignore warning from old xlrd
    @pytest.mark.filterwarnings("ignore:This metho:PendingDeprecationWarning")
    def test_read_from_file_url(self, read_ext, datapath):

        # FILE
        localtable = os.path.join(datapath("io", "data"), 'test1' + read_ext)
        local_table = pd.read_excel(localtable)

        try:
            url_table = pd.read_excel('file://localhost/' + localtable)
        except URLError:
            # fails on some systems
            import platform
            pytest.skip("failing on %s" % ' '.join(platform.uname()).strip())

        tm.assert_frame_equal(url_table, local_table)

    def test_read_from_pathlib_path(self, read_ext):

        # GH12655
        from pathlib import Path

        str_path = 'test1' + read_ext
        expected = pd.read_excel(str_path, 'Sheet1', index_col=0)

        path_obj = Path('test1' + read_ext)
        actual = pd.read_excel(path_obj, 'Sheet1', index_col=0)

        tm.assert_frame_equal(expected, actual)

    @td.skip_if_no('py.path')
    def test_read_from_py_localpath(self, read_ext):

        # GH12655
        from py.path import local as LocalPath

        str_path = os.path.join('test1' + read_ext)
        expected = pd.read_excel(str_path, 'Sheet1', index_col=0)

        path_obj = LocalPath().join('test1' + read_ext)
        actual = pd.read_excel(path_obj, 'Sheet1', index_col=0)

        tm.assert_frame_equal(expected, actual)

    def test_reader_seconds(self, read_ext):

        # Test reading times with and without milliseconds. GH5945.
        expected = DataFrame.from_dict({
            "Time": [
                time(1, 2, 3),
                time(2, 45, 56, 100000),
                time(4, 29, 49, 200000),
                time(6, 13, 42, 300000),
                time(7, 57, 35, 400000),
                time(9, 41, 28, 500000),
                time(11, 25, 21, 600000),
                time(13, 9, 14, 700000),
                time(14, 53, 7, 800000),
                time(16, 37, 0, 900000),
                time(18, 20, 54)
            ]
        })

        actual = pd.read_excel('times_1900' + read_ext, 'Sheet1')
        tm.assert_frame_equal(actual, expected)

        actual = pd.read_excel('times_1904' + read_ext, 'Sheet1')
        tm.assert_frame_equal(actual, expected)

    def test_read_excel_multiindex(self, read_ext):
        # see gh-4679
        mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]])
        mi_file = "testmultiindex" + read_ext

        # "mi_column" sheet
        expected = DataFrame(
            [[1, 2.5, pd.Timestamp("2015-01-01"), True],
             [2, 3.5, pd.Timestamp("2015-01-02"), False],
             [3, 4.5, pd.Timestamp("2015-01-03"), False],
             [4, 5.5, pd.Timestamp("2015-01-04"), True]],
            columns=mi)

        actual = pd.read_excel(mi_file,
                               "mi_column",
                               header=[0, 1],
                               index_col=0)
        tm.assert_frame_equal(actual, expected)

        # "mi_index" sheet
        expected.index = mi
        expected.columns = ["a", "b", "c", "d"]

        actual = pd.read_excel(mi_file, "mi_index", index_col=[0, 1])
        tm.assert_frame_equal(actual, expected, check_names=False)

        # "both" sheet
        expected.columns = mi

        actual = pd.read_excel(mi_file,
                               "both",
                               index_col=[0, 1],
                               header=[0, 1])
        tm.assert_frame_equal(actual, expected, check_names=False)

        # "mi_index_name" sheet
        expected.columns = ["a", "b", "c", "d"]
        expected.index = mi.set_names(["ilvl1", "ilvl2"])

        actual = pd.read_excel(mi_file, "mi_index_name", index_col=[0, 1])
        tm.assert_frame_equal(actual, expected)

        # "mi_column_name" sheet
        expected.index = list(range(4))
        expected.columns = mi.set_names(["c1", "c2"])
        actual = pd.read_excel(mi_file,
                               "mi_column_name",
                               header=[0, 1],
                               index_col=0)
        tm.assert_frame_equal(actual, expected)

        # see gh-11317
        # "name_with_int" sheet
        expected.columns = mi.set_levels([1, 2],
                                         level=1).set_names(["c1", "c2"])

        actual = pd.read_excel(mi_file,
                               "name_with_int",
                               index_col=0,
                               header=[0, 1])
        tm.assert_frame_equal(actual, expected)

        # "both_name" sheet
        expected.columns = mi.set_names(["c1", "c2"])
        expected.index = mi.set_names(["ilvl1", "ilvl2"])

        actual = pd.read_excel(mi_file,
                               "both_name",
                               index_col=[0, 1],
                               header=[0, 1])
        tm.assert_frame_equal(actual, expected)

        # "both_skiprows" sheet
        actual = pd.read_excel(mi_file,
                               "both_name_skiprows",
                               index_col=[0, 1],
                               header=[0, 1],
                               skiprows=2)
        tm.assert_frame_equal(actual, expected)

    def test_read_excel_multiindex_header_only(self, read_ext):
        # see gh-11733.
        #
        # Don't try to parse a header name if there isn't one.
        mi_file = "testmultiindex" + read_ext
        result = pd.read_excel(mi_file, "index_col_none", header=[0, 1])

        exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")])
        expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns)
        tm.assert_frame_equal(result, expected)

    def test_excel_old_index_format(self, read_ext):
        # see gh-4679
        filename = "test_index_name_pre17" + read_ext

        # We detect headers to determine if index names exist, so
        # that "index" name in the "names" version of the data will
        # now be interpreted as rows that include null data.
        data = np.array([[None, None, None, None, None],
                         ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
                         ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
                         ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
                         ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
                         ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]])
        columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
        mi = MultiIndex(levels=[[
            "R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"
        ], ["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"]],
                        codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]],
                        names=[None, None])
        si = Index(
            ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
            name=None)

        expected = pd.DataFrame(data, index=si, columns=columns)

        actual = pd.read_excel(filename, "single_names", index_col=0)
        tm.assert_frame_equal(actual, expected)

        expected.index = mi

        actual = pd.read_excel(filename, "multi_names", index_col=[0, 1])
        tm.assert_frame_equal(actual, expected)

        # The analogous versions of the "names" version data
        # where there are explicitly no names for the indices.
        data = np.array([["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
                         ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
                         ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
                         ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
                         ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]])
        columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
        mi = MultiIndex(
            levels=[["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
                    ["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"]],
            codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]],
            names=[None, None])
        si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
                   name=None)

        expected = pd.DataFrame(data, index=si, columns=columns)

        actual = pd.read_excel(filename, "single_no_names", index_col=0)
        tm.assert_frame_equal(actual, expected)

        expected.index = mi

        actual = pd.read_excel(filename, "multi_no_names", index_col=[0, 1])
        tm.assert_frame_equal(actual, expected, check_names=False)

    def test_read_excel_bool_header_arg(self, read_ext):
        # GH 6114
        for arg in [True, False]:
            with pytest.raises(TypeError):
                pd.read_excel('test1' + read_ext, header=arg)

    def test_read_excel_chunksize(self, read_ext):
        # GH 8011
        with pytest.raises(NotImplementedError):
            pd.read_excel('test1' + read_ext, chunksize=100)

    def test_read_excel_skiprows_list(self, read_ext):
        # GH 4903
        actual = pd.read_excel('testskiprows' + read_ext,
                               'skiprows_list',
                               skiprows=[0, 2])
        expected = DataFrame(
            [[1, 2.5, pd.Timestamp('2015-01-01'), True],
             [2, 3.5, pd.Timestamp('2015-01-02'), False],
             [3, 4.5, pd.Timestamp('2015-01-03'), False],
             [4, 5.5, pd.Timestamp('2015-01-04'), True]],
            columns=['a', 'b', 'c', 'd'])
        tm.assert_frame_equal(actual, expected)

        actual = pd.read_excel('testskiprows' + read_ext,
                               'skiprows_list',
                               skiprows=np.array([0, 2]))
        tm.assert_frame_equal(actual, expected)

    def test_read_excel_nrows(self, read_ext):
        # GH 16645
        num_rows_to_pull = 5
        actual = pd.read_excel('test1' + read_ext, nrows=num_rows_to_pull)
        expected = pd.read_excel('test1' + read_ext)
        expected = expected[:num_rows_to_pull]
        tm.assert_frame_equal(actual, expected)

    def test_read_excel_nrows_greater_than_nrows_in_file(self, read_ext):
        # GH 16645
        expected = pd.read_excel('test1' + read_ext)
        num_records_in_file = len(expected)
        num_rows_to_pull = num_records_in_file + 10
        actual = pd.read_excel('test1' + read_ext, nrows=num_rows_to_pull)
        tm.assert_frame_equal(actual, expected)

    def test_read_excel_nrows_non_integer_parameter(self, read_ext):
        # GH 16645
        msg = "'nrows' must be an integer >=0"
        with pytest.raises(ValueError, match=msg):
            pd.read_excel('test1' + read_ext, nrows='5')

    def test_read_excel_squeeze(self, read_ext):
        # GH 12157
        f = 'test_squeeze' + read_ext

        actual = pd.read_excel(f, 'two_columns', index_col=0, squeeze=True)
        expected = pd.Series([2, 3, 4], [4, 5, 6], name='b')
        expected.index.name = 'a'
        tm.assert_series_equal(actual, expected)

        actual = pd.read_excel(f, 'two_columns', squeeze=True)
        expected = pd.DataFrame({'a': [4, 5, 6], 'b': [2, 3, 4]})
        tm.assert_frame_equal(actual, expected)

        actual = pd.read_excel(f, 'one_column', squeeze=True)
        expected = pd.Series([1, 2, 3], name='a')
        tm.assert_series_equal(actual, expected)
Ejemplo n.º 19
0
def test_safe_import_non_existent(name):
    assert not td.safe_import(name)
Ejemplo n.º 20
0
def test_safe_import_non_existent(name):
    assert not td.safe_import(name)
Ejemplo n.º 21
0
    with pytest.raises(ValueError):
        read_html(url, 'google', flavor='not a* valid**++ flaver')


@td.skip_if_no('bs4')
@td.skip_if_no('lxml')
def test_same_ordering(datapath):
    filename = datapath('io', 'data', 'valid_markup.html')
    dfs_lxml = read_html(filename, index_col=0, flavor=['lxml'])
    dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4'])
    assert_framelist_equal(dfs_lxml, dfs_bs4)


@pytest.mark.parametrize("flavor", [
    pytest.param('bs4',
                 marks=pytest.mark.skipif(not td.safe_import('lxml'),
                                          reason='No bs4')),
    pytest.param('lxml',
                 marks=pytest.mark.skipif(not td.safe_import('lxml'),
                                          reason='No lxml'))
],
                         scope="class")
class TestReadHtml(object):
    @pytest.fixture(autouse=True)
    def set_files(self, datapath):
        self.spam_data = datapath('io', 'data', 'spam.html')
        self.spam_data_kwargs = {}
        if PY3:
            self.spam_data_kwargs['encoding'] = 'UTF-8'
        self.banklist_data = datapath("io", "data", "banklist.html")
Ejemplo n.º 22
0
    monkeypatch.setattr('gcsfs.GCSFileSystem', MockGCSFileSystem)
    df1.to_csv('gs://test/test.csv', index=True)
    df2 = read_csv(StringIO(s.getvalue()), parse_dates=['dt'], index_col=0)

    assert_frame_equal(df1, df2)


@td.skip_if_no('gcsfs')
def test_gcs_get_filepath_or_buffer(monkeypatch):
    df1 = DataFrame({'int': [1, 3], 'float': [2.0, np.nan], 'str': ['t', 's'],
                     'dt': date_range('2018-06-18', periods=2)})

    def mock_get_filepath_or_buffer(*args, **kwargs):
        return (StringIO(df1.to_csv(index=False)),
                None, None, False)

    monkeypatch.setattr('pandas.io.gcs.get_filepath_or_buffer',
                        mock_get_filepath_or_buffer)
    df2 = read_csv('gs://test/test.csv', parse_dates=['dt'])

    assert_frame_equal(df1, df2)


@pytest.mark.skipif(td.safe_import('gcsfs'),
                    reason='Only check when gcsfs not installed')
def test_gcs_not_present_exception():
    with pytest.raises(ImportError) as e:
        read_csv('gs://test/test.csv')
        assert 'gcsfs library is required' in str(e.value)
Ejemplo n.º 23
0
    with pytest.raises(ValueError, match=msg):
        read_html(url, "google", flavor=flavor)


@td.skip_if_no('bs4')
@td.skip_if_no('lxml')
def test_same_ordering(datapath):
    filename = datapath('io', 'data', 'valid_markup.html')
    dfs_lxml = read_html(filename, index_col=0, flavor=['lxml'])
    dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4'])
    assert_framelist_equal(dfs_lxml, dfs_bs4)


@pytest.mark.parametrize("flavor", [
    pytest.param('bs4', marks=pytest.mark.skipif(
        not td.safe_import('lxml'), reason='No bs4')),
    pytest.param('lxml', marks=pytest.mark.skipif(
        not td.safe_import('lxml'), reason='No lxml'))], scope="class")
class TestReadHtml(object):

    @pytest.fixture(autouse=True)
    def set_files(self, datapath):
        self.spam_data = datapath('io', 'data', 'spam.html')
        self.spam_data_kwargs = {}
        if PY3:
            self.spam_data_kwargs['encoding'] = 'UTF-8'
        self.banklist_data = datapath("io", "data", "banklist.html")

    @pytest.fixture(autouse=True, scope="function")
    def set_defaults(self, flavor, request):
        self.read_html = partial(read_html, flavor=flavor)
Ejemplo n.º 24
0
def test_safe_import_versions(min_version, valid):
    result = td.safe_import("pandas", min_version=min_version)
    result = result if valid else not result
    assert result
Ejemplo n.º 25
0
class TestExcelFileRead:
    @pytest.fixture(
        autouse=True,
        params=[
            # Add any engines to test here
            pytest.param('xlrd',
                         marks=pytest.mark.skipif(not td.safe_import("xlrd"),
                                                  reason="no xlrd")),
            pytest.param(None,
                         marks=pytest.mark.skipif(not td.safe_import("xlrd"),
                                                  reason="no xlrd")),
        ])
    def cd_and_set_engine(self, request, datapath, monkeypatch):
        """
        Change directory and set engine for ExcelFile objects.
        """
        func = partial(pd.ExcelFile, engine=request.param)
        monkeypatch.chdir(datapath("io", "data"))
        monkeypatch.setattr(pd, 'ExcelFile', func)

    def test_excel_passes_na(self, read_ext):

        excel = ExcelFile('test4' + read_ext)

        parsed = pd.read_excel(excel,
                               'Sheet1',
                               keep_default_na=False,
                               na_values=['apple'])
        expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']],
                             columns=['Test'])
        tm.assert_frame_equal(parsed, expected)

        parsed = pd.read_excel(excel,
                               'Sheet1',
                               keep_default_na=True,
                               na_values=['apple'])
        expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']],
                             columns=['Test'])
        tm.assert_frame_equal(parsed, expected)

        # 13967
        excel = ExcelFile('test5' + read_ext)

        parsed = pd.read_excel(excel,
                               'Sheet1',
                               keep_default_na=False,
                               na_values=['apple'])
        expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']],
                             columns=['Test'])
        tm.assert_frame_equal(parsed, expected)

        parsed = pd.read_excel(excel,
                               'Sheet1',
                               keep_default_na=True,
                               na_values=['apple'])
        expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']],
                             columns=['Test'])
        tm.assert_frame_equal(parsed, expected)

    @pytest.mark.parametrize('arg', ['sheet', 'sheetname', 'parse_cols'])
    def test_unexpected_kwargs_raises(self, read_ext, arg):
        # gh-17964
        excel = ExcelFile('test1' + read_ext)

        kwarg = {arg: 'Sheet1'}
        msg = "unexpected keyword argument `{}`".format(arg)
        with pytest.raises(TypeError, match=msg):
            pd.read_excel(excel, **kwarg)

    def test_excel_table_sheet_by_index(self, read_ext, df_ref):

        excel = ExcelFile('test1' + read_ext)

        df1 = pd.read_excel(excel, 0, index_col=0)
        df2 = pd.read_excel(excel, 1, skiprows=[1], index_col=0)
        tm.assert_frame_equal(df1, df_ref, check_names=False)
        tm.assert_frame_equal(df2, df_ref, check_names=False)

        df1 = excel.parse(0, index_col=0)
        df2 = excel.parse(1, skiprows=[1], index_col=0)
        tm.assert_frame_equal(df1, df_ref, check_names=False)
        tm.assert_frame_equal(df2, df_ref, check_names=False)

        df3 = pd.read_excel(excel, 0, index_col=0, skipfooter=1)
        tm.assert_frame_equal(df3, df1.iloc[:-1])

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            df4 = pd.read_excel(excel, 0, index_col=0, skip_footer=1)
            tm.assert_frame_equal(df3, df4)

        df3 = excel.parse(0, index_col=0, skipfooter=1)
        tm.assert_frame_equal(df3, df1.iloc[:-1])

        import xlrd  # will move to engine-specific tests as new ones are added
        with pytest.raises(xlrd.XLRDError):
            pd.read_excel(excel, 'asdf')

    def test_sheet_name(self, read_ext, df_ref):
        filename = "test1"
        sheet_name = "Sheet1"

        excel = ExcelFile(filename + read_ext)
        df1_parse = excel.parse(sheet_name=sheet_name, index_col=0)  # doc
        df2_parse = excel.parse(index_col=0, sheet_name=sheet_name)

        tm.assert_frame_equal(df1_parse, df_ref, check_names=False)
        tm.assert_frame_equal(df2_parse, df_ref, check_names=False)

    def test_excel_read_buffer(self, read_ext):

        pth = 'test1' + read_ext
        expected = pd.read_excel(pth, 'Sheet1', index_col=0)

        with open(pth, 'rb') as f:
            xls = ExcelFile(f)
            actual = pd.read_excel(xls, 'Sheet1', index_col=0)
            tm.assert_frame_equal(expected, actual)

    def test_reader_closes_file(self, read_ext):

        f = open('test1' + read_ext, 'rb')
        with ExcelFile(f) as xlsx:
            # parses okay
            pd.read_excel(xlsx, 'Sheet1', index_col=0)

        assert f.closed

    @pytest.mark.parametrize('excel_engine', ['xlrd', None])
    def test_read_excel_engine_value(self, read_ext, excel_engine):
        # GH 26566
        xl = ExcelFile("test1" + read_ext, engine=excel_engine)
        msg = "Engine should not be specified when passing an ExcelFile"
        with pytest.raises(ValueError, match=msg):
            pd.read_excel(xl, engine='openpyxl')
Ejemplo n.º 26
0
        This fixture will run as part of each test method defined in the
        class and any subclasses, on account of the `autouse=True`
        argument
        """
        option_name = 'io.excel.{ext}.writer'.format(ext=ext.strip('.'))
        prev_engine = get_option(option_name)
        set_option(option_name, engine)
        with ensure_clean(ext) as path:
            self.path = path
            yield
        set_option(option_name, prev_engine)  # Roll back option change


@pytest.mark.parametrize("engine,ext", [
    pytest.param('openpyxl', '.xlsx', marks=pytest.mark.skipif(
        not td.safe_import('openpyxl'), reason='No openpyxl')),
    pytest.param('openpyxl', '.xlsm', marks=pytest.mark.skipif(
        not td.safe_import('openpyxl'), reason='No openpyxl')),
    pytest.param('xlwt', '.xls', marks=pytest.mark.skipif(
        not td.safe_import('xlwt'), reason='No xlwt')),
    pytest.param('xlsxwriter', '.xlsx', marks=pytest.mark.skipif(
        not td.safe_import('xlsxwriter'), reason='No xlsxwriter'))
])
class TestExcelWriter(_WriterBase):
    # Base class for test cases to run with different Excel writers.

    def test_excel_sheet_size(self, engine, ext):

        # GH 26080
        breaking_row_count = 2**20 + 1
        breaking_col_count = 2**14 + 1