def test_categorical_conversion(setup_path): # GH13322 # Check that read_hdf with categorical columns doesn't return rows if # where criteria isn't met. obsids = ["ESP_012345_6789", "ESP_987654_3210"] imgids = ["APF00006np", "APF0001imm"] data = [4.3, 9.8] # Test without categories df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data}) # We are expecting an empty DataFrame matching types of df expected = df.iloc[[], :] with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table", data_columns=True) result = read_hdf(path, "df", where="obsids=B") tm.assert_frame_equal(result, expected) # Test with categories df.obsids = df.obsids.astype("category") df.imgids = df.imgids.astype("category") # We are expecting an empty DataFrame matching types of df expected = df.iloc[[], :] with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table", data_columns=True) result = read_hdf(path, "df", where="obsids=B") tm.assert_frame_equal(result, expected)
def test_to_hdf_with_object_column_names(setup_path): # GH9057 types_should_fail = [ tm.makeIntIndex, tm.makeFloatIndex, tm.makeDateIndex, tm.makeTimedeltaIndex, tm.makePeriodIndex, ] types_should_run = [ tm.makeStringIndex, tm.makeCategoricalIndex, tm.makeUnicodeIndex, ] for index in types_should_fail: df = DataFrame(np.random.randn(10, 2), columns=index(2)) with ensure_clean_path(setup_path) as path: with catch_warnings(record=True): msg = "cannot have non-object label DataIndexableCol" with pytest.raises(ValueError, match=msg): df.to_hdf(path, "df", format="table", data_columns=True) for index in types_should_run: df = DataFrame(np.random.randn(10, 2), columns=index(2)) with ensure_clean_path(setup_path) as path: with catch_warnings(record=True): df.to_hdf(path, "df", format="table", data_columns=True) result = read_hdf(path, "df", where=f"index = [{df.index[0]}]") assert len(result)
def test_store_dropna(setup_path): df_with_missing = DataFrame( { "col1": [0.0, np.nan, 2.0], "col2": [1.0, np.nan, np.nan] }, index=list("abc"), ) df_without_missing = DataFrame({ "col1": [0.0, 2.0], "col2": [1.0, np.nan] }, index=list("ac")) # # Test to make sure defaults are to not drop. # # Corresponding to Issue 9382 with ensure_clean_path(setup_path) as path: df_with_missing.to_hdf(path, "df", format="table") reloaded = read_hdf(path, "df") tm.assert_frame_equal(df_with_missing, reloaded) with ensure_clean_path(setup_path) as path: df_with_missing.to_hdf(path, "df", format="table", dropna=False) reloaded = read_hdf(path, "df") tm.assert_frame_equal(df_with_missing, reloaded) with ensure_clean_path(setup_path) as path: df_with_missing.to_hdf(path, "df", format="table", dropna=True) reloaded = read_hdf(path, "df") tm.assert_frame_equal(df_without_missing, reloaded)
def test_invalid_terms(setup_path): with ensure_clean_store(setup_path) as store: with catch_warnings(record=True): df = tm.makeTimeDataFrame() df["string"] = "foo" df.loc[df.index[0:4], "string"] = "bar" store.put("df", df, format="table") # some invalid terms msg = re.escape( "__init__() missing 1 required positional argument: 'where'") with pytest.raises(TypeError, match=msg): Term() # more invalid msg = re.escape("cannot process expression [df.index[3]], " "[2000-01-06 00:00:00] is not a valid condition") with pytest.raises(ValueError, match=msg): store.select("df", "df.index[3]") msg = "invalid syntax" with pytest.raises(SyntaxError, match=msg): store.select("df", "index>") # from the docs with ensure_clean_path(setup_path) as path: dfq = DataFrame( np.random.randn(10, 4), columns=list("ABCD"), index=date_range("20130101", periods=10), ) dfq.to_hdf(path, "dfq", format="table", data_columns=True) # check ok read_hdf(path, "dfq", where="index>Timestamp('20130104') & columns=['A', 'B']") read_hdf(path, "dfq", where="A>0 or C>0") # catch the invalid reference with ensure_clean_path(setup_path) as path: dfq = DataFrame( np.random.randn(10, 4), columns=list("ABCD"), index=date_range("20130101", periods=10), ) dfq.to_hdf(path, "dfq", format="table") msg = (r"The passed where expression: A>0 or C>0\n\s*" r"contains an invalid variable reference\n\s*" r"all of the variable references must be a reference to\n\s*" r"an axis \(e.g. 'index' or 'columns'\), or a data_column\n\s*" r"The currently defined references are: index,columns\n") with pytest.raises(ValueError, match=msg): read_hdf(path, "dfq", where="A>0 or C>0")
def test_complex_series_error(setup_path): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) s = Series(complex128, index=list("abcd")) with ensure_clean_path(setup_path) as path: with pytest.raises(TypeError): s.to_hdf(path, "obj", format="t") with ensure_clean_path(setup_path) as path: s.to_hdf(path, "obj", format="t", index=False) reread = read_hdf(path, "obj") tm.assert_series_equal(s, reread)
def test_complibs_default_settings(setup_path): # GH15943 df = tm.makeDataFrame() # Set complevel and check if complib is automatically set to # default value with ensure_clean_path(setup_path) as tmpfile: df.to_hdf(tmpfile, "df", complevel=9) result = read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 9 assert node.filters.complib == "zlib" # Set complib and check to see if compression is disabled with ensure_clean_path(setup_path) as tmpfile: df.to_hdf(tmpfile, "df", complib="zlib") result = read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 0 assert node.filters.complib is None # Check if not setting complib or complevel results in no compression with ensure_clean_path(setup_path) as tmpfile: df.to_hdf(tmpfile, "df") result = read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 0 assert node.filters.complib is None # Check if file-defaults can be overridden on a per table basis with ensure_clean_path(setup_path) as tmpfile: store = HDFStore(tmpfile) store.append("dfc", df, complevel=9, complib="blosc") store.append("df", df) store.close() with tables.open_file(tmpfile, mode="r") as h5file: for node in h5file.walk_nodes(where="/df", classname="Leaf"): assert node.filters.complevel == 0 assert node.filters.complib is None for node in h5file.walk_nodes(where="/dfc", classname="Leaf"): assert node.filters.complevel == 9 assert node.filters.complib == "blosc"
def test_mode(setup_path, mode): df = tm.makeTimeDataFrame() msg = r"[\S]* does not exist" with ensure_clean_path(setup_path) as path: # constructor if mode in ["r", "r+"]: with pytest.raises(OSError, match=msg): HDFStore(path, mode=mode) else: store = HDFStore(path, mode=mode) assert store._handle.mode == mode store.close() with ensure_clean_path(setup_path) as path: # context if mode in ["r", "r+"]: with pytest.raises(OSError, match=msg): with HDFStore(path, mode=mode) as store: pass else: with HDFStore(path, mode=mode) as store: assert store._handle.mode == mode with ensure_clean_path(setup_path) as path: # conv write if mode in ["r", "r+"]: with pytest.raises(OSError, match=msg): df.to_hdf(path, "df", mode=mode) df.to_hdf(path, "df", mode="w") else: df.to_hdf(path, "df", mode=mode) # conv read if mode in ["w"]: msg = ( "mode w is not allowed while performing a read. " r"Allowed modes are r, r\+ and a." ) with pytest.raises(ValueError, match=msg): read_hdf(path, "df", mode=mode) else: result = read_hdf(path, "df", mode=mode) tm.assert_frame_equal(result, df)
def test_supported_for_subclass_dataframe(self): data = {"a": [1, 2], "b": [3, 4]} sdf = tm.SubclassedDataFrame(data, dtype=np.intp) expected = DataFrame(data, dtype=np.intp) with ensure_clean_path("temp.h5") as path: sdf.to_hdf(path, "df") result = read_hdf(path, "df") tm.assert_frame_equal(result, expected) with ensure_clean_path("temp.h5") as path: with HDFStore(path) as store: store.put("df", sdf) result = read_hdf(path, "df") tm.assert_frame_equal(result, expected)
def test_default_mode(setup_path): # read_hdf uses default mode df = tm.makeTimeDataFrame() with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", mode="w") result = read_hdf(path, "df") tm.assert_frame_equal(result, df)
def test_supported_for_subclass_series(self): data = [1, 2, 3] sser = tm.SubclassedSeries(data, dtype=np.intp) expected = Series(data, dtype=np.intp) with ensure_clean_path("temp.h5") as path: sser.to_hdf(path, "ser") result = read_hdf(path, "ser") tm.assert_series_equal(result, expected) with ensure_clean_path("temp.h5") as path: with HDFStore(path) as store: store.put("ser", sser) result = read_hdf(path, "ser") tm.assert_series_equal(result, expected)
def test_append_hierarchical(setup_path): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["foo", "bar"], ) df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) with ensure_clean_store(setup_path) as store: store.append("mi", df) result = store.select("mi") tm.assert_frame_equal(result, df) # GH 3748 result = store.select("mi", columns=["A", "B"]) expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(result, expected) with ensure_clean_path("test.hdf") as path: df.to_hdf(path, "df", format="table") result = read_hdf(path, "df", columns=["A", "B"]) expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(result, expected)
def test_to_hdf_multiindex_extension_dtype(idx, setup_path): # GH 7775 mi = MultiIndex.from_arrays([idx, idx]) df = DataFrame(0, index=mi, columns=["a"]) with ensure_clean_path(setup_path) as path: with pytest.raises(NotImplementedError, match="Saving a MultiIndex"): df.to_hdf(path, "df")
def test_retain_index_attributes2(setup_path): with ensure_clean_path(setup_path) as path: with catch_warnings(record=True): df = DataFrame({ "A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H")) }) df.to_hdf(path, "data", mode="w", append=True) df2 = DataFrame({ "A": Series(range(3), index=date_range("2002-1-1", periods=3, freq="D")) }) df2.to_hdf(path, "data", append=True) idx = date_range("2000-1-1", periods=3, freq="H") idx.name = "foo" df = DataFrame({"A": Series(range(3), index=idx)}) df.to_hdf(path, "data", mode="w", append=True) assert read_hdf(path, "data").index.name == "foo" with catch_warnings(record=True): idx2 = date_range("2001-1-1", periods=3, freq="H") idx2.name = "bar" df2 = DataFrame({"A": Series(range(3), index=idx2)}) df2.to_hdf(path, "data", append=True) assert read_hdf(path, "data").index.name is None
def test_hdfstore_iteritems_deprecated(setup_path): with ensure_clean_path(setup_path) as path: df = DataFrame({"a": [1]}) with HDFStore(path, mode="w") as hdf: hdf.put("table", df) with tm.assert_produces_warning(FutureWarning): next(hdf.iteritems())
def test_complex_mixed_table(setup_path): complex64 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64) complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128) df = DataFrame( { "A": [1, 2, 3, 4], "B": ["a", "b", "c", "d"], "C": complex64, "D": complex128, "E": [1.0, 2.0, 3.0, 4.0], }, index=list("abcd"), ) with ensure_clean_store(setup_path) as store: store.append("df", df, data_columns=["A", "B"]) result = store.select("df", where="A>2") tm.assert_frame_equal(df.loc[df.A > 2], result) with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table") reread = read_hdf(path, "df") tm.assert_frame_equal(df, reread)
def check_default_mode(): # read_hdf uses default mode with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", mode="w") result = read_hdf(path, "df") tm.assert_frame_equal(result, df)
def test_complibs(setup_path): # GH14478 df = tm.makeDataFrame() # Building list of all complibs and complevels tuples all_complibs = tables.filters.all_complibs # Remove lzo if its not available on this platform if not tables.which_lib_version("lzo"): all_complibs.remove("lzo") # Remove bzip2 if its not available on this platform if not tables.which_lib_version("bzip2"): all_complibs.remove("bzip2") all_levels = range(0, 10) all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels] for (lib, lvl) in all_tests: with ensure_clean_path(setup_path) as tmpfile: gname = "foo" # Write and read file to see if data is consistent df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl) result = read_hdf(tmpfile, gname) tm.assert_frame_equal(result, df) # Open file and check metadata # for correct amount of compression h5table = tables.open_file(tmpfile, mode="r") for node in h5table.walk_nodes(where="/" + gname, classname="Leaf"): assert node.filters.complevel == lvl if lvl == 0: assert node.filters.complib is None else: assert node.filters.complib == lib h5table.close()
def test_complex_series_error(setup_path): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) s = Series(complex128, index=list("abcd")) msg = ("Columns containing complex values can be stored " "but cannot be indexed when using table format. " "Either use fixed format, set index=False, " "or do not include the columns containing complex " "values to data_columns when initializing the table.") with ensure_clean_path(setup_path) as path: with pytest.raises(TypeError, match=msg): s.to_hdf(path, "obj", format="t") with ensure_clean_path(setup_path) as path: s.to_hdf(path, "obj", format="t", index=False) reread = read_hdf(path, "obj") tm.assert_series_equal(s, reread)
def test_read_nokey_empty(setup_path): with ensure_clean_path(setup_path) as path: store = HDFStore(path) store.close() msg = re.escape( "Dataset(s) incompatible with Pandas data types, not table, or no " "datasets found in HDF5 file.") with pytest.raises(ValueError, match=msg): read_hdf(path)
def test_read_hdf_series_mode_r(format, setup_path): # GH 16583 # Tests that reading a Series saved to an HDF file # still works if a mode='r' argument is supplied series = tm.makeFloatSeries() with ensure_clean_path(setup_path) as path: series.to_hdf(path, key="data", format=format) result = pd.read_hdf(path, key="data", mode="r") tm.assert_series_equal(result, series)
def test_format_type(setup_path): df = DataFrame({"A": [1, 2]}) with ensure_clean_path(setup_path) as path: with HDFStore(path) as store: store.put("a", df, format="fixed") store.put("b", df, format="table") assert store.get_storer("a").format_type == "fixed" assert store.get_storer("b").format_type == "table"
def test_select_empty_where(where): # GH26610 df = DataFrame([1, 2, 3]) with ensure_clean_path("empty_where.h5") as path: with HDFStore(path) as store: store.put("df", df, "t") result = read_hdf(store, "df", where=where) tm.assert_frame_equal(result, df)
def test_to_hdf_errors(format, setup_path): data = ["\ud800foo"] ser = Series(data, index=Index(data)) with ensure_clean_path(setup_path) as path: # GH 20835 ser.to_hdf(path, "table", format=format, errors="surrogatepass") result = read_hdf(path, "table", errors="surrogatepass") tm.assert_series_equal(result, ser)
def test_round_trip_equals(setup_path): # GH 9330 df = DataFrame({"B": [1, 2], "A": ["x", "y"]}) with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table") other = read_hdf(path, "df") tm.assert_frame_equal(df, other) assert df.equals(other) assert other.equals(df)
def test_read_missing_key_close_store(setup_path): # GH 25766 with ensure_clean_path(setup_path) as path: df = DataFrame({"a": range(2), "b": range(2)}) df.to_hdf(path, "k1") with pytest.raises(KeyError, match="'No object named k2 in the file'"): pd.read_hdf(path, "k2") # smoke test to test that file is properly closed after # read with KeyError before another write df.to_hdf(path, "k2")
def test_complex_fixed(setup_path): df = DataFrame( np.random.rand(4, 5).astype(np.complex64), index=list("abcd"), columns=list("ABCDE"), ) with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df") reread = read_hdf(path, "df") tm.assert_frame_equal(df, reread) df = DataFrame( np.random.rand(4, 5).astype(np.complex128), index=list("abcd"), columns=list("ABCDE"), ) with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df") reread = read_hdf(path, "df") tm.assert_frame_equal(df, reread)
def test_convert_value(setup_path, where: str, df: DataFrame, expected: DataFrame): # GH39420 # Check that read_hdf with categorical columns can filter by where condition. df.col = df.col.astype("category") max_widths = {"col": 1} categorical_values = sorted(df.col.unique()) expected.col = expected.col.astype("category") expected.col = expected.col.cat.set_categories(categorical_values) with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table", min_itemsize=max_widths) result = read_hdf(path, where=where) tm.assert_frame_equal(result, expected)
def test_complex_across_dimensions_fixed(setup_path): with catch_warnings(record=True): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) s = Series(complex128, index=list("abcd")) df = DataFrame({"A": s, "B": s}) objs = [s, df] comps = [tm.assert_series_equal, tm.assert_frame_equal] for obj, comp in zip(objs, comps): with ensure_clean_path(setup_path) as path: obj.to_hdf(path, "obj", format="fixed") reread = read_hdf(path, "obj") comp(obj, reread)
def test_read_with_where_tz_aware_index(setup_path): # GH 11926 periods = 10 dts = pd.date_range("20151201", periods=periods, freq="D", tz="UTC") mi = pd.MultiIndex.from_arrays([dts, range(periods)], names=["DATE", "NO"]) expected = pd.DataFrame({"MYCOL": 0}, index=mi) key = "mykey" with ensure_clean_path(setup_path) as path: with pd.HDFStore(path) as store: store.append(key, expected, format="table", append=True) result = pd.read_hdf(path, key, where="DATE > 20151130") tm.assert_frame_equal(result, expected)
def test_read_from_pathlib_path(setup_path): # GH11773 expected = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) with ensure_clean_path(setup_path) as filename: path_obj = Path(filename) expected.to_hdf(path_obj, "df", mode="a") actual = read_hdf(path_obj, "df") tm.assert_frame_equal(expected, actual)