def check(format, index): df = DataFrame(np.random.randn(10, 2), columns=list("AB")) df.index = index(len(df)) _maybe_remove(store, "df") store.put("df", df, format=format) tm.assert_frame_equal(df, store["df"])
def test_create_table_index(setup_path): with ensure_clean_store(setup_path) as store: with catch_warnings(record=True): def col(t, column): return getattr(store.get_storer(t).table.cols, column) # data columns df = tm.makeTimeDataFrame() df["string"] = "foo" df["string2"] = "bar" store.append("f", df, data_columns=["string", "string2"]) assert col("f", "index").is_indexed is True assert col("f", "string").is_indexed is True assert col("f", "string2").is_indexed is True # specify index=columns store.append("f2", df, index=["string"], data_columns=["string", "string2"]) assert col("f2", "index").is_indexed is False assert col("f2", "string").is_indexed is True assert col("f2", "string2").is_indexed is False # try to index a non-table _maybe_remove(store, "f2") store.put("f2", df) msg = "cannot create table index on a Fixed format store" with pytest.raises(TypeError, match=msg): store.create_table_index("f2")
def test_put(setup_path): with ensure_clean_store(setup_path) as store: ts = tm.makeTimeSeries() df = tm.makeTimeDataFrame() store["a"] = ts store["b"] = df[:10] store["foo/bar/bah"] = df[:10] store["foo"] = df[:10] store["/foo"] = df[:10] store.put("c", df[:10], format="table") # not OK, not a table msg = "Can only append to Tables" with pytest.raises(ValueError, match=msg): store.put("b", df[10:], append=True) # node does not currently exist, test _is_table_type returns False # in this case _maybe_remove(store, "f") with pytest.raises(ValueError, match=msg): store.put("f", df[10:], append=True) # can't put to a table (use append instead) with pytest.raises(ValueError, match=msg): store.put("c", df[10:], append=True) # overwrite table store.put("c", df[:10], format="table", append=False) tm.assert_frame_equal(df[:10], store["c"])
def test_put_mixed_type(setup_path): df = tm.makeTimeDataFrame() df["obj1"] = "foo" df["obj2"] = "bar" df["bool1"] = df["A"] > 0 df["bool2"] = df["B"] > 0 df["bool3"] = True df["int1"] = 1 df["int2"] = 2 df["timestamp1"] = Timestamp("20010102") df["timestamp2"] = Timestamp("20010103") df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0) df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0) df.loc[df.index[3:6], ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") # PerformanceWarning with catch_warnings(record=True): simplefilter("ignore", pd.errors.PerformanceWarning) store.put("df", df) expected = store.get("df") tm.assert_frame_equal(expected, df)
def test_timezones_fixed_format_frame_non_empty(setup_path): with ensure_clean_store(setup_path) as store: # index rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") rng = rng._with_freq(None) # freq doesnt round-trip df = DataFrame(np.random.randn(len(rng), 4), index=rng) store["df"] = df result = store["df"] tm.assert_frame_equal(result, df) # as data # GH11411 _maybe_remove(store, "df") df = DataFrame( { "A": rng, "B": rng.tz_convert("UTC").tz_localize(None), "C": rng.tz_convert("CET"), "D": range(len(rng)), }, index=rng, ) store["df"] = df result = store["df"] tm.assert_frame_equal(result, df)
def test_append_frame_column_oriented(setup_path): with ensure_clean_store(setup_path) as store: # column oriented df = tm.makeTimeDataFrame() df.index = df.index._with_freq(None) # freq doesnt round-trip _maybe_remove(store, "df1") store.append("df1", df.iloc[:, :2], axes=["columns"]) store.append("df1", df.iloc[:, 2:]) tm.assert_frame_equal(store["df1"], df) result = store.select("df1", "columns=A") expected = df.reindex(columns=["A"]) tm.assert_frame_equal(expected, result) # selection on the non-indexable result = store.select("df1", ("columns=A", "index=df.index[0:4]")) expected = df.reindex(columns=["A"], index=df.index[0:4]) tm.assert_frame_equal(expected, result) # this isn't supported msg = re.escape( "passing a filterable condition to a non-table indexer " "[Filter: Not Initialized]") with pytest.raises(TypeError, match=msg): store.select("df1", "columns=A and index>df.index[4]")
def test_versioning(setup_path): with ensure_clean_store(setup_path) as store: store["a"] = tm.makeTimeSeries() store["b"] = tm.makeDataFrame() df = tm.makeTimeDataFrame() _maybe_remove(store, "df1") store.append("df1", df[:10]) store.append("df1", df[10:]) assert store.root.a._v_attrs.pandas_version == "0.15.2" assert store.root.b._v_attrs.pandas_version == "0.15.2" assert store.root.df1._v_attrs.pandas_version == "0.15.2" # write a file and wipe its versioning _maybe_remove(store, "df2") store.append("df2", df) # this is an error because its table_type is appendable, but no # version info store.get_node("df2")._v_attrs.pandas_version = None msg = "'NoneType' object has no attribute 'startswith'" with pytest.raises(Exception, match=msg): store.select("df2")
def test_select_iterator_many_empty_frames(setup_path): # GH 8014 # using iterator and where clause can return many empty # frames. chunksize = 10_000 # with iterator, range limited to the first chunk with ensure_clean_store(setup_path) as store: expected = tm.makeTimeDataFrame(100000, "S") _maybe_remove(store, "df") store.append("df", expected) beg_dt = expected.index[0] end_dt = expected.index[chunksize - 1] # select w/iterator and where clause, single term, begin of range where = f"index >= '{beg_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range where = f"index <= '{end_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) assert len(results) == 1 result = concat(results) rexpected = expected[expected.index <= end_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, inclusive range where = f"index >= '{beg_dt}' & index <= '{end_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) # should be 1, is 10 assert len(results) == 1 result = concat(results) rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause which selects # *nothing*. # # To be consistent with Python idiom I suggest this should # return [] e.g. `for e in []: print True` never prints # True. where = f"index <= '{beg_dt}' & index >= '{end_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) # should be [] assert len(results) == 0
def test_store_index_types(setup_path, format, index): # GH5386 # test storing various index types with ensure_clean_store(setup_path) as store: df = DataFrame(np.random.randn(10, 2), columns=list("AB")) df.index = index(len(df)) _maybe_remove(store, "df") store.put("df", df, format=format) tm.assert_frame_equal(df, store["df"])
def test_retain_index_attributes(setup_path): # GH 3499, losing frequency info on index recreation df = DataFrame( {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))} ) with ensure_clean_store(setup_path) as store: _maybe_remove(store, "data") store.put("data", df, format="table") result = store.get("data") tm.assert_frame_equal(df, result) for attr in ["freq", "tz", "name"]: for idx in ["index", "columns"]: assert getattr(getattr(df, idx), attr, None) == getattr( getattr(result, idx), attr, None ) # try to append a table with a different frequency with catch_warnings(record=True): df2 = DataFrame( { "A": Series( range(3), index=date_range("2002-1-1", periods=3, freq="D") ) } ) store.append("data", df2) assert store.get_storer("data").info["index"]["freq"] is None # this is ok _maybe_remove(store, "df2") df2 = DataFrame( { "A": Series( range(3), index=[ Timestamp("20010101"), Timestamp("20010102"), Timestamp("20020101"), ], ) } ) store.append("df2", df2) df3 = DataFrame( {"A": Series(range(3), index=date_range("2002-1-1", periods=3, freq="D"))} ) store.append("df2", df3)
def test_encoding(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame({"A": "foo", "B": "bar"}, index=range(5)) df.loc[2, "A"] = np.nan df.loc[3, "B"] = np.nan _maybe_remove(store, "df") store.append("df", df, encoding="ascii") tm.assert_frame_equal(store["df"], df) expected = df.reindex(columns=["A"]) result = store.select("df", Term("columns=A", encoding="ascii")) tm.assert_frame_equal(result, expected)
def test_select_iterator_non_complete_8014(setup_path): # GH 8014 # using iterator and where clause chunksize = 1e4 # with iterator, non complete range with ensure_clean_store(setup_path) as store: expected = tm.makeTimeDataFrame(100064, "S") _maybe_remove(store, "df") store.append("df", expected) beg_dt = expected.index[1] end_dt = expected.index[-2] # select w/iterator and where clause, single term, begin of range where = f"index >= '{beg_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range where = f"index <= '{end_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[expected.index <= end_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, inclusive range where = f"index >= '{beg_dt}' & index <= '{end_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)] tm.assert_frame_equal(rexpected, result) # with iterator, empty where with ensure_clean_store(setup_path) as store: expected = tm.makeTimeDataFrame(100064, "S") _maybe_remove(store, "df") store.append("df", expected) end_dt = expected.index[-1] # select w/iterator and where clause, single term, begin of range where = f"index > '{end_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) assert 0 == len(results)
def test_select_with_many_inputs(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame({ "ts": bdate_range("2012-01-01", periods=300), "A": np.random.randn(300), "B": range(300), "users": ["a"] * 50 + ["b"] * 50 + ["c"] * 100 + [f"a{i:03d}" for i in range(100)], }) _maybe_remove(store, "df") store.append("df", df, data_columns=["ts", "A", "B", "users"]) # regular select result = store.select("df", "ts>=Timestamp('2012-02-01')") expected = df[df.ts >= Timestamp("2012-02-01")] tm.assert_frame_equal(expected, result) # small selector result = store.select( "df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']") expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(["a", "b", "c"])] tm.assert_frame_equal(expected, result) # big selector along the columns selector = ["a", "b", "c"] + [f"a{i:03d}" for i in range(60)] result = store.select( "df", "ts>=Timestamp('2012-02-01') and users=selector") expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(selector)] tm.assert_frame_equal(expected, result) selector = range(100, 200) result = store.select("df", "B=selector") expected = df[df.B.isin(selector)] tm.assert_frame_equal(expected, result) assert len(result) == 100 # big selector along the index selector = Index(df.ts[0:100].values) result = store.select("df", "ts=selector") expected = df[df.ts.isin(selector.values)] tm.assert_frame_equal(expected, result) assert len(result) == 100
def test_append_with_timedelta(setup_path): # GH 3577 # append timedelta df = DataFrame( { "A": Timestamp("20130101"), "B": [ Timestamp("20130101") + timedelta(days=i, seconds=10) for i in range(10) ], } ) df["C"] = df["A"] - df["B"] df.loc[3:5, "C"] = np.nan with ensure_clean_store(setup_path) as store: # table _maybe_remove(store, "df") store.append("df", df, data_columns=True) result = store.select("df") tm.assert_frame_equal(result, df) result = store.select("df", where="C<100000") tm.assert_frame_equal(result, df) result = store.select("df", where="C<pd.Timedelta('-3D')") tm.assert_frame_equal(result, df.iloc[3:]) result = store.select("df", "C<'-3D'") tm.assert_frame_equal(result, df.iloc[3:]) # a bit hacky here as we don't really deal with the NaT properly result = store.select("df", "C<'-500000s'") result = result.dropna(subset=["C"]) tm.assert_frame_equal(result, df.iloc[6:]) result = store.select("df", "C<'-3.5D'") result = result.iloc[1:] tm.assert_frame_equal(result, df.iloc[4:]) # fixed _maybe_remove(store, "df2") store.put("df2", df) result = store.select("df2") tm.assert_frame_equal(result, df)
def test_dst_transitions(setup_path): # make sure we are not failing on transitions with ensure_clean_store(setup_path) as store: times = pd.date_range( "2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="H", ambiguous="infer", ) for i in [times, times + pd.Timedelta("10min")]: _maybe_remove(store, "df") df = DataFrame({"A": range(len(i)), "B": i}, index=i) store.append("df", df) result = store.select("df") tm.assert_frame_equal(result, df)
def test_append_with_timezones_as_index(setup_path, gettz): # GH#4098 example dti = date_range("2000-1-1", periods=3, freq="H", tz=gettz("US/Eastern")) dti = dti._with_freq(None) # freq doesn't round-trip df = DataFrame({"A": Series(range(3), index=dti)}) with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") store.put("df", df) result = store.select("df") tm.assert_frame_equal(result, df) _maybe_remove(store, "df") store.append("df", df) result = store.select("df") tm.assert_frame_equal(result, df)
def test_remove(setup_path): with ensure_clean_store(setup_path) as store: ts = tm.makeTimeSeries() df = tm.makeDataFrame() store["a"] = ts store["b"] = df _maybe_remove(store, "a") assert len(store) == 1 tm.assert_frame_equal(df, store["b"]) _maybe_remove(store, "b") assert len(store) == 0 # nonexistence with pytest.raises( KeyError, match="'No object named a_nonexistent_store in the file'"): store.remove("a_nonexistent_store") # pathing store["a"] = ts store["b/foo"] = df _maybe_remove(store, "foo") _maybe_remove(store, "b/foo") assert len(store) == 1 store["a"] = ts store["b/foo"] = df _maybe_remove(store, "b") assert len(store) == 1 # __delitem__ store["a"] = ts store["b"] = df del store["a"] del store["b"] assert len(store) == 0
def test_append_some_nans(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( { "A": Series(np.random.randn(20)).astype("int32"), "A1": np.random.randn(20), "A2": np.random.randn(20), "B": "foo", "C": "bar", "D": Timestamp("20010101"), "E": datetime.datetime(2001, 1, 2, 0, 0), }, index=np.arange(20), ) # some nans _maybe_remove(store, "df1") df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan store.append("df1", df[:10]) store.append("df1", df[10:]) tm.assert_frame_equal(store["df1"], df) # first column df1 = df.copy() df1.loc[:, "A1"] = np.nan _maybe_remove(store, "df1") store.append("df1", df1[:10]) store.append("df1", df1[10:]) tm.assert_frame_equal(store["df1"], df1) # 2nd column df2 = df.copy() df2.loc[:, "A2"] = np.nan _maybe_remove(store, "df2") store.append("df2", df2[:10]) store.append("df2", df2[10:]) tm.assert_frame_equal(store["df2"], df2) # datetimes df3 = df.copy() df3.loc[:, "E"] = np.nan _maybe_remove(store, "df3") store.append("df3", df3[:10]) store.append("df3", df3[10:]) tm.assert_frame_equal(store["df3"], df3)
def test_select(setup_path): with ensure_clean_store(setup_path) as store: with catch_warnings(record=True): # select with columns= df = tm.makeTimeDataFrame() _maybe_remove(store, "df") store.append("df", df) result = store.select("df", columns=["A", "B"]) expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(expected, result) # equivalently result = store.select("df", [("columns=['A', 'B']")]) expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(expected, result) # with a data column _maybe_remove(store, "df") store.append("df", df, data_columns=["A"]) result = store.select("df", ["A > 0"], columns=["A", "B"]) expected = df[df.A > 0].reindex(columns=["A", "B"]) tm.assert_frame_equal(expected, result) # all a data columns _maybe_remove(store, "df") store.append("df", df, data_columns=True) result = store.select("df", ["A > 0"], columns=["A", "B"]) expected = df[df.A > 0].reindex(columns=["A", "B"]) tm.assert_frame_equal(expected, result) # with a data column, but different columns _maybe_remove(store, "df") store.append("df", df, data_columns=["A"]) result = store.select("df", ["A > 0"], columns=["C", "D"]) expected = df[df.A > 0].reindex(columns=["C", "D"]) tm.assert_frame_equal(expected, result)
def test_api_default_format(setup_path): # default_format option with ensure_clean_store(setup_path) as store: df = tm.makeDataFrame() pd.set_option("io.hdf.default_format", "fixed") _maybe_remove(store, "df") store.put("df", df) assert not store.get_storer("df").is_table msg = "Can only append to Tables" with pytest.raises(ValueError, match=msg): store.append("df2", df) pd.set_option("io.hdf.default_format", "table") _maybe_remove(store, "df") store.put("df", df) assert store.get_storer("df").is_table _maybe_remove(store, "df2") store.append("df2", df) assert store.get_storer("df").is_table pd.set_option("io.hdf.default_format", None) with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() pd.set_option("io.hdf.default_format", "fixed") df.to_hdf(path, "df") with HDFStore(path) as store: assert not store.get_storer("df").is_table with pytest.raises(ValueError, match=msg): df.to_hdf(path, "df2", append=True) pd.set_option("io.hdf.default_format", "table") df.to_hdf(path, "df3") with HDFStore(path) as store: assert store.get_storer("df3").is_table df.to_hdf(path, "df4", append=True) with HDFStore(path) as store: assert store.get_storer("df4").is_table pd.set_option("io.hdf.default_format", None)
def test_select_iterator(setup_path): # single table with ensure_clean_store(setup_path) as store: df = tm.makeTimeDataFrame(500) _maybe_remove(store, "df") store.append("df", df) expected = store.select("df") results = list(store.select("df", iterator=True)) result = concat(results) tm.assert_frame_equal(expected, result) results = list(store.select("df", chunksize=100)) assert len(results) == 5 result = concat(results) tm.assert_frame_equal(expected, result) results = list(store.select("df", chunksize=150)) result = concat(results) tm.assert_frame_equal(result, expected) with ensure_clean_path(setup_path) as path: df = tm.makeTimeDataFrame(500) df.to_hdf(path, "df_non_table") msg = "can only use an iterator or chunksize on a table" with pytest.raises(TypeError, match=msg): read_hdf(path, "df_non_table", chunksize=100) with pytest.raises(TypeError, match=msg): read_hdf(path, "df_non_table", iterator=True) with ensure_clean_path(setup_path) as path: df = tm.makeTimeDataFrame(500) df.to_hdf(path, "df", format="table") results = list(read_hdf(path, "df", chunksize=100)) result = concat(results) assert len(results) == 5 tm.assert_frame_equal(result, df) tm.assert_frame_equal(result, read_hdf(path, "df")) # multiple with ensure_clean_store(setup_path) as store: df1 = tm.makeTimeDataFrame(500) store.append("df1", df1, data_columns=True) df2 = tm.makeTimeDataFrame(500).rename(columns="{}_2".format) df2["foo"] = "bar" store.append("df2", df2) df = concat([df1, df2], axis=1) # full selection expected = store.select_as_multiple(["df1", "df2"], selector="df1") results = list( store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=150)) result = concat(results) tm.assert_frame_equal(expected, result)
def test_append_with_data_columns(setup_path): with ensure_clean_store(setup_path) as store: df = tm.makeTimeDataFrame() df.iloc[0, df.columns.get_loc("B")] = 1.0 _maybe_remove(store, "df") store.append("df", df[:2], data_columns=["B"]) store.append("df", df[2:]) tm.assert_frame_equal(store["df"], df) # check that we have indices created assert store._handle.root.df.table.cols.index.is_indexed is True assert store._handle.root.df.table.cols.B.is_indexed is True # data column searching result = store.select("df", "B>0") expected = df[df.B > 0] tm.assert_frame_equal(result, expected) # data column searching (with an indexable and a data_columns) result = store.select("df", "B>0 and index>df.index[3]") df_new = df.reindex(index=df.index[4:]) expected = df_new[df_new.B > 0] tm.assert_frame_equal(result, expected) # data column selection with a string data_column df_new = df.copy() df_new["string"] = "foo" df_new.loc[df_new.index[1:4], "string"] = np.nan df_new.loc[df_new.index[5:6], "string"] = "bar" _maybe_remove(store, "df") store.append("df", df_new, data_columns=["string"]) result = store.select("df", "string='foo'") expected = df_new[df_new.string == "foo"] tm.assert_frame_equal(result, expected) # using min_itemsize and a data column def check_col(key, name, size): assert ( getattr(store.get_storer(key).table.description, name).itemsize == size ) with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") store.append("df", df_new, data_columns=["string"], min_itemsize={"string": 30}) check_col("df", "string", 30) _maybe_remove(store, "df") store.append("df", df_new, data_columns=["string"], min_itemsize=30) check_col("df", "string", 30) _maybe_remove(store, "df") store.append("df", df_new, data_columns=["string"], min_itemsize={"values": 30}) check_col("df", "string", 30) with ensure_clean_store(setup_path) as store: df_new["string2"] = "foobarbah" df_new["string_block1"] = "foobarbah1" df_new["string_block2"] = "foobarbah2" _maybe_remove(store, "df") store.append( "df", df_new, data_columns=["string", "string2"], min_itemsize={"string": 30, "string2": 40, "values": 50}, ) check_col("df", "string", 30) check_col("df", "string2", 40) check_col("df", "values_block_1", 50) with ensure_clean_store(setup_path) as store: # multiple data columns df_new = df.copy() df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0 df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0 df_new["string"] = "foo" sl = df_new.columns.get_loc("string") df_new.iloc[1:4, sl] = np.nan df_new.iloc[5:6, sl] = "bar" df_new["string2"] = "foo" sl = df_new.columns.get_loc("string2") df_new.iloc[2:5, sl] = np.nan df_new.iloc[7:8, sl] = "bar" _maybe_remove(store, "df") store.append("df", df_new, data_columns=["A", "B", "string", "string2"]) result = store.select("df", "string='foo' and string2='foo' and A>0 and B<0") expected = df_new[ (df_new.string == "foo") & (df_new.string2 == "foo") & (df_new.A > 0) & (df_new.B < 0) ] tm.assert_frame_equal(result, expected, check_freq=False) # FIXME: 2020-05-07 freq check randomly fails in the CI # yield an empty frame result = store.select("df", "string='foo' and string2='cool'") expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")] tm.assert_frame_equal(result, expected) with ensure_clean_store(setup_path) as store: # doc example df_dc = df.copy() df_dc["string"] = "foo" df_dc.loc[df_dc.index[4:6], "string"] = np.nan df_dc.loc[df_dc.index[7:9], "string"] = "bar" df_dc["string2"] = "cool" df_dc["datetime"] = Timestamp("20010102") df_dc = df_dc._convert(datetime=True) df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan _maybe_remove(store, "df_dc") store.append( "df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"] ) result = store.select("df_dc", "B>0") expected = df_dc[df_dc.B > 0] tm.assert_frame_equal(result, expected) result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"]) expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] tm.assert_frame_equal(result, expected, check_freq=False) # FIXME: 2020-12-07 intermittent build failures here with freq of # None instead of BDay(4) with ensure_clean_store(setup_path) as store: # doc example part 2 np.random.seed(1234) index = date_range("1/1/2000", periods=8) df_dc = DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"]) df_dc["string"] = "foo" df_dc.loc[df_dc.index[4:6], "string"] = np.nan df_dc.loc[df_dc.index[7:9], "string"] = "bar" df_dc.loc[:, ["B", "C"]] = df_dc.loc[:, ["B", "C"]].abs() df_dc["string2"] = "cool" # on-disk operations store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"]) result = store.select("df_dc", "B>0") expected = df_dc[df_dc.B > 0] tm.assert_frame_equal(result, expected) result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"']) expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] tm.assert_frame_equal(result, expected)
def test_append_with_strings(setup_path): with ensure_clean_store(setup_path) as store: with catch_warnings(record=True): def check_col(key, name, size): assert ( getattr(store.get_storer(key).table.description, name).itemsize == size ) # avoid truncation on elements df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) store.append("df_big", df) tm.assert_frame_equal(store.select("df_big"), df) check_col("df_big", "values_block_1", 15) # appending smaller string ok df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]]) store.append("df_big", df2) expected = concat([df, df2]) tm.assert_frame_equal(store.select("df_big"), expected) check_col("df_big", "values_block_1", 15) # avoid truncation on elements df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]]) store.append("df_big2", df, min_itemsize={"values": 50}) tm.assert_frame_equal(store.select("df_big2"), df) check_col("df_big2", "values_block_1", 50) # bigger string on next append store.append("df_new", df) df_new = DataFrame( [[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]] ) msg = ( r"Trying to store a string with len \[26\] in " r"\[values_block_1\] column but\n" r"this column has a limit of \[15\]!\n" "Consider using min_itemsize to preset the sizes on these " "columns" ) with pytest.raises(ValueError, match=msg): store.append("df_new", df_new) # min_itemsize on Series index (GH 11412) df = tm.makeMixedDataFrame().set_index("C") store.append("ss", df["B"], min_itemsize={"index": 4}) tm.assert_series_equal(store.select("ss"), df["B"]) # same as above, with data_columns=True store.append("ss2", df["B"], data_columns=True, min_itemsize={"index": 4}) tm.assert_series_equal(store.select("ss2"), df["B"]) # min_itemsize in index without appending (GH 10381) store.put("ss3", df, format="table", min_itemsize={"index": 6}) # just make sure there is a longer string: df2 = df.copy().reset_index().assign(C="longer").set_index("C") store.append("ss3", df2) tm.assert_frame_equal(store.select("ss3"), concat([df, df2])) # same as above, with a Series store.put("ss4", df["B"], format="table", min_itemsize={"index": 6}) store.append("ss4", df2["B"]) tm.assert_series_equal(store.select("ss4"), concat([df["B"], df2["B"]])) # with nans _maybe_remove(store, "df") df = tm.makeTimeDataFrame() df["string"] = "foo" df.loc[df.index[1:4], "string"] = np.nan df["string2"] = "bar" df.loc[df.index[4:8], "string2"] = np.nan df["string3"] = "bah" df.loc[df.index[1:], "string3"] = np.nan store.append("df", df) result = store.select("df") tm.assert_frame_equal(result, df) with ensure_clean_store(setup_path) as store: def check_col(key, name, size): assert getattr(store.get_storer(key).table.description, name).itemsize, size df = DataFrame({"A": "foo", "B": "bar"}, index=range(10)) # a min_itemsize that creates a data_column _maybe_remove(store, "df") store.append("df", df, min_itemsize={"A": 200}) check_col("df", "A", 200) assert store.get_storer("df").data_columns == ["A"] # a min_itemsize that creates a data_column2 _maybe_remove(store, "df") store.append("df", df, data_columns=["B"], min_itemsize={"A": 200}) check_col("df", "A", 200) assert store.get_storer("df").data_columns == ["B", "A"] # a min_itemsize that creates a data_column2 _maybe_remove(store, "df") store.append("df", df, data_columns=["B"], min_itemsize={"values": 200}) check_col("df", "B", 200) check_col("df", "values_block_0", 200) assert store.get_storer("df").data_columns == ["B"] # infer the .typ on subsequent appends _maybe_remove(store, "df") store.append("df", df[:5], min_itemsize=200) store.append("df", df[5:], min_itemsize=200) tm.assert_frame_equal(store["df"], df) # invalid min_itemsize keys df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"]) _maybe_remove(store, "df") msg = re.escape( "min_itemsize has the key [foo] which is not an axis or data_column" ) with pytest.raises(ValueError, match=msg): store.append("df", df, min_itemsize={"foo": 20, "foobar": 20})
def test_append(setup_path): with ensure_clean_store(setup_path) as store: # this is allowed by almost always don't want to do it # tables.NaturalNameWarning): with catch_warnings(record=True): df = tm.makeTimeDataFrame() _maybe_remove(store, "df1") store.append("df1", df[:10]) store.append("df1", df[10:]) tm.assert_frame_equal(store["df1"], df) _maybe_remove(store, "df2") store.put("df2", df[:10], format="table") store.append("df2", df[10:]) tm.assert_frame_equal(store["df2"], df) _maybe_remove(store, "df3") store.append("/df3", df[:10]) store.append("/df3", df[10:]) tm.assert_frame_equal(store["df3"], df) # this is allowed by almost always don't want to do it # tables.NaturalNameWarning _maybe_remove(store, "/df3 foo") store.append("/df3 foo", df[:10]) store.append("/df3 foo", df[10:]) tm.assert_frame_equal(store["df3 foo"], df) # dtype issues - mizxed type in a single object column df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]]) df["mixed_column"] = "testing" df.loc[2, "mixed_column"] = np.nan _maybe_remove(store, "df") store.append("df", df) tm.assert_frame_equal(store["df"], df) # uints - test storage of uints uint_data = DataFrame( { "u08": Series( np.random.randint(0, high=255, size=5), dtype=np.uint8 ), "u16": Series( np.random.randint(0, high=65535, size=5), dtype=np.uint16 ), "u32": Series( np.random.randint(0, high=2 ** 30, size=5), dtype=np.uint32 ), "u64": Series( [2 ** 58, 2 ** 59, 2 ** 60, 2 ** 61, 2 ** 62], dtype=np.uint64, ), }, index=np.arange(5), ) _maybe_remove(store, "uints") store.append("uints", uint_data) tm.assert_frame_equal(store["uints"], uint_data) # uints - test storage of uints in indexable columns _maybe_remove(store, "uints") # 64-bit indices not yet supported store.append("uints", uint_data, data_columns=["u08", "u16", "u32"]) tm.assert_frame_equal(store["uints"], uint_data)
def test_append_all_nans(setup_path): with ensure_clean_store(setup_path) as store: df = DataFrame( {"A1": np.random.randn(20), "A2": np.random.randn(20)}, index=np.arange(20), ) df.loc[0:15, :] = np.nan # nan some entire rows (dropna=True) _maybe_remove(store, "df") store.append("df", df[:10], dropna=True) store.append("df", df[10:], dropna=True) tm.assert_frame_equal(store["df"], df[-4:]) # nan some entire rows (dropna=False) _maybe_remove(store, "df2") store.append("df2", df[:10], dropna=False) store.append("df2", df[10:], dropna=False) tm.assert_frame_equal(store["df2"], df) # tests the option io.hdf.dropna_table pd.set_option("io.hdf.dropna_table", False) _maybe_remove(store, "df3") store.append("df3", df[:10]) store.append("df3", df[10:]) tm.assert_frame_equal(store["df3"], df) pd.set_option("io.hdf.dropna_table", True) _maybe_remove(store, "df4") store.append("df4", df[:10]) store.append("df4", df[10:]) tm.assert_frame_equal(store["df4"], df[-4:]) # nan some entire rows (string are still written!) df = DataFrame( { "A1": np.random.randn(20), "A2": np.random.randn(20), "B": "foo", "C": "bar", }, index=np.arange(20), ) df.loc[0:15, :] = np.nan _maybe_remove(store, "df") store.append("df", df[:10], dropna=True) store.append("df", df[10:], dropna=True) tm.assert_frame_equal(store["df"], df) _maybe_remove(store, "df2") store.append("df2", df[:10], dropna=False) store.append("df2", df[10:], dropna=False) tm.assert_frame_equal(store["df2"], df) # nan some entire rows (but since we have dates they are still # written!) df = DataFrame( { "A1": np.random.randn(20), "A2": np.random.randn(20), "B": "foo", "C": "bar", "D": Timestamp("20010101"), "E": datetime.datetime(2001, 1, 2, 0, 0), }, index=np.arange(20), ) df.loc[0:15, :] = np.nan _maybe_remove(store, "df") store.append("df", df[:10], dropna=True) store.append("df", df[10:], dropna=True) tm.assert_frame_equal(store["df"], df) _maybe_remove(store, "df2") store.append("df2", df[:10], dropna=False) store.append("df2", df[10:], dropna=False) tm.assert_frame_equal(store["df2"], df)
def test_append_with_timezones_dateutil(setup_path): from datetime import timedelta # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows # filename issues. from pandas._libs.tslibs.timezones import maybe_get_tz gettz = lambda x: maybe_get_tz("dateutil/" + x) # as columns with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df_tz") df = DataFrame( dict(A=[ Timestamp("20130102 2:00:00", tz=gettz("US/Eastern")) + timedelta(hours=1) * i for i in range(5) ])) store.append("df_tz", df, data_columns=["A"]) result = store["df_tz"] _compare_with_tz(result, df) tm.assert_frame_equal(result, df) # select with tz aware expected = df[df.A >= df.A[3]] result = store.select("df_tz", where="A>=df.A[3]") _compare_with_tz(result, expected) # ensure we include dates in DST and STD time here. _maybe_remove(store, "df_tz") df = DataFrame( dict( A=Timestamp("20130102", tz=gettz("US/Eastern")), B=Timestamp("20130603", tz=gettz("US/Eastern")), ), index=range(5), ) store.append("df_tz", df) result = store["df_tz"] _compare_with_tz(result, df) tm.assert_frame_equal(result, df) df = DataFrame( dict( A=Timestamp("20130102", tz=gettz("US/Eastern")), B=Timestamp("20130102", tz=gettz("EET")), ), index=range(5), ) with pytest.raises(ValueError): store.append("df_tz", df) # this is ok _maybe_remove(store, "df_tz") store.append("df_tz", df, data_columns=["A", "B"]) result = store["df_tz"] _compare_with_tz(result, df) tm.assert_frame_equal(result, df) # can't append with diff timezone df = DataFrame( dict( A=Timestamp("20130102", tz=gettz("US/Eastern")), B=Timestamp("20130102", tz=gettz("CET")), ), index=range(5), ) with pytest.raises(ValueError): store.append("df_tz", df) # as index with ensure_clean_store(setup_path) as store: # GH 4098 example df = DataFrame( dict(A=Series( range(3), index=date_range( "2000-1-1", periods=3, freq="H", tz=gettz("US/Eastern")), ))) _maybe_remove(store, "df") store.put("df", df) result = store.select("df") tm.assert_frame_equal(result, df) _maybe_remove(store, "df") store.append("df", df) result = store.select("df") tm.assert_frame_equal(result, df)
def test_append_with_timezones_pytz(setup_path): from datetime import timedelta # as columns with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df_tz") df = DataFrame( dict(A=[ Timestamp("20130102 2:00:00", tz="US/Eastern") + timedelta(hours=1) * i for i in range(5) ])) store.append("df_tz", df, data_columns=["A"]) result = store["df_tz"] _compare_with_tz(result, df) tm.assert_frame_equal(result, df) # select with tz aware _compare_with_tz(store.select("df_tz", where="A>=df.A[3]"), df[df.A >= df.A[3]]) _maybe_remove(store, "df_tz") # ensure we include dates in DST and STD time here. df = DataFrame( dict( A=Timestamp("20130102", tz="US/Eastern"), B=Timestamp("20130603", tz="US/Eastern"), ), index=range(5), ) store.append("df_tz", df) result = store["df_tz"] _compare_with_tz(result, df) tm.assert_frame_equal(result, df) df = DataFrame( dict( A=Timestamp("20130102", tz="US/Eastern"), B=Timestamp("20130102", tz="EET"), ), index=range(5), ) with pytest.raises(ValueError): store.append("df_tz", df) # this is ok _maybe_remove(store, "df_tz") store.append("df_tz", df, data_columns=["A", "B"]) result = store["df_tz"] _compare_with_tz(result, df) tm.assert_frame_equal(result, df) # can't append with diff timezone df = DataFrame( dict( A=Timestamp("20130102", tz="US/Eastern"), B=Timestamp("20130102", tz="CET"), ), index=range(5), ) with pytest.raises(ValueError): store.append("df_tz", df) # as index with ensure_clean_store(setup_path) as store: # GH 4098 example df = DataFrame( dict(A=Series( range(3), index=date_range( "2000-1-1", periods=3, freq="H", tz="US/Eastern"), ))) _maybe_remove(store, "df") store.put("df", df) result = store.select("df") tm.assert_frame_equal(result, df) _maybe_remove(store, "df") store.append("df", df) result = store.select("df") tm.assert_frame_equal(result, df)
def test_select_dtypes(setup_path): with ensure_clean_store(setup_path) as store: # with a Timestamp data column (GH #2637) df = DataFrame({ "ts": bdate_range("2012-01-01", periods=300), "A": np.random.randn(300), }) _maybe_remove(store, "df") store.append("df", df, data_columns=["ts", "A"]) result = store.select("df", "ts>=Timestamp('2012-02-01')") expected = df[df.ts >= Timestamp("2012-02-01")] tm.assert_frame_equal(expected, result) # bool columns (GH #2849) df = DataFrame(np.random.randn(5, 2), columns=["A", "B"]) df["object"] = "foo" df.loc[4:5, "object"] = "bar" df["boolv"] = df["A"] > 0 _maybe_remove(store, "df") store.append("df", df, data_columns=True) expected = df[df.boolv == True].reindex(columns=["A", "boolv"]) # noqa for v in [True, "true", 1]: result = store.select("df", f"boolv == {v}", columns=["A", "boolv"]) tm.assert_frame_equal(expected, result) expected = df[df.boolv == False].reindex(columns=["A", "boolv"]) # noqa for v in [False, "false", 0]: result = store.select("df", f"boolv == {v}", columns=["A", "boolv"]) tm.assert_frame_equal(expected, result) # integer index df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)}) _maybe_remove(store, "df_int") store.append("df_int", df) result = store.select("df_int", "index<10 and columns=['A']") expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) tm.assert_frame_equal(expected, result) # float index df = DataFrame({ "A": np.random.rand(20), "B": np.random.rand(20), "index": np.arange(20, dtype="f8"), }) _maybe_remove(store, "df_float") store.append("df_float", df) result = store.select("df_float", "index<10.0 and columns=['A']") expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) tm.assert_frame_equal(expected, result) with ensure_clean_store(setup_path) as store: # floats w/o NaN df = DataFrame({ "cols": range(11), "values": range(11) }, dtype="float64") df["cols"] = (df["cols"] + 10).apply(str) store.append("df1", df, data_columns=True) result = store.select("df1", where="values>2.0") expected = df[df["values"] > 2.0] tm.assert_frame_equal(expected, result) # floats with NaN df.iloc[0] = np.nan expected = df[df["values"] > 2.0] store.append("df2", df, data_columns=True, index=False) result = store.select("df2", where="values>2.0") tm.assert_frame_equal(expected, result) # https://github.com/PyTables/PyTables/issues/282 # bug in selection when 0th row has a np.nan and an index # store.append('df3',df,data_columns=True) # result = store.select( # 'df3', where='values>2.0') # tm.assert_frame_equal(expected, result) # not in first position float with NaN ok too df = DataFrame({ "cols": range(11), "values": range(11) }, dtype="float64") df["cols"] = (df["cols"] + 10).apply(str) df.iloc[1] = np.nan expected = df[df["values"] > 2.0] store.append("df4", df, data_columns=True) result = store.select("df4", where="values>2.0") tm.assert_frame_equal(expected, result) # test selection with comparison against numpy scalar # GH 11283 with ensure_clean_store(setup_path) as store: df = tm.makeDataFrame() expected = df[df["A"] > 0] store.append("df", df, data_columns=True) np_zero = np.float64(0) # noqa:F841 result = store.select("df", where=["A>np_zero"]) tm.assert_frame_equal(expected, result)
def test_coordinates(setup_path): df = tm.makeTimeDataFrame() with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") store.append("df", df) # all c = store.select_as_coordinates("df") assert (c.values == np.arange(len(df.index))).all() # get coordinates back & test vs frame _maybe_remove(store, "df") df = DataFrame({"A": range(5), "B": range(5)}) store.append("df", df) c = store.select_as_coordinates("df", ["index<3"]) assert (c.values == np.arange(3)).all() result = store.select("df", where=c) expected = df.loc[0:2, :] tm.assert_frame_equal(result, expected) c = store.select_as_coordinates("df", ["index>=3", "index<=4"]) assert (c.values == np.arange(2) + 3).all() result = store.select("df", where=c) expected = df.loc[3:4, :] tm.assert_frame_equal(result, expected) assert isinstance(c, Index) # multiple tables _maybe_remove(store, "df1") _maybe_remove(store, "df2") df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) store.append("df1", df1, data_columns=["A", "B"]) store.append("df2", df2) c = store.select_as_coordinates("df1", ["A>0", "B>0"]) df1_result = store.select("df1", c) df2_result = store.select("df2", c) result = concat([df1_result, df2_result], axis=1) expected = concat([df1, df2], axis=1) expected = expected[(expected.A > 0) & (expected.B > 0)] tm.assert_frame_equal(result, expected, check_freq=False) # FIXME: 2021-01-18 on some (mostly windows) builds we get freq=None # but expect freq="18B" # pass array/mask as the coordinates with ensure_clean_store(setup_path) as store: df = DataFrame(np.random.randn(1000, 2), index=date_range("20000101", periods=1000)) store.append("df", df) c = store.select_column("df", "index") where = c[DatetimeIndex(c).month == 5].index expected = df.iloc[where] # locations result = store.select("df", where=where) tm.assert_frame_equal(result, expected) # boolean result = store.select("df", where=where) tm.assert_frame_equal(result, expected) # invalid msg = ("where must be passed as a string, PyTablesExpr, " "or list-like of PyTablesExpr") with pytest.raises(TypeError, match=msg): store.select("df", where=np.arange(len(df), dtype="float64")) with pytest.raises(TypeError, match=msg): store.select("df", where=np.arange(len(df) + 1)) with pytest.raises(TypeError, match=msg): store.select("df", where=np.arange(len(df)), start=5) with pytest.raises(TypeError, match=msg): store.select("df", where=np.arange(len(df)), start=5, stop=10) # selection with filter selection = date_range("20000101", periods=500) result = store.select("df", where="index in selection") expected = df[df.index.isin(selection)] tm.assert_frame_equal(result, expected) # list df = DataFrame(np.random.randn(10, 2)) store.append("df2", df) result = store.select("df2", where=[0, 3, 5]) expected = df.iloc[[0, 3, 5]] tm.assert_frame_equal(result, expected) # boolean where = [True] * 10 where[-2] = False result = store.select("df2", where=where) expected = df.loc[where] tm.assert_frame_equal(result, expected) # start/stop result = store.select("df2", start=5, stop=10) expected = df[5:10] tm.assert_frame_equal(result, expected)
def test_select_iterator_complete_8014(setup_path): # GH 8014 # using iterator and where clause chunksize = 1e4 # no iterator with ensure_clean_store(setup_path) as store: expected = tm.makeTimeDataFrame(100064, "S") _maybe_remove(store, "df") store.append("df", expected) beg_dt = expected.index[0] end_dt = expected.index[-1] # select w/o iteration and no where clause works result = store.select("df") tm.assert_frame_equal(expected, result) # select w/o iterator and where clause, single term, begin # of range, works where = f"index >= '{beg_dt}'" result = store.select("df", where=where) tm.assert_frame_equal(expected, result) # select w/o iterator and where clause, single term, end # of range, works where = f"index <= '{end_dt}'" result = store.select("df", where=where) tm.assert_frame_equal(expected, result) # select w/o iterator and where clause, inclusive range, # works where = f"index >= '{beg_dt}' & index <= '{end_dt}'" result = store.select("df", where=where) tm.assert_frame_equal(expected, result) # with iterator, full range with ensure_clean_store(setup_path) as store: expected = tm.makeTimeDataFrame(100064, "S") _maybe_remove(store, "df") store.append("df", expected) beg_dt = expected.index[0] end_dt = expected.index[-1] # select w/iterator and no where clause works results = list(store.select("df", chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, begin of range where = f"index >= '{beg_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, end of range where = f"index <= '{end_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, inclusive range where = f"index >= '{beg_dt}' & index <= '{end_dt}'" results = list(store.select("df", where=where, chunksize=chunksize)) result = concat(results) tm.assert_frame_equal(expected, result)