Exemple #1
0
        def check(format, index):
            df = DataFrame(np.random.randn(10, 2), columns=list("AB"))
            df.index = index(len(df))

            _maybe_remove(store, "df")
            store.put("df", df, format=format)
            tm.assert_frame_equal(df, store["df"])
def test_create_table_index(setup_path):

    with ensure_clean_store(setup_path) as store:

        with catch_warnings(record=True):

            def col(t, column):
                return getattr(store.get_storer(t).table.cols, column)

            # data columns
            df = tm.makeTimeDataFrame()
            df["string"] = "foo"
            df["string2"] = "bar"
            store.append("f", df, data_columns=["string", "string2"])
            assert col("f", "index").is_indexed is True
            assert col("f", "string").is_indexed is True
            assert col("f", "string2").is_indexed is True

            # specify index=columns
            store.append("f2", df, index=["string"], data_columns=["string", "string2"])
            assert col("f2", "index").is_indexed is False
            assert col("f2", "string").is_indexed is True
            assert col("f2", "string2").is_indexed is False

            # try to index a non-table
            _maybe_remove(store, "f2")
            store.put("f2", df)
            msg = "cannot create table index on a Fixed format store"
            with pytest.raises(TypeError, match=msg):
                store.create_table_index("f2")
Exemple #3
0
def test_put(setup_path):

    with ensure_clean_store(setup_path) as store:

        ts = tm.makeTimeSeries()
        df = tm.makeTimeDataFrame()
        store["a"] = ts
        store["b"] = df[:10]
        store["foo/bar/bah"] = df[:10]
        store["foo"] = df[:10]
        store["/foo"] = df[:10]
        store.put("c", df[:10], format="table")

        # not OK, not a table
        msg = "Can only append to Tables"
        with pytest.raises(ValueError, match=msg):
            store.put("b", df[10:], append=True)

        # node does not currently exist, test _is_table_type returns False
        # in this case
        _maybe_remove(store, "f")
        with pytest.raises(ValueError, match=msg):
            store.put("f", df[10:], append=True)

        # can't put to a table (use append instead)
        with pytest.raises(ValueError, match=msg):
            store.put("c", df[10:], append=True)

        # overwrite table
        store.put("c", df[:10], format="table", append=False)
        tm.assert_frame_equal(df[:10], store["c"])
Exemple #4
0
def test_put_mixed_type(setup_path):
    df = tm.makeTimeDataFrame()
    df["obj1"] = "foo"
    df["obj2"] = "bar"
    df["bool1"] = df["A"] > 0
    df["bool2"] = df["B"] > 0
    df["bool3"] = True
    df["int1"] = 1
    df["int2"] = 2
    df["timestamp1"] = Timestamp("20010102")
    df["timestamp2"] = Timestamp("20010103")
    df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0)
    df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0)
    df.loc[df.index[3:6], ["obj1"]] = np.nan
    df = df._consolidate()._convert(datetime=True)

    with ensure_clean_store(setup_path) as store:
        _maybe_remove(store, "df")

        # PerformanceWarning
        with catch_warnings(record=True):
            simplefilter("ignore", pd.errors.PerformanceWarning)
            store.put("df", df)

        expected = store.get("df")
        tm.assert_frame_equal(expected, df)
Exemple #5
0
def test_timezones_fixed_format_frame_non_empty(setup_path):
    with ensure_clean_store(setup_path) as store:

        # index
        rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern")
        rng = rng._with_freq(None)  # freq doesnt round-trip
        df = DataFrame(np.random.randn(len(rng), 4), index=rng)
        store["df"] = df
        result = store["df"]
        tm.assert_frame_equal(result, df)

        # as data
        # GH11411
        _maybe_remove(store, "df")
        df = DataFrame(
            {
                "A": rng,
                "B": rng.tz_convert("UTC").tz_localize(None),
                "C": rng.tz_convert("CET"),
                "D": range(len(rng)),
            },
            index=rng,
        )
        store["df"] = df
        result = store["df"]
        tm.assert_frame_equal(result, df)
Exemple #6
0
def test_append_frame_column_oriented(setup_path):
    with ensure_clean_store(setup_path) as store:

        # column oriented
        df = tm.makeTimeDataFrame()
        df.index = df.index._with_freq(None)  # freq doesnt round-trip

        _maybe_remove(store, "df1")
        store.append("df1", df.iloc[:, :2], axes=["columns"])
        store.append("df1", df.iloc[:, 2:])
        tm.assert_frame_equal(store["df1"], df)

        result = store.select("df1", "columns=A")
        expected = df.reindex(columns=["A"])
        tm.assert_frame_equal(expected, result)

        # selection on the non-indexable
        result = store.select("df1", ("columns=A", "index=df.index[0:4]"))
        expected = df.reindex(columns=["A"], index=df.index[0:4])
        tm.assert_frame_equal(expected, result)

        # this isn't supported
        msg = re.escape(
            "passing a filterable condition to a non-table indexer "
            "[Filter: Not Initialized]")
        with pytest.raises(TypeError, match=msg):
            store.select("df1", "columns=A and index>df.index[4]")
def test_versioning(setup_path):

    with ensure_clean_store(setup_path) as store:
        store["a"] = tm.makeTimeSeries()
        store["b"] = tm.makeDataFrame()
        df = tm.makeTimeDataFrame()
        _maybe_remove(store, "df1")
        store.append("df1", df[:10])
        store.append("df1", df[10:])
        assert store.root.a._v_attrs.pandas_version == "0.15.2"
        assert store.root.b._v_attrs.pandas_version == "0.15.2"
        assert store.root.df1._v_attrs.pandas_version == "0.15.2"

        # write a file and wipe its versioning
        _maybe_remove(store, "df2")
        store.append("df2", df)

        # this is an error because its table_type is appendable, but no
        # version info
        store.get_node("df2")._v_attrs.pandas_version = None

        msg = "'NoneType' object has no attribute 'startswith'"

        with pytest.raises(Exception, match=msg):
            store.select("df2")
Exemple #8
0
def test_select_iterator_many_empty_frames(setup_path):

    # GH 8014
    # using iterator and where clause can return many empty
    # frames.
    chunksize = 10_000

    # with iterator, range limited to the first chunk
    with ensure_clean_store(setup_path) as store:

        expected = tm.makeTimeDataFrame(100000, "S")
        _maybe_remove(store, "df")
        store.append("df", expected)

        beg_dt = expected.index[0]
        end_dt = expected.index[chunksize - 1]

        # select w/iterator and where clause, single term, begin of range
        where = f"index >= '{beg_dt}'"
        results = list(store.select("df", where=where, chunksize=chunksize))
        result = concat(results)
        rexpected = expected[expected.index >= beg_dt]
        tm.assert_frame_equal(rexpected, result)

        # select w/iterator and where clause, single term, end of range
        where = f"index <= '{end_dt}'"
        results = list(store.select("df", where=where, chunksize=chunksize))

        assert len(results) == 1
        result = concat(results)
        rexpected = expected[expected.index <= end_dt]
        tm.assert_frame_equal(rexpected, result)

        # select w/iterator and where clause, inclusive range
        where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
        results = list(store.select("df", where=where, chunksize=chunksize))

        # should be 1, is 10
        assert len(results) == 1
        result = concat(results)
        rexpected = expected[(expected.index >= beg_dt)
                             & (expected.index <= end_dt)]
        tm.assert_frame_equal(rexpected, result)

        # select w/iterator and where clause which selects
        # *nothing*.
        #
        # To be consistent with Python idiom I suggest this should
        # return [] e.g. `for e in []: print True` never prints
        # True.

        where = f"index <= '{beg_dt}' & index >= '{end_dt}'"
        results = list(store.select("df", where=where, chunksize=chunksize))

        # should be []
        assert len(results) == 0
Exemple #9
0
def test_store_index_types(setup_path, format, index):
    # GH5386
    # test storing various index types

    with ensure_clean_store(setup_path) as store:

        df = DataFrame(np.random.randn(10, 2), columns=list("AB"))
        df.index = index(len(df))

        _maybe_remove(store, "df")
        store.put("df", df, format=format)
        tm.assert_frame_equal(df, store["df"])
def test_retain_index_attributes(setup_path):

    # GH 3499, losing frequency info on index recreation
    df = DataFrame(
        {"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))}
    )

    with ensure_clean_store(setup_path) as store:
        _maybe_remove(store, "data")
        store.put("data", df, format="table")

        result = store.get("data")
        tm.assert_frame_equal(df, result)

        for attr in ["freq", "tz", "name"]:
            for idx in ["index", "columns"]:
                assert getattr(getattr(df, idx), attr, None) == getattr(
                    getattr(result, idx), attr, None
                )

        # try to append a table with a different frequency
        with catch_warnings(record=True):
            df2 = DataFrame(
                {
                    "A": Series(
                        range(3), index=date_range("2002-1-1", periods=3, freq="D")
                    )
                }
            )
            store.append("data", df2)

        assert store.get_storer("data").info["index"]["freq"] is None

        # this is ok
        _maybe_remove(store, "df2")
        df2 = DataFrame(
            {
                "A": Series(
                    range(3),
                    index=[
                        Timestamp("20010101"),
                        Timestamp("20010102"),
                        Timestamp("20020101"),
                    ],
                )
            }
        )
        store.append("df2", df2)
        df3 = DataFrame(
            {"A": Series(range(3), index=date_range("2002-1-1", periods=3, freq="D"))}
        )
        store.append("df2", df3)
Exemple #11
0
def test_encoding(setup_path):

    with ensure_clean_store(setup_path) as store:
        df = DataFrame({"A": "foo", "B": "bar"}, index=range(5))
        df.loc[2, "A"] = np.nan
        df.loc[3, "B"] = np.nan
        _maybe_remove(store, "df")
        store.append("df", df, encoding="ascii")
        tm.assert_frame_equal(store["df"], df)

        expected = df.reindex(columns=["A"])
        result = store.select("df", Term("columns=A", encoding="ascii"))
        tm.assert_frame_equal(result, expected)
Exemple #12
0
def test_select_iterator_non_complete_8014(setup_path):

    # GH 8014
    # using iterator and where clause
    chunksize = 1e4

    # with iterator, non complete range
    with ensure_clean_store(setup_path) as store:

        expected = tm.makeTimeDataFrame(100064, "S")
        _maybe_remove(store, "df")
        store.append("df", expected)

        beg_dt = expected.index[1]
        end_dt = expected.index[-2]

        # select w/iterator and where clause, single term, begin of range
        where = f"index >= '{beg_dt}'"
        results = list(store.select("df", where=where, chunksize=chunksize))
        result = concat(results)
        rexpected = expected[expected.index >= beg_dt]
        tm.assert_frame_equal(rexpected, result)

        # select w/iterator and where clause, single term, end of range
        where = f"index <= '{end_dt}'"
        results = list(store.select("df", where=where, chunksize=chunksize))
        result = concat(results)
        rexpected = expected[expected.index <= end_dt]
        tm.assert_frame_equal(rexpected, result)

        # select w/iterator and where clause, inclusive range
        where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
        results = list(store.select("df", where=where, chunksize=chunksize))
        result = concat(results)
        rexpected = expected[(expected.index >= beg_dt)
                             & (expected.index <= end_dt)]
        tm.assert_frame_equal(rexpected, result)

    # with iterator, empty where
    with ensure_clean_store(setup_path) as store:

        expected = tm.makeTimeDataFrame(100064, "S")
        _maybe_remove(store, "df")
        store.append("df", expected)

        end_dt = expected.index[-1]

        # select w/iterator and where clause, single term, begin of range
        where = f"index > '{end_dt}'"
        results = list(store.select("df", where=where, chunksize=chunksize))
        assert 0 == len(results)
Exemple #13
0
def test_select_with_many_inputs(setup_path):

    with ensure_clean_store(setup_path) as store:

        df = DataFrame({
            "ts":
            bdate_range("2012-01-01", periods=300),
            "A":
            np.random.randn(300),
            "B":
            range(300),
            "users": ["a"] * 50 + ["b"] * 50 + ["c"] * 100 +
            [f"a{i:03d}" for i in range(100)],
        })
        _maybe_remove(store, "df")
        store.append("df", df, data_columns=["ts", "A", "B", "users"])

        # regular select
        result = store.select("df", "ts>=Timestamp('2012-02-01')")
        expected = df[df.ts >= Timestamp("2012-02-01")]
        tm.assert_frame_equal(expected, result)

        # small selector
        result = store.select(
            "df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']")
        expected = df[(df.ts >= Timestamp("2012-02-01"))
                      & df.users.isin(["a", "b", "c"])]
        tm.assert_frame_equal(expected, result)

        # big selector along the columns
        selector = ["a", "b", "c"] + [f"a{i:03d}" for i in range(60)]
        result = store.select(
            "df", "ts>=Timestamp('2012-02-01') and users=selector")
        expected = df[(df.ts >= Timestamp("2012-02-01"))
                      & df.users.isin(selector)]
        tm.assert_frame_equal(expected, result)

        selector = range(100, 200)
        result = store.select("df", "B=selector")
        expected = df[df.B.isin(selector)]
        tm.assert_frame_equal(expected, result)
        assert len(result) == 100

        # big selector along the index
        selector = Index(df.ts[0:100].values)
        result = store.select("df", "ts=selector")
        expected = df[df.ts.isin(selector.values)]
        tm.assert_frame_equal(expected, result)
        assert len(result) == 100
Exemple #14
0
def test_append_with_timedelta(setup_path):
    # GH 3577
    # append timedelta

    df = DataFrame(
        {
            "A": Timestamp("20130101"),
            "B": [
                Timestamp("20130101") + timedelta(days=i, seconds=10) for i in range(10)
            ],
        }
    )
    df["C"] = df["A"] - df["B"]
    df.loc[3:5, "C"] = np.nan

    with ensure_clean_store(setup_path) as store:

        # table
        _maybe_remove(store, "df")
        store.append("df", df, data_columns=True)
        result = store.select("df")
        tm.assert_frame_equal(result, df)

        result = store.select("df", where="C<100000")
        tm.assert_frame_equal(result, df)

        result = store.select("df", where="C<pd.Timedelta('-3D')")
        tm.assert_frame_equal(result, df.iloc[3:])

        result = store.select("df", "C<'-3D'")
        tm.assert_frame_equal(result, df.iloc[3:])

        # a bit hacky here as we don't really deal with the NaT properly

        result = store.select("df", "C<'-500000s'")
        result = result.dropna(subset=["C"])
        tm.assert_frame_equal(result, df.iloc[6:])

        result = store.select("df", "C<'-3.5D'")
        result = result.iloc[1:]
        tm.assert_frame_equal(result, df.iloc[4:])

        # fixed
        _maybe_remove(store, "df2")
        store.put("df2", df)
        result = store.select("df2")
        tm.assert_frame_equal(result, df)
def test_dst_transitions(setup_path):
    # make sure we are not failing on transitions
    with ensure_clean_store(setup_path) as store:
        times = pd.date_range(
            "2013-10-26 23:00",
            "2013-10-27 01:00",
            tz="Europe/London",
            freq="H",
            ambiguous="infer",
        )

        for i in [times, times + pd.Timedelta("10min")]:
            _maybe_remove(store, "df")
            df = DataFrame({"A": range(len(i)), "B": i}, index=i)
            store.append("df", df)
            result = store.select("df")
            tm.assert_frame_equal(result, df)
def test_append_with_timezones_as_index(setup_path, gettz):
    # GH#4098 example

    dti = date_range("2000-1-1", periods=3, freq="H", tz=gettz("US/Eastern"))
    dti = dti._with_freq(None)  # freq doesn't round-trip

    df = DataFrame({"A": Series(range(3), index=dti)})

    with ensure_clean_store(setup_path) as store:

        _maybe_remove(store, "df")
        store.put("df", df)
        result = store.select("df")
        tm.assert_frame_equal(result, df)

        _maybe_remove(store, "df")
        store.append("df", df)
        result = store.select("df")
        tm.assert_frame_equal(result, df)
Exemple #17
0
def test_remove(setup_path):

    with ensure_clean_store(setup_path) as store:

        ts = tm.makeTimeSeries()
        df = tm.makeDataFrame()
        store["a"] = ts
        store["b"] = df
        _maybe_remove(store, "a")
        assert len(store) == 1
        tm.assert_frame_equal(df, store["b"])

        _maybe_remove(store, "b")
        assert len(store) == 0

        # nonexistence
        with pytest.raises(
                KeyError,
                match="'No object named a_nonexistent_store in the file'"):
            store.remove("a_nonexistent_store")

        # pathing
        store["a"] = ts
        store["b/foo"] = df
        _maybe_remove(store, "foo")
        _maybe_remove(store, "b/foo")
        assert len(store) == 1

        store["a"] = ts
        store["b/foo"] = df
        _maybe_remove(store, "b")
        assert len(store) == 1

        # __delitem__
        store["a"] = ts
        store["b"] = df
        del store["a"]
        del store["b"]
        assert len(store) == 0
Exemple #18
0
def test_append_some_nans(setup_path):

    with ensure_clean_store(setup_path) as store:
        df = DataFrame(
            {
                "A": Series(np.random.randn(20)).astype("int32"),
                "A1": np.random.randn(20),
                "A2": np.random.randn(20),
                "B": "foo",
                "C": "bar",
                "D": Timestamp("20010101"),
                "E": datetime.datetime(2001, 1, 2, 0, 0),
            },
            index=np.arange(20),
        )
        # some nans
        _maybe_remove(store, "df1")
        df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan
        store.append("df1", df[:10])
        store.append("df1", df[10:])
        tm.assert_frame_equal(store["df1"], df)

        # first column
        df1 = df.copy()
        df1.loc[:, "A1"] = np.nan
        _maybe_remove(store, "df1")
        store.append("df1", df1[:10])
        store.append("df1", df1[10:])
        tm.assert_frame_equal(store["df1"], df1)

        # 2nd column
        df2 = df.copy()
        df2.loc[:, "A2"] = np.nan
        _maybe_remove(store, "df2")
        store.append("df2", df2[:10])
        store.append("df2", df2[10:])
        tm.assert_frame_equal(store["df2"], df2)

        # datetimes
        df3 = df.copy()
        df3.loc[:, "E"] = np.nan
        _maybe_remove(store, "df3")
        store.append("df3", df3[:10])
        store.append("df3", df3[10:])
        tm.assert_frame_equal(store["df3"], df3)
Exemple #19
0
def test_select(setup_path):

    with ensure_clean_store(setup_path) as store:

        with catch_warnings(record=True):

            # select with columns=
            df = tm.makeTimeDataFrame()
            _maybe_remove(store, "df")
            store.append("df", df)
            result = store.select("df", columns=["A", "B"])
            expected = df.reindex(columns=["A", "B"])
            tm.assert_frame_equal(expected, result)

            # equivalently
            result = store.select("df", [("columns=['A', 'B']")])
            expected = df.reindex(columns=["A", "B"])
            tm.assert_frame_equal(expected, result)

            # with a data column
            _maybe_remove(store, "df")
            store.append("df", df, data_columns=["A"])
            result = store.select("df", ["A > 0"], columns=["A", "B"])
            expected = df[df.A > 0].reindex(columns=["A", "B"])
            tm.assert_frame_equal(expected, result)

            # all a data columns
            _maybe_remove(store, "df")
            store.append("df", df, data_columns=True)
            result = store.select("df", ["A > 0"], columns=["A", "B"])
            expected = df[df.A > 0].reindex(columns=["A", "B"])
            tm.assert_frame_equal(expected, result)

            # with a data column, but different columns
            _maybe_remove(store, "df")
            store.append("df", df, data_columns=["A"])
            result = store.select("df", ["A > 0"], columns=["C", "D"])
            expected = df[df.A > 0].reindex(columns=["C", "D"])
            tm.assert_frame_equal(expected, result)
Exemple #20
0
def test_api_default_format(setup_path):

    # default_format option
    with ensure_clean_store(setup_path) as store:
        df = tm.makeDataFrame()

        pd.set_option("io.hdf.default_format", "fixed")
        _maybe_remove(store, "df")
        store.put("df", df)
        assert not store.get_storer("df").is_table

        msg = "Can only append to Tables"

        with pytest.raises(ValueError, match=msg):
            store.append("df2", df)

        pd.set_option("io.hdf.default_format", "table")
        _maybe_remove(store, "df")
        store.put("df", df)
        assert store.get_storer("df").is_table
        _maybe_remove(store, "df2")
        store.append("df2", df)
        assert store.get_storer("df").is_table

        pd.set_option("io.hdf.default_format", None)

    with ensure_clean_path(setup_path) as path:

        df = tm.makeDataFrame()

        pd.set_option("io.hdf.default_format", "fixed")
        df.to_hdf(path, "df")
        with HDFStore(path) as store:
            assert not store.get_storer("df").is_table
        with pytest.raises(ValueError, match=msg):
            df.to_hdf(path, "df2", append=True)

        pd.set_option("io.hdf.default_format", "table")
        df.to_hdf(path, "df3")
        with HDFStore(path) as store:
            assert store.get_storer("df3").is_table
        df.to_hdf(path, "df4", append=True)
        with HDFStore(path) as store:
            assert store.get_storer("df4").is_table

        pd.set_option("io.hdf.default_format", None)
Exemple #21
0
def test_select_iterator(setup_path):

    # single table
    with ensure_clean_store(setup_path) as store:

        df = tm.makeTimeDataFrame(500)
        _maybe_remove(store, "df")
        store.append("df", df)

        expected = store.select("df")

        results = list(store.select("df", iterator=True))
        result = concat(results)
        tm.assert_frame_equal(expected, result)

        results = list(store.select("df", chunksize=100))
        assert len(results) == 5
        result = concat(results)
        tm.assert_frame_equal(expected, result)

        results = list(store.select("df", chunksize=150))
        result = concat(results)
        tm.assert_frame_equal(result, expected)

    with ensure_clean_path(setup_path) as path:

        df = tm.makeTimeDataFrame(500)
        df.to_hdf(path, "df_non_table")

        msg = "can only use an iterator or chunksize on a table"
        with pytest.raises(TypeError, match=msg):
            read_hdf(path, "df_non_table", chunksize=100)

        with pytest.raises(TypeError, match=msg):
            read_hdf(path, "df_non_table", iterator=True)

    with ensure_clean_path(setup_path) as path:

        df = tm.makeTimeDataFrame(500)
        df.to_hdf(path, "df", format="table")

        results = list(read_hdf(path, "df", chunksize=100))
        result = concat(results)

        assert len(results) == 5
        tm.assert_frame_equal(result, df)
        tm.assert_frame_equal(result, read_hdf(path, "df"))

    # multiple

    with ensure_clean_store(setup_path) as store:

        df1 = tm.makeTimeDataFrame(500)
        store.append("df1", df1, data_columns=True)
        df2 = tm.makeTimeDataFrame(500).rename(columns="{}_2".format)
        df2["foo"] = "bar"
        store.append("df2", df2)

        df = concat([df1, df2], axis=1)

        # full selection
        expected = store.select_as_multiple(["df1", "df2"], selector="df1")
        results = list(
            store.select_as_multiple(["df1", "df2"],
                                     selector="df1",
                                     chunksize=150))
        result = concat(results)
        tm.assert_frame_equal(expected, result)
Exemple #22
0
def test_append_with_data_columns(setup_path):

    with ensure_clean_store(setup_path) as store:
        df = tm.makeTimeDataFrame()
        df.iloc[0, df.columns.get_loc("B")] = 1.0
        _maybe_remove(store, "df")
        store.append("df", df[:2], data_columns=["B"])
        store.append("df", df[2:])
        tm.assert_frame_equal(store["df"], df)

        # check that we have indices created
        assert store._handle.root.df.table.cols.index.is_indexed is True
        assert store._handle.root.df.table.cols.B.is_indexed is True

        # data column searching
        result = store.select("df", "B>0")
        expected = df[df.B > 0]
        tm.assert_frame_equal(result, expected)

        # data column searching (with an indexable and a data_columns)
        result = store.select("df", "B>0 and index>df.index[3]")
        df_new = df.reindex(index=df.index[4:])
        expected = df_new[df_new.B > 0]
        tm.assert_frame_equal(result, expected)

        # data column selection with a string data_column
        df_new = df.copy()
        df_new["string"] = "foo"
        df_new.loc[df_new.index[1:4], "string"] = np.nan
        df_new.loc[df_new.index[5:6], "string"] = "bar"
        _maybe_remove(store, "df")
        store.append("df", df_new, data_columns=["string"])
        result = store.select("df", "string='foo'")
        expected = df_new[df_new.string == "foo"]
        tm.assert_frame_equal(result, expected)

        # using min_itemsize and a data column
        def check_col(key, name, size):
            assert (
                getattr(store.get_storer(key).table.description, name).itemsize == size
            )

    with ensure_clean_store(setup_path) as store:
        _maybe_remove(store, "df")
        store.append("df", df_new, data_columns=["string"], min_itemsize={"string": 30})
        check_col("df", "string", 30)
        _maybe_remove(store, "df")
        store.append("df", df_new, data_columns=["string"], min_itemsize=30)
        check_col("df", "string", 30)
        _maybe_remove(store, "df")
        store.append("df", df_new, data_columns=["string"], min_itemsize={"values": 30})
        check_col("df", "string", 30)

    with ensure_clean_store(setup_path) as store:
        df_new["string2"] = "foobarbah"
        df_new["string_block1"] = "foobarbah1"
        df_new["string_block2"] = "foobarbah2"
        _maybe_remove(store, "df")
        store.append(
            "df",
            df_new,
            data_columns=["string", "string2"],
            min_itemsize={"string": 30, "string2": 40, "values": 50},
        )
        check_col("df", "string", 30)
        check_col("df", "string2", 40)
        check_col("df", "values_block_1", 50)

    with ensure_clean_store(setup_path) as store:
        # multiple data columns
        df_new = df.copy()
        df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0
        df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0
        df_new["string"] = "foo"

        sl = df_new.columns.get_loc("string")
        df_new.iloc[1:4, sl] = np.nan
        df_new.iloc[5:6, sl] = "bar"

        df_new["string2"] = "foo"
        sl = df_new.columns.get_loc("string2")
        df_new.iloc[2:5, sl] = np.nan
        df_new.iloc[7:8, sl] = "bar"
        _maybe_remove(store, "df")
        store.append("df", df_new, data_columns=["A", "B", "string", "string2"])
        result = store.select("df", "string='foo' and string2='foo' and A>0 and B<0")
        expected = df_new[
            (df_new.string == "foo")
            & (df_new.string2 == "foo")
            & (df_new.A > 0)
            & (df_new.B < 0)
        ]
        tm.assert_frame_equal(result, expected, check_freq=False)
        # FIXME: 2020-05-07 freq check randomly fails in the CI

        # yield an empty frame
        result = store.select("df", "string='foo' and string2='cool'")
        expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")]
        tm.assert_frame_equal(result, expected)

    with ensure_clean_store(setup_path) as store:
        # doc example
        df_dc = df.copy()
        df_dc["string"] = "foo"
        df_dc.loc[df_dc.index[4:6], "string"] = np.nan
        df_dc.loc[df_dc.index[7:9], "string"] = "bar"
        df_dc["string2"] = "cool"
        df_dc["datetime"] = Timestamp("20010102")
        df_dc = df_dc._convert(datetime=True)
        df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan

        _maybe_remove(store, "df_dc")
        store.append(
            "df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"]
        )
        result = store.select("df_dc", "B>0")

        expected = df_dc[df_dc.B > 0]
        tm.assert_frame_equal(result, expected)

        result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"])
        expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
        tm.assert_frame_equal(result, expected, check_freq=False)
        # FIXME: 2020-12-07 intermittent build failures here with freq of
        #  None instead of BDay(4)

    with ensure_clean_store(setup_path) as store:
        # doc example part 2
        np.random.seed(1234)
        index = date_range("1/1/2000", periods=8)
        df_dc = DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"])
        df_dc["string"] = "foo"
        df_dc.loc[df_dc.index[4:6], "string"] = np.nan
        df_dc.loc[df_dc.index[7:9], "string"] = "bar"
        df_dc.loc[:, ["B", "C"]] = df_dc.loc[:, ["B", "C"]].abs()
        df_dc["string2"] = "cool"

        # on-disk operations
        store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"])

        result = store.select("df_dc", "B>0")
        expected = df_dc[df_dc.B > 0]
        tm.assert_frame_equal(result, expected)

        result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"'])
        expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
        tm.assert_frame_equal(result, expected)
Exemple #23
0
def test_append_with_strings(setup_path):

    with ensure_clean_store(setup_path) as store:
        with catch_warnings(record=True):

            def check_col(key, name, size):
                assert (
                    getattr(store.get_storer(key).table.description, name).itemsize
                    == size
                )

            # avoid truncation on elements
            df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
            store.append("df_big", df)
            tm.assert_frame_equal(store.select("df_big"), df)
            check_col("df_big", "values_block_1", 15)

            # appending smaller string ok
            df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]])
            store.append("df_big", df2)
            expected = concat([df, df2])
            tm.assert_frame_equal(store.select("df_big"), expected)
            check_col("df_big", "values_block_1", 15)

            # avoid truncation on elements
            df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
            store.append("df_big2", df, min_itemsize={"values": 50})
            tm.assert_frame_equal(store.select("df_big2"), df)
            check_col("df_big2", "values_block_1", 50)

            # bigger string on next append
            store.append("df_new", df)
            df_new = DataFrame(
                [[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]]
            )
            msg = (
                r"Trying to store a string with len \[26\] in "
                r"\[values_block_1\] column but\n"
                r"this column has a limit of \[15\]!\n"
                "Consider using min_itemsize to preset the sizes on these "
                "columns"
            )
            with pytest.raises(ValueError, match=msg):
                store.append("df_new", df_new)

            # min_itemsize on Series index (GH 11412)
            df = tm.makeMixedDataFrame().set_index("C")
            store.append("ss", df["B"], min_itemsize={"index": 4})
            tm.assert_series_equal(store.select("ss"), df["B"])

            # same as above, with data_columns=True
            store.append("ss2", df["B"], data_columns=True, min_itemsize={"index": 4})
            tm.assert_series_equal(store.select("ss2"), df["B"])

            # min_itemsize in index without appending (GH 10381)
            store.put("ss3", df, format="table", min_itemsize={"index": 6})
            # just make sure there is a longer string:
            df2 = df.copy().reset_index().assign(C="longer").set_index("C")
            store.append("ss3", df2)
            tm.assert_frame_equal(store.select("ss3"), concat([df, df2]))

            # same as above, with a Series
            store.put("ss4", df["B"], format="table", min_itemsize={"index": 6})
            store.append("ss4", df2["B"])
            tm.assert_series_equal(store.select("ss4"), concat([df["B"], df2["B"]]))

            # with nans
            _maybe_remove(store, "df")
            df = tm.makeTimeDataFrame()
            df["string"] = "foo"
            df.loc[df.index[1:4], "string"] = np.nan
            df["string2"] = "bar"
            df.loc[df.index[4:8], "string2"] = np.nan
            df["string3"] = "bah"
            df.loc[df.index[1:], "string3"] = np.nan
            store.append("df", df)
            result = store.select("df")
            tm.assert_frame_equal(result, df)

    with ensure_clean_store(setup_path) as store:

        def check_col(key, name, size):
            assert getattr(store.get_storer(key).table.description, name).itemsize, size

        df = DataFrame({"A": "foo", "B": "bar"}, index=range(10))

        # a min_itemsize that creates a data_column
        _maybe_remove(store, "df")
        store.append("df", df, min_itemsize={"A": 200})
        check_col("df", "A", 200)
        assert store.get_storer("df").data_columns == ["A"]

        # a min_itemsize that creates a data_column2
        _maybe_remove(store, "df")
        store.append("df", df, data_columns=["B"], min_itemsize={"A": 200})
        check_col("df", "A", 200)
        assert store.get_storer("df").data_columns == ["B", "A"]

        # a min_itemsize that creates a data_column2
        _maybe_remove(store, "df")
        store.append("df", df, data_columns=["B"], min_itemsize={"values": 200})
        check_col("df", "B", 200)
        check_col("df", "values_block_0", 200)
        assert store.get_storer("df").data_columns == ["B"]

        # infer the .typ on subsequent appends
        _maybe_remove(store, "df")
        store.append("df", df[:5], min_itemsize=200)
        store.append("df", df[5:], min_itemsize=200)
        tm.assert_frame_equal(store["df"], df)

        # invalid min_itemsize keys
        df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"])
        _maybe_remove(store, "df")
        msg = re.escape(
            "min_itemsize has the key [foo] which is not an axis or data_column"
        )
        with pytest.raises(ValueError, match=msg):
            store.append("df", df, min_itemsize={"foo": 20, "foobar": 20})
Exemple #24
0
def test_append(setup_path):

    with ensure_clean_store(setup_path) as store:

        # this is allowed by almost always don't want to do it
        # tables.NaturalNameWarning):
        with catch_warnings(record=True):

            df = tm.makeTimeDataFrame()
            _maybe_remove(store, "df1")
            store.append("df1", df[:10])
            store.append("df1", df[10:])
            tm.assert_frame_equal(store["df1"], df)

            _maybe_remove(store, "df2")
            store.put("df2", df[:10], format="table")
            store.append("df2", df[10:])
            tm.assert_frame_equal(store["df2"], df)

            _maybe_remove(store, "df3")
            store.append("/df3", df[:10])
            store.append("/df3", df[10:])
            tm.assert_frame_equal(store["df3"], df)

            # this is allowed by almost always don't want to do it
            # tables.NaturalNameWarning
            _maybe_remove(store, "/df3 foo")
            store.append("/df3 foo", df[:10])
            store.append("/df3 foo", df[10:])
            tm.assert_frame_equal(store["df3 foo"], df)

            # dtype issues - mizxed type in a single object column
            df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]])
            df["mixed_column"] = "testing"
            df.loc[2, "mixed_column"] = np.nan
            _maybe_remove(store, "df")
            store.append("df", df)
            tm.assert_frame_equal(store["df"], df)

            # uints - test storage of uints
            uint_data = DataFrame(
                {
                    "u08": Series(
                        np.random.randint(0, high=255, size=5), dtype=np.uint8
                    ),
                    "u16": Series(
                        np.random.randint(0, high=65535, size=5), dtype=np.uint16
                    ),
                    "u32": Series(
                        np.random.randint(0, high=2 ** 30, size=5), dtype=np.uint32
                    ),
                    "u64": Series(
                        [2 ** 58, 2 ** 59, 2 ** 60, 2 ** 61, 2 ** 62],
                        dtype=np.uint64,
                    ),
                },
                index=np.arange(5),
            )
            _maybe_remove(store, "uints")
            store.append("uints", uint_data)
            tm.assert_frame_equal(store["uints"], uint_data)

            # uints - test storage of uints in indexable columns
            _maybe_remove(store, "uints")
            # 64-bit indices not yet supported
            store.append("uints", uint_data, data_columns=["u08", "u16", "u32"])
            tm.assert_frame_equal(store["uints"], uint_data)
Exemple #25
0
def test_append_all_nans(setup_path):

    with ensure_clean_store(setup_path) as store:

        df = DataFrame(
            {"A1": np.random.randn(20), "A2": np.random.randn(20)},
            index=np.arange(20),
        )
        df.loc[0:15, :] = np.nan

        # nan some entire rows (dropna=True)
        _maybe_remove(store, "df")
        store.append("df", df[:10], dropna=True)
        store.append("df", df[10:], dropna=True)
        tm.assert_frame_equal(store["df"], df[-4:])

        # nan some entire rows (dropna=False)
        _maybe_remove(store, "df2")
        store.append("df2", df[:10], dropna=False)
        store.append("df2", df[10:], dropna=False)
        tm.assert_frame_equal(store["df2"], df)

        # tests the option io.hdf.dropna_table
        pd.set_option("io.hdf.dropna_table", False)
        _maybe_remove(store, "df3")
        store.append("df3", df[:10])
        store.append("df3", df[10:])
        tm.assert_frame_equal(store["df3"], df)

        pd.set_option("io.hdf.dropna_table", True)
        _maybe_remove(store, "df4")
        store.append("df4", df[:10])
        store.append("df4", df[10:])
        tm.assert_frame_equal(store["df4"], df[-4:])

        # nan some entire rows (string are still written!)
        df = DataFrame(
            {
                "A1": np.random.randn(20),
                "A2": np.random.randn(20),
                "B": "foo",
                "C": "bar",
            },
            index=np.arange(20),
        )

        df.loc[0:15, :] = np.nan

        _maybe_remove(store, "df")
        store.append("df", df[:10], dropna=True)
        store.append("df", df[10:], dropna=True)
        tm.assert_frame_equal(store["df"], df)

        _maybe_remove(store, "df2")
        store.append("df2", df[:10], dropna=False)
        store.append("df2", df[10:], dropna=False)
        tm.assert_frame_equal(store["df2"], df)

        # nan some entire rows (but since we have dates they are still
        # written!)
        df = DataFrame(
            {
                "A1": np.random.randn(20),
                "A2": np.random.randn(20),
                "B": "foo",
                "C": "bar",
                "D": Timestamp("20010101"),
                "E": datetime.datetime(2001, 1, 2, 0, 0),
            },
            index=np.arange(20),
        )

        df.loc[0:15, :] = np.nan

        _maybe_remove(store, "df")
        store.append("df", df[:10], dropna=True)
        store.append("df", df[10:], dropna=True)
        tm.assert_frame_equal(store["df"], df)

        _maybe_remove(store, "df2")
        store.append("df2", df[:10], dropna=False)
        store.append("df2", df[10:], dropna=False)
        tm.assert_frame_equal(store["df2"], df)
def test_append_with_timezones_dateutil(setup_path):

    from datetime import timedelta

    # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows
    # filename issues.
    from pandas._libs.tslibs.timezones import maybe_get_tz

    gettz = lambda x: maybe_get_tz("dateutil/" + x)

    # as columns
    with ensure_clean_store(setup_path) as store:

        _maybe_remove(store, "df_tz")
        df = DataFrame(
            dict(A=[
                Timestamp("20130102 2:00:00", tz=gettz("US/Eastern")) +
                timedelta(hours=1) * i for i in range(5)
            ]))

        store.append("df_tz", df, data_columns=["A"])
        result = store["df_tz"]
        _compare_with_tz(result, df)
        tm.assert_frame_equal(result, df)

        # select with tz aware
        expected = df[df.A >= df.A[3]]
        result = store.select("df_tz", where="A>=df.A[3]")
        _compare_with_tz(result, expected)

        # ensure we include dates in DST and STD time here.
        _maybe_remove(store, "df_tz")
        df = DataFrame(
            dict(
                A=Timestamp("20130102", tz=gettz("US/Eastern")),
                B=Timestamp("20130603", tz=gettz("US/Eastern")),
            ),
            index=range(5),
        )
        store.append("df_tz", df)
        result = store["df_tz"]
        _compare_with_tz(result, df)
        tm.assert_frame_equal(result, df)

        df = DataFrame(
            dict(
                A=Timestamp("20130102", tz=gettz("US/Eastern")),
                B=Timestamp("20130102", tz=gettz("EET")),
            ),
            index=range(5),
        )
        with pytest.raises(ValueError):
            store.append("df_tz", df)

        # this is ok
        _maybe_remove(store, "df_tz")
        store.append("df_tz", df, data_columns=["A", "B"])
        result = store["df_tz"]
        _compare_with_tz(result, df)
        tm.assert_frame_equal(result, df)

        # can't append with diff timezone
        df = DataFrame(
            dict(
                A=Timestamp("20130102", tz=gettz("US/Eastern")),
                B=Timestamp("20130102", tz=gettz("CET")),
            ),
            index=range(5),
        )
        with pytest.raises(ValueError):
            store.append("df_tz", df)

    # as index
    with ensure_clean_store(setup_path) as store:

        # GH 4098 example
        df = DataFrame(
            dict(A=Series(
                range(3),
                index=date_range(
                    "2000-1-1", periods=3, freq="H", tz=gettz("US/Eastern")),
            )))

        _maybe_remove(store, "df")
        store.put("df", df)
        result = store.select("df")
        tm.assert_frame_equal(result, df)

        _maybe_remove(store, "df")
        store.append("df", df)
        result = store.select("df")
        tm.assert_frame_equal(result, df)
def test_append_with_timezones_pytz(setup_path):

    from datetime import timedelta

    # as columns
    with ensure_clean_store(setup_path) as store:

        _maybe_remove(store, "df_tz")
        df = DataFrame(
            dict(A=[
                Timestamp("20130102 2:00:00", tz="US/Eastern") +
                timedelta(hours=1) * i for i in range(5)
            ]))
        store.append("df_tz", df, data_columns=["A"])
        result = store["df_tz"]
        _compare_with_tz(result, df)
        tm.assert_frame_equal(result, df)

        # select with tz aware
        _compare_with_tz(store.select("df_tz", where="A>=df.A[3]"),
                         df[df.A >= df.A[3]])

        _maybe_remove(store, "df_tz")
        # ensure we include dates in DST and STD time here.
        df = DataFrame(
            dict(
                A=Timestamp("20130102", tz="US/Eastern"),
                B=Timestamp("20130603", tz="US/Eastern"),
            ),
            index=range(5),
        )
        store.append("df_tz", df)
        result = store["df_tz"]
        _compare_with_tz(result, df)
        tm.assert_frame_equal(result, df)

        df = DataFrame(
            dict(
                A=Timestamp("20130102", tz="US/Eastern"),
                B=Timestamp("20130102", tz="EET"),
            ),
            index=range(5),
        )
        with pytest.raises(ValueError):
            store.append("df_tz", df)

        # this is ok
        _maybe_remove(store, "df_tz")
        store.append("df_tz", df, data_columns=["A", "B"])
        result = store["df_tz"]
        _compare_with_tz(result, df)
        tm.assert_frame_equal(result, df)

        # can't append with diff timezone
        df = DataFrame(
            dict(
                A=Timestamp("20130102", tz="US/Eastern"),
                B=Timestamp("20130102", tz="CET"),
            ),
            index=range(5),
        )
        with pytest.raises(ValueError):
            store.append("df_tz", df)

    # as index
    with ensure_clean_store(setup_path) as store:

        # GH 4098 example
        df = DataFrame(
            dict(A=Series(
                range(3),
                index=date_range(
                    "2000-1-1", periods=3, freq="H", tz="US/Eastern"),
            )))

        _maybe_remove(store, "df")
        store.put("df", df)
        result = store.select("df")
        tm.assert_frame_equal(result, df)

        _maybe_remove(store, "df")
        store.append("df", df)
        result = store.select("df")
        tm.assert_frame_equal(result, df)
Exemple #28
0
def test_select_dtypes(setup_path):

    with ensure_clean_store(setup_path) as store:
        # with a Timestamp data column (GH #2637)
        df = DataFrame({
            "ts": bdate_range("2012-01-01", periods=300),
            "A": np.random.randn(300),
        })
        _maybe_remove(store, "df")
        store.append("df", df, data_columns=["ts", "A"])

        result = store.select("df", "ts>=Timestamp('2012-02-01')")
        expected = df[df.ts >= Timestamp("2012-02-01")]
        tm.assert_frame_equal(expected, result)

        # bool columns (GH #2849)
        df = DataFrame(np.random.randn(5, 2), columns=["A", "B"])
        df["object"] = "foo"
        df.loc[4:5, "object"] = "bar"
        df["boolv"] = df["A"] > 0
        _maybe_remove(store, "df")
        store.append("df", df, data_columns=True)

        expected = df[df.boolv == True].reindex(columns=["A", "boolv"])  # noqa
        for v in [True, "true", 1]:
            result = store.select("df",
                                  f"boolv == {v}",
                                  columns=["A", "boolv"])
            tm.assert_frame_equal(expected, result)

        expected = df[df.boolv == False].reindex(columns=["A",
                                                          "boolv"])  # noqa
        for v in [False, "false", 0]:
            result = store.select("df",
                                  f"boolv == {v}",
                                  columns=["A", "boolv"])
            tm.assert_frame_equal(expected, result)

        # integer index
        df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)})
        _maybe_remove(store, "df_int")
        store.append("df_int", df)
        result = store.select("df_int", "index<10 and columns=['A']")
        expected = df.reindex(index=list(df.index)[0:10], columns=["A"])
        tm.assert_frame_equal(expected, result)

        # float index
        df = DataFrame({
            "A": np.random.rand(20),
            "B": np.random.rand(20),
            "index": np.arange(20, dtype="f8"),
        })
        _maybe_remove(store, "df_float")
        store.append("df_float", df)
        result = store.select("df_float", "index<10.0 and columns=['A']")
        expected = df.reindex(index=list(df.index)[0:10], columns=["A"])
        tm.assert_frame_equal(expected, result)

    with ensure_clean_store(setup_path) as store:

        # floats w/o NaN
        df = DataFrame({
            "cols": range(11),
            "values": range(11)
        },
                       dtype="float64")
        df["cols"] = (df["cols"] + 10).apply(str)

        store.append("df1", df, data_columns=True)
        result = store.select("df1", where="values>2.0")
        expected = df[df["values"] > 2.0]
        tm.assert_frame_equal(expected, result)

        # floats with NaN
        df.iloc[0] = np.nan
        expected = df[df["values"] > 2.0]

        store.append("df2", df, data_columns=True, index=False)
        result = store.select("df2", where="values>2.0")
        tm.assert_frame_equal(expected, result)

        # https://github.com/PyTables/PyTables/issues/282
        # bug in selection when 0th row has a np.nan and an index
        # store.append('df3',df,data_columns=True)
        # result = store.select(
        #    'df3', where='values>2.0')
        # tm.assert_frame_equal(expected, result)

        # not in first position float with NaN ok too
        df = DataFrame({
            "cols": range(11),
            "values": range(11)
        },
                       dtype="float64")
        df["cols"] = (df["cols"] + 10).apply(str)

        df.iloc[1] = np.nan
        expected = df[df["values"] > 2.0]

        store.append("df4", df, data_columns=True)
        result = store.select("df4", where="values>2.0")
        tm.assert_frame_equal(expected, result)

    # test selection with comparison against numpy scalar
    # GH 11283
    with ensure_clean_store(setup_path) as store:
        df = tm.makeDataFrame()

        expected = df[df["A"] > 0]

        store.append("df", df, data_columns=True)
        np_zero = np.float64(0)  # noqa:F841
        result = store.select("df", where=["A>np_zero"])
        tm.assert_frame_equal(expected, result)
Exemple #29
0
def test_coordinates(setup_path):
    df = tm.makeTimeDataFrame()

    with ensure_clean_store(setup_path) as store:

        _maybe_remove(store, "df")
        store.append("df", df)

        # all
        c = store.select_as_coordinates("df")
        assert (c.values == np.arange(len(df.index))).all()

        # get coordinates back & test vs frame
        _maybe_remove(store, "df")

        df = DataFrame({"A": range(5), "B": range(5)})
        store.append("df", df)
        c = store.select_as_coordinates("df", ["index<3"])
        assert (c.values == np.arange(3)).all()
        result = store.select("df", where=c)
        expected = df.loc[0:2, :]
        tm.assert_frame_equal(result, expected)

        c = store.select_as_coordinates("df", ["index>=3", "index<=4"])
        assert (c.values == np.arange(2) + 3).all()
        result = store.select("df", where=c)
        expected = df.loc[3:4, :]
        tm.assert_frame_equal(result, expected)
        assert isinstance(c, Index)

        # multiple tables
        _maybe_remove(store, "df1")
        _maybe_remove(store, "df2")
        df1 = tm.makeTimeDataFrame()
        df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
        store.append("df1", df1, data_columns=["A", "B"])
        store.append("df2", df2)

        c = store.select_as_coordinates("df1", ["A>0", "B>0"])
        df1_result = store.select("df1", c)
        df2_result = store.select("df2", c)
        result = concat([df1_result, df2_result], axis=1)

        expected = concat([df1, df2], axis=1)
        expected = expected[(expected.A > 0) & (expected.B > 0)]
        tm.assert_frame_equal(result, expected, check_freq=False)
        # FIXME: 2021-01-18 on some (mostly windows) builds we get freq=None
        #  but expect freq="18B"

    # pass array/mask as the coordinates
    with ensure_clean_store(setup_path) as store:

        df = DataFrame(np.random.randn(1000, 2),
                       index=date_range("20000101", periods=1000))
        store.append("df", df)
        c = store.select_column("df", "index")
        where = c[DatetimeIndex(c).month == 5].index
        expected = df.iloc[where]

        # locations
        result = store.select("df", where=where)
        tm.assert_frame_equal(result, expected)

        # boolean
        result = store.select("df", where=where)
        tm.assert_frame_equal(result, expected)

        # invalid
        msg = ("where must be passed as a string, PyTablesExpr, "
               "or list-like of PyTablesExpr")
        with pytest.raises(TypeError, match=msg):
            store.select("df", where=np.arange(len(df), dtype="float64"))

        with pytest.raises(TypeError, match=msg):
            store.select("df", where=np.arange(len(df) + 1))

        with pytest.raises(TypeError, match=msg):
            store.select("df", where=np.arange(len(df)), start=5)

        with pytest.raises(TypeError, match=msg):
            store.select("df", where=np.arange(len(df)), start=5, stop=10)

        # selection with filter
        selection = date_range("20000101", periods=500)
        result = store.select("df", where="index in selection")
        expected = df[df.index.isin(selection)]
        tm.assert_frame_equal(result, expected)

        # list
        df = DataFrame(np.random.randn(10, 2))
        store.append("df2", df)
        result = store.select("df2", where=[0, 3, 5])
        expected = df.iloc[[0, 3, 5]]
        tm.assert_frame_equal(result, expected)

        # boolean
        where = [True] * 10
        where[-2] = False
        result = store.select("df2", where=where)
        expected = df.loc[where]
        tm.assert_frame_equal(result, expected)

        # start/stop
        result = store.select("df2", start=5, stop=10)
        expected = df[5:10]
        tm.assert_frame_equal(result, expected)
Exemple #30
0
def test_select_iterator_complete_8014(setup_path):

    # GH 8014
    # using iterator and where clause
    chunksize = 1e4

    # no iterator
    with ensure_clean_store(setup_path) as store:

        expected = tm.makeTimeDataFrame(100064, "S")
        _maybe_remove(store, "df")
        store.append("df", expected)

        beg_dt = expected.index[0]
        end_dt = expected.index[-1]

        # select w/o iteration and no where clause works
        result = store.select("df")
        tm.assert_frame_equal(expected, result)

        # select w/o iterator and where clause, single term, begin
        # of range, works
        where = f"index >= '{beg_dt}'"
        result = store.select("df", where=where)
        tm.assert_frame_equal(expected, result)

        # select w/o iterator and where clause, single term, end
        # of range, works
        where = f"index <= '{end_dt}'"
        result = store.select("df", where=where)
        tm.assert_frame_equal(expected, result)

        # select w/o iterator and where clause, inclusive range,
        # works
        where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
        result = store.select("df", where=where)
        tm.assert_frame_equal(expected, result)

    # with iterator, full range
    with ensure_clean_store(setup_path) as store:

        expected = tm.makeTimeDataFrame(100064, "S")
        _maybe_remove(store, "df")
        store.append("df", expected)

        beg_dt = expected.index[0]
        end_dt = expected.index[-1]

        # select w/iterator and no where clause works
        results = list(store.select("df", chunksize=chunksize))
        result = concat(results)
        tm.assert_frame_equal(expected, result)

        # select w/iterator and where clause, single term, begin of range
        where = f"index >= '{beg_dt}'"
        results = list(store.select("df", where=where, chunksize=chunksize))
        result = concat(results)
        tm.assert_frame_equal(expected, result)

        # select w/iterator and where clause, single term, end of range
        where = f"index <= '{end_dt}'"
        results = list(store.select("df", where=where, chunksize=chunksize))
        result = concat(results)
        tm.assert_frame_equal(expected, result)

        # select w/iterator and where clause, inclusive range
        where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
        results = list(store.select("df", where=where, chunksize=chunksize))
        result = concat(results)
        tm.assert_frame_equal(expected, result)