Exemple #1
0
    def test_subclass_unstack_multi(self):
        # GH 15564
        df = tm.SubclassedDataFrame(
            [[10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33],
             [40, 41, 42, 43]],
            index=MultiIndex.from_tuples(list(zip(list("AABB"), list("cdcd"))),
                                         names=["aaa", "ccc"]),
            columns=MultiIndex.from_tuples(list(zip(list("WWXX"),
                                                    list("yzyz"))),
                                           names=["www", "yyy"]),
        )

        exp = tm.SubclassedDataFrame(
            [[10, 20, 11, 21, 12, 22, 13, 23],
             [30, 40, 31, 41, 32, 42, 33, 43]],
            index=Index(["A", "B"], name="aaa"),
            columns=MultiIndex.from_tuples(
                list(zip(list("WWWWXXXX"), list("yyzzyyzz"),
                         list("cdcdcdcd"))),
                names=["www", "yyy", "ccc"],
            ),
        )

        res = df.unstack()
        tm.assert_frame_equal(res, exp)

        res = df.unstack("ccc")
        tm.assert_frame_equal(res, exp)

        exp = tm.SubclassedDataFrame(
            [[10, 30, 11, 31, 12, 32, 13, 33],
             [20, 40, 21, 41, 22, 42, 23, 43]],
            index=Index(["c", "d"], name="ccc"),
            columns=MultiIndex.from_tuples(
                list(zip(list("WWWWXXXX"), list("yyzzyyzz"),
                         list("ABABABAB"))),
                names=["www", "yyy", "aaa"],
            ),
        )

        res = df.unstack("aaa")
        tm.assert_frame_equal(res, exp)
Exemple #2
0
def test_start_stop_fixed(setup_path):

    with ensure_clean_store(setup_path) as store:

        # fixed, GH 8287
        df = DataFrame(
            {
                "A": np.random.rand(20),
                "B": np.random.rand(20)
            },
            index=date_range("20130101", periods=20),
        )
        store.put("df", df)

        result = store.select("df", start=0, stop=5)
        expected = df.iloc[0:5, :]
        tm.assert_frame_equal(result, expected)

        result = store.select("df", start=5, stop=10)
        expected = df.iloc[5:10, :]
        tm.assert_frame_equal(result, expected)

        # out of range
        result = store.select("df", start=30, stop=40)
        expected = df.iloc[30:40, :]
        tm.assert_frame_equal(result, expected)

        # series
        s = df.A
        store.put("s", s)
        result = store.select("s", start=0, stop=5)
        expected = s.iloc[0:5]
        tm.assert_series_equal(result, expected)

        result = store.select("s", start=5, stop=10)
        expected = s.iloc[5:10]
        tm.assert_series_equal(result, expected)

        # sparse; not implemented
        df = tm.makeDataFrame()
        df.iloc[3:5, 1:3] = np.nan
        df.iloc[8:10, -2] = np.nan
Exemple #3
0
    def test_sort_index_multilevel_repr_8017(self, gen, extra):

        np.random.seed(0)
        data = np.random.randn(3, 4)

        columns = MultiIndex.from_tuples([("red", i) for i in gen])
        df = DataFrame(data, index=list("def"), columns=columns)
        df2 = pd.concat(
            [
                df,
                DataFrame(
                    "world",
                    index=list("def"),
                    columns=MultiIndex.from_tuples([("red", extra)]),
                ),
            ],
            axis=1,
        )

        # check that the repr is good
        # make sure that we have a correct sparsified repr
        # e.g. only 1 header of read
        assert str(df2).splitlines()[0].split() == ["red"]

        # GH 8017
        # sorting fails after columns added

        # construct single-dtype then sort
        result = df.copy().sort_index(axis=1)
        expected = df.iloc[:, [0, 2, 1, 3]]
        tm.assert_frame_equal(result, expected)

        result = df2.sort_index(axis=1)
        expected = df2.iloc[:, [0, 2, 1, 4, 3]]
        tm.assert_frame_equal(result, expected)

        # setitem then sort
        result = df.copy()
        result[("red", extra)] = "world"

        result = result.sort_index(axis=1)
        tm.assert_frame_equal(result, expected)
    def test_excel_table(self, read_ext, df_ref):
        if pd.read_excel.keywords["engine"] == "pyxlsb":
            pytest.xfail("Sheets containing datetimes not supported by pyxlsb")

        df1 = pd.read_excel("test1" + read_ext,
                            sheet_name="Sheet1",
                            index_col=0)
        df2 = pd.read_excel("test1" + read_ext,
                            sheet_name="Sheet2",
                            skiprows=[1],
                            index_col=0)
        # TODO add index to file
        tm.assert_frame_equal(df1, df_ref, check_names=False)
        tm.assert_frame_equal(df2, df_ref, check_names=False)

        df3 = pd.read_excel("test1" + read_ext,
                            sheet_name="Sheet1",
                            index_col=0,
                            skipfooter=1)
        tm.assert_frame_equal(df3, df1.iloc[:-1])
Exemple #5
0
def test_same_name_scoping(setup_path):

    with ensure_clean_store(setup_path) as store:

        df = DataFrame(np.random.randn(20, 2),
                       index=date_range("20130101", periods=20))
        store.put("df", df, format="table")
        expected = df[df.index > Timestamp("20130105")]

        result = store.select("df", "index>datetime.datetime(2013,1,5)")
        tm.assert_frame_equal(result, expected)

        from datetime import datetime  # noqa

        # technically an error, but allow it
        result = store.select("df", "index>datetime.datetime(2013,1,5)")
        tm.assert_frame_equal(result, expected)

        result = store.select("df", "index>datetime(2013,1,5)")
        tm.assert_frame_equal(result, expected)
    def test_get_dummies_basic(self, sparse, dtype):
        s_list = list("abc")
        s_series = Series(s_list)
        s_series_index = Series(s_list, list("ABC"))

        expected = DataFrame(
            {"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]},
            dtype=self.effective_dtype(dtype),
        )
        if sparse:
            expected = expected.apply(SparseArray, fill_value=0.0)
        result = get_dummies(s_list, sparse=sparse, dtype=dtype)
        tm.assert_frame_equal(result, expected)

        result = get_dummies(s_series, sparse=sparse, dtype=dtype)
        tm.assert_frame_equal(result, expected)

        expected.index = list("ABC")
        result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
        tm.assert_frame_equal(result, expected)
def test_make_cf_table():
    result2 = make_cf_table(varied_ratings, training, classes=2)
    result3 = make_cf_table(varied_ratings, training, classes=3)
    result6 = make_cf_table(varied_ratings, training, classes=6)

    books = [10, 20, 30]
    users = [100, 200, 300]
    matrix2 = pd.DataFrame(
        [[0, 0, 0], [1, 1, 0], [0, 0, 0]], index=books, columns=users
    ).astype(float)
    matrix3 = pd.DataFrame(
        [[0, 1, 1], [2, 2, 0], [0, 0, 0]], index=books, columns=users
    ).astype(float)
    matrix6 = pd.DataFrame(
        [[0, 1, 3], [5, 4, 0], [0, 0, 0]], index=books, columns=users
    ).astype(float)

    assert_frame_equal(result2, matrix2)
    assert_frame_equal(result3, matrix3)
    assert_frame_equal(result6, matrix6)
    def test_get_dummies_basic_drop_first(self, sparse):
        # GH12402 Add a new parameter `drop_first` to avoid collinearity
        # Basic case
        s_list = list("abc")
        s_series = Series(s_list)
        s_series_index = Series(s_list, list("ABC"))

        expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=np.uint8)

        result = get_dummies(s_list, drop_first=True, sparse=sparse)
        if sparse:
            expected = expected.apply(SparseArray, fill_value=0)
        tm.assert_frame_equal(result, expected)

        result = get_dummies(s_series, drop_first=True, sparse=sparse)
        tm.assert_frame_equal(result, expected)

        expected.index = list("ABC")
        result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
        tm.assert_frame_equal(result, expected)
Exemple #9
0
    def test_loc_index(self):
        # gh-17131
        # a boolean index should index like a boolean numpy array

        df = DataFrame(
            np.random.random(size=(5, 10)),
            index=["alpha_0", "alpha_1", "alpha_2", "beta_0", "beta_1"],
        )

        mask = df.index.map(lambda x: "alpha" in x)
        expected = df.loc[np.array(mask)]

        result = df.loc[mask]
        tm.assert_frame_equal(result, expected)

        result = df.loc[mask.values]
        tm.assert_frame_equal(result, expected)

        result = df.loc[pd.array(mask, dtype="boolean")]
        tm.assert_frame_equal(result, expected)
Exemple #10
0
    def test_loc_non_unique(self):
        # GH3659
        # non-unique indexer with loc slice
        # https://groups.google.com/forum/?fromgroups#!topic/pydata/zTm2No0crYs

        # these are going to raise because the we are non monotonic
        df = DataFrame({
            "A": [1, 2, 3, 4, 5, 6],
            "B": [3, 4, 5, 6, 7, 8]
        },
                       index=[0, 1, 0, 1, 2, 3])
        msg = "'Cannot get left slice bound for non-unique label: 1'"
        with pytest.raises(KeyError, match=msg):
            df.loc[1:]
        msg = "'Cannot get left slice bound for non-unique label: 0'"
        with pytest.raises(KeyError, match=msg):
            df.loc[0:]
        msg = "'Cannot get left slice bound for non-unique label: 1'"
        with pytest.raises(KeyError, match=msg):
            df.loc[1:2]

        # monotonic are ok
        df = DataFrame({
            "A": [1, 2, 3, 4, 5, 6],
            "B": [3, 4, 5, 6, 7, 8]
        },
                       index=[0, 1, 0, 1, 2, 3]).sort_index(axis=0)
        result = df.loc[1:]
        expected = DataFrame({
            "A": [2, 4, 5, 6],
            "B": [4, 6, 7, 8]
        },
                             index=[1, 1, 2, 3])
        tm.assert_frame_equal(result, expected)

        result = df.loc[0:]
        tm.assert_frame_equal(result, df)

        result = df.loc[1:2]
        expected = DataFrame({"A": [2, 4, 5], "B": [4, 6, 7]}, index=[1, 1, 2])
        tm.assert_frame_equal(result, expected)
    def test_empty_field_eof(self):
        data = "a,b,c\n1,2,3\n4,,"

        result = TextReader(StringIO(data), delimiter=",").read()

        expected = {
            0: np.array([1, 4], dtype=np.int64),
            1: np.array(["2", ""], dtype=object),
            2: np.array(["3", ""], dtype=object),
        }
        assert_array_dicts_equal(result, expected)

        # GH5664
        a = DataFrame([["b"], [np.nan]], columns=["a"], index=["a", "c"])
        b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1])
        c = DataFrame(
            [
                [1, 2, 3, 4],
                [6, np.nan, np.nan, np.nan],
                [8, 9, 10, 11],
                [13, 14, np.nan, np.nan],
            ],
            columns=list("abcd"),
            index=[0, 5, 7, 12],
        )

        for _ in range(100):
            df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c")
            tm.assert_frame_equal(df, a)

            df = read_csv(
                StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c"
            )
            tm.assert_frame_equal(df, b)

            df = read_csv(
                StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"),
                names=list("abcd"),
                engine="c",
            )
            tm.assert_frame_equal(df, c)
Exemple #12
0
    def test_iloc_empty_list_indexer_is_ok(self):

        df = tm.makeCustomDataframe(5, 2)
        # vertical empty
        tm.assert_frame_equal(
            df.iloc[:, []],
            df.iloc[:, :0],
            check_index_type=True,
            check_column_type=True,
        )
        # horizontal empty
        tm.assert_frame_equal(
            df.iloc[[], :],
            df.iloc[:0, :],
            check_index_type=True,
            check_column_type=True,
        )
        # horizontal empty
        tm.assert_frame_equal(
            df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True
        )
Exemple #13
0
def test_agg_apply_corner(ts, tsframe):
    # nothing to group, all NA
    grouped = ts.groupby(ts * np.nan)
    assert ts.dtype == np.float64

    # groupby float64 values results in Float64Index
    exp = Series([], dtype=np.float64, index=Index([], dtype=np.float64))
    tm.assert_series_equal(grouped.sum(), exp)
    tm.assert_series_equal(grouped.agg(np.sum), exp)
    tm.assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False)

    # DataFrame
    grouped = tsframe.groupby(tsframe["A"] * np.nan)
    exp_df = DataFrame(
        columns=tsframe.columns,
        dtype=float,
        index=Index([], name="A", dtype=np.float64),
    )
    tm.assert_frame_equal(grouped.sum(), exp_df)
    tm.assert_frame_equal(grouped.agg(np.sum), exp_df)
    tm.assert_frame_equal(grouped.apply(np.sum), exp_df)
Exemple #14
0
    def test_partial_set(self,
                         multiindex_year_month_day_dataframe_random_data):
        # GH #397
        ymd = multiindex_year_month_day_dataframe_random_data
        df = ymd.copy()
        exp = ymd.copy()
        df.loc[2000, 4] = 0
        exp.loc[2000, 4].values[:] = 0
        tm.assert_frame_equal(df, exp)

        df["A"].loc[2000, 4] = 1
        exp["A"].loc[2000, 4].values[:] = 1
        tm.assert_frame_equal(df, exp)

        df.loc[2000] = 5
        exp.loc[2000].values[:] = 5
        tm.assert_frame_equal(df, exp)

        # this works...for now
        df["A"].iloc[14] = 5
        assert df["A"].iloc[14] == 5
def test_dtype_and_names_error(c_parser_only):
    # see gh-8833: passing both dtype and names
    # resulting in an error reporting issue
    parser = c_parser_only
    data = """
1.0 1
2.0 2
3.0 3
"""
    # base cases
    result = parser.read_csv(StringIO(data), sep=r"\s+", header=None)
    expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
    tm.assert_frame_equal(result, expected)

    result = parser.read_csv(StringIO(data), sep=r"\s+", header=None, names=["a", "b"])
    expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"])
    tm.assert_frame_equal(result, expected)

    # fallback casting
    result = parser.read_csv(
        StringIO(data), sep=r"\s+", header=None, names=["a", "b"], dtype={"a": np.int32}
    )
    expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=["a", "b"])
    expected["a"] = expected["a"].astype(np.int32)
    tm.assert_frame_equal(result, expected)

    data = """
1.0 1
nan 2
3.0 3
"""
    # fallback casting, but not castable
    with pytest.raises(ValueError, match="cannot safely convert"):
        parser.read_csv(
            StringIO(data),
            sep=r"\s+",
            header=None,
            names=["a", "b"],
            dtype={"a": np.int32},
        )
Exemple #16
0
    def test_describe_bool_frame(self):
        # GH#13891
        df = pd.DataFrame({
            "bool_data_1": [False, False, True, True],
            "bool_data_2": [False, True, True, True],
        })
        result = df.describe()
        expected = DataFrame(
            {
                "bool_data_1": [4, 2, True, 2],
                "bool_data_2": [4, 2, True, 3]
            },
            index=["count", "unique", "top", "freq"],
        )
        tm.assert_frame_equal(result, expected)

        df = pd.DataFrame({
            "bool_data": [False, False, True, True, False],
            "int_data": [0, 1, 2, 3, 4],
        })
        result = df.describe()
        expected = DataFrame(
            {"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]},
            index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
        )
        tm.assert_frame_equal(result, expected)

        df = pd.DataFrame({
            "bool_data": [False, False, True, True],
            "str_data": ["a", "b", "c", "a"]
        })
        result = df.describe()
        expected = DataFrame(
            {
                "bool_data": [4, 2, True, 2],
                "str_data": [4, 3, "a", 2]
            },
            index=["count", "unique", "top", "freq"],
        )
        tm.assert_frame_equal(result, expected)
Exemple #17
0
    def test_interp_rowwise(self):
        df = DataFrame({
            0: [1, 2, np.nan, 4],
            1: [2, 3, 4, np.nan],
            2: [np.nan, 4, 5, 6],
            3: [4, np.nan, 6, 7],
            4: [1, 2, 3, 4],
        })
        result = df.interpolate(axis=1)
        expected = df.copy()
        expected.loc[3, 1] = 5
        expected.loc[0, 2] = 3
        expected.loc[1, 3] = 3
        expected[4] = expected[4].astype(np.float64)
        tm.assert_frame_equal(result, expected)

        result = df.interpolate(axis=1, method="values")
        tm.assert_frame_equal(result, expected)

        result = df.interpolate(axis=0)
        expected = df.interpolate()
        tm.assert_frame_equal(result, expected)
Exemple #18
0
    def test_iloc_getitem_slice(self):
        df = DataFrame(
            [
                {"A": 1, "B": 2, "C": 3},
                {"A": 100, "B": 200, "C": 300},
                {"A": 1000, "B": 2000, "C": 3000},
            ]
        )

        expected = DataFrame([{"A": 1, "B": 2, "C": 3}, {"A": 100, "B": 200, "C": 300}])
        result = df.iloc[:2]
        tm.assert_frame_equal(result, expected)

        expected = DataFrame([{"A": 100, "B": 200}], index=[1])
        result = df.iloc[1:2, 0:2]
        tm.assert_frame_equal(result, expected)

        expected = DataFrame(
            [{"A": 1, "C": 3}, {"A": 100, "C": 300}, {"A": 1000, "C": 3000}]
        )
        result = df.iloc[:, lambda df: [0, 2]]
        tm.assert_frame_equal(result, expected)
Exemple #19
0
    def test_crosstab_ndarray(self, box):
        # GH 44076
        a = box(np.random.randint(0, 5, size=100))
        b = box(np.random.randint(0, 3, size=100))
        c = box(np.random.randint(0, 10, size=100))

        df = DataFrame({"a": a, "b": b, "c": c})

        result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"))
        expected = crosstab(df["a"], [df["b"], df["c"]])
        tm.assert_frame_equal(result, expected)

        result = crosstab([b, c], a, colnames=["a"], rownames=("b", "c"))
        expected = crosstab([df["b"], df["c"]], df["a"])
        tm.assert_frame_equal(result, expected)

        # assign arbitrary names
        result = crosstab(a, c)
        expected = crosstab(df["a"], df["c"])
        expected.index.names = ["row_0"]
        expected.columns.names = ["col_0"]
        tm.assert_frame_equal(result, expected)
Exemple #20
0
    def test_subset(self, date_range_frame):
        N = 10
        df = date_range_frame.iloc[:N].copy()
        df.loc[df.index[4:8], "A"] = np.nan
        dates = date_range("1/1/1990", periods=N * 3, freq="25s")

        # with a subset of A should be the same
        result = df.asof(dates, subset="A")
        expected = df.asof(dates)
        tm.assert_frame_equal(result, expected)

        # same with A/B
        result = df.asof(dates, subset=["A", "B"])
        expected = df.asof(dates)
        tm.assert_frame_equal(result, expected)

        # B gives df.asof
        result = df.asof(dates, subset="B")
        expected = df.resample("25s", closed="right").ffill().reindex(dates)
        expected.iloc[20:] = 9

        tm.assert_frame_equal(result, expected)
Exemple #21
0
    def test_int_types(self, np_type, path):
        # Test np.int values read come back as int
        # (rather than float which is Excel's format).
        df = DataFrame(np.random.randint(-10, 10, size=(10, 2)), dtype=np_type)
        df.to_excel(path, "test1")

        reader = ExcelFile(path)
        recons = pd.read_excel(reader, "test1", index_col=0)

        int_frame = df.astype(np.int64)
        tm.assert_frame_equal(int_frame, recons)

        recons2 = pd.read_excel(path, "test1", index_col=0)
        tm.assert_frame_equal(int_frame, recons2)

        # Test with convert_float=False comes back as float.
        float_frame = df.astype(float)
        recons = pd.read_excel(path, "test1", convert_float=False, index_col=0)
        tm.assert_frame_equal(recons,
                              float_frame,
                              check_index_type=False,
                              check_column_type=False)
Exemple #22
0
    def test_frame_getitem_setitem_multislice(self):
        levels = [["t1", "t2"], ["a", "b", "c"]]
        codes = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]]
        midx = MultiIndex(codes=codes, levels=levels, names=[None, "id"])
        df = DataFrame({"value": [1, 2, 3, 7, 8]}, index=midx)

        result = df.loc[:, "value"]
        tm.assert_series_equal(df["value"], result)

        result = df.loc[df.index[1:3], "value"]
        tm.assert_series_equal(df["value"][1:3], result)

        result = df.loc[:, :]
        tm.assert_frame_equal(df, result)

        result = df
        df.loc[:, "value"] = 10
        result["value"] = 10
        tm.assert_frame_equal(df, result)

        df.loc[:, :] = 10
        tm.assert_frame_equal(df, result)
Exemple #23
0
    def test_read_excel_parse_dates(self, ext):
        # see gh-11544, gh-12051
        df = DataFrame(
            {"col": [1, 2, 3], "date_strings": pd.date_range("2012-01-01", periods=3)}
        )
        df2 = df.copy()
        df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y")

        with tm.ensure_clean(ext) as pth:
            df2.to_excel(pth)

            res = pd.read_excel(pth, index_col=0)
            tm.assert_frame_equal(df2, res)

            res = pd.read_excel(pth, parse_dates=["date_strings"], index_col=0)
            tm.assert_frame_equal(df, res)

            date_parser = lambda x: datetime.strptime(x, "%m/%d/%Y")
            res = pd.read_excel(
                pth, parse_dates=["date_strings"], date_parser=date_parser, index_col=0
            )
            tm.assert_frame_equal(df, res)
Exemple #24
0
    def test_sort_values_multicolumn(self):
        A = np.arange(5).repeat(20)
        B = np.tile(np.arange(5), 20)
        random.shuffle(A)
        random.shuffle(B)
        frame = DataFrame({"A": A, "B": B, "C": np.random.randn(100)})

        result = frame.sort_values(by=["A", "B"])
        indexer = np.lexsort((frame["B"], frame["A"]))
        expected = frame.take(indexer)
        tm.assert_frame_equal(result, expected)

        result = frame.sort_values(by=["A", "B"], ascending=False)
        indexer = np.lexsort((frame["B"].rank(ascending=False),
                              frame["A"].rank(ascending=False)))
        expected = frame.take(indexer)
        tm.assert_frame_equal(result, expected)

        result = frame.sort_values(by=["B", "A"])
        indexer = np.lexsort((frame["A"], frame["B"]))
        expected = frame.take(indexer)
        tm.assert_frame_equal(result, expected)
Exemple #25
0
    def test_concat_series_partial_columns_names(self):
        # GH10698
        foo = Series([1, 2], name="foo")
        bar = Series([1, 2])
        baz = Series([4, 5])

        result = concat([foo, bar, baz], axis=1)
        expected = DataFrame(
            {"foo": [1, 2], 0: [1, 2], 1: [4, 5]}, columns=["foo", 0, 1]
        )
        tm.assert_frame_equal(result, expected)

        result = concat([foo, bar, baz], axis=1, keys=["red", "blue", "yellow"])
        expected = DataFrame(
            {"red": [1, 2], "blue": [1, 2], "yellow": [4, 5]},
            columns=["red", "blue", "yellow"],
        )
        tm.assert_frame_equal(result, expected)

        result = concat([foo, bar, baz], axis=1, ignore_index=True)
        expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]})
        tm.assert_frame_equal(result, expected)
Exemple #26
0
    def test_concat_multiple_tzs(self):
        # GH#12467
        # combining datetime tz-aware and naive DataFrames
        ts1 = Timestamp("2015-01-01", tz=None)
        ts2 = Timestamp("2015-01-01", tz="UTC")
        ts3 = Timestamp("2015-01-01", tz="EST")

        df1 = DataFrame(dict(time=[ts1]))
        df2 = DataFrame(dict(time=[ts2]))
        df3 = DataFrame(dict(time=[ts3]))

        results = pd.concat([df1, df2]).reset_index(drop=True)
        expected = DataFrame(dict(time=[ts1, ts2]), dtype=object)
        tm.assert_frame_equal(results, expected)

        results = pd.concat([df1, df3]).reset_index(drop=True)
        expected = DataFrame(dict(time=[ts1, ts3]), dtype=object)
        tm.assert_frame_equal(results, expected)

        results = pd.concat([df2, df3]).reset_index(drop=True)
        expected = DataFrame(dict(time=[ts2, ts3]))
        tm.assert_frame_equal(results, expected)
Exemple #27
0
    def test1_index(self):
        # Tests with DEMO_G.xpt using index (all numeric file)

        # Compare to this
        data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv"))
        data_csv = data_csv.set_index("SEQN")
        numeric_as_float(data_csv)

        # Read full file
        data = read_sas(self.file01, index="SEQN", format="xport")
        tm.assert_frame_equal(data, data_csv, check_index_type=False)

        # Test incremental read with `read` method.
        reader = read_sas(self.file01, index="SEQN", format="xport", iterator=True)
        data = reader.read(10)
        reader.close()
        tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False)

        # Test incremental read with `get_chunk` method.
        reader = read_sas(self.file01, index="SEQN", format="xport", chunksize=10)
        data = reader.get_chunk()
        reader.close()
        tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False)
    def test_get_dummies_basic_drop_first_NA(self, sparse):
        # Test NA handling together with drop_first
        s_NA = ["a", "b", np.nan]
        res = get_dummies(s_NA, drop_first=True, sparse=sparse)
        exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8)
        if sparse:
            exp = exp.apply(SparseArray, fill_value=0)

        tm.assert_frame_equal(res, exp)

        res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse)
        exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=np.uint8).reindex(
            ["b", np.nan], axis=1
        )
        if sparse:
            exp_na = exp_na.apply(SparseArray, fill_value=0)
        tm.assert_frame_equal(res_na, exp_na)

        res_just_na = get_dummies(
            [np.nan], dummy_na=True, drop_first=True, sparse=sparse
        )
        exp_just_na = DataFrame(index=np.arange(1))
        tm.assert_frame_equal(res_just_na, exp_just_na)
Exemple #29
0
def test_apply_mutating(using_array_manager):
    # GH#35462 case where applied func pins a new BlockManager to a row
    df = DataFrame({"a": range(100), "b": range(100, 200)})
    df_orig = df.copy()

    def func(row):
        mgr = row._mgr
        row.loc["a"] += 1
        assert row._mgr is not mgr
        return row

    expected = df.copy()
    expected["a"] += 1

    result = df.apply(func, axis=1)

    tm.assert_frame_equal(result, expected)
    if not using_array_manager:
        # INFO(ArrayManager) With BlockManager, the row is a view and mutated in place,
        # with ArrayManager the row is not a view, and thus not mutated in place
        tm.assert_frame_equal(df, result)
    else:
        tm.assert_frame_equal(df, df_orig)
Exemple #30
0
    def test_shift_dt64values_int_fill_deprecated(self):
        # GH#31971
        ser = Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")])

        with tm.assert_produces_warning(FutureWarning):
            result = ser.shift(1, fill_value=0)
        expected = Series([pd.Timestamp(0), ser[0]])
        tm.assert_series_equal(result, expected)

        df = ser.to_frame()
        with tm.assert_produces_warning(FutureWarning):
            result = df.shift(1, fill_value=0)
        expected = expected.to_frame()
        tm.assert_frame_equal(result, expected)

        # axis = 1
        df2 = DataFrame({"A": ser, "B": ser})
        df2._consolidate_inplace()

        with tm.assert_produces_warning(FutureWarning):
            result = df2.shift(1, axis=1, fill_value=0)

        expected = DataFrame({
            "A": [pd.Timestamp(0), pd.Timestamp(0)],
            "B": df2["A"]
        })
        tm.assert_frame_equal(result, expected)

        # same thing but not consolidated
        # This isn't great that we get different behavior, but
        #  that will go away when the deprecation is enforced
        df3 = DataFrame({"A": ser})
        df3["B"] = ser
        assert len(df3._mgr.arrays) == 2
        result = df3.shift(1, axis=1, fill_value=0)
        expected = DataFrame({"A": [0, 0], "B": df2["A"]})
        tm.assert_frame_equal(result, expected)