Exemple #1
0
    def test_astype_unicode(self):
        # see GH#7758: A bit of magic is required to set
        # default encoding to utf-8
        digits = string.digits
        test_series = [
            Series([digits * 10,
                    tm.rands(63),
                    tm.rands(64),
                    tm.rands(1000)]),
            Series(["データーサイエンス、お前はもう死んでいる"]),
        ]

        former_encoding = None

        if sys.getdefaultencoding() == "utf-8":
            test_series.append(Series(["野菜食べないとやばい".encode()]))

        for s in test_series:
            res = s.astype("unicode")
            expec = s.map(str)
            tm.assert_series_equal(res, expec)

        # Restore the former encoding
        if former_encoding is not None and former_encoding != "utf-8":
            reload(sys)
            sys.setdefaultencoding(former_encoding)
    def test_getitem_series_integer_with_missing_raises(self, data, idx):
        msg = "Cannot index with an integer indexer containing NA values"
        # TODO: this raises KeyError about labels not found (it tries label-based)

        ser = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))])
        with pytest.raises(ValueError, match=msg):
            ser[idx]
Exemple #3
0
def test_utf16_bom_skiprows(all_parsers, sep, encoding):
    # see gh-2298
    parser = all_parsers
    data = """skip this
skip this too
A,B,C
1,2,3
4,5,6""".replace(",", sep)
    path = "__{}__.csv".format(tm.rands(10))
    kwargs = dict(sep=sep, skiprows=2)
    utf8 = "utf-8"

    with tm.ensure_clean(path) as path:
        from io import TextIOWrapper

        bytes_data = data.encode(encoding)

        with open(path, "wb") as f:
            f.write(bytes_data)

        bytes_buffer = BytesIO(data.encode(utf8))
        bytes_buffer = TextIOWrapper(bytes_buffer, encoding=utf8)

        result = parser.read_csv(path, encoding=encoding, **kwargs)
        expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs)

        bytes_buffer.close()
        tm.assert_frame_equal(result, expected)
Exemple #4
0
    def test_timestamp_compare(self):
        # make sure we can compare Timestamps on the right AND left hand side
        # GH#4982
        df = pd.DataFrame(
            {
                "dates1": pd.date_range("20010101", periods=10),
                "dates2": pd.date_range("20010102", periods=10),
                "intcol": np.random.randint(1000000000, size=10),
                "floatcol": np.random.randn(10),
                "stringcol": list(tm.rands(10)),
            }
        )
        df.loc[np.random.rand(len(df)) > 0.5, "dates2"] = pd.NaT
        ops = {"gt": "lt", "lt": "gt", "ge": "le", "le": "ge", "eq": "eq", "ne": "ne"}

        for left, right in ops.items():
            left_f = getattr(operator, left)
            right_f = getattr(operator, right)

            # no nats
            if left in ["eq", "ne"]:
                expected = left_f(df, pd.Timestamp("20010109"))
                result = right_f(pd.Timestamp("20010109"), df)
                tm.assert_frame_equal(result, expected)
            else:
                with pytest.raises(TypeError):
                    left_f(df, pd.Timestamp("20010109"))
                with pytest.raises(TypeError):
                    right_f(pd.Timestamp("20010109"), df)
            # nats
            expected = left_f(df, pd.Timestamp("nat"))
            result = right_f(pd.Timestamp("nat"), df)
            tm.assert_frame_equal(result, expected)
    def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series):
        arr = data.copy()

        # TODO(xfail) this raises KeyError about labels not found (it tries label-based)
        # for list of labels with Series
        if box_in_series:
            arr = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))])

        msg = "Cannot index with an integer indexer containing NA values"
        with pytest.raises(ValueError, match=msg):
            arr[idx] = arr[0]
Exemple #6
0
class TestAstype:
    @pytest.mark.parametrize("dtype", np.typecodes["All"])
    def test_astype_empty_constructor_equality(self, dtype):
        # see GH#15524

        if dtype not in (
            "S",
            "V",  # poor support (if any) currently
            "M",
            "m",  # Generic timestamps raise a ValueError. Already tested.
        ):
            init_empty = Series([], dtype=dtype)
            with tm.assert_produces_warning(FutureWarning):
                as_type_empty = Series([]).astype(dtype)
            tm.assert_series_equal(init_empty, as_type_empty)

    @pytest.mark.parametrize("dtype", [str, np.str_])
    @pytest.mark.parametrize(
        "series",
        [
            Series([string.digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]),
            Series([string.digits * 10, tm.rands(63), tm.rands(64), np.nan, 1.0]),
        ],
    )
    def test_astype_str_map(self, dtype, series):
        # see GH#4405
        result = series.astype(dtype)
        expected = series.map(str)
        tm.assert_series_equal(result, expected)

    def test_astype_float_to_period(self):
        result = Series([np.nan]).astype("period[D]")
        expected = Series([NaT], dtype="period[D]")
        tm.assert_series_equal(result, expected)

    def test_astype_no_pandas_dtype(self):
        # https://github.com/pandas-dev/pandas/pull/24866
        ser = Series([1, 2], dtype="int64")
        # Don't have PandasDtype in the public API, so we use `.array.dtype`,
        # which is a PandasDtype.
        result = ser.astype(ser.array.dtype)
        tm.assert_series_equal(result, ser)

    @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64])
    def test_astype_generic_timestamp_no_frequency(self, dtype, request):
        # see GH#15524, GH#15987
        data = [1]
        ser = Series(data)

        if np.dtype(dtype).name not in ["timedelta64", "datetime64"]:
            mark = pytest.mark.xfail(reason="GH#33890 Is assigned ns unit")
            request.node.add_marker(mark)

        msg = (
            fr"The '{dtype.__name__}' dtype has no unit\. "
            fr"Please pass in '{dtype.__name__}\[ns\]' instead."
        )
        with pytest.raises(ValueError, match=msg):
            ser.astype(dtype)

    def test_astype_dt64_to_str(self):
        # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex
        dti = date_range("2012-01-01", periods=3)
        result = Series(dti).astype(str)
        expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object)
        tm.assert_series_equal(result, expected)

    def test_astype_dt64tz_to_str(self):
        # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex
        dti_tz = date_range("2012-01-01", periods=3, tz="US/Eastern")
        result = Series(dti_tz).astype(str)
        expected = Series(
            [
                "2012-01-01 00:00:00-05:00",
                "2012-01-02 00:00:00-05:00",
                "2012-01-03 00:00:00-05:00",
            ],
            dtype=object,
        )
        tm.assert_series_equal(result, expected)

    def test_astype_datetime(self):
        s = Series(iNaT, dtype="M8[ns]", index=range(5))

        s = s.astype("O")
        assert s.dtype == np.object_

        s = Series([datetime(2001, 1, 2, 0, 0)])

        s = s.astype("O")
        assert s.dtype == np.object_

        s = Series([datetime(2001, 1, 2, 0, 0) for i in range(3)])

        s[1] = np.nan
        assert s.dtype == "M8[ns]"

        s = s.astype("O")
        assert s.dtype == np.object_

    def test_astype_datetime64tz(self):
        s = Series(date_range("20130101", periods=3, tz="US/Eastern"))

        # astype
        result = s.astype(object)
        expected = Series(s.astype(object), dtype=object)
        tm.assert_series_equal(result, expected)

        result = Series(s.values).dt.tz_localize("UTC").dt.tz_convert(s.dt.tz)
        tm.assert_series_equal(result, s)

        # astype - object, preserves on construction
        result = Series(s.astype(object))
        expected = s.astype(object)
        tm.assert_series_equal(result, expected)

        # astype - datetime64[ns, tz]
        with tm.assert_produces_warning(FutureWarning):
            # dt64->dt64tz astype deprecated
            result = Series(s.values).astype("datetime64[ns, US/Eastern]")
        tm.assert_series_equal(result, s)

        with tm.assert_produces_warning(FutureWarning):
            # dt64->dt64tz astype deprecated
            result = Series(s.values).astype(s.dtype)
        tm.assert_series_equal(result, s)

        result = s.astype("datetime64[ns, CET]")
        expected = Series(date_range("20130101 06:00:00", periods=3, tz="CET"))
        tm.assert_series_equal(result, expected)

    def test_astype_str_cast_dt64(self):
        # see GH#9757
        ts = Series([Timestamp("2010-01-04 00:00:00")])
        s = ts.astype(str)

        expected = Series(["2010-01-04"])
        tm.assert_series_equal(s, expected)

        ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")])
        s = ts.astype(str)

        expected = Series(["2010-01-04 00:00:00-05:00"])
        tm.assert_series_equal(s, expected)

    def test_astype_str_cast_td64(self):
        # see GH#9757

        td = Series([Timedelta(1, unit="d")])
        ser = td.astype(str)

        expected = Series(["1 days"])
        tm.assert_series_equal(ser, expected)

    def test_dt64_series_astype_object(self):
        dt64ser = Series(date_range("20130101", periods=3))
        result = dt64ser.astype(object)
        assert isinstance(result.iloc[0], datetime)
        assert result.dtype == np.object_

    def test_td64_series_astype_object(self):
        tdser = Series(["59 Days", "59 Days", "NaT"], dtype="timedelta64[ns]")
        result = tdser.astype(object)
        assert isinstance(result.iloc[0], timedelta)
        assert result.dtype == np.object_

    @pytest.mark.parametrize(
        "data, dtype",
        [
            (["x", "y", "z"], "string[python]"),
            pytest.param(
                ["x", "y", "z"],
                "string[pyarrow]",
                marks=td.skip_if_no("pyarrow", min_version="1.0.0"),
            ),
            (["x", "y", "z"], "category"),
            (3 * [Timestamp("2020-01-01", tz="UTC")], None),
            (3 * [Interval(0, 1)], None),
        ],
    )
    @pytest.mark.parametrize("errors", ["raise", "ignore"])
    def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors):
        # https://github.com/pandas-dev/pandas/issues/35471
        ser = Series(data, dtype=dtype)
        if errors == "ignore":
            expected = ser
            result = ser.astype(float, errors="ignore")
            tm.assert_series_equal(result, expected)
        else:
            msg = "(Cannot cast)|(could not convert)"
            with pytest.raises((ValueError, TypeError), match=msg):
                ser.astype(float, errors=errors)

    @pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64])
    def test_astype_from_float_to_str(self, dtype):
        # https://github.com/pandas-dev/pandas/issues/36451
        s = Series([0.1], dtype=dtype)
        result = s.astype(str)
        expected = Series(["0.1"])
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize(
        "value, string_value",
        [
            (None, "None"),
            (np.nan, "nan"),
            (NA, "<NA>"),
        ],
    )
    def test_astype_to_str_preserves_na(self, value, string_value):
        # https://github.com/pandas-dev/pandas/issues/36904
        s = Series(["a", "b", value], dtype=object)
        result = s.astype(str)
        expected = Series(["a", "b", string_value], dtype=object)
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"])
    def test_astype(self, dtype):
        s = Series(np.random.randn(5), name="foo")
        as_typed = s.astype(dtype)

        assert as_typed.dtype == dtype
        assert as_typed.name == s.name

    @pytest.mark.parametrize("value", [np.nan, np.inf])
    @pytest.mark.parametrize("dtype", [np.int32, np.int64])
    def test_astype_cast_nan_inf_int(self, dtype, value):
        # gh-14265: check NaN and inf raise error when converting to int
        msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
        s = Series([value])

        with pytest.raises(ValueError, match=msg):
            s.astype(dtype)

    @pytest.mark.parametrize("dtype", [int, np.int8, np.int64])
    def test_astype_cast_object_int_fail(self, dtype):
        arr = Series(["car", "house", "tree", "1"])
        msg = r"invalid literal for int\(\) with base 10: 'car'"
        with pytest.raises(ValueError, match=msg):
            arr.astype(dtype)

    def test_astype_cast_object_int(self):
        arr = Series(["1", "2", "3", "4"], dtype=object)
        result = arr.astype(int)

        tm.assert_series_equal(result, Series(np.arange(1, 5)))

    def test_astype_unicode(self):
        # see GH#7758: A bit of magic is required to set
        # default encoding to utf-8
        digits = string.digits
        test_series = [
            Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]),
            Series(["データーサイエンス、お前はもう死んでいる"]),
        ]

        former_encoding = None

        if sys.getdefaultencoding() == "utf-8":
            test_series.append(Series(["野菜食べないとやばい".encode()]))

        for s in test_series:
            res = s.astype("unicode")
            expec = s.map(str)
            tm.assert_series_equal(res, expec)

        # Restore the former encoding
        if former_encoding is not None and former_encoding != "utf-8":
            reload(sys)
            sys.setdefaultencoding(former_encoding)

    def test_astype_bytes(self):
        # GH#39474
        result = Series(["foo", "bar", "baz"]).astype(bytes)
        assert result.dtypes == np.dtype("S3")

    def test_astype_nan_to_bool(self):
        # GH#43018
        ser = Series(np.nan, dtype="object")
        result = ser.astype("bool")
        expected = Series(True, dtype="bool")
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize(
        "dtype",
        tm.ALL_INT_EA_DTYPES + tm.FLOAT_EA_DTYPES,
    )
    def test_astype_ea_to_datetimetzdtype(self, dtype):
        # GH37553
        result = Series([4, 0, 9], dtype=dtype).astype(DatetimeTZDtype(tz="US/Pacific"))
        expected = Series(
            {
                0: Timestamp("1969-12-31 16:00:00.000000004-08:00", tz="US/Pacific"),
                1: Timestamp("1969-12-31 16:00:00.000000000-08:00", tz="US/Pacific"),
                2: Timestamp("1969-12-31 16:00:00.000000009-08:00", tz="US/Pacific"),
            }
        )

        if dtype in tm.FLOAT_EA_DTYPES:
            expected = Series(
                {
                    0: Timestamp(
                        "1970-01-01 00:00:00.000000004-08:00", tz="US/Pacific"
                    ),
                    1: Timestamp(
                        "1970-01-01 00:00:00.000000000-08:00", tz="US/Pacific"
                    ),
                    2: Timestamp(
                        "1970-01-01 00:00:00.000000009-08:00", tz="US/Pacific"
                    ),
                }
            )

        tm.assert_series_equal(result, expected)
Exemple #7
0
class TestSeriesDtypes:
    def test_dt64_series_astype_object(self):
        dt64ser = Series(date_range("20130101", periods=3))
        result = dt64ser.astype(object)
        assert isinstance(result.iloc[0], datetime)
        assert result.dtype == np.object_

    def test_td64_series_astype_object(self):
        tdser = Series(["59 Days", "59 Days", "NaT"], dtype="timedelta64[ns]")
        result = tdser.astype(object)
        assert isinstance(result.iloc[0], timedelta)
        assert result.dtype == np.object_

    @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"])
    def test_astype(self, dtype):
        s = Series(np.random.randn(5), name="foo")
        as_typed = s.astype(dtype)

        assert as_typed.dtype == dtype
        assert as_typed.name == s.name

    def test_dtype(self, datetime_series):

        assert datetime_series.dtype == np.dtype("float64")
        assert datetime_series.dtypes == np.dtype("float64")

    @pytest.mark.parametrize("value", [np.nan, np.inf])
    @pytest.mark.parametrize("dtype", [np.int32, np.int64])
    def test_astype_cast_nan_inf_int(self, dtype, value):
        # gh-14265: check NaN and inf raise error when converting to int
        msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
        s = Series([value])

        with pytest.raises(ValueError, match=msg):
            s.astype(dtype)

    @pytest.mark.parametrize("dtype", [int, np.int8, np.int64])
    def test_astype_cast_object_int_fail(self, dtype):
        arr = Series(["car", "house", "tree", "1"])
        msg = r"invalid literal for int\(\) with base 10: 'car'"
        with pytest.raises(ValueError, match=msg):
            arr.astype(dtype)

    def test_astype_cast_object_int(self):
        arr = Series(["1", "2", "3", "4"], dtype=object)
        result = arr.astype(int)

        tm.assert_series_equal(result, Series(np.arange(1, 5)))

    def test_astype_datetime(self):
        s = Series(iNaT, dtype="M8[ns]", index=range(5))

        s = s.astype("O")
        assert s.dtype == np.object_

        s = Series([datetime(2001, 1, 2, 0, 0)])

        s = s.astype("O")
        assert s.dtype == np.object_

        s = Series([datetime(2001, 1, 2, 0, 0) for i in range(3)])

        s[1] = np.nan
        assert s.dtype == "M8[ns]"

        s = s.astype("O")
        assert s.dtype == np.object_

    def test_astype_datetime64tz(self):
        s = Series(date_range("20130101", periods=3, tz="US/Eastern"))

        # astype
        result = s.astype(object)
        expected = Series(s.astype(object), dtype=object)
        tm.assert_series_equal(result, expected)

        result = Series(s.values).dt.tz_localize("UTC").dt.tz_convert(s.dt.tz)
        tm.assert_series_equal(result, s)

        # astype - object, preserves on construction
        result = Series(s.astype(object))
        expected = s.astype(object)
        tm.assert_series_equal(result, expected)

        # astype - datetime64[ns, tz]
        result = Series(s.values).astype("datetime64[ns, US/Eastern]")
        tm.assert_series_equal(result, s)

        result = Series(s.values).astype(s.dtype)
        tm.assert_series_equal(result, s)

        result = s.astype("datetime64[ns, CET]")
        expected = Series(date_range("20130101 06:00:00", periods=3, tz="CET"))
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("dtype", [str, np.str_])
    @pytest.mark.parametrize(
        "series",
        [
            Series([
                string.digits * 10,
                tm.rands(63),
                tm.rands(64),
                tm.rands(1000)
            ]),
            Series(
                [string.digits * 10,
                 tm.rands(63),
                 tm.rands(64), np.nan, 1.0]),
        ],
    )
    def test_astype_str_map(self, dtype, series):
        # see gh-4405
        result = series.astype(dtype)
        expected = series.map(str)
        tm.assert_series_equal(result, expected)

    def test_astype_str_cast(self):
        # see gh-9757
        ts = Series([Timestamp("2010-01-04 00:00:00")])
        s = ts.astype(str)

        expected = Series([str("2010-01-04")])
        tm.assert_series_equal(s, expected)

        ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")])
        s = ts.astype(str)

        expected = Series([str("2010-01-04 00:00:00-05:00")])
        tm.assert_series_equal(s, expected)

        td = Series([Timedelta(1, unit="d")])
        s = td.astype(str)

        expected = Series([str("1 days 00:00:00.000000000")])
        tm.assert_series_equal(s, expected)

    def test_astype_unicode(self):
        # see gh-7758: A bit of magic is required to set
        # default encoding to utf-8
        digits = string.digits
        test_series = [
            Series([digits * 10,
                    tm.rands(63),
                    tm.rands(64),
                    tm.rands(1000)]),
            Series(["データーサイエンス、お前はもう死んでいる"]),
        ]

        former_encoding = None

        if sys.getdefaultencoding() == "utf-8":
            test_series.append(Series(["野菜食べないとやばい".encode("utf-8")]))

        for s in test_series:
            res = s.astype("unicode")
            expec = s.map(str)
            tm.assert_series_equal(res, expec)

        # Restore the former encoding
        if former_encoding is not None and former_encoding != "utf-8":
            reload(sys)
            sys.setdefaultencoding(former_encoding)

    @pytest.mark.parametrize("dtype_class", [dict, Series])
    def test_astype_dict_like(self, dtype_class):
        # see gh-7271
        s = Series(range(0, 10, 2), name="abc")

        dt1 = dtype_class({"abc": str})
        result = s.astype(dt1)
        expected = Series(["0", "2", "4", "6", "8"], name="abc")
        tm.assert_series_equal(result, expected)

        dt2 = dtype_class({"abc": "float64"})
        result = s.astype(dt2)
        expected = Series([0.0, 2.0, 4.0, 6.0, 8.0],
                          dtype="float64",
                          name="abc")
        tm.assert_series_equal(result, expected)

        dt3 = dtype_class({"abc": str, "def": str})
        msg = ("Only the Series name can be used for the key in Series dtype "
               r"mappings\.")
        with pytest.raises(KeyError, match=msg):
            s.astype(dt3)

        dt4 = dtype_class({0: str})
        with pytest.raises(KeyError, match=msg):
            s.astype(dt4)

        # GH16717
        # if dtypes provided is empty, it should error
        if dtype_class is Series:
            dt5 = dtype_class({}, dtype=object)
        else:
            dt5 = dtype_class({})

        with pytest.raises(KeyError, match=msg):
            s.astype(dt5)

    def test_astype_categories_raises(self):
        # deprecated 17636, removed in GH-27141
        s = Series(["a", "b", "a"])
        with pytest.raises(TypeError, match="got an unexpected"):
            s.astype("category", categories=["a", "b"], ordered=True)

    def test_astype_from_categorical(self):
        items = ["a", "b", "c", "a"]
        s = Series(items)
        exp = Series(Categorical(items))
        res = s.astype("category")
        tm.assert_series_equal(res, exp)

        items = [1, 2, 3, 1]
        s = Series(items)
        exp = Series(Categorical(items))
        res = s.astype("category")
        tm.assert_series_equal(res, exp)

        df = DataFrame({
            "cats": [1, 2, 3, 4, 5, 6],
            "vals": [1, 2, 3, 4, 5, 6]
        })
        cats = Categorical([1, 2, 3, 4, 5, 6])
        exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
        df["cats"] = df["cats"].astype("category")
        tm.assert_frame_equal(exp_df, df)

        df = DataFrame({
            "cats": ["a", "b", "b", "a", "a", "d"],
            "vals": [1, 2, 3, 4, 5, 6]
        })
        cats = Categorical(["a", "b", "b", "a", "a", "d"])
        exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
        df["cats"] = df["cats"].astype("category")
        tm.assert_frame_equal(exp_df, df)

        # with keywords
        lst = ["a", "b", "c", "a"]
        s = Series(lst)
        exp = Series(Categorical(lst, ordered=True))
        res = s.astype(CategoricalDtype(None, ordered=True))
        tm.assert_series_equal(res, exp)

        exp = Series(Categorical(lst, categories=list("abcdef"), ordered=True))
        res = s.astype(CategoricalDtype(list("abcdef"), ordered=True))
        tm.assert_series_equal(res, exp)

    def test_astype_categorical_to_other(self):

        value = np.random.RandomState(0).randint(0, 10000, 100)
        df = DataFrame({"value": value})
        labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
        cat_labels = Categorical(labels, labels)

        df = df.sort_values(by=["value"], ascending=True)
        df["value_group"] = pd.cut(df.value,
                                   range(0, 10500, 500),
                                   right=False,
                                   labels=cat_labels)

        s = df["value_group"]
        expected = s
        tm.assert_series_equal(s.astype("category"), expected)
        tm.assert_series_equal(s.astype(CategoricalDtype()), expected)
        msg = r"could not convert string to float|invalid literal for float\(\)"
        with pytest.raises(ValueError, match=msg):
            s.astype("float64")

        cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]))
        exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"])
        tm.assert_series_equal(cat.astype("str"), exp)
        s2 = Series(Categorical(["1", "2", "3", "4"]))
        exp2 = Series([1, 2, 3, 4]).astype(int)
        tm.assert_series_equal(s2.astype("int"), exp2)

        # object don't sort correctly, so just compare that we have the same
        # values
        def cmp(a, b):
            tm.assert_almost_equal(np.sort(np.unique(a)),
                                   np.sort(np.unique(b)))

        expected = Series(np.array(s.values), name="value_group")
        cmp(s.astype("object"), expected)
        cmp(s.astype(np.object_), expected)

        # array conversion
        tm.assert_almost_equal(np.array(s), np.array(s.values))

        # valid conversion
        for valid in [
                lambda x: x.astype("category"),
                lambda x: x.astype(CategoricalDtype()),
                lambda x: x.astype("object").astype("category"),
                lambda x: x.astype("object").astype(CategoricalDtype()),
        ]:

            result = valid(s)
            # compare series values
            # internal .categories can't be compared because it is sorted
            tm.assert_series_equal(result, s, check_categorical=False)

        # invalid conversion (these are NOT a dtype)
        msg = (r"invalid type <class 'pandas\.core\.arrays\.categorical\."
               "Categorical'> for astype")
        for invalid in [
                lambda x: x.astype(Categorical),
                lambda x: x.astype("object").astype(Categorical),
        ]:
            with pytest.raises(TypeError, match=msg):
                invalid(s)

    @pytest.mark.parametrize("name", [None, "foo"])
    @pytest.mark.parametrize("dtype_ordered", [True, False])
    @pytest.mark.parametrize("series_ordered", [True, False])
    def test_astype_categorical_to_categorical(self, name, dtype_ordered,
                                               series_ordered):
        # GH 10696/18593
        s_data = list("abcaacbab")
        s_dtype = CategoricalDtype(list("bac"), ordered=series_ordered)
        s = Series(s_data, dtype=s_dtype, name=name)

        # unspecified categories
        dtype = CategoricalDtype(ordered=dtype_ordered)
        result = s.astype(dtype)
        exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered)
        expected = Series(s_data, name=name, dtype=exp_dtype)
        tm.assert_series_equal(result, expected)

        # different categories
        dtype = CategoricalDtype(list("adc"), dtype_ordered)
        result = s.astype(dtype)
        expected = Series(s_data, name=name, dtype=dtype)
        tm.assert_series_equal(result, expected)

        if dtype_ordered is False:
            # not specifying ordered, so only test once
            expected = s
            result = s.astype("category")
            tm.assert_series_equal(result, expected)

    def test_astype_bool_missing_to_categorical(self):
        # GH-19182
        s = Series([True, False, np.nan])
        assert s.dtypes == np.object_

        result = s.astype(CategoricalDtype(categories=[True, False]))
        expected = Series(
            Categorical([True, False, np.nan], categories=[True, False]))
        tm.assert_series_equal(result, expected)

    def test_astype_categoricaldtype(self):
        s = Series(["a", "b", "a"])
        result = s.astype(CategoricalDtype(["a", "b"], ordered=True))
        expected = Series(Categorical(["a", "b", "a"], ordered=True))
        tm.assert_series_equal(result, expected)

        result = s.astype(CategoricalDtype(["a", "b"], ordered=False))
        expected = Series(Categorical(["a", "b", "a"], ordered=False))
        tm.assert_series_equal(result, expected)

        result = s.astype(CategoricalDtype(["a", "b", "c"], ordered=False))
        expected = Series(
            Categorical(["a", "b", "a"],
                        categories=["a", "b", "c"],
                        ordered=False))
        tm.assert_series_equal(result, expected)
        tm.assert_index_equal(result.cat.categories, Index(["a", "b", "c"]))

    @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64])
    def test_astype_generic_timestamp_no_frequency(self, dtype):
        # see gh-15524, gh-15987
        data = [1]
        s = Series(data)

        msg = (r"The '{dtype}' dtype has no unit\. "
               r"Please pass in '{dtype}\[ns\]' instead.").format(
                   dtype=dtype.__name__)
        with pytest.raises(ValueError, match=msg):
            s.astype(dtype)

    @pytest.mark.parametrize("dtype", np.typecodes["All"])
    def test_astype_empty_constructor_equality(self, dtype):
        # see gh-15524

        if dtype not in (
                "S",
                "V",  # poor support (if any) currently
                "M",
                "m",  # Generic timestamps raise a ValueError. Already tested.
        ):
            init_empty = Series([], dtype=dtype)
            with tm.assert_produces_warning(DeprecationWarning,
                                            check_stacklevel=False):
                as_type_empty = Series([]).astype(dtype)
            tm.assert_series_equal(init_empty, as_type_empty)

    def test_arg_for_errors_in_astype(self):
        # see gh-14878
        s = Series([1, 2, 3])

        msg = (r"Expected value of kwarg 'errors' to be one of \['raise', "
               r"'ignore'\]\. Supplied value is 'False'")
        with pytest.raises(ValueError, match=msg):
            s.astype(np.float64, errors=False)

        s.astype(np.int8, errors="raise")

    def test_intercept_astype_object(self):
        series = Series(date_range("1/1/2000", periods=10))

        # This test no longer makes sense, as
        # Series is by default already M8[ns].
        expected = series.astype("object")

        df = DataFrame({"a": series, "b": np.random.randn(len(series))})
        exp_dtypes = Series([np.dtype("datetime64[ns]"),
                             np.dtype("float64")],
                            index=["a", "b"])
        tm.assert_series_equal(df.dtypes, exp_dtypes)

        result = df.values.squeeze()
        assert (result[:, 0] == expected.values).all()

        df = DataFrame({"a": series, "b": ["foo"] * len(series)})

        result = df.values.squeeze()
        assert (result[:, 0] == expected.values).all()

    def test_series_to_categorical(self):
        # see gh-16524: test conversion of Series to Categorical
        series = Series(["a", "b", "c"])

        result = Series(series, dtype="category")
        expected = Series(["a", "b", "c"], dtype="category")

        tm.assert_series_equal(result, expected)

    def test_infer_objects_series(self):
        # GH 11221
        actual = Series(np.array([1, 2, 3], dtype="O")).infer_objects()
        expected = Series([1, 2, 3])
        tm.assert_series_equal(actual, expected)

        actual = Series(np.array([1, 2, 3, None], dtype="O")).infer_objects()
        expected = Series([1.0, 2.0, 3.0, np.nan])
        tm.assert_series_equal(actual, expected)

        # only soft conversions, unconvertable pass thru unchanged
        actual = Series(np.array([1, 2, 3, None, "a"],
                                 dtype="O")).infer_objects()
        expected = Series([1, 2, 3, None, "a"])

        assert actual.dtype == "object"
        tm.assert_series_equal(actual, expected)

    @pytest.mark.parametrize(
        "data",
        [
            pd.period_range("2000", periods=4),
            pd.IntervalIndex.from_breaks([1, 2, 3, 4]),
        ],
    )
    def test_values_compatibility(self, data):
        # https://github.com/pandas-dev/pandas/issues/23995
        result = pd.Series(data).values
        expected = np.array(data.astype(object))
        tm.assert_numpy_array_equal(result, expected)

    def test_reindex_astype_order_consistency(self):
        # GH 17444
        s = Series([1, 2, 3], index=[2, 0, 1])
        new_index = [0, 1, 2]
        temp_dtype = "category"
        new_dtype = str
        s1 = s.reindex(new_index).astype(temp_dtype).astype(new_dtype)
        s2 = s.astype(temp_dtype).reindex(new_index).astype(new_dtype)
        tm.assert_series_equal(s1, s2)
Exemple #8
0
def get_random_path():
    return "__{}__.pickle".format(tm.rands(10))
Exemple #9
0
def setup_path():
    """Fixture for setup path"""
    return "tmp.__{}__.h5".format(tm.rands(10))
Exemple #10
0
def test_rands():
    r = tm.rands(10)
    assert len(r) == 10
Exemple #11
0
class TestSeriesDtypes:
    def test_dtype(self, datetime_series):

        assert datetime_series.dtype == np.dtype("float64")
        assert datetime_series.dtypes == np.dtype("float64")

    @pytest.mark.parametrize("dtype", [str, np.str_])
    @pytest.mark.parametrize(
        "series",
        [
            Series([string.digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]),
            Series([string.digits * 10, tm.rands(63), tm.rands(64), np.nan, 1.0]),
        ],
    )
    def test_astype_str_map(self, dtype, series):
        # see gh-4405
        result = series.astype(dtype)
        expected = series.map(str)
        tm.assert_series_equal(result, expected)

    def test_astype_from_categorical(self):
        items = ["a", "b", "c", "a"]
        s = Series(items)
        exp = Series(Categorical(items))
        res = s.astype("category")
        tm.assert_series_equal(res, exp)

        items = [1, 2, 3, 1]
        s = Series(items)
        exp = Series(Categorical(items))
        res = s.astype("category")
        tm.assert_series_equal(res, exp)

        df = DataFrame({"cats": [1, 2, 3, 4, 5, 6], "vals": [1, 2, 3, 4, 5, 6]})
        cats = Categorical([1, 2, 3, 4, 5, 6])
        exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
        df["cats"] = df["cats"].astype("category")
        tm.assert_frame_equal(exp_df, df)

        df = DataFrame(
            {"cats": ["a", "b", "b", "a", "a", "d"], "vals": [1, 2, 3, 4, 5, 6]}
        )
        cats = Categorical(["a", "b", "b", "a", "a", "d"])
        exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
        df["cats"] = df["cats"].astype("category")
        tm.assert_frame_equal(exp_df, df)

        # with keywords
        lst = ["a", "b", "c", "a"]
        s = Series(lst)
        exp = Series(Categorical(lst, ordered=True))
        res = s.astype(CategoricalDtype(None, ordered=True))
        tm.assert_series_equal(res, exp)

        exp = Series(Categorical(lst, categories=list("abcdef"), ordered=True))
        res = s.astype(CategoricalDtype(list("abcdef"), ordered=True))
        tm.assert_series_equal(res, exp)

    def test_astype_categorical_to_other(self):

        value = np.random.RandomState(0).randint(0, 10000, 100)
        df = DataFrame({"value": value})
        labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
        cat_labels = Categorical(labels, labels)

        df = df.sort_values(by=["value"], ascending=True)
        df["value_group"] = pd.cut(
            df.value, range(0, 10500, 500), right=False, labels=cat_labels
        )

        s = df["value_group"]
        expected = s
        tm.assert_series_equal(s.astype("category"), expected)
        tm.assert_series_equal(s.astype(CategoricalDtype()), expected)
        msg = r"could not convert string to float|invalid literal for float\(\)"
        with pytest.raises(ValueError, match=msg):
            s.astype("float64")

        cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]))
        exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"])
        tm.assert_series_equal(cat.astype("str"), exp)
        s2 = Series(Categorical(["1", "2", "3", "4"]))
        exp2 = Series([1, 2, 3, 4]).astype(int)
        tm.assert_series_equal(s2.astype("int"), exp2)

        # object don't sort correctly, so just compare that we have the same
        # values
        def cmp(a, b):
            tm.assert_almost_equal(np.sort(np.unique(a)), np.sort(np.unique(b)))

        expected = Series(np.array(s.values), name="value_group")
        cmp(s.astype("object"), expected)
        cmp(s.astype(np.object_), expected)

        # array conversion
        tm.assert_almost_equal(np.array(s), np.array(s.values))

        tm.assert_series_equal(s.astype("category"), s)
        tm.assert_series_equal(s.astype(CategoricalDtype()), s)

        roundtrip_expected = s.cat.set_categories(
            s.cat.categories.sort_values()
        ).cat.remove_unused_categories()
        tm.assert_series_equal(
            s.astype("object").astype("category"), roundtrip_expected
        )
        tm.assert_series_equal(
            s.astype("object").astype(CategoricalDtype()), roundtrip_expected
        )

    def test_invalid_conversions(self):
        # invalid conversion (these are NOT a dtype)
        cat = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)])
        ser = Series(np.random.randint(0, 10000, 100)).sort_values()
        ser = pd.cut(ser, range(0, 10500, 500), right=False, labels=cat)

        msg = (
            "dtype '<class 'pandas.core.arrays.categorical.Categorical'>' "
            "not understood"
        )
        with pytest.raises(TypeError, match=msg):
            ser.astype(Categorical)
        with pytest.raises(TypeError, match=msg):
            ser.astype("object").astype(Categorical)

    @pytest.mark.parametrize("dtype", np.typecodes["All"])
    def test_astype_empty_constructor_equality(self, dtype):
        # see gh-15524

        if dtype not in (
            "S",
            "V",  # poor support (if any) currently
            "M",
            "m",  # Generic timestamps raise a ValueError. Already tested.
        ):
            init_empty = Series([], dtype=dtype)
            with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False):
                as_type_empty = Series([]).astype(dtype)
            tm.assert_series_equal(init_empty, as_type_empty)

    def test_series_to_categorical(self):
        # see gh-16524: test conversion of Series to Categorical
        series = Series(["a", "b", "c"])

        result = Series(series, dtype="category")
        expected = Series(["a", "b", "c"], dtype="category")

        tm.assert_series_equal(result, expected)

    def test_reindex_astype_order_consistency(self):
        # GH 17444
        s = Series([1, 2, 3], index=[2, 0, 1])
        new_index = [0, 1, 2]
        temp_dtype = "category"
        new_dtype = str
        s1 = s.reindex(new_index).astype(temp_dtype).astype(new_dtype)
        s2 = s.astype(temp_dtype).reindex(new_index).astype(new_dtype)
        tm.assert_series_equal(s1, s2)