Ejemplo n.º 1
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

ftemp = np.zeros((len(inputs), 5000000))


for i in inputs:
    MCalgo(inputs)


idx = pd.Int64Index(range(50000000))
chain1 = pd.read_csv('chain1.csv', header=None)
traceplot_df = pd.DataFrame(index=idx)
traceplot_df['a'] = a
traceplot_df['b'] = b
traceplot_df['c'] = c

plt.figure(figsize=[6.4, 10])
plt.subplot(3, 1, 1)
plt.plot(idx, chain1, color='tab:blue', linewidth=0.3)
plt.title('a')
plt.subplot(3, 1, 2)
plt.plot(idx, chain2, color='tab:blue', linewidth=0.3)
plt.title('b')
plt.subplot(3, 1, 3)
plt.plot(idx, chain3, color='tab:blue', linewidth=0.3)
plt.title('c')
plt.savefig("traceplots.png")

Ejemplo n.º 2
0
    def test_insert_index_int64(self, insert, coerced_val, coerced_dtype):
        obj = pd.Int64Index([1, 2, 3, 4])
        assert obj.dtype == np.int64

        exp = pd.Index([1, coerced_val, 2, 3, 4])
        self._assert_insert_conversion(obj, insert, exp, coerced_dtype)
Ejemplo n.º 3
0
class TestSeriesConstructors:
    @pytest.mark.parametrize(
        "constructor,check_index_type",
        [
            # NOTE: some overlap with test_constructor_empty but that test does not
            # test for None or an empty generator.
            # test_constructor_pass_none tests None but only with the index also
            # passed.
            (lambda: Series(), True),
            (lambda: Series(None), True),
            (lambda: Series({}), True),
            (lambda: Series(()), False),  # creates a RangeIndex
            (lambda: Series([]), False),  # creates a RangeIndex
            (lambda: Series((_ for _ in [])), False),  # creates a RangeIndex
            (lambda: Series(data=None), True),
            (lambda: Series(data={}), True),
            (lambda: Series(data=()), False),  # creates a RangeIndex
            (lambda: Series(data=[]), False),  # creates a RangeIndex
            (lambda: Series(data=(_ for _ in [])),
             False),  # creates a RangeIndex
        ],
    )
    def test_empty_constructor(self, constructor, check_index_type):
        expected = Series()
        result = constructor()
        assert len(result.index) == 0
        tm.assert_series_equal(result,
                               expected,
                               check_index_type=check_index_type)

    def test_invalid_dtype(self):
        # GH15520
        msg = "not understood"
        invalid_list = [pd.Timestamp, "pd.Timestamp", list]
        for dtype in invalid_list:
            with pytest.raises(TypeError, match=msg):
                Series([], name="time", dtype=dtype)

    def test_scalar_conversion(self):

        # Pass in scalar is disabled
        scalar = Series(0.5)
        assert not isinstance(scalar, float)

        # Coercion
        assert float(Series([1.0])) == 1.0
        assert int(Series([1.0])) == 1

    def test_constructor(self, datetime_series):
        empty_series = Series()

        assert datetime_series.index.is_all_dates

        # Pass in Series
        derived = Series(datetime_series)
        assert derived.index.is_all_dates

        assert tm.equalContents(derived.index, datetime_series.index)
        # Ensure new index is not created
        assert id(datetime_series.index) == id(derived.index)

        # Mixed type Series
        mixed = Series(["hello", np.NaN], index=[0, 1])
        assert mixed.dtype == np.object_
        assert mixed[1] is np.NaN

        assert not empty_series.index.is_all_dates
        assert not Series().index.is_all_dates

        # exception raised is of type Exception
        with pytest.raises(Exception, match="Data must be 1-dimensional"):
            Series(np.random.randn(3, 3), index=np.arange(3))

        mixed.name = "Series"
        rs = Series(mixed).name
        xp = "Series"
        assert rs == xp

        # raise on MultiIndex GH4187
        m = MultiIndex.from_arrays([[1, 2], [3, 4]])
        msg = "initializing a Series from a MultiIndex is not supported"
        with pytest.raises(NotImplementedError, match=msg):
            Series(m)

    @pytest.mark.parametrize("input_class", [list, dict, OrderedDict])
    def test_constructor_empty(self, input_class):
        empty = Series()
        empty2 = Series(input_class())

        # these are Index() and RangeIndex() which don't compare type equal
        # but are just .equals
        tm.assert_series_equal(empty, empty2, check_index_type=False)

        # With explicit dtype:
        empty = Series(dtype="float64")
        empty2 = Series(input_class(), dtype="float64")
        tm.assert_series_equal(empty, empty2, check_index_type=False)

        # GH 18515 : with dtype=category:
        empty = Series(dtype="category")
        empty2 = Series(input_class(), dtype="category")
        tm.assert_series_equal(empty, empty2, check_index_type=False)

        if input_class is not list:
            # With index:
            empty = Series(index=range(10))
            empty2 = Series(input_class(), index=range(10))
            tm.assert_series_equal(empty, empty2)

            # With index and dtype float64:
            empty = Series(np.nan, index=range(10))
            empty2 = Series(input_class(), index=range(10), dtype="float64")
            tm.assert_series_equal(empty, empty2)

            # GH 19853 : with empty string, index and dtype str
            empty = Series("", dtype=str, index=range(3))
            empty2 = Series("", index=range(3))
            tm.assert_series_equal(empty, empty2)

    @pytest.mark.parametrize("input_arg", [np.nan, float("nan")])
    def test_constructor_nan(self, input_arg):
        empty = Series(dtype="float64", index=range(10))
        empty2 = Series(input_arg, index=range(10))

        tm.assert_series_equal(empty, empty2, check_index_type=False)

    @pytest.mark.parametrize(
        "dtype",
        [
            "f8", "i8", "M8[ns]", "m8[ns]", "category", "object",
            "datetime64[ns, UTC]"
        ],
    )
    @pytest.mark.parametrize("index", [None, pd.Index([])])
    def test_constructor_dtype_only(self, dtype, index):
        # GH-20865
        result = pd.Series(dtype=dtype, index=index)
        assert result.dtype == dtype
        assert len(result) == 0

    def test_constructor_no_data_index_order(self):
        result = pd.Series(index=["b", "a", "c"])
        assert result.index.tolist() == ["b", "a", "c"]

    def test_constructor_no_data_string_type(self):
        # GH 22477
        result = pd.Series(index=[1], dtype=str)
        assert np.isnan(result.iloc[0])

    @pytest.mark.parametrize("item", ["entry", "ѐ", 13])
    def test_constructor_string_element_string_type(self, item):
        # GH 22477
        result = pd.Series(item, index=[1], dtype=str)
        assert result.iloc[0] == str(item)

    def test_constructor_dtype_str_na_values(self, string_dtype):
        # https://github.com/pandas-dev/pandas/issues/21083
        ser = Series(["x", None], dtype=string_dtype)
        result = ser.isna()
        expected = Series([False, True])
        tm.assert_series_equal(result, expected)
        assert ser.iloc[1] is None

        ser = Series(["x", np.nan], dtype=string_dtype)
        assert np.isnan(ser.iloc[1])

    def test_constructor_series(self):
        index1 = ["d", "b", "a", "c"]
        index2 = sorted(index1)
        s1 = Series([4, 7, -5, 3], index=index1)
        s2 = Series(s1, index=index2)

        tm.assert_series_equal(s2, s1.sort_index())

    def test_constructor_iterable(self):
        # GH 21987
        class Iter:
            def __iter__(self):
                for i in range(10):
                    yield i

        expected = Series(list(range(10)), dtype="int64")
        result = Series(Iter(), dtype="int64")
        tm.assert_series_equal(result, expected)

    def test_constructor_sequence(self):
        # GH 21987
        expected = Series(list(range(10)), dtype="int64")
        result = Series(range(10), dtype="int64")
        tm.assert_series_equal(result, expected)

    def test_constructor_single_str(self):
        # GH 21987
        expected = Series(["abc"])
        result = Series("abc")
        tm.assert_series_equal(result, expected)

    def test_constructor_list_like(self):

        # make sure that we are coercing different
        # list-likes to standard dtypes and not
        # platform specific
        expected = Series([1, 2, 3], dtype="int64")
        for obj in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype="int64")]:
            result = Series(obj, index=[0, 1, 2])
            tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("dtype", ["bool", "int32", "int64", "float64"])
    def test_constructor_index_dtype(self, dtype):
        # GH 17088

        s = Series(Index([0, 2, 4]), dtype=dtype)
        assert s.dtype == dtype

    @pytest.mark.parametrize(
        "input_vals",
        [
            ([1, 2]),
            (["1", "2"]),
            (list(pd.date_range("1/1/2011", periods=2, freq="H"))),
            (list(
                pd.date_range("1/1/2011", periods=2, freq="H",
                              tz="US/Eastern"))),
            ([pd.Interval(left=0, right=5)]),
        ],
    )
    def test_constructor_list_str(self, input_vals, string_dtype):
        # GH 16605
        # Ensure that data elements from a list are converted to strings
        # when dtype is str, 'str', or 'U'
        result = Series(input_vals, dtype=string_dtype)
        expected = Series(input_vals).astype(string_dtype)
        tm.assert_series_equal(result, expected)

    def test_constructor_list_str_na(self, string_dtype):
        result = Series([1.0, 2.0, np.nan], dtype=string_dtype)
        expected = Series(["1.0", "2.0", np.nan], dtype=object)
        tm.assert_series_equal(result, expected)
        assert np.isnan(result[2])

    def test_constructor_generator(self):
        gen = (i for i in range(10))

        result = Series(gen)
        exp = Series(range(10))
        tm.assert_series_equal(result, exp)

        gen = (i for i in range(10))
        result = Series(gen, index=range(10, 20))
        exp.index = range(10, 20)
        tm.assert_series_equal(result, exp)

    def test_constructor_map(self):
        # GH8909
        m = map(lambda x: x, range(10))

        result = Series(m)
        exp = Series(range(10))
        tm.assert_series_equal(result, exp)

        m = map(lambda x: x, range(10))
        result = Series(m, index=range(10, 20))
        exp.index = range(10, 20)
        tm.assert_series_equal(result, exp)

    def test_constructor_categorical(self):
        cat = pd.Categorical([0, 1, 2, 0, 1, 2], ["a", "b", "c"],
                             fastpath=True)
        res = Series(cat)
        tm.assert_categorical_equal(res.values, cat)

        # can cast to a new dtype
        result = Series(pd.Categorical([1, 2, 3]), dtype="int64")
        expected = pd.Series([1, 2, 3], dtype="int64")
        tm.assert_series_equal(result, expected)

        # GH12574
        cat = Series(pd.Categorical([1, 2, 3]), dtype="category")
        assert is_categorical_dtype(cat)
        assert is_categorical_dtype(cat.dtype)
        s = Series([1, 2, 3], dtype="category")
        assert is_categorical_dtype(s)
        assert is_categorical_dtype(s.dtype)

    def test_constructor_categorical_with_coercion(self):
        factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])
        # test basic creation / coercion of categoricals
        s = Series(factor, name="A")
        assert s.dtype == "category"
        assert len(s) == len(factor)
        str(s.values)
        str(s)

        # in a frame
        df = DataFrame({"A": factor})
        result = df["A"]
        tm.assert_series_equal(result, s)
        result = df.iloc[:, 0]
        tm.assert_series_equal(result, s)
        assert len(df) == len(factor)
        str(df.values)
        str(df)

        df = DataFrame({"A": s})
        result = df["A"]
        tm.assert_series_equal(result, s)
        assert len(df) == len(factor)
        str(df.values)
        str(df)

        # multiples
        df = DataFrame({"A": s, "B": s, "C": 1})
        result1 = df["A"]
        result2 = df["B"]
        tm.assert_series_equal(result1, s)
        tm.assert_series_equal(result2, s, check_names=False)
        assert result2.name == "B"
        assert len(df) == len(factor)
        str(df.values)
        str(df)

        # GH8623
        x = DataFrame(
            [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]],
            columns=["person_id", "person_name"],
        )
        x["person_name"] = Categorical(
            x.person_name)  # doing this breaks transform

        expected = x.iloc[0].person_name
        result = x.person_name.iloc[0]
        assert result == expected

        result = x.person_name[0]
        assert result == expected

        result = x.person_name.loc[0]
        assert result == expected

    def test_constructor_categorical_dtype(self):
        result = pd.Series(["a", "b"],
                           dtype=CategoricalDtype(["a", "b", "c"],
                                                  ordered=True))
        assert is_categorical_dtype(result) is True
        tm.assert_index_equal(result.cat.categories, pd.Index(["a", "b", "c"]))
        assert result.cat.ordered

        result = pd.Series(["a", "b"], dtype=CategoricalDtype(["b", "a"]))
        assert is_categorical_dtype(result)
        tm.assert_index_equal(result.cat.categories, pd.Index(["b", "a"]))
        assert result.cat.ordered is False

        # GH 19565 - Check broadcasting of scalar with Categorical dtype
        result = Series("a",
                        index=[0, 1],
                        dtype=CategoricalDtype(["a", "b"], ordered=True))
        expected = Series(["a", "a"],
                          index=[0, 1],
                          dtype=CategoricalDtype(["a", "b"], ordered=True))
        tm.assert_series_equal(result, expected, check_categorical=True)

    def test_constructor_categorical_string(self):
        # GH 26336: the string 'category' maintains existing CategoricalDtype
        cdt = CategoricalDtype(categories=list("dabc"), ordered=True)
        expected = Series(list("abcabc"), dtype=cdt)

        # Series(Categorical, dtype='category') keeps existing dtype
        cat = Categorical(list("abcabc"), dtype=cdt)
        result = Series(cat, dtype="category")
        tm.assert_series_equal(result, expected)

        # Series(Series[Categorical], dtype='category') keeps existing dtype
        result = Series(result, dtype="category")
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("none, warning",
                             [(None, None), (ordered_sentinel, FutureWarning)])
    def test_categorical_ordered_none_deprecated(self, none, warning):
        # GH 26336: only warn if None is not explicitly passed
        cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True)
        cdt2 = CategoricalDtype(categories=list("cedafb"), ordered=none)

        cat = Categorical(list("abcdaba"), dtype=cdt1)
        with tm.assert_produces_warning(warning, check_stacklevel=False):
            Series(cat, dtype=cdt2)

        s = Series(cat)
        with tm.assert_produces_warning(warning, check_stacklevel=False):
            Series(s, dtype=cdt2)

    def test_categorical_sideeffects_free(self):
        # Passing a categorical to a Series and then changing values in either
        # the series or the categorical should not change the values in the
        # other one, IF you specify copy!
        cat = Categorical(["a", "b", "c", "a"])
        s = Series(cat, copy=True)
        assert s.cat is not cat
        s.cat.categories = [1, 2, 3]
        exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
        exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
        tm.assert_numpy_array_equal(s.__array__(), exp_s)
        tm.assert_numpy_array_equal(cat.__array__(), exp_cat)

        # setting
        s[0] = 2
        exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s2)
        tm.assert_numpy_array_equal(cat.__array__(), exp_cat)

        # however, copy is False by default
        # so this WILL change values
        cat = Categorical(["a", "b", "c", "a"])
        s = Series(cat)
        assert s.values is cat
        s.cat.categories = [1, 2, 3]
        exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s)
        tm.assert_numpy_array_equal(cat.__array__(), exp_s)

        s[0] = 2
        exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s2)
        tm.assert_numpy_array_equal(cat.__array__(), exp_s2)

    def test_unordered_compare_equal(self):
        left = pd.Series(["a", "b", "c"], dtype=CategoricalDtype(["a", "b"]))
        right = pd.Series(
            pd.Categorical(["a", "b", np.nan], categories=["a", "b"]))
        tm.assert_series_equal(left, right)

    def test_constructor_maskedarray(self):
        data = ma.masked_all((3, ), dtype=float)
        result = Series(data)
        expected = Series([np.nan, np.nan, np.nan])
        tm.assert_series_equal(result, expected)

        data[0] = 0.0
        data[2] = 2.0
        index = ["a", "b", "c"]
        result = Series(data, index=index)
        expected = Series([0.0, np.nan, 2.0], index=index)
        tm.assert_series_equal(result, expected)

        data[1] = 1.0
        result = Series(data, index=index)
        expected = Series([0.0, 1.0, 2.0], index=index)
        tm.assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype=int)
        result = Series(data)
        expected = Series([np.nan, np.nan, np.nan], dtype=float)
        tm.assert_series_equal(result, expected)

        data[0] = 0
        data[2] = 2
        index = ["a", "b", "c"]
        result = Series(data, index=index)
        expected = Series([0, np.nan, 2], index=index, dtype=float)
        tm.assert_series_equal(result, expected)

        data[1] = 1
        result = Series(data, index=index)
        expected = Series([0, 1, 2], index=index, dtype=int)
        tm.assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype=bool)
        result = Series(data)
        expected = Series([np.nan, np.nan, np.nan], dtype=object)
        tm.assert_series_equal(result, expected)

        data[0] = True
        data[2] = False
        index = ["a", "b", "c"]
        result = Series(data, index=index)
        expected = Series([True, np.nan, False], index=index, dtype=object)
        tm.assert_series_equal(result, expected)

        data[1] = True
        result = Series(data, index=index)
        expected = Series([True, True, False], index=index, dtype=bool)
        tm.assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype="M8[ns]")
        result = Series(data)
        expected = Series([iNaT, iNaT, iNaT], dtype="M8[ns]")
        tm.assert_series_equal(result, expected)

        data[0] = datetime(2001, 1, 1)
        data[2] = datetime(2001, 1, 3)
        index = ["a", "b", "c"]
        result = Series(data, index=index)
        expected = Series(
            [datetime(2001, 1, 1), iNaT,
             datetime(2001, 1, 3)],
            index=index,
            dtype="M8[ns]",
        )
        tm.assert_series_equal(result, expected)

        data[1] = datetime(2001, 1, 2)
        result = Series(data, index=index)
        expected = Series(
            [datetime(2001, 1, 1),
             datetime(2001, 1, 2),
             datetime(2001, 1, 3)],
            index=index,
            dtype="M8[ns]",
        )
        tm.assert_series_equal(result, expected)

    def test_constructor_maskedarray_hardened(self):
        # Check numpy masked arrays with hard masks -- from GH24574
        data = ma.masked_all((3, ), dtype=float).harden_mask()
        result = pd.Series(data)
        expected = pd.Series([np.nan, np.nan, np.nan])
        tm.assert_series_equal(result, expected)

    def test_series_ctor_plus_datetimeindex(self):
        rng = date_range("20090415", "20090519", freq="B")
        data = {k: 1 for k in rng}

        result = Series(data, index=rng)
        assert result.index is rng

    def test_constructor_default_index(self):
        s = Series([0, 1, 2])
        tm.assert_index_equal(s.index, pd.Index(np.arange(3)))

    @pytest.mark.parametrize(
        "input",
        [
            [1, 2, 3],
            (1, 2, 3),
            list(range(3)),
            pd.Categorical(["a", "b", "a"]),
            (i for i in range(3)),
            map(lambda x: x, range(3)),
        ],
    )
    def test_constructor_index_mismatch(self, input):
        # GH 19342
        # test that construction of a Series with an index of different length
        # raises an error
        msg = "Length of passed values is 3, index implies 4"
        with pytest.raises(ValueError, match=msg):
            Series(input, index=np.arange(4))

    def test_constructor_numpy_scalar(self):
        # GH 19342
        # construction with a numpy scalar
        # should not raise
        result = Series(np.array(100), index=np.arange(4), dtype="int64")
        expected = Series(100, index=np.arange(4), dtype="int64")
        tm.assert_series_equal(result, expected)

    def test_constructor_broadcast_list(self):
        # GH 19342
        # construction with single-element container and index
        # should raise
        msg = "Length of passed values is 1, index implies 3"
        with pytest.raises(ValueError, match=msg):
            Series(["foo"], index=["a", "b", "c"])

    def test_constructor_corner(self):
        df = tm.makeTimeDataFrame()
        objs = [df, df]
        s = Series(objs, index=[0, 1])
        assert isinstance(s, Series)

    def test_constructor_sanitize(self):
        s = Series(np.array([1.0, 1.0, 8.0]), dtype="i8")
        assert s.dtype == np.dtype("i8")

        s = Series(np.array([1.0, 1.0, np.nan]), copy=True, dtype="i8")
        assert s.dtype == np.dtype("f8")

    def test_constructor_copy(self):
        # GH15125
        # test dtype parameter has no side effects on copy=True
        for data in [[1.0], np.array([1.0])]:
            x = Series(data)
            y = pd.Series(x, copy=True, dtype=float)

            # copy=True maintains original data in Series
            tm.assert_series_equal(x, y)

            # changes to origin of copy does not affect the copy
            x[0] = 2.0
            assert not x.equals(y)
            assert x[0] == 2.0
            assert y[0] == 1.0

    @pytest.mark.parametrize(
        "index",
        [
            pd.date_range("20170101", periods=3, tz="US/Eastern"),
            pd.date_range("20170101", periods=3),
            pd.timedelta_range("1 day", periods=3),
            pd.period_range("2012Q1", periods=3, freq="Q"),
            pd.Index(list("abc")),
            pd.Int64Index([1, 2, 3]),
            pd.RangeIndex(0, 3),
        ],
        ids=lambda x: type(x).__name__,
    )
    def test_constructor_limit_copies(self, index):
        # GH 17449
        # limit copies of input
        s = pd.Series(index)

        # we make 1 copy; this is just a smoke test here
        assert s._data.blocks[0].values is not index

    def test_constructor_pass_none(self):
        s = Series(None, index=range(5))
        assert s.dtype == np.float64

        s = Series(None, index=range(5), dtype=object)
        assert s.dtype == np.object_

        # GH 7431
        # inference on the index
        s = Series(index=np.array([None]))
        expected = Series(index=Index([None]))
        tm.assert_series_equal(s, expected)

    def test_constructor_pass_nan_nat(self):
        # GH 13467
        exp = Series([np.nan, np.nan], dtype=np.float64)
        assert exp.dtype == np.float64
        tm.assert_series_equal(Series([np.nan, np.nan]), exp)
        tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp)

        exp = Series([pd.NaT, pd.NaT])
        assert exp.dtype == "datetime64[ns]"
        tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp)
        tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp)

        tm.assert_series_equal(Series([pd.NaT, np.nan]), exp)
        tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp)

        tm.assert_series_equal(Series([np.nan, pd.NaT]), exp)
        tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp)

    def test_constructor_cast(self):
        msg = "could not convert string to float"
        with pytest.raises(ValueError, match=msg):
            Series(["a", "b", "c"], dtype=float)

    def test_constructor_unsigned_dtype_overflow(self, uint_dtype):
        # see gh-15832
        msg = "Trying to coerce negative values to unsigned integers"
        with pytest.raises(OverflowError, match=msg):
            Series([-1], dtype=uint_dtype)

    def test_constructor_coerce_float_fail(self, any_int_dtype):
        # see gh-15832
        msg = "Trying to coerce float values to integers"
        with pytest.raises(ValueError, match=msg):
            Series([1, 2, 3.5], dtype=any_int_dtype)

    def test_constructor_coerce_float_valid(self, float_dtype):
        s = Series([1, 2, 3.5], dtype=float_dtype)
        expected = Series([1, 2, 3.5]).astype(float_dtype)
        tm.assert_series_equal(s, expected)

    def test_constructor_dtype_no_cast(self):
        # see gh-1572
        s = Series([1, 2, 3])
        s2 = Series(s, dtype=np.int64)

        s2[1] = 5
        assert s[1] == 5

    def test_constructor_datelike_coercion(self):

        # GH 9477
        # incorrectly inferring on dateimelike looking when object dtype is
        # specified
        s = Series([Timestamp("20130101"), "NOV"], dtype=object)
        assert s.iloc[0] == Timestamp("20130101")
        assert s.iloc[1] == "NOV"
        assert s.dtype == object

        # the dtype was being reset on the slicing and re-inferred to datetime
        # even thought the blocks are mixed
        belly = "216 3T19".split()
        wing1 = "2T15 4H19".split()
        wing2 = "416 4T20".split()
        mat = pd.to_datetime("2016-01-22 2019-09-07".split())
        df = pd.DataFrame({
            "wing1": wing1,
            "wing2": wing2,
            "mat": mat
        },
                          index=belly)

        result = df.loc["3T19"]
        assert result.dtype == object
        result = df.loc["216"]
        assert result.dtype == object

    def test_constructor_datetimes_with_nulls(self):
        # gh-15869
        for arr in [
                np.array([None, None, None, None,
                          datetime.now(), None]),
                np.array([None, None, datetime.now(), None]),
        ]:
            result = Series(arr)
            assert result.dtype == "M8[ns]"

    def test_constructor_dtype_datetime64(self):

        s = Series(iNaT, dtype="M8[ns]", index=range(5))
        assert isna(s).all()

        # in theory this should be all nulls, but since
        # we are not specifying a dtype is ambiguous
        s = Series(iNaT, index=range(5))
        assert not isna(s).all()

        s = Series(np.nan, dtype="M8[ns]", index=range(5))
        assert isna(s).all()

        s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype="M8[ns]")
        assert isna(s[1])
        assert s.dtype == "M8[ns]"

        s = Series([datetime(2001, 1, 2, 0, 0), np.nan], dtype="M8[ns]")
        assert isna(s[1])
        assert s.dtype == "M8[ns]"

        # GH3416
        dates = [
            np.datetime64(datetime(2013, 1, 1)),
            np.datetime64(datetime(2013, 1, 2)),
            np.datetime64(datetime(2013, 1, 3)),
        ]

        s = Series(dates)
        assert s.dtype == "M8[ns]"

        s.iloc[0] = np.nan
        assert s.dtype == "M8[ns]"

        # GH3414 related
        expected = Series(
            [datetime(2013, 1, 1),
             datetime(2013, 1, 2),
             datetime(2013, 1, 3)],
            dtype="datetime64[ns]",
        )

        result = Series(Series(dates).astype(np.int64) / 1000000,
                        dtype="M8[ms]")
        tm.assert_series_equal(result, expected)

        result = Series(dates, dtype="datetime64[ns]")
        tm.assert_series_equal(result, expected)

        expected = Series(
            [pd.NaT, datetime(2013, 1, 2),
             datetime(2013, 1, 3)],
            dtype="datetime64[ns]")
        result = Series([np.nan] + dates[1:], dtype="datetime64[ns]")
        tm.assert_series_equal(result, expected)

        dts = Series(dates, dtype="datetime64[ns]")

        # valid astype
        dts.astype("int64")

        # invalid casting
        msg = r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" r" \[int32\]"
        with pytest.raises(TypeError, match=msg):
            dts.astype("int32")

        # ints are ok
        # we test with np.int64 to get similar results on
        # windows / 32-bit platforms
        result = Series(dts, dtype=np.int64)
        expected = Series(dts.astype(np.int64))
        tm.assert_series_equal(result, expected)

        # invalid dates can be help as object
        result = Series([datetime(2, 1, 1)])
        assert result[0] == datetime(2, 1, 1, 0, 0)

        result = Series([datetime(3000, 1, 1)])
        assert result[0] == datetime(3000, 1, 1, 0, 0)

        # don't mix types
        result = Series([Timestamp("20130101"), 1], index=["a", "b"])
        assert result["a"] == Timestamp("20130101")
        assert result["b"] == 1

        # GH6529
        # coerce datetime64 non-ns properly
        dates = date_range("01-Jan-2015", "01-Dec-2015", freq="M")
        values2 = dates.view(np.ndarray).astype("datetime64[ns]")
        expected = Series(values2, index=dates)

        for dtype in ["s", "D", "ms", "us", "ns"]:
            values1 = dates.view(np.ndarray).astype("M8[{0}]".format(dtype))
            result = Series(values1, dates)
            tm.assert_series_equal(result, expected)

        # GH 13876
        # coerce to non-ns to object properly
        expected = Series(values2, index=dates, dtype=object)
        for dtype in ["s", "D", "ms", "us", "ns"]:
            values1 = dates.view(np.ndarray).astype("M8[{0}]".format(dtype))
            result = Series(values1, index=dates, dtype=object)
            tm.assert_series_equal(result, expected)

        # leave datetime.date alone
        dates2 = np.array([d.date() for d in dates.to_pydatetime()],
                          dtype=object)
        series1 = Series(dates2, dates)
        tm.assert_numpy_array_equal(series1.values, dates2)
        assert series1.dtype == object

        # these will correctly infer a datetime
        s = Series([None, pd.NaT, "2013-08-05 15:30:00.000001"])
        assert s.dtype == "datetime64[ns]"
        s = Series([np.nan, pd.NaT, "2013-08-05 15:30:00.000001"])
        assert s.dtype == "datetime64[ns]"
        s = Series([pd.NaT, None, "2013-08-05 15:30:00.000001"])
        assert s.dtype == "datetime64[ns]"
        s = Series([pd.NaT, np.nan, "2013-08-05 15:30:00.000001"])
        assert s.dtype == "datetime64[ns]"

        # tz-aware (UTC and other tz's)
        # GH 8411
        dr = date_range("20130101", periods=3)
        assert Series(dr).iloc[0].tz is None
        dr = date_range("20130101", periods=3, tz="UTC")
        assert str(Series(dr).iloc[0].tz) == "UTC"
        dr = date_range("20130101", periods=3, tz="US/Eastern")
        assert str(Series(dr).iloc[0].tz) == "US/Eastern"

        # non-convertible
        s = Series([1479596223000, -1479590, pd.NaT])
        assert s.dtype == "object"
        assert s[2] is pd.NaT
        assert "NaT" in str(s)

        # if we passed a NaT it remains
        s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT])
        assert s.dtype == "object"
        assert s[2] is pd.NaT
        assert "NaT" in str(s)

        # if we passed a nan it remains
        s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan])
        assert s.dtype == "object"
        assert s[2] is np.nan
        assert "NaN" in str(s)

    def test_constructor_with_datetime_tz(self):

        # 8260
        # support datetime64 with tz

        dr = date_range("20130101", periods=3, tz="US/Eastern")
        s = Series(dr)
        assert s.dtype.name == "datetime64[ns, US/Eastern]"
        assert s.dtype == "datetime64[ns, US/Eastern]"
        assert is_datetime64tz_dtype(s.dtype)
        assert "datetime64[ns, US/Eastern]" in str(s)

        # export
        result = s.values
        assert isinstance(result, np.ndarray)
        assert result.dtype == "datetime64[ns]"

        exp = pd.DatetimeIndex(result)
        exp = exp.tz_localize("UTC").tz_convert(tz=s.dt.tz)
        tm.assert_index_equal(dr, exp)

        # indexing
        result = s.iloc[0]
        assert result == Timestamp("2013-01-01 00:00:00-0500",
                                   tz="US/Eastern",
                                   freq="D")
        result = s[0]
        assert result == Timestamp("2013-01-01 00:00:00-0500",
                                   tz="US/Eastern",
                                   freq="D")

        result = s[Series([True, True, False], index=s.index)]
        tm.assert_series_equal(result, s[0:2])

        result = s.iloc[0:1]
        tm.assert_series_equal(result, Series(dr[0:1]))

        # concat
        result = pd.concat([s.iloc[0:1], s.iloc[1:]])
        tm.assert_series_equal(result, s)

        # short str
        assert "datetime64[ns, US/Eastern]" in str(s)

        # formatting with NaT
        result = s.shift()
        assert "datetime64[ns, US/Eastern]" in str(result)
        assert "NaT" in str(result)

        # long str
        t = Series(date_range("20130101", periods=1000, tz="US/Eastern"))
        assert "datetime64[ns, US/Eastern]" in str(t)

        result = pd.DatetimeIndex(s, freq="infer")
        tm.assert_index_equal(result, dr)

        # inference
        s = Series([
            pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"),
            pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"),
        ])
        assert s.dtype == "datetime64[ns, US/Pacific]"
        assert lib.infer_dtype(s, skipna=True) == "datetime64"

        s = Series([
            pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"),
            pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Eastern"),
        ])
        assert s.dtype == "object"
        assert lib.infer_dtype(s, skipna=True) == "datetime"

        # with all NaT
        s = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]")
        expected = Series(pd.DatetimeIndex(["NaT", "NaT"], tz="US/Eastern"))
        tm.assert_series_equal(s, expected)

    @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
    def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units
        # gh-19223
        dtype = "{}[{}]".format(dtype, unit)
        arr = np.array([1, 2, 3], dtype=arr_dtype)
        s = Series(arr)
        result = s.astype(dtype)
        expected = Series(arr.astype(dtype))

        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("arg",
                             ["2013-01-01 00:00:00", pd.NaT, np.nan, None])
    def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg):
        # GH 17415: With naive string
        result = Series([arg], dtype="datetime64[ns, CET]")
        expected = Series(pd.Timestamp(arg)).dt.tz_localize("CET")
        tm.assert_series_equal(result, expected)

    def test_construction_interval(self):
        # construction from interval & array of intervals
        index = IntervalIndex.from_breaks(np.arange(3), closed="right")
        result = Series(index)
        repr(result)
        str(result)
        tm.assert_index_equal(Index(result.values), index)

        result = Series(index.values)
        tm.assert_index_equal(Index(result.values), index)

    def test_construction_consistency(self):

        # make sure that we are not re-localizing upon construction
        # GH 14928
        s = Series(pd.date_range("20130101", periods=3, tz="US/Eastern"))

        result = Series(s, dtype=s.dtype)
        tm.assert_series_equal(result, s)

        result = Series(s.dt.tz_convert("UTC"), dtype=s.dtype)
        tm.assert_series_equal(result, s)

        result = Series(s.values, dtype=s.dtype)
        tm.assert_series_equal(result, s)

    def test_constructor_infer_period(self):
        data = [pd.Period("2000", "D"), pd.Period("2001", "D"), None]
        result = pd.Series(data)
        expected = pd.Series(period_array(data))
        tm.assert_series_equal(result, expected)
        assert result.dtype == "Period[D]"

        data = np.asarray(data, dtype=object)
        tm.assert_series_equal(result, expected)
        assert result.dtype == "Period[D]"

    def test_constructor_period_incompatible_frequency(self):
        data = [pd.Period("2000", "D"), pd.Period("2001", "A")]
        result = pd.Series(data)
        assert result.dtype == object
        assert result.tolist() == data

    def test_constructor_periodindex(self):
        # GH7932
        # converting a PeriodIndex when put in a Series

        pi = period_range("20130101", periods=5, freq="D")
        s = Series(pi)
        assert s.dtype == "Period[D]"
        expected = Series(pi.astype(object))
        tm.assert_series_equal(s, expected)

    def test_constructor_dict(self):
        d = {"a": 0.0, "b": 1.0, "c": 2.0}
        result = Series(d, index=["b", "c", "d", "a"])
        expected = Series([1, 2, np.nan, 0], index=["b", "c", "d", "a"])
        tm.assert_series_equal(result, expected)

        pidx = tm.makePeriodIndex(100)
        d = {pidx[0]: 0, pidx[1]: 1}
        result = Series(d, index=pidx)
        expected = Series(np.nan, pidx)
        expected.iloc[0] = 0
        expected.iloc[1] = 1
        tm.assert_series_equal(result, expected)

    def test_constructor_dict_order(self):
        # GH19018
        # initialization ordering: by insertion order if python>= 3.6, else
        # order by value
        d = {"b": 1, "a": 0, "c": 2}
        result = Series(d)
        if PY36:
            expected = Series([1, 0, 2], index=list("bac"))
        else:
            expected = Series([0, 1, 2], index=list("abc"))
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")])
    def test_constructor_dict_nan_key(self, value):
        # GH 18480
        d = {1: "a", value: "b", float("nan"): "c", 4: "d"}
        result = Series(d).sort_values()
        expected = Series(["a", "b", "c", "d"], index=[1, value, np.nan, 4])
        tm.assert_series_equal(result, expected)

        # MultiIndex:
        d = {(1, 1): "a", (2, np.nan): "b", (3, value): "c"}
        result = Series(d).sort_values()
        expected = Series(["a", "b", "c"],
                          index=Index([(1, 1), (2, np.nan), (3, value)]))
        tm.assert_series_equal(result, expected)

    def test_constructor_dict_datetime64_index(self):
        # GH 9456

        dates_as_str = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"]
        values = [42544017.198965244, 1234565, 40512335.181958228, -1]

        def create_data(constructor):
            return dict(zip((constructor(x) for x in dates_as_str), values))

        data_datetime64 = create_data(np.datetime64)
        data_datetime = create_data(lambda x: datetime.strptime(x, "%Y-%m-%d"))
        data_Timestamp = create_data(Timestamp)

        expected = Series(values, (Timestamp(x) for x in dates_as_str))

        result_datetime64 = Series(data_datetime64)
        result_datetime = Series(data_datetime)
        result_Timestamp = Series(data_Timestamp)

        tm.assert_series_equal(result_datetime64, expected)
        tm.assert_series_equal(result_datetime, expected)
        tm.assert_series_equal(result_Timestamp, expected)

    def test_constructor_list_of_tuples(self):
        data = [(1, 1), (2, 2), (2, 3)]
        s = Series(data)
        assert list(s) == data

    def test_constructor_tuple_of_tuples(self):
        data = ((1, 1), (2, 2), (2, 3))
        s = Series(data)
        assert tuple(s) == data

    def test_constructor_dict_of_tuples(self):
        data = {(1, 2): 3, (None, 5): 6}
        result = Series(data).sort_values()
        expected = Series([3, 6],
                          index=MultiIndex.from_tuples([(1, 2), (None, 5)]))
        tm.assert_series_equal(result, expected)

    def test_constructor_set(self):
        values = {1, 2, 3, 4, 5}
        with pytest.raises(TypeError, match="'set' type is unordered"):
            Series(values)
        values = frozenset(values)
        with pytest.raises(TypeError, match="'frozenset' type is unordered"):
            Series(values)

    # https://github.com/pandas-dev/pandas/issues/22698
    @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning")
    def test_fromDict(self):
        data = {"a": 0, "b": 1, "c": 2, "d": 3}

        series = Series(data)
        tm.assert_is_sorted(series.index)

        data = {"a": 0, "b": "1", "c": "2", "d": datetime.now()}
        series = Series(data)
        assert series.dtype == np.object_

        data = {"a": 0, "b": "1", "c": "2", "d": "3"}
        series = Series(data)
        assert series.dtype == np.object_

        data = {"a": "0", "b": "1"}
        series = Series(data, dtype=float)
        assert series.dtype == np.float64

    def test_fromValue(self, datetime_series):

        nans = Series(np.NaN, index=datetime_series.index)
        assert nans.dtype == np.float_
        assert len(nans) == len(datetime_series)

        strings = Series("foo", index=datetime_series.index)
        assert strings.dtype == np.object_
        assert len(strings) == len(datetime_series)

        d = datetime.now()
        dates = Series(d, index=datetime_series.index)
        assert dates.dtype == "M8[ns]"
        assert len(dates) == len(datetime_series)

        # GH12336
        # Test construction of categorical series from value
        categorical = Series(0, index=datetime_series.index, dtype="category")
        expected = Series(0, index=datetime_series.index).astype("category")
        assert categorical.dtype == "category"
        assert len(categorical) == len(datetime_series)
        tm.assert_series_equal(categorical, expected)

    def test_constructor_dtype_timedelta64(self):

        # basic
        td = Series([timedelta(days=i) for i in range(3)])
        assert td.dtype == "timedelta64[ns]"

        td = Series([timedelta(days=1)])
        assert td.dtype == "timedelta64[ns]"

        td = Series(
            [timedelta(days=1),
             timedelta(days=2),
             np.timedelta64(1, "s")])

        assert td.dtype == "timedelta64[ns]"

        # mixed with NaT
        td = Series([timedelta(days=1), NaT], dtype="m8[ns]")
        assert td.dtype == "timedelta64[ns]"

        td = Series([timedelta(days=1), np.nan], dtype="m8[ns]")
        assert td.dtype == "timedelta64[ns]"

        td = Series([np.timedelta64(300000000), pd.NaT], dtype="m8[ns]")
        assert td.dtype == "timedelta64[ns]"

        # improved inference
        # GH5689
        td = Series([np.timedelta64(300000000), NaT])
        assert td.dtype == "timedelta64[ns]"

        # because iNaT is int, not coerced to timedelta
        td = Series([np.timedelta64(300000000), iNaT])
        assert td.dtype == "object"

        td = Series([np.timedelta64(300000000), np.nan])
        assert td.dtype == "timedelta64[ns]"

        td = Series([pd.NaT, np.timedelta64(300000000)])
        assert td.dtype == "timedelta64[ns]"

        td = Series([np.timedelta64(1, "s")])
        assert td.dtype == "timedelta64[ns]"

        # these are frequency conversion astypes
        # for t in ['s', 'D', 'us', 'ms']:
        #    with pytest.raises(TypeError):
        #        td.astype('m8[%s]' % t)

        # valid astype
        td.astype("int64")

        # invalid casting
        msg = r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" r" \[int32\]"
        with pytest.raises(TypeError, match=msg):
            td.astype("int32")

        # this is an invalid casting
        msg = "Could not convert object to NumPy timedelta"
        with pytest.raises(ValueError, match=msg):
            Series([timedelta(days=1), "foo"], dtype="m8[ns]")

        # leave as object here
        td = Series([timedelta(days=i) for i in range(3)] + ["foo"])
        assert td.dtype == "object"

        # these will correctly infer a timedelta
        s = Series([None, pd.NaT, "1 Day"])
        assert s.dtype == "timedelta64[ns]"
        s = Series([np.nan, pd.NaT, "1 Day"])
        assert s.dtype == "timedelta64[ns]"
        s = Series([pd.NaT, None, "1 Day"])
        assert s.dtype == "timedelta64[ns]"
        s = Series([pd.NaT, np.nan, "1 Day"])
        assert s.dtype == "timedelta64[ns]"

    # GH 16406
    def test_constructor_mixed_tz(self):
        s = Series(
            [Timestamp("20130101"),
             Timestamp("20130101", tz="US/Eastern")])
        expected = Series(
            [Timestamp("20130101"),
             Timestamp("20130101", tz="US/Eastern")],
            dtype="object",
        )
        tm.assert_series_equal(s, expected)

    def test_NaT_scalar(self):
        series = Series([0, 1000, 2000, iNaT], dtype="M8[ns]")

        val = series[3]
        assert isna(val)

        series[2] = val
        assert isna(series[2])

    def test_NaT_cast(self):
        # GH10747
        result = Series([np.nan]).astype("M8[ns]")
        expected = Series([NaT])
        tm.assert_series_equal(result, expected)

    def test_constructor_name_hashable(self):
        for n in [777, 777.0, "name", datetime(2001, 11, 11), (1, ), "\u05D0"]:
            for data in [[1, 2, 3], np.ones(3), {"a": 0, "b": 1}]:
                s = Series(data, name=n)
                assert s.name == n

    def test_constructor_name_unhashable(self):
        msg = r"Series\.name must be a hashable type"
        for n in [["name_list"], np.ones(2), {1: 2}]:
            for data in [["name_list"], np.ones(2), {1: 2}]:
                with pytest.raises(TypeError, match=msg):
                    Series(data, name=n)

    def test_auto_conversion(self):
        series = Series(list(date_range("1/1/2000", periods=10)))
        assert series.dtype == "M8[ns]"

    def test_convert_non_ns(self):
        # convert from a numpy array of non-ns timedelta64
        arr = np.array([1, 2, 3], dtype="timedelta64[s]")
        s = Series(arr)
        expected = Series(pd.timedelta_range("00:00:01", periods=3, freq="s"))
        tm.assert_series_equal(s, expected)

        # convert from a numpy array of non-ns datetime64
        # note that creating a numpy datetime64 is in LOCAL time!!!!
        # seems to work for M8[D], but not for M8[s]

        s = Series(
            np.array(["2013-01-01", "2013-01-02", "2013-01-03"],
                     dtype="datetime64[D]"))
        tm.assert_series_equal(
            s, Series(date_range("20130101", periods=3, freq="D")))

        # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01
        # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]'))

        # tm.assert_series_equal(s,date_range('20130101
        # 00:00:01',period=3,freq='s'))

    @pytest.mark.parametrize(
        "index",
        [
            date_range("1/1/2000", periods=10),
            timedelta_range("1 day", periods=10),
            period_range("2000-Q1", periods=10, freq="Q"),
        ],
        ids=lambda x: type(x).__name__,
    )
    def test_constructor_cant_cast_datetimelike(self, index):

        # floats are not ok
        msg = "Cannot cast {}.*? to ".format(
            # strip Index to convert PeriodIndex -> Period
            # We don't care whether the error message says
            # PeriodIndex or PeriodArray
            type(index).__name__.rstrip("Index"))
        with pytest.raises(TypeError, match=msg):
            Series(index, dtype=float)

        # ints are ok
        # we test with np.int64 to get similar results on
        # windows / 32-bit platforms
        result = Series(index, dtype=np.int64)
        expected = Series(index.astype(np.int64))
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize(
        "index",
        [
            date_range("1/1/2000", periods=10),
            timedelta_range("1 day", periods=10),
            period_range("2000-Q1", periods=10, freq="Q"),
        ],
        ids=lambda x: type(x).__name__,
    )
    def test_constructor_cast_object(self, index):
        s = Series(index, dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

        s = Series(pd.Index(index, dtype=object), dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

        s = Series(index.astype(object), dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

    @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64])
    def test_constructor_generic_timestamp_no_frequency(self, dtype):
        # see gh-15524, gh-15987
        msg = "dtype has no unit. Please pass in"

        with pytest.raises(ValueError, match=msg):
            Series([], dtype=dtype)

    @pytest.mark.parametrize(
        "dtype,msg",
        [
            ("m8[ps]", "cannot convert timedeltalike"),
            ("M8[ps]", "cannot convert datetimelike"),
        ],
    )
    def test_constructor_generic_timestamp_bad_frequency(self, dtype, msg):
        # see gh-15524, gh-15987

        with pytest.raises(TypeError, match=msg):
            Series([], dtype=dtype)

    @pytest.mark.parametrize("dtype", [None, "uint8", "category"])
    def test_constructor_range_dtype(self, dtype):
        # GH 16804
        expected = Series([0, 1, 2, 3, 4], dtype=dtype or "int64")
        result = Series(range(5), dtype=dtype)
        tm.assert_series_equal(result, expected)

    def test_constructor_tz_mixed_data(self):
        # GH 13051
        dt_list = [
            Timestamp("2016-05-01 02:03:37"),
            Timestamp("2016-04-30 19:03:37-0700", tz="US/Pacific"),
        ]
        result = Series(dt_list)
        expected = Series(dt_list, dtype=object)
        tm.assert_series_equal(result, expected)
Ejemplo n.º 4
0
class TestFancy(Base):
    """ pure get/set item & fancy indexing """
    def test_setitem_ndarray_1d(self):
        # GH5508

        # len of indexer vs length of the 1d ndarray
        df = DataFrame(index=Index(np.arange(1, 11)))
        df["foo"] = np.zeros(10, dtype=np.float64)
        df["bar"] = np.zeros(10, dtype=np.complex)

        # invalid
        with pytest.raises(ValueError):
            df.loc[df.index[2:5],
                   "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0])

        # valid
        df.loc[df.index[2:6], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0])

        result = df.loc[df.index[2:6], "bar"]
        expected = Series([2.33j, 1.23 + 0.1j, 2.2, 1.0],
                          index=[3, 4, 5, 6],
                          name="bar")
        tm.assert_series_equal(result, expected)

        # dtype getting changed?
        df = DataFrame(index=Index(np.arange(1, 11)))
        df["foo"] = np.zeros(10, dtype=np.float64)
        df["bar"] = np.zeros(10, dtype=np.complex)

        with pytest.raises(ValueError):
            df[2:5] = np.arange(1, 4) * 1j

    @pytest.mark.parametrize("index",
                             tm.all_index_generator(5),
                             ids=lambda x: type(x).__name__)
    @pytest.mark.parametrize(
        "obj",
        [
            lambda i: Series(np.arange(len(i)), index=i),
            lambda i: DataFrame(
                np.random.randn(len(i), len(i)), index=i, columns=i),
        ],
        ids=["Series", "DataFrame"],
    )
    @pytest.mark.parametrize(
        "idxr, idxr_id",
        [
            (lambda x: x, "getitem"),
            (lambda x: x.loc, "loc"),
            (lambda x: x.iloc, "iloc"),
            pytest.param(lambda x: x.ix, "ix", marks=ignore_ix),
        ],
    )
    def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id):
        # GH 25567
        obj = obj(index)
        idxr = idxr(obj)
        nd3 = np.random.randint(5, size=(2, 2, 2))

        msg = (
            r"Buffer has wrong number of dimensions \(expected 1,"
            r" got 3\)|"
            "The truth value of an array with more than one element is"
            " ambiguous|"
            "Cannot index with multidimensional key|"
            r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]|"
            "No matching signature found|"  # TypeError
            "unhashable type: 'numpy.ndarray'"  # TypeError
        )

        if (isinstance(obj, Series) and idxr_id == "getitem"
                and index.inferred_type in [
                    "string",
                    "datetime64",
                    "period",
                    "timedelta64",
                    "boolean",
                    "categorical",
                ]):
            idxr[nd3]
        else:
            if (isinstance(obj, DataFrame) and idxr_id == "getitem"
                    and index.inferred_type == "boolean"):
                error = TypeError
            elif idxr_id == "getitem" and index.inferred_type == "interval":
                error = TypeError
            else:
                error = ValueError

            with pytest.raises(error, match=msg):
                idxr[nd3]

    @pytest.mark.parametrize("index",
                             tm.all_index_generator(5),
                             ids=lambda x: type(x).__name__)
    @pytest.mark.parametrize(
        "obj",
        [
            lambda i: Series(np.arange(len(i)), index=i),
            lambda i: DataFrame(
                np.random.randn(len(i), len(i)), index=i, columns=i),
        ],
        ids=["Series", "DataFrame"],
    )
    @pytest.mark.parametrize(
        "idxr, idxr_id",
        [
            (lambda x: x, "setitem"),
            (lambda x: x.loc, "loc"),
            (lambda x: x.iloc, "iloc"),
            pytest.param(lambda x: x.ix, "ix", marks=ignore_ix),
        ],
    )
    def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id):
        # GH 25567
        obj = obj(index)
        idxr = idxr(obj)
        nd3 = np.random.randint(5, size=(2, 2, 2))

        msg = (
            r"Buffer has wrong number of dimensions \(expected 1,"
            r" got 3\)|"
            "The truth value of an array with more than one element is"
            " ambiguous|"
            "Only 1-dimensional input arrays are supported|"
            "'pandas._libs.interval.IntervalTree' object has no attribute"
            " 'set_value'|"  # AttributeError
            "unhashable type: 'numpy.ndarray'|"  # TypeError
            "No matching signature found|"  # TypeError
            r"^\[\[\["  # pandas.core.indexing.IndexingError
        )

        if ((idxr_id == "iloc")
                or ((isinstance(obj, Series) and idxr_id == "setitem"
                     and index.inferred_type in [
                         "floating",
                         "string",
                         "datetime64",
                         "period",
                         "timedelta64",
                         "boolean",
                         "categorical",
                     ]))
                or (idxr_id == "ix" and index.inferred_type
                    in ["string", "datetime64", "period", "boolean"])):
            idxr[nd3] = 0
        else:
            with pytest.raises(
                (ValueError, AttributeError, TypeError,
                 pd.core.indexing.IndexingError),
                    match=msg,
            ):
                idxr[nd3] = 0

    def test_inf_upcast(self):
        # GH 16957
        # We should be able to use np.inf as a key
        # np.inf should cause an index to convert to float

        # Test with np.inf in rows
        df = DataFrame(columns=[0])
        df.loc[1] = 1
        df.loc[2] = 2
        df.loc[np.inf] = 3

        # make sure we can look up the value
        assert df.loc[np.inf, 0] == 3

        result = df.index
        expected = pd.Float64Index([1, 2, np.inf])
        tm.assert_index_equal(result, expected)

        # Test with np.inf in columns
        df = DataFrame()
        df.loc[0, 0] = 1
        df.loc[1, 1] = 2
        df.loc[0, np.inf] = 3

        result = df.columns
        expected = pd.Float64Index([0, 1, np.inf])
        tm.assert_index_equal(result, expected)

    def test_setitem_dtype_upcast(self):

        # GH3216
        df = DataFrame([{"a": 1}, {"a": 3, "b": 2}])
        df["c"] = np.nan
        assert df["c"].dtype == np.float64

        df.loc[0, "c"] = "foo"
        expected = DataFrame([{
            "a": 1,
            "c": "foo"
        }, {
            "a": 3,
            "b": 2,
            "c": np.nan
        }])
        tm.assert_frame_equal(df, expected)

        # GH10280
        df = DataFrame(
            np.arange(6, dtype="int64").reshape(2, 3),
            index=list("ab"),
            columns=["foo", "bar", "baz"],
        )

        for val in [3.14, "wxyz"]:
            left = df.copy()
            left.loc["a", "bar"] = val
            right = DataFrame(
                [[0, val, 2], [3, 4, 5]],
                index=list("ab"),
                columns=["foo", "bar", "baz"],
            )

            tm.assert_frame_equal(left, right)
            assert is_integer_dtype(left["foo"])
            assert is_integer_dtype(left["baz"])

        left = DataFrame(
            np.arange(6, dtype="int64").reshape(2, 3) / 10.0,
            index=list("ab"),
            columns=["foo", "bar", "baz"],
        )
        left.loc["a", "bar"] = "wxyz"

        right = DataFrame(
            [[0, "wxyz", 0.2], [0.3, 0.4, 0.5]],
            index=list("ab"),
            columns=["foo", "bar", "baz"],
        )

        tm.assert_frame_equal(left, right)
        assert is_float_dtype(left["foo"])
        assert is_float_dtype(left["baz"])

    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf

        df = mkdf(10, 3)
        df.columns = ["a", "a", "b"]
        result = df[["b", "a"]].columns
        expected = Index(["b", "a", "a"])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]],
                       columns=list("aaaaaaa"))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]])
        result.columns = list("aaaaaaa")

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {
                "test": [5, 7, 9, 11],
                "test1": [4.0, 5, 6, 7],
                "other": list("abcd")
            },
            index=["A", "A", "B", "C"],
        )
        rows = ["C", "B"]
        expected = DataFrame(
            {
                "test": [11, 9],
                "test1": [7.0, 6],
                "other": ["d", "c"]
            },
            index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ["C", "B", "E"]
        expected = DataFrame(
            {
                "test": [11, 9, np.nan],
                "test1": [7.0, 6, np.nan],
                "other": ["d", "c", np.nan],
            },
            index=rows,
        )

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # see GH5553, make sure we use the right indexer
        rows = ["F", "G", "H", "C", "B", "E"]
        expected = DataFrame(
            {
                "test": [np.nan, np.nan, np.nan, 11, 9, np.nan],
                "test1": [np.nan, np.nan, np.nan, 7.0, 6, np.nan],
                "other": [np.nan, np.nan, np.nan, "d", "c", np.nan],
            },
            index=rows,
        )
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # List containing only missing label
        dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD"))
        with pytest.raises(KeyError):
            dfnu.loc[["E"]]

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        df = DataFrame({"A": list("abc")})
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": ["a", np.nan, "a"]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        # non unique with non unique selector
        df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"])
        expected = DataFrame({"test": [5, 7, 5, 7, np.nan]},
                             index=["A", "A", "A", "A", "E"])
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[["A", "A", "E"]]
        tm.assert_frame_equal(result, expected)

    def test_dups_fancy_indexing2(self):
        # GH 5835
        # dups on index and missing values
        df = DataFrame(np.random.randn(5, 5),
                       columns=["A", "B", "B", "B", "A"])

        expected = pd.concat(
            [
                df.loc[:, ["A", "B"]],
                DataFrame(np.nan, columns=["C"], index=df.index)
            ],
            axis=1,
        )
        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            result = df.loc[:, ["A", "B", "C"]]
        tm.assert_frame_equal(result, expected)

        # GH 6504, multi-axis indexing
        df = DataFrame(np.random.randn(9, 2),
                       index=[1, 1, 1, 2, 2, 2, 3, 3, 3],
                       columns=["a", "b"])

        expected = df.iloc[0:6]
        result = df.loc[[1, 2]]
        tm.assert_frame_equal(result, expected)

        expected = df
        result = df.loc[:, ["a", "b"]]
        tm.assert_frame_equal(result, expected)

        expected = df.iloc[0:6, :]
        result = df.loc[[1, 2], ["a", "b"]]
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("case", [lambda s: s, lambda s: s.loc])
    def test_duplicate_int_indexing(self, case):
        # GH 17347
        s = pd.Series(range(3), index=[1, 1, 3])
        expected = s[1]
        result = case(s)[[1]]
        tm.assert_series_equal(result, expected)

    def test_indexing_mixed_frame_bug(self):

        # GH3492
        df = DataFrame({
            "a": {
                1: "aaa",
                2: "bbb",
                3: "ccc"
            },
            "b": {
                1: 111,
                2: 222,
                3: 333
            }
        })

        # this works, new column is created correctly
        df["test"] = df["a"].apply(lambda x: "_" if x == "aaa" else x)

        # this does not work, ie column test is not changed
        idx = df["test"] == "_"
        temp = df.loc[idx, "a"].apply(lambda x: "-----" if x == "aaa" else x)
        df.loc[idx, "test"] = temp
        assert df.iloc[0, 2] == "-----"

        # if I look at df, then element [0,2] equals '_'. If instead I type
        # df.ix[idx,'test'], I get '-----', finally by typing df.iloc[0,2] I
        # get '_'.

    def test_multitype_list_index_access(self):
        # GH 10610
        df = DataFrame(np.random.random((10, 5)),
                       columns=["a"] + [20, 21, 22, 23])

        with pytest.raises(KeyError):
            df[[22, 26, -8]]
        assert df[21].shape[0] == df.shape[0]

    def test_set_index_nan(self):

        # GH 3586
        df = DataFrame({
            "PRuid": {
                17: "nonQC",
                18: "nonQC",
                19: "nonQC",
                20: "10",
                21: "11",
                22: "12",
                23: "13",
                24: "24",
                25: "35",
                26: "46",
                27: "47",
                28: "48",
                29: "59",
                30: "10",
            },
            "QC": {
                17: 0.0,
                18: 0.0,
                19: 0.0,
                20: np.nan,
                21: np.nan,
                22: np.nan,
                23: np.nan,
                24: 1.0,
                25: np.nan,
                26: np.nan,
                27: np.nan,
                28: np.nan,
                29: np.nan,
                30: np.nan,
            },
            "data": {
                17: 7.9544899999999998,
                18: 8.0142609999999994,
                19: 7.8591520000000008,
                20: 0.86140349999999999,
                21: 0.87853110000000001,
                22: 0.8427041999999999,
                23: 0.78587700000000005,
                24: 0.73062459999999996,
                25: 0.81668560000000001,
                26: 0.81927080000000008,
                27: 0.80705009999999999,
                28: 0.81440240000000008,
                29: 0.80140849999999997,
                30: 0.81307740000000006,
            },
            "year": {
                17: 2006,
                18: 2007,
                19: 2008,
                20: 1985,
                21: 1985,
                22: 1985,
                23: 1985,
                24: 1985,
                25: 1985,
                26: 1985,
                27: 1985,
                28: 1985,
                29: 1985,
                30: 1986,
            },
        }).reset_index()

        result = (df.set_index(["year", "PRuid", "QC"
                                ]).reset_index().reindex(columns=df.columns))
        tm.assert_frame_equal(result, df)

    def test_multi_assign(self):

        # GH 3626, an assignment of a sub-df to a df
        df = DataFrame({
            "FC": ["a", "b", "a", "b", "a", "b"],
            "PF": [0, 0, 0, 0, 1, 1],
            "col1": list(range(6)),
            "col2": list(range(6, 12)),
        })
        df.iloc[1, 0] = np.nan
        df2 = df.copy()

        mask = ~df2.FC.isna()
        cols = ["col1", "col2"]

        dft = df2 * 2
        dft.iloc[3, 3] = np.nan

        expected = DataFrame({
            "FC": ["a", np.nan, "a", "b", "a", "b"],
            "PF": [0, 0, 0, 0, 1, 1],
            "col1": Series([0, 1, 4, 6, 8, 10]),
            "col2": [12, 7, 16, np.nan, 20, 22],
        })

        # frame on rhs
        df2.loc[mask, cols] = dft.loc[mask, cols]
        tm.assert_frame_equal(df2, expected)

        df2.loc[mask, cols] = dft.loc[mask, cols]
        tm.assert_frame_equal(df2, expected)

        # with an ndarray on rhs
        # coerces to float64 because values has float64 dtype
        # GH 14001
        expected = DataFrame({
            "FC": ["a", np.nan, "a", "b", "a", "b"],
            "PF": [0, 0, 0, 0, 1, 1],
            "col1": [0.0, 1.0, 4.0, 6.0, 8.0, 10.0],
            "col2": [12, 7, 16, np.nan, 20, 22],
        })
        df2 = df.copy()
        df2.loc[mask, cols] = dft.loc[mask, cols].values
        tm.assert_frame_equal(df2, expected)
        df2.loc[mask, cols] = dft.loc[mask, cols].values
        tm.assert_frame_equal(df2, expected)

        # broadcasting on the rhs is required
        df = DataFrame(
            dict(
                A=[1, 2, 0, 0, 0],
                B=[0, 0, 0, 10, 11],
                C=[0, 0, 0, 10, 11],
                D=[3, 4, 5, 6, 7],
            ))

        expected = df.copy()
        mask = expected["A"] == 0
        for col in ["A", "B"]:
            expected.loc[mask, col] = df["D"]

        df.loc[df["A"] == 0, ["A", "B"]] = df["D"]
        tm.assert_frame_equal(df, expected)

    def test_setitem_list(self):

        # GH 6043
        # ix with a list
        df = DataFrame(index=[0, 1], columns=[0])
        with catch_warnings(record=True):
            simplefilter("ignore")
            df.ix[1, 0] = [1, 2, 3]
            df.ix[1, 0] = [1, 2]

        result = DataFrame(index=[0, 1], columns=[0])
        with catch_warnings(record=True):
            simplefilter("ignore")
            result.ix[1, 0] = [1, 2]

        tm.assert_frame_equal(result, df)

        # ix with an object
        class TO:
            def __init__(self, value):
                self.value = value

            def __str__(self):
                return "[{0}]".format(self.value)

            __repr__ = __str__

            def __eq__(self, other):
                return self.value == other.value

            def view(self):
                return self

        df = DataFrame(index=[0, 1], columns=[0])
        with catch_warnings(record=True):
            simplefilter("ignore")
            df.ix[1, 0] = TO(1)
            df.ix[1, 0] = TO(2)

        result = DataFrame(index=[0, 1], columns=[0])
        with catch_warnings(record=True):
            simplefilter("ignore")
            result.ix[1, 0] = TO(2)

        tm.assert_frame_equal(result, df)

        # remains object dtype even after setting it back
        df = DataFrame(index=[0, 1], columns=[0])
        with catch_warnings(record=True):
            simplefilter("ignore")
            df.ix[1, 0] = TO(1)
            df.ix[1, 0] = np.nan
        result = DataFrame(index=[0, 1], columns=[0])

        tm.assert_frame_equal(result, df)

    def test_string_slice(self):
        # GH 14424
        # string indexing against datetimelike with object
        # dtype should properly raises KeyError
        df = DataFrame([1], Index([pd.Timestamp("2011-01-01")], dtype=object))
        assert df.index.is_all_dates
        with pytest.raises(KeyError):
            df["2011"]

        with pytest.raises(KeyError):
            df.loc["2011", 0]

        df = DataFrame()
        assert not df.index.is_all_dates
        with pytest.raises(KeyError):
            df["2011"]

        with pytest.raises(KeyError):
            df.loc["2011", 0]

    def test_astype_assignment(self):

        # GH4312 (iloc)
        df_orig = DataFrame([["1", "2", "3", ".4", 5, 6.0, "foo"]],
                            columns=list("ABCDEFG"))

        df = df_orig.copy()
        df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64)
        expected = DataFrame([[1, 2, "3", ".4", 5, 6.0, "foo"]],
                             columns=list("ABCDEFG"))
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.iloc[:, 0:2] = df.iloc[:, 0:2]._convert(datetime=True, numeric=True)
        expected = DataFrame([[1, 2, "3", ".4", 5, 6.0, "foo"]],
                             columns=list("ABCDEFG"))
        tm.assert_frame_equal(df, expected)

        # GH5702 (loc)
        df = df_orig.copy()
        df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64)
        expected = DataFrame([[1, "2", "3", ".4", 5, 6.0, "foo"]],
                             columns=list("ABCDEFG"))
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
        expected = DataFrame([["1", 2, 3, ".4", 5, 6.0, "foo"]],
                             columns=list("ABCDEFG"))
        tm.assert_frame_equal(df, expected)

        # full replacements / no nans
        df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]})
        df.iloc[:, 0] = df["A"].astype(np.int64)
        expected = DataFrame({"A": [1, 2, 3, 4]})
        tm.assert_frame_equal(df, expected)

        df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]})
        df.loc[:, "A"] = df["A"].astype(np.int64)
        expected = DataFrame({"A": [1, 2, 3, 4]})
        tm.assert_frame_equal(df, expected)

    @pytest.mark.parametrize(
        "index,val",
        [
            (Index([0, 1, 2]), 2),
            (Index([0, 1, "2"]), "2"),
            (Index([0, 1, 2, np.inf, 4]), 4),
            (Index([0, 1, 2, np.nan, 4]), 4),
            (Index([0, 1, 2, np.inf]), np.inf),
            (Index([0, 1, 2, np.nan]), np.nan),
        ],
    )
    def test_index_contains(self, index, val):
        assert val in index

    @pytest.mark.parametrize(
        "index,val",
        [
            (Index([0, 1, 2]), "2"),
            (Index([0, 1, "2"]), 2),
            (Index([0, 1, 2, np.inf]), 4),
            (Index([0, 1, 2, np.nan]), 4),
            (Index([0, 1, 2, np.inf]), np.nan),
            (Index([0, 1, 2, np.nan]), np.inf),
            # Checking if np.inf in Int64Index should not cause an OverflowError
            # Related to GH 16957
            (pd.Int64Index([0, 1, 2]), np.inf),
            (pd.Int64Index([0, 1, 2]), np.nan),
            (pd.UInt64Index([0, 1, 2]), np.inf),
            (pd.UInt64Index([0, 1, 2]), np.nan),
        ],
    )
    def test_index_not_contains(self, index, val):
        assert val not in index

    @pytest.mark.parametrize("index,val", [(Index([0, 1, "2"]), 0),
                                           (Index([0, 1, "2"]), "2")])
    def test_mixed_index_contains(self, index, val):
        # GH 19860
        assert val in index

    @pytest.mark.parametrize("index,val", [(Index([0, 1, "2"]), "1"),
                                           (Index([0, 1, "2"]), 2)])
    def test_mixed_index_not_contains(self, index, val):
        # GH 19860
        assert val not in index

    def test_contains_with_float_index(self):
        # GH#22085
        integer_index = pd.Int64Index([0, 1, 2, 3])
        uinteger_index = pd.UInt64Index([0, 1, 2, 3])
        float_index = pd.Float64Index([0.1, 1.1, 2.2, 3.3])

        for index in (integer_index, uinteger_index):
            assert 1.1 not in index
            assert 1.0 in index
            assert 1 in index

        assert 1.1 in float_index
        assert 1.0 not in float_index
        assert 1 not in float_index

    def test_index_type_coercion(self):

        with catch_warnings(record=True):
            simplefilter("ignore")

            # GH 11836
            # if we have an index type and set it with something that looks
            # to numpy like the same, but is actually, not
            # (e.g. setting with a float or string '0')
            # then we need to coerce to object

            # integer indexes
            for s in [Series(range(5)), Series(range(5), index=range(1, 6))]:

                assert s.index.is_integer()

                for indexer in [lambda x: x.ix, lambda x: x.loc, lambda x: x]:
                    s2 = s.copy()
                    indexer(s2)[0.1] = 0
                    assert s2.index.is_floating()
                    assert indexer(s2)[0.1] == 0

                    s2 = s.copy()
                    indexer(s2)[0.0] = 0
                    exp = s.index
                    if 0 not in s:
                        exp = Index(s.index.tolist() + [0])
                    tm.assert_index_equal(s2.index, exp)

                    s2 = s.copy()
                    indexer(s2)["0"] = 0
                    assert s2.index.is_object()

            for s in [Series(range(5), index=np.arange(5.0))]:

                assert s.index.is_floating()

                for idxr in [lambda x: x.ix, lambda x: x.loc, lambda x: x]:

                    s2 = s.copy()
                    idxr(s2)[0.1] = 0
                    assert s2.index.is_floating()
                    assert idxr(s2)[0.1] == 0

                    s2 = s.copy()
                    idxr(s2)[0.0] = 0
                    tm.assert_index_equal(s2.index, s.index)

                    s2 = s.copy()
                    idxr(s2)["0"] = 0
                    assert s2.index.is_object()
Ejemplo n.º 5
0
import pytest
import numpy as np

import pandas as pd
import pandas.util.testing as tm

from pandas.compat import PY3
from pandas.core import ops
from pandas import Timedelta, Series, Index, TimedeltaIndex


@pytest.fixture(params=[
    pd.Float64Index(np.arange(5, dtype='float64')),
    pd.UInt64Index(np.arange(5, dtype='uint64')),
    pd.Int64Index(np.arange(5, dtype='int64')),
    pd.RangeIndex(5)
],
                ids=lambda x: type(x).__name__)
def idx(request):
    return request.param


# ------------------------------------------------------------------
# Comparisons


class TestNumericComparisons(object):
    def test_operator_series_comparison_zerorank(self):
        # GH#13006
        result = np.float64(0) > pd.Series([1, 2, 3])
Ejemplo n.º 6
0
 def setup(self):
     N = 10**5
     self.idx_int_dup = pd.Int64Index(np.arange(N * 5))
     # cache is_unique
     self.idx_int_dup.is_unique
Ejemplo n.º 7
0
class StatisticalBuiltInsTestCase(zf.WithAssetFinder, zf.WithTradingCalendars,
                                  zf.ZiplineTestCase):
    sids = ASSET_FINDER_EQUITY_SIDS = pd.Int64Index([1, 2, 3])
    START_DATE = pd.Timestamp("2015-01-31", tz="UTC")
    END_DATE = pd.Timestamp("2015-03-01", tz="UTC")
    ASSET_FINDER_EQUITY_SYMBOLS = ("A", "B", "C")
    ASSET_FINDER_COUNTRY_CODE = "US"

    @classmethod
    def init_class_fixtures(cls):
        super(StatisticalBuiltInsTestCase, cls).init_class_fixtures()

        day = cls.trading_calendar.day
        cls.dates = dates = pd.date_range(
            "2015-02-01",
            "2015-02-28",
            freq=day,
            tz="UTC",
        )

        # Using these start and end dates because they are a contigous span of
        # 5 days (Monday - Friday) and they allow for plenty of days to look
        # back on when computing correlations and regressions.
        cls.start_date_index = start_date_index = 14
        cls.end_date_index = end_date_index = 18
        cls.pipeline_start_date = dates[start_date_index]
        cls.pipeline_end_date = dates[end_date_index]
        cls.num_days = num_days = end_date_index - start_date_index + 1

        sids = cls.sids
        cls.assets = assets = cls.asset_finder.retrieve_all(sids)
        cls.my_asset_column = my_asset_column = 0
        cls.my_asset = assets[my_asset_column]
        cls.num_assets = num_assets = len(assets)

        cls.raw_data = raw_data = pd.DataFrame(
            data=np.arange(len(dates) * len(sids),
                           dtype=float64_dtype).reshape(
                               len(dates),
                               len(sids),
                           ),
            index=dates,
            columns=assets,
        )

        # Using mock 'close' data here because the correlation and regression
        # built-ins use USEquityPricing.close as the input to their `Returns`
        # factors. Since there is no way to change that when constructing an
        # instance of these built-ins, we need to test with mock 'close' data
        # to most accurately reflect their true behavior and results.
        close_loader = DataFrameLoader(USEquityPricing.close, raw_data)

        cls.run_pipeline = SimplePipelineEngine(
            {
                USEquityPricing.close: close_loader
            }.__getitem__,
            cls.asset_finder,
            default_domain=US_EQUITIES,
        ).run_pipeline

        cls.cascading_mask = AssetIDPlusDay() < (sids[-1] +
                                                 dates[start_date_index].day)
        cls.expected_cascading_mask_result = make_cascading_boolean_array(
            shape=(num_days, num_assets), )
        cls.alternating_mask = (AssetIDPlusDay() % 2).eq(0)
        cls.expected_alternating_mask_result = make_alternating_boolean_array(
            shape=(num_days, num_assets), )
        cls.expected_no_mask_result = np.full(
            shape=(num_days, num_assets),
            fill_value=True,
            dtype=bool_dtype,
        )

    @parameter_space(returns_length=[2, 3], correlation_length=[3, 4])
    def test_correlation_factors(self, returns_length, correlation_length):
        """
        Tests for the built-in factors `RollingPearsonOfReturns` and
        `RollingSpearmanOfReturns`.
        """
        assets = self.assets
        my_asset = self.my_asset
        my_asset_column = self.my_asset_column
        dates = self.dates
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        start_date_index = self.start_date_index
        end_date_index = self.end_date_index
        num_days = self.num_days
        run_pipeline = self.run_pipeline

        returns = Returns(window_length=returns_length)
        masks = (self.cascading_mask, self.alternating_mask, NotSpecified)
        expected_mask_results = (
            self.expected_cascading_mask_result,
            self.expected_alternating_mask_result,
            self.expected_no_mask_result,
        )

        for mask, expected_mask in zip(masks, expected_mask_results):
            pearson_factor = RollingPearsonOfReturns(
                target=my_asset,
                returns_length=returns_length,
                correlation_length=correlation_length,
                mask=mask,
            )
            spearman_factor = RollingSpearmanOfReturns(
                target=my_asset,
                returns_length=returns_length,
                correlation_length=correlation_length,
                mask=mask,
            )

            columns = {
                "pearson_factor": pearson_factor,
                "spearman_factor": spearman_factor,
            }
            pipeline = Pipeline(columns=columns)
            if mask is not NotSpecified:
                pipeline.add(mask, "mask")

            results = run_pipeline(pipeline, start_date, end_date)
            pearson_results = results["pearson_factor"].unstack()
            spearman_results = results["spearman_factor"].unstack()
            if mask is not NotSpecified:
                mask_results = results["mask"].unstack()
                check_arrays(mask_results.values, expected_mask)

            # Run a separate pipeline that calculates returns starting
            # (correlation_length - 1) days prior to our start date. This is
            # because we need (correlation_length - 1) extra days of returns to
            # compute our expected correlations.
            results = run_pipeline(
                Pipeline(columns={"returns": returns}),
                dates[start_date_index - (correlation_length - 1)],
                dates[end_date_index],
            )
            returns_results = results["returns"].unstack()

            # On each day, calculate the expected correlation coefficients
            # between the asset we are interested in and each other asset. Each
            # correlation is calculated over `correlation_length` days.
            expected_pearson_results = np.full_like(pearson_results, nan)
            expected_spearman_results = np.full_like(spearman_results, nan)
            for day in range(num_days):
                todays_returns = returns_results.iloc[day:day +
                                                      correlation_length]
                my_asset_returns = todays_returns.iloc[:, my_asset_column]
                for asset, other_asset_returns in todays_returns.iteritems():
                    asset_column = int(asset) - 1
                    expected_pearson_results[day, asset_column] = pearsonr(
                        my_asset_returns,
                        other_asset_returns,
                    )[0]
                    expected_spearman_results[day, asset_column] = spearmanr(
                        my_asset_returns,
                        other_asset_returns,
                    )[0]

            expected_pearson_results = pd.DataFrame(
                data=np.where(expected_mask, expected_pearson_results, nan),
                index=dates[start_date_index:end_date_index + 1],
                columns=assets,
            )
            assert_frame_equal(pearson_results, expected_pearson_results)

            expected_spearman_results = pd.DataFrame(
                data=np.where(expected_mask, expected_spearman_results, nan),
                index=dates[start_date_index:end_date_index + 1],
                columns=assets,
            )
            assert_frame_equal(spearman_results, expected_spearman_results)

    @parameter_space(returns_length=[2, 3], regression_length=[3, 4])
    def test_regression_of_returns_factor(self, returns_length,
                                          regression_length):
        """
        Tests for the built-in factor `RollingLinearRegressionOfReturns`.
        """
        assets = self.assets
        my_asset = self.my_asset
        my_asset_column = self.my_asset_column
        dates = self.dates
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        start_date_index = self.start_date_index
        end_date_index = self.end_date_index
        num_days = self.num_days
        run_pipeline = self.run_pipeline

        # The order of these is meant to align with the output of `linregress`.
        outputs = ["beta", "alpha", "r_value", "p_value", "stderr"]

        returns = Returns(window_length=returns_length)
        masks = self.cascading_mask, self.alternating_mask, NotSpecified
        expected_mask_results = (
            self.expected_cascading_mask_result,
            self.expected_alternating_mask_result,
            self.expected_no_mask_result,
        )

        for mask, expected_mask in zip(masks, expected_mask_results):
            regression_factor = RollingLinearRegressionOfReturns(
                target=my_asset,
                returns_length=returns_length,
                regression_length=regression_length,
                mask=mask,
            )

            columns = {
                output: getattr(regression_factor, output)
                for output in outputs
            }
            pipeline = Pipeline(columns=columns)
            if mask is not NotSpecified:
                pipeline.add(mask, "mask")

            results = run_pipeline(pipeline, start_date, end_date)
            if mask is not NotSpecified:
                mask_results = results["mask"].unstack()
                check_arrays(mask_results.values, expected_mask)

            output_results = {}
            expected_output_results = {}
            for output in outputs:
                output_results[output] = results[output].unstack()
                expected_output_results[output] = np.full_like(
                    output_results[output],
                    nan,
                )

            # Run a separate pipeline that calculates returns starting
            # (regression_length - 1) days prior to our start date. This is
            # because we need (regression_length - 1) extra days of returns to
            # compute our expected regressions.
            results = run_pipeline(
                Pipeline(columns={"returns": returns}),
                dates[start_date_index - (regression_length - 1)],
                dates[end_date_index],
            )
            returns_results = results["returns"].unstack()

            # On each day, calculate the expected regression results for Y ~ X
            # where Y is the asset we are interested in and X is each other
            # asset. Each regression is calculated over `regression_length`
            # days of data.
            for day in range(num_days):
                todays_returns = returns_results.iloc[day:day +
                                                      regression_length]
                my_asset_returns = todays_returns.iloc[:, my_asset_column]
                for asset, other_asset_returns in todays_returns.iteritems():
                    asset_column = int(asset) - 1
                    expected_regression_results = linregress(
                        y=other_asset_returns,
                        x=my_asset_returns,
                    )
                    for i, output in enumerate(outputs):
                        expected_output_results[output][
                            day, asset_column] = expected_regression_results[i]

            for output in outputs:
                output_result = output_results[output]
                expected_output_result = pd.DataFrame(
                    np.where(expected_mask, expected_output_results[output],
                             nan),
                    index=dates[start_date_index:end_date_index + 1],
                    columns=assets,
                )
                assert_frame_equal(output_result, expected_output_result)

    def test_simple_beta_matches_regression(self):
        run_pipeline = self.run_pipeline
        simple_beta = SimpleBeta(target=self.my_asset, regression_length=10)
        complex_beta = RollingLinearRegressionOfReturns(
            target=self.my_asset,
            returns_length=2,
            regression_length=10,
        ).beta
        pipe = Pipeline({"simple": simple_beta, "complex": complex_beta})
        results = run_pipeline(
            pipe,
            self.pipeline_start_date,
            self.pipeline_end_date,
        )
        assert_equal(results["simple"], results["complex"], check_names=False)

    def test_simple_beta_allowed_missing_calculation(self):
        for percentage, expected in [
            (0.651, 65),
            (0.659, 65),
            (0.66, 66),
            (0.0, 0),
            (1.0, 100),
        ]:
            beta = SimpleBeta(
                target=self.my_asset,
                regression_length=100,
                allowed_missing_percentage=percentage,
            )
            assert beta.params["allowed_missing_count"] == expected

    def test_correlation_and_regression_with_bad_asset(self):
        """
        Test that `RollingPearsonOfReturns`, `RollingSpearmanOfReturns` and
        `RollingLinearRegressionOfReturns` raise the proper exception when
        given a nonexistent target asset.
        """
        my_asset = Equity(
            0,
            exchange_info=ExchangeInfo("TEST", "TEST FULL", "US"),
        )
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        run_pipeline = self.run_pipeline

        # This filter is arbitrary; the important thing is that we test each
        # factor both with and without a specified mask.
        my_asset_filter = AssetID().eq(1)

        for mask in (NotSpecified, my_asset_filter):
            pearson_factor = RollingPearsonOfReturns(
                target=my_asset,
                returns_length=3,
                correlation_length=3,
                mask=mask,
            )
            spearman_factor = RollingSpearmanOfReturns(
                target=my_asset,
                returns_length=3,
                correlation_length=3,
                mask=mask,
            )
            regression_factor = RollingLinearRegressionOfReturns(
                target=my_asset,
                returns_length=3,
                regression_length=3,
                mask=mask,
            )

            with pytest.raises(NonExistentAssetInTimeFrame):
                run_pipeline(
                    Pipeline(columns={"pearson_factor": pearson_factor}),
                    start_date,
                    end_date,
                )
            with pytest.raises(NonExistentAssetInTimeFrame):
                run_pipeline(
                    Pipeline(columns={"spearman_factor": spearman_factor}),
                    start_date,
                    end_date,
                )
            with pytest.raises(NonExistentAssetInTimeFrame):
                run_pipeline(
                    Pipeline(columns={"regression_factor": regression_factor}),
                    start_date,
                    end_date,
                )

    def test_require_length_greater_than_one(self):
        my_asset = Equity(
            0,
            exchange_info=ExchangeInfo("TEST", "TEST FULL", "US"),
        )

        with pytest.raises(ValueError):
            RollingPearsonOfReturns(
                target=my_asset,
                returns_length=3,
                correlation_length=1,
            )

        with pytest.raises(ValueError):
            RollingSpearmanOfReturns(
                target=my_asset,
                returns_length=3,
                correlation_length=1,
            )

        with pytest.raises(ValueError):
            RollingLinearRegressionOfReturns(
                target=my_asset,
                returns_length=3,
                regression_length=1,
            )

    def test_simple_beta_input_validation(self):
        expected = ("SimpleBeta() expected a value of type"
                    " Asset for argument 'target',"
                    " but got str instead.")
        with pytest.raises(TypeError, match=re.escape(expected)):
            SimpleBeta(
                target="SPY",
                regression_length=100,
                allowed_missing_percentage=0.5,
            )

        expected = ("SimpleBeta() expected a value greater than or equal to 3"
                    " for argument 'regression_length', but got 1 instead.")
        with pytest.raises(ValueError, match=re.escape(expected)):
            SimpleBeta(
                target=self.my_asset,
                regression_length=1,
                allowed_missing_percentage=0.5,
            )

        expected = (
            "SimpleBeta() expected a value inclusively between 0.0 and 1.0 "
            "for argument 'allowed_missing_percentage', but got 50 instead.")
        with pytest.raises(ValueError, match=re.escape(expected)):
            SimpleBeta(
                target=self.my_asset,
                regression_length=100,
                allowed_missing_percentage=50,
            )

    def test_simple_beta_target(self):
        beta = SimpleBeta(
            target=self.my_asset,
            regression_length=50,
            allowed_missing_percentage=0.5,
        )
        assert beta.target is self.my_asset

    def test_simple_beta_repr(self):
        beta = SimpleBeta(
            target=self.my_asset,
            regression_length=50,
            allowed_missing_percentage=0.5,
        )
        result = repr(beta)
        expected = "SimpleBeta({}, length=50, allowed_missing=25)".format(
            self.my_asset, )
        assert result == expected

    def test_simple_beta_graph_repr(self):
        beta = SimpleBeta(
            target=self.my_asset,
            regression_length=50,
            allowed_missing_percentage=0.5,
        )
        result = beta.graph_repr()
        expected = "SimpleBeta('A', 50, 25)"
        assert result == expected
def test_make_meta():
    df = pd.DataFrame({'a': [1, 2, 3], 'b': list('abc'), 'c': [1., 2., 3.]},
                      index=[10, 20, 30])

    # Pandas dataframe
    meta = make_meta(df)
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes).all()
    assert isinstance(meta.index, type(df.index))

    # Pandas series
    meta = make_meta(df.a)
    assert len(meta) == 0
    assert meta.dtype == df.a.dtype
    assert isinstance(meta.index, type(df.index))

    # Pandas index
    meta = make_meta(df.index)
    assert isinstance(meta, type(df.index))
    assert len(meta) == 0

    # Dask object
    ddf = dd.from_pandas(df, npartitions=2)
    assert make_meta(ddf) is ddf._meta

    # Dict
    meta = make_meta({'a': 'i8', 'b': 'O', 'c': 'f8'})
    assert isinstance(meta, pd.DataFrame)
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes).all()
    assert isinstance(meta.index, pd.RangeIndex)

    # Iterable
    meta = make_meta([('a', 'i8'), ('c', 'f8'), ('b', 'O')])
    assert (meta.columns == ['a', 'c', 'b']).all()
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all()
    assert isinstance(meta.index, pd.RangeIndex)

    # Tuple
    meta = make_meta(('a', 'i8'))
    assert isinstance(meta, pd.Series)
    assert len(meta) == 0
    assert meta.dtype == 'i8'
    assert meta.name == 'a'

    # With index
    meta = make_meta({'a': 'i8', 'b': 'i4'},
                     index=pd.Int64Index([1, 2], name='foo'))
    assert isinstance(meta.index, pd.Int64Index)
    assert len(meta.index) == 0
    meta = make_meta(('a', 'i8'), index=pd.Int64Index([1, 2], name='foo'))
    assert isinstance(meta.index, pd.Int64Index)
    assert len(meta.index) == 0

    # Categoricals
    meta = make_meta({'a': 'category'})
    assert len(meta.a.cat.categories) == 1
    assert meta.a.cat.categories[0] == UNKNOWN_CATEGORIES
    meta = make_meta(('a', 'category'))
    assert len(meta.cat.categories) == 1
    assert meta.cat.categories[0] == UNKNOWN_CATEGORIES

    # Numpy scalar
    meta = make_meta(np.float64(1.0))
    assert isinstance(meta, np.float64)

    # Python scalar
    meta = make_meta(1.0)
    assert isinstance(meta, np.float64)

    # Timestamp
    x = pd.Timestamp(2000, 1, 1)
    meta = make_meta(x)
    assert meta is x

    # Dtype expressions
    meta = make_meta('i8')
    assert isinstance(meta, np.int64)
    meta = make_meta(float)
    assert isinstance(meta, np.dtype(float).type)
    meta = make_meta(np.dtype('bool'))
    assert isinstance(meta, np.bool_)
    assert pytest.raises(TypeError, lambda: make_meta(None))
Ejemplo n.º 9
0
all_data = pd.concat([train_data, test_data],
                     axis=0).sort_values(by='ID').reset_index().drop(['index'],
                                                                     axis=1)
bad_feature = [
    'ID', '功率A', '功率B', '功率C', '平均功率', '现场温度', '电压A', '电压B', '电压C', '电流B',
    '电流C', '转换效率', '转换效率A', '转换效率B', '转换效率C'
]
bad_index1 = all_data[bad_feature][
    (all_data[bad_feature] > all_data[bad_feature].mean() +
     2 * all_data[bad_feature].std()) |
    (all_data[bad_feature] < all_data[bad_feature].mean() -
     2 * all_data[bad_feature].std())].dropna(how='all').index
bad_index2 = all_data[((all_data['电压A'] < 500) & (all_data['电压A'] != 0)) |
                      ((all_data['电压B'] < 500) & (all_data['电压B'] != 0)) |
                      ((all_data['电压C'] < 500) & (all_data['电压C'] != 0))].index
bad_index = pd.Int64Index(list(bad_index1) + list(bad_index2))
# all_data.loc[np.concatenate([bad_index -1,bad_index,bad_index+1])].sort_values(by='ID', ascending=True)

nn_bad_data = all_data.loc[np.concatenate(
    [bad_index - 1, bad_index,
     bad_index + 1])].sort_values(by='ID', ascending=True).drop_duplicates()
bad_data = all_data.loc[bad_index].sort_values(by='ID', ascending=True)

#上下记录均值替代异常值
for idx, line in bad_data.iterrows():
    ID = line['ID']
    col_index = line[bad_feature][
        (line[bad_feature] > all_data[bad_feature].mean() +
         3 * all_data[bad_feature].std()) |
        (line[bad_feature] < all_data[bad_feature].mean() -
         3 * all_data[bad_feature].std())].index
Ejemplo n.º 10
0
    def restore_dataframe(
        store,
        key,
        filter_query=None,
        columns=None,
        predicate_pushdown_to_io=True,
        categories=None,
        predicates=None,
        date_as_object=False,
    ):
        check_predicates(predicates)
        # If we want to do columnar access we can benefit from partial reads
        # otherwise full read en block is the better option.
        if (not predicate_pushdown_to_io) or (columns is None and predicates is None):
            with pa.BufferReader(store.get(key)) as reader:
                table = pq.read_pandas(reader, columns=columns)
        else:
            if HAVE_BOTO and isinstance(store, BotoStore):
                # Parquet and seeks on S3 currently leak connections thus
                # we omit column projection to the store.
                reader = pa.BufferReader(store.get(key))
            else:
                reader = store.open(key)
                # Buffer at least 4 MB in requests. This is chosen because the default block size of the Azure
                # storage client is 4MB.
                reader = BlockBuffer(reader, 4 * 1024 * 1024)
            try:
                parquet_file = ParquetFile(reader)
                if predicates and parquet_file.metadata.num_rows > 0:
                    # We need to calculate different predicates for predicate
                    # pushdown and the later DataFrame filtering. This is required
                    # e.g. in the case where we have an `in` predicate as this has
                    # different normalized values.
                    columns_to_io = _columns_for_pushdown(columns, predicates)
                    predicates_for_pushdown = _normalize_predicates(
                        parquet_file, predicates, True
                    )
                    predicates = _normalize_predicates(parquet_file, predicates, False)
                    tables = _read_row_groups_into_tables(
                        parquet_file, columns_to_io, predicates_for_pushdown
                    )

                    if len(tables) == 0:
                        if ARROW_LARGER_EQ_0130:
                            table = parquet_file.schema.to_arrow_schema().empty_table()
                        else:
                            table = _empty_table_from_schema(parquet_file)
                    else:
                        table = pa.concat_tables(tables)
                else:
                    # ARROW-5139 Column projection with empty columns returns a table w/out index
                    if ARROW_LARGER_EQ_0130 and columns == []:
                        # Create an arrow table with expected index length.
                        df = (
                            parquet_file.schema.to_arrow_schema()
                            .empty_table()
                            .to_pandas(date_as_object=date_as_object)
                        )
                        index = pd.Int64Index(
                            pd.RangeIndex(start=0, stop=parquet_file.metadata.num_rows)
                        )
                        df = pd.DataFrame(df, index=index)
                        # convert back to table to keep downstream code untouched by this patch
                        table = pa.Table.from_pandas(df)
                    else:
                        table = pq.read_pandas(reader, columns=columns)
            finally:
                reader.close()

        table = _fix_pyarrow_07992_table(table)

        table = _fix_pyarrow_0130_table(table)

        if columns is not None:
            missing_columns = set(columns) - set(table.schema.names)
            if missing_columns:
                raise ValueError(
                    u"Columns cannot be found in stored dataframe: {missing}".format(
                        missing=u", ".join(sorted(missing_columns))
                    )
                )

        df = table.to_pandas(categories=categories, date_as_object=date_as_object)
        df.columns = df.columns.map(ensure_unicode_string_type)
        if predicates:
            df = filter_df_from_predicates(
                df, predicates, strict_date_types=date_as_object
            )
        else:
            df = filter_df(df, filter_query)
        if columns is not None:
            return df.loc[:, columns]
        else:
            return df
def test_meta_nonempty_index():
    idx = pd.RangeIndex(1, name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.RangeIndex
    assert res.name == idx.name

    idx = pd.Int64Index([1], name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.Int64Index
    assert res.name == idx.name

    idx = pd.Index(['a'], name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.Index
    assert res.name == idx.name

    idx = pd.DatetimeIndex(['1970-01-01'], freq='d',
                           tz='America/New_York', name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.DatetimeIndex
    assert res.tz == idx.tz
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.PeriodIndex(['1970-01-01'], freq='d', name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.PeriodIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.TimedeltaIndex([np.timedelta64(1, 'D')], freq='d', name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.TimedeltaIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.CategoricalIndex(['xyx'], ['xyx', 'zzz'], ordered=True, name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert (res.categories == idx.categories).all()
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES], ordered=True, name='foo')
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    levels = [pd.Int64Index([1], name='a'),
              pd.Float64Index([1.0], name='b')]
    idx = pd.MultiIndex(levels=levels, labels=[[0], [0]], names=['a', 'b'])
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names

    levels = [pd.Int64Index([1], name='a'),
              pd.CategoricalIndex(data=['xyx'], categories=['xyx'], name='b'),
              pd.TimedeltaIndex([np.timedelta64(1, 'D')], name='timedelta')]
    idx = pd.MultiIndex(levels=levels, labels=[[0], [0], [0]], names=['a', 'b', 'timedelta'])
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names
Ejemplo n.º 12
0
    def from_stimulus_file(
            cls,
            stimulus_file: StimulusFile,
            stimulus_timestamps: StimulusTimestamps,
            limit_to_images: Optional[List] = None) -> "Presentations":
        """Get stimulus presentation data.

        :param stimulus_file
        :param limit_to_images
            Only return images given by these image names
        :param stimulus_timestamps


        :returns: pd.DataFrame --
            Table whose rows are stimulus presentations
            (i.e. a given image, for a given duration, typically 250 ms)
            and whose columns are presentation characteristics.
        """
        stimulus_timestamps = stimulus_timestamps.value
        data = stimulus_file.data
        raw_stim_pres_df = get_stimulus_presentations(data,
                                                      stimulus_timestamps)

        # Fill in nulls for image_name
        # This makes two assumptions:
        #   1. Nulls in `image_name` should be "gratings_<orientation>"
        #   2. Gratings are only present (or need to be fixed) when all
        #      values for `image_name` are null.
        if pd.isnull(raw_stim_pres_df["image_name"]).all():
            if ~pd.isnull(raw_stim_pres_df["orientation"]).all():
                raw_stim_pres_df["image_name"] = (
                    raw_stim_pres_df["orientation"].apply(
                        lambda x: f"gratings_{x}"))
            else:
                raise ValueError("All values for 'orentation' and 'image_name'"
                                 " are null.")

        stimulus_metadata_df = get_stimulus_metadata(data)

        idx_name = raw_stim_pres_df.index.name
        stimulus_index_df = (raw_stim_pres_df.reset_index().merge(
            stimulus_metadata_df.reset_index(),
            on=["image_name"]).set_index(idx_name))
        stimulus_index_df = (stimulus_index_df[[
            "image_set", "image_index", "start_time", "phase",
            "spatial_frequency"
        ]].rename(columns={
            "start_time": "timestamps"
        }).sort_index().set_index("timestamps", drop=True))
        stim_pres_df = raw_stim_pres_df.merge(stimulus_index_df,
                                              left_on="start_time",
                                              right_index=True,
                                              how="left")
        if len(raw_stim_pres_df) != len(stim_pres_df):
            raise ValueError("Length of `stim_pres_df` should not change after"
                             f" merge; was {len(raw_stim_pres_df)}, now "
                             f" {len(stim_pres_df)}.")

        stim_pres_df['is_change'] = is_change_event(
            stimulus_presentations=stim_pres_df)

        # Sort columns then drop columns which contain only all NaN values
        stim_pres_df = \
            stim_pres_df[sorted(stim_pres_df)].dropna(axis=1, how='all')
        if limit_to_images is not None:
            stim_pres_df = \
                stim_pres_df[stim_pres_df['image_name'].isin(limit_to_images)]
            stim_pres_df.index = pd.Int64Index(range(stim_pres_df.shape[0]),
                                               name=stim_pres_df.index.name)
        stim_pres_df = cls._postprocess(presentations=stim_pres_df)
        return Presentations(presentations=stim_pres_df)
Ejemplo n.º 13
0
def test_make_meta():
    df = pd.DataFrame({
        "a": [1, 2, 3],
        "b": list("abc"),
        "c": [1.0, 2.0, 3.0]
    },
                      index=[10, 20, 30])

    # Pandas dataframe
    meta = make_meta(df)
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes).all()
    assert isinstance(meta.index, type(df.index))

    # Pandas series
    meta = make_meta(df.a)
    assert len(meta) == 0
    assert meta.dtype == df.a.dtype
    assert isinstance(meta.index, type(df.index))

    # Pandas index
    meta = make_meta(df.index)
    assert isinstance(meta, type(df.index))
    assert len(meta) == 0

    # Dask object
    ddf = dd.from_pandas(df, npartitions=2)
    assert make_meta(ddf) is ddf._meta

    # Dict
    meta = make_meta({"a": "i8", "b": "O", "c": "f8"})
    assert isinstance(meta, pd.DataFrame)
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes).all()
    assert isinstance(meta.index, pd.RangeIndex)

    # Iterable
    meta = make_meta([("a", "i8"), ("c", "f8"), ("b", "O")])
    assert (meta.columns == ["a", "c", "b"]).all()
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all()
    assert isinstance(meta.index, pd.RangeIndex)

    # Tuple
    meta = make_meta(("a", "i8"))
    assert isinstance(meta, pd.Series)
    assert len(meta) == 0
    assert meta.dtype == "i8"
    assert meta.name == "a"

    # With index
    meta = make_meta(
        {
            "a": "i8",
            "b": "i4"
        },
        index=pd.Int64Index([1, 2], name="foo"),
    )
    assert isinstance(meta.index, pd.Int64Index)
    assert len(meta.index) == 0
    meta = make_meta(("a", "i8"), index=pd.Int64Index([1, 2], name="foo"))
    assert isinstance(meta.index, pd.Int64Index)
    assert len(meta.index) == 0

    # Categoricals
    meta = make_meta({"a": "category"}, parent_meta=df)
    assert len(meta.a.cat.categories) == 1
    assert meta.a.cat.categories[0] == UNKNOWN_CATEGORIES
    meta = make_meta(("a", "category"), parent_meta=df)
    assert len(meta.cat.categories) == 1
    assert meta.cat.categories[0] == UNKNOWN_CATEGORIES

    # Numpy scalar
    meta = make_meta(np.float64(1.0), parent_meta=df)
    assert isinstance(meta, np.float64)

    # Python scalar
    meta = make_meta(1.0, parent_meta=df)
    assert isinstance(meta, np.float64)

    # Timestamp
    x = pd.Timestamp(2000, 1, 1)
    meta = make_meta(x, parent_meta=df)
    assert meta is x

    # DatetimeTZDtype
    x = pd.DatetimeTZDtype(tz="UTC")
    meta = make_meta(x)
    assert meta == pd.Timestamp(1, tz=x.tz, unit=x.unit)

    # Dtype expressions
    meta = make_meta("i8", parent_meta=df)
    assert isinstance(meta, np.int64)
    meta = make_meta(float, parent_meta=df)
    assert isinstance(meta, np.dtype(float).type)
    meta = make_meta(np.dtype("bool"), parent_meta=df)
    assert isinstance(meta, np.bool_)
    assert pytest.raises(TypeError, lambda: make_meta(None))
Ejemplo n.º 14
0
def test_meta_nonempty_index():
    idx = pd.RangeIndex(1, name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.RangeIndex
    assert res.name == idx.name

    idx = pd.Int64Index([1], name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.Int64Index
    assert res.name == idx.name

    idx = pd.Index(["a"], name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.Index
    assert res.name == idx.name

    idx = pd.DatetimeIndex(["1970-01-01"],
                           freq="d",
                           tz="America/New_York",
                           name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.DatetimeIndex
    assert res.tz == idx.tz
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.PeriodIndex(["1970-01-01"], freq="d", name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.PeriodIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.TimedeltaIndex([np.timedelta64(1, "D")], freq="d", name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.TimedeltaIndex
    assert res.freq == idx.freq
    assert res.name == idx.name

    idx = pd.CategoricalIndex(["xyx"], ["xyx", "zzz"],
                              ordered=True,
                              name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert (res.categories == idx.categories).all()
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES],
                              ordered=True,
                              name="foo")
    res = meta_nonempty(idx)
    assert type(res) is pd.CategoricalIndex
    assert res.ordered == idx.ordered
    assert res.name == idx.name

    levels = [pd.Int64Index([1], name="a"), pd.Float64Index([1.0], name="b")]
    codes = [[0], [0]]
    idx = pd.MultiIndex(levels=levels, names=["a", "b"], codes=codes)
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names

    levels = [
        pd.Int64Index([1], name="a"),
        pd.CategoricalIndex(data=["xyx"], categories=["xyx"], name="b"),
        pd.TimedeltaIndex([np.timedelta64(1, "D")], name="timedelta"),
    ]

    codes = [[0], [0], [0]]

    idx = pd.MultiIndex(levels=levels,
                        names=["a", "b", "timedelta"],
                        codes=codes)
    res = meta_nonempty(idx)
    assert type(res) is pd.MultiIndex
    for idx1, idx2 in zip(idx.levels, res.levels):
        assert type(idx1) is type(idx2)
        assert idx1.name == idx2.name
    assert res.names == idx.names
Ejemplo n.º 15
0
 def fix_df_index_impl(index):
     index_data = fix_df_array(index)
     return pd.Int64Index(index_data)
Ejemplo n.º 16
0
@pytest.fixture(params=zeros)
def zero(request):
    # For testing division by (or of) zero for Index with length 5, this
    # gives several scalar-zeros and length-5 vector-zeros
    return request.param


# ------------------------------------------------------------------
# Vector Fixtures


@pytest.fixture(
    params=[
        pd.Float64Index(np.arange(5, dtype="float64")),
        pd.Int64Index(np.arange(5, dtype="int64")),
        pd.UInt64Index(np.arange(5, dtype="uint64")),
        pd.RangeIndex(5),
    ],
    ids=lambda x: type(x).__name__,
)
def numeric_idx(request):
    """
    Several types of numeric-dtypes Index objects
    """
    return request.param


# ------------------------------------------------------------------
# Scalar Fixtures
Ejemplo n.º 17
0
 def setup(self, keep):
     N = 10**5
     self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
     self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
     self.string_idx = tm.makeStringIndex(N)
Ejemplo n.º 18
0
def test_filter_index_value():
    pd_index = pd.RangeIndex(10)
    index_value = parse_index(pd_index)

    min_max = (0, True, 9, True)
    assert filter_index_value(
        index_value,
        min_max).to_pandas().tolist() == pd_index[(pd_index >= 0)
                                                  & (pd_index <= 9)].tolist()

    min_max = (0, False, 9, False)
    assert filter_index_value(
        index_value,
        min_max).to_pandas().tolist() == pd_index[(pd_index > 0)
                                                  & (pd_index < 9)].tolist()

    pd_index = pd.RangeIndex(1, 11, 3)
    index_value = parse_index(pd_index)

    min_max = (2, True, 10, True)
    assert filter_index_value(
        index_value,
        min_max).to_pandas().tolist() == pd_index[(pd_index >= 2)
                                                  & (pd_index <= 10)].tolist()

    min_max = (2, False, 10, False)
    assert filter_index_value(
        index_value,
        min_max).to_pandas().tolist() == pd_index[(pd_index > 2)
                                                  & (pd_index < 10)].tolist()

    pd_index = pd.RangeIndex(9, -1, -1)
    index_value = parse_index(pd_index)

    min_max = (0, True, 9, True)
    assert filter_index_value(
        index_value,
        min_max).to_pandas().tolist() == pd_index[(pd_index >= 0)
                                                  & (pd_index <= 9)].tolist()

    min_max = (0, False, 9, False)
    assert filter_index_value(
        index_value,
        min_max).to_pandas().tolist() == pd_index[(pd_index > 0)
                                                  & (pd_index < 9)].tolist()

    pd_index = pd.RangeIndex(10, 0, -3)
    index_value = parse_index(pd_index, store_data=False)

    min_max = (2, True, 10, True)
    assert filter_index_value(
        index_value,
        min_max).to_pandas().tolist() == pd_index[(pd_index >= 2)
                                                  & (pd_index <= 10)].tolist()

    min_max = (2, False, 10, False)
    assert filter_index_value(
        index_value,
        min_max).to_pandas().tolist() == pd_index[(pd_index > 2)
                                                  & (pd_index < 10)].tolist()

    pd_index = pd.Int64Index([0, 3, 8])
    index_value = parse_index(pd_index, store_data=True)

    min_max = (2, True, 8, False)
    assert filter_index_value(
        index_value, min_max,
        store_data=True).to_pandas().tolist() == pd_index[
            (pd_index >= 2) & (pd_index < 8)].tolist()

    index_value = parse_index(pd_index)

    min_max = (2, True, 8, False)
    filtered = filter_index_value(index_value, min_max)
    assert len(filtered.to_pandas().tolist()) == 0
    assert isinstance(filtered.value, IndexValue.Int64Index)
Ejemplo n.º 19
0
class StatisticalMethodsTestCase(zf.WithSeededRandomPipelineEngine,
                                 zf.ZiplineTestCase):
    sids = ASSET_FINDER_EQUITY_SIDS = pd.Int64Index([1, 2, 3])
    START_DATE = pd.Timestamp("2015-01-31", tz="UTC")
    END_DATE = pd.Timestamp("2015-03-01", tz="UTC")
    ASSET_FINDER_COUNTRY_CODE = "US"
    SEEDED_RANDOM_PIPELINE_DEFAULT_DOMAIN = US_EQUITIES

    @classmethod
    def init_class_fixtures(cls):
        super(StatisticalMethodsTestCase, cls).init_class_fixtures()

        # Using these start and end dates because they are a contigous span of
        # 5 days (Monday - Friday) and they allow for plenty of days to look
        # back on when computing correlations and regressions.
        cls.dates = dates = cls.trading_days
        cls.start_date_index = start_date_index = 14
        cls.end_date_index = end_date_index = 18
        cls.pipeline_start_date = cls.trading_days[start_date_index]
        cls.pipeline_end_date = cls.trading_days[end_date_index]

        sids = cls.sids
        cls.assets = assets = cls.asset_finder.retrieve_all(sids)
        cls.my_asset_column = my_asset_column = 0
        cls.my_asset = assets[my_asset_column]
        cls.num_days = num_days = end_date_index - start_date_index + 1
        cls.num_assets = num_assets = len(assets)

        cls.cascading_mask = AssetIDPlusDay() < (sids[-1] +
                                                 dates[start_date_index].day)
        cls.expected_cascading_mask_result = make_cascading_boolean_array(
            shape=(num_days, num_assets), )
        cls.alternating_mask = (AssetIDPlusDay() % 2).eq(0)
        cls.expected_alternating_mask_result = make_alternating_boolean_array(
            shape=(num_days, num_assets), )
        cls.expected_no_mask_result = np.full(
            shape=(num_days, num_assets),
            fill_value=True,
            dtype=bool_dtype,
        )

        # Random input for factors.
        cls.col = TestingDataSet.float_col

    @parameter_space(returns_length=[2, 3], correlation_length=[3, 4])
    def test_factor_correlation_methods(self, returns_length,
                                        correlation_length):
        """
        Ensure that `Factor.pearsonr` and `Factor.spearmanr` are consistent
        with the built-in factors `RollingPearsonOfReturns` and
        `RollingSpearmanOfReturns`.
        """
        my_asset = self.my_asset
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        run_pipeline = self.run_pipeline

        returns = Returns(window_length=returns_length, inputs=[self.col])
        returns_slice = returns[my_asset]

        pearson = returns.pearsonr(
            target=returns_slice,
            correlation_length=correlation_length,
        )
        spearman = returns.spearmanr(
            target=returns_slice,
            correlation_length=correlation_length,
        )
        expected_pearson = RollingPearsonOfReturns(
            target=my_asset,
            returns_length=returns_length,
            correlation_length=correlation_length,
        )
        expected_spearman = RollingSpearmanOfReturns(
            target=my_asset,
            returns_length=returns_length,
            correlation_length=correlation_length,
        )

        # These built-ins construct their own Returns factor to use as inputs,
        # so the only way to set our own inputs is to do so after the fact.
        # This should not be done in practice. It is necessary here because we
        # want Returns to use our random data as an input, but by default it is
        # using USEquityPricing.close.
        expected_pearson.inputs = [returns, returns_slice]
        expected_spearman.inputs = [returns, returns_slice]

        columns = {
            "pearson": pearson,
            "spearman": spearman,
            "expected_pearson": expected_pearson,
            "expected_spearman": expected_spearman,
        }

        results = run_pipeline(Pipeline(columns=columns), start_date, end_date)
        pearson_results = results["pearson"].unstack()
        spearman_results = results["spearman"].unstack()
        expected_pearson_results = results["expected_pearson"].unstack()
        expected_spearman_results = results["expected_spearman"].unstack()

        assert_frame_equal(pearson_results, expected_pearson_results)
        assert_frame_equal(spearman_results, expected_spearman_results)

    def test_correlation_methods_bad_type(self):
        """
        Make sure we cannot call the Factor correlation methods on factors or
        slices that are not of float or int dtype.
        """
        # These are arbitrary for the purpose of this test.
        returns_length = 2
        correlation_length = 10

        returns = Returns(window_length=returns_length, inputs=[self.col])
        returns_slice = returns[self.my_asset]

        class BadTypeFactor(CustomFactor):
            inputs = []
            window_length = 1
            dtype = datetime64ns_dtype
            window_safe = True

            def compute(self, today, assets, out):
                pass

        bad_type_factor = BadTypeFactor()
        bad_type_factor_slice = bad_type_factor[self.my_asset]

        with pytest.raises(TypeError):
            bad_type_factor.pearsonr(
                target=returns_slice,
                correlation_length=correlation_length,
            )
        with pytest.raises(TypeError):
            bad_type_factor.spearmanr(
                target=returns_slice,
                correlation_length=correlation_length,
            )
        with pytest.raises(TypeError):
            returns.pearsonr(
                target=bad_type_factor_slice,
                correlation_length=correlation_length,
            )
        with pytest.raises(TypeError):
            returns.spearmanr(
                target=bad_type_factor_slice,
                correlation_length=correlation_length,
            )

    @parameter_space(returns_length=[2, 3], regression_length=[3, 4])
    def test_factor_regression_method(self, returns_length, regression_length):
        """
        Ensure that `Factor.linear_regression` is consistent with the built-in
        factor `RollingLinearRegressionOfReturns`.
        """
        my_asset = self.my_asset
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        run_pipeline = self.run_pipeline

        returns = Returns(window_length=returns_length, inputs=[self.col])
        returns_slice = returns[my_asset]

        regression = returns.linear_regression(
            target=returns_slice,
            regression_length=regression_length,
        )
        expected_regression = RollingLinearRegressionOfReturns(
            target=my_asset,
            returns_length=returns_length,
            regression_length=regression_length,
        )

        # This built-in constructs its own Returns factor to use as an input,
        # so the only way to set our own input is to do so after the fact. This
        # should not be done in practice. It is necessary here because we want
        # Returns to use our random data as an input, but by default it is
        # using USEquityPricing.close.
        expected_regression.inputs = [returns, returns_slice]

        columns = {
            "regression": regression,
            "expected_regression": expected_regression,
        }

        results = run_pipeline(Pipeline(columns=columns), start_date, end_date)
        regression_results = results["regression"].unstack()
        expected_regression_results = results["expected_regression"].unstack()

        assert_frame_equal(regression_results, expected_regression_results)

    def test_regression_method_bad_type(self):
        """
        Make sure we cannot call the Factor linear regression method on factors
        or slices that are not of float or int dtype.
        """
        # These are arbitrary for the purpose of this test.
        returns_length = 2
        regression_length = 10

        returns = Returns(window_length=returns_length, inputs=[self.col])
        returns_slice = returns[self.my_asset]

        class BadTypeFactor(CustomFactor):
            window_length = 1
            inputs = []
            dtype = datetime64ns_dtype
            window_safe = True

            def compute(self, today, assets, out):
                pass

        bad_type_factor = BadTypeFactor()
        bad_type_factor_slice = bad_type_factor[self.my_asset]

        with pytest.raises(TypeError):
            bad_type_factor.linear_regression(
                target=returns_slice,
                regression_length=regression_length,
            )
        with pytest.raises(TypeError):
            returns.linear_regression(
                target=bad_type_factor_slice,
                regression_length=regression_length,
            )

    @parameter_space(correlation_length=[2, 3, 4])
    def test_factor_correlation_methods_two_factors(self, correlation_length):
        """
        Tests for `Factor.pearsonr` and `Factor.spearmanr` when passed another
        2D factor instead of a Slice.
        """
        assets = self.assets
        dates = self.dates
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        start_date_index = self.start_date_index
        end_date_index = self.end_date_index
        num_days = self.num_days
        run_pipeline = self.run_pipeline

        # Ensure that the correlation methods cannot be called with two 2D
        # factors which have different masks.
        returns_masked_1 = Returns(
            window_length=5,
            inputs=[self.col],
            mask=AssetID().eq(1),
        )
        returns_masked_2 = Returns(
            window_length=5,
            inputs=[self.col],
            mask=AssetID().eq(2),
        )
        with pytest.raises(IncompatibleTerms):
            returns_masked_1.pearsonr(
                target=returns_masked_2,
                correlation_length=correlation_length,
            )
        with pytest.raises(IncompatibleTerms):
            returns_masked_1.spearmanr(
                target=returns_masked_2,
                correlation_length=correlation_length,
            )

        returns_5 = Returns(window_length=5, inputs=[self.col])
        returns_10 = Returns(window_length=10, inputs=[self.col])

        pearson_factor = returns_5.pearsonr(
            target=returns_10,
            correlation_length=correlation_length,
        )
        spearman_factor = returns_5.spearmanr(
            target=returns_10,
            correlation_length=correlation_length,
        )

        columns = {
            "pearson_factor": pearson_factor,
            "spearman_factor": spearman_factor,
        }
        pipeline = Pipeline(columns=columns)

        results = run_pipeline(pipeline, start_date, end_date)
        pearson_results = results["pearson_factor"].unstack()
        spearman_results = results["spearman_factor"].unstack()

        # Run a separate pipeline that calculates returns starting
        # (correlation_length - 1) days prior to our start date. This is
        # because we need (correlation_length - 1) extra days of returns to
        # compute our expected correlations.
        columns = {"returns_5": returns_5, "returns_10": returns_10}
        results = run_pipeline(
            Pipeline(columns=columns),
            dates[start_date_index - (correlation_length - 1)],
            dates[end_date_index],
        )
        returns_5_results = results["returns_5"].unstack()
        returns_10_results = results["returns_10"].unstack()

        # On each day, calculate the expected correlation coefficients
        # between each asset's 5 and 10 day rolling returns. Each correlation
        # is calculated over `correlation_length` days.
        expected_pearson_results = np.full_like(pearson_results, nan)
        expected_spearman_results = np.full_like(spearman_results, nan)
        for day in range(num_days):
            todays_returns_5 = returns_5_results.iloc[day:day +
                                                      correlation_length]
            todays_returns_10 = returns_10_results.iloc[day:day +
                                                        correlation_length]
            for asset, asset_returns_5 in todays_returns_5.iteritems():
                asset_column = int(asset) - 1
                asset_returns_10 = todays_returns_10[asset]
                expected_pearson_results[day, asset_column] = pearsonr(
                    asset_returns_5,
                    asset_returns_10,
                )[0]
                expected_spearman_results[day, asset_column] = spearmanr(
                    asset_returns_5,
                    asset_returns_10,
                )[0]

        expected_pearson_results = pd.DataFrame(
            data=expected_pearson_results,
            index=dates[start_date_index:end_date_index + 1],
            columns=assets,
        )
        assert_frame_equal(pearson_results, expected_pearson_results)

        expected_spearman_results = pd.DataFrame(
            data=expected_spearman_results,
            index=dates[start_date_index:end_date_index + 1],
            columns=assets,
        )
        assert_frame_equal(spearman_results, expected_spearman_results)

    @parameter_space(regression_length=[2, 3, 4])
    def test_factor_regression_method_two_factors(self, regression_length):
        """
        Tests for `Factor.linear_regression` when passed another 2D factor
        instead of a Slice.
        """
        assets = self.assets
        dates = self.dates
        start_date = self.pipeline_start_date
        end_date = self.pipeline_end_date
        start_date_index = self.start_date_index
        end_date_index = self.end_date_index
        num_days = self.num_days
        run_pipeline = self.run_pipeline

        # The order of these is meant to align with the output of `linregress`.
        outputs = ["beta", "alpha", "r_value", "p_value", "stderr"]

        # Ensure that the `linear_regression` method cannot be called with two
        # 2D factors which have different masks.
        returns_masked_1 = Returns(
            window_length=5,
            inputs=[self.col],
            mask=AssetID().eq(1),
        )
        returns_masked_2 = Returns(
            window_length=5,
            inputs=[self.col],
            mask=AssetID().eq(2),
        )
        with pytest.raises(IncompatibleTerms):
            returns_masked_1.linear_regression(
                target=returns_masked_2,
                regression_length=regression_length,
            )

        returns_5 = Returns(window_length=5, inputs=[self.col])
        returns_10 = Returns(window_length=10, inputs=[self.col])

        regression_factor = returns_5.linear_regression(
            target=returns_10,
            regression_length=regression_length,
        )

        columns = {
            output: getattr(regression_factor, output)
            for output in outputs
        }
        pipeline = Pipeline(columns=columns)

        results = run_pipeline(pipeline, start_date, end_date)

        output_results = {}
        expected_output_results = {}
        for output in outputs:
            output_results[output] = results[output].unstack()
            expected_output_results[output] = np.full_like(
                output_results[output],
                nan,
            )

        # Run a separate pipeline that calculates returns starting
        # (regression_length - 1) days prior to our start date. This is because
        # we need (regression_length - 1) extra days of returns to compute our
        # expected regressions.
        columns = {"returns_5": returns_5, "returns_10": returns_10}
        results = run_pipeline(
            Pipeline(columns=columns),
            dates[start_date_index - (regression_length - 1)],
            dates[end_date_index],
        )
        returns_5_results = results["returns_5"].unstack()
        returns_10_results = results["returns_10"].unstack()

        # On each day, for each asset, calculate the expected regression
        # results of Y ~ X where Y is the asset's rolling 5 day returns and X
        # is the asset's rolling 10 day returns. Each regression is calculated
        # over `regression_length` days of data.
        for day in range(num_days):
            todays_returns_5 = returns_5_results.iloc[day:day +
                                                      regression_length]
            todays_returns_10 = returns_10_results.iloc[day:day +
                                                        regression_length]
            for asset, asset_returns_5 in todays_returns_5.iteritems():
                asset_column = int(asset) - 1
                asset_returns_10 = todays_returns_10[asset]
                expected_regression_results = linregress(
                    y=asset_returns_5,
                    x=asset_returns_10,
                )
                for i, output in enumerate(outputs):
                    expected_output_results[output][
                        day, asset_column] = expected_regression_results[i]

        for output in outputs:
            output_result = output_results[output]
            expected_output_result = pd.DataFrame(
                expected_output_results[output],
                index=dates[start_date_index:end_date_index + 1],
                columns=assets,
            )
            assert_frame_equal(output_result, expected_output_result)
Ejemplo n.º 20
0
def test_infer_index_value():
    # same range index
    index1 = pd.RangeIndex(1, 3)
    index2 = pd.RangeIndex(1, 3)

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert oival.key == ival1.key
    assert oival.key == ival2.key

    # different range index
    index1 = pd.RangeIndex(1, 3)
    index2 = pd.RangeIndex(2, 4)

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Int64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    # same int64 index, all unique
    index1 = pd.Int64Index([1, 2])
    index2 = pd.Int64Index([1, 2])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Int64Index)
    assert oival.key == ival1.key
    assert oival.key == ival2.key

    # same int64 index, not all unique
    index1 = pd.Int64Index([1, 2, 2])
    index2 = pd.Int64Index([1, 2, 2])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Int64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    # different int64 index
    index1 = pd.Int64Index([1, 2])
    index2 = pd.Int64Index([2, 3])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Int64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    # different index type
    index1 = pd.Int64Index([1, 2])
    index2 = pd.Float64Index([2.0, 3.0])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Float64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    # range index and other index
    index1 = pd.RangeIndex(1, 4)
    index2 = pd.Float64Index([2, 3, 4])

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Float64Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key

    index1 = pd.DatetimeIndex([])
    index2 = pd.RangeIndex(2)

    ival1 = parse_index(index1)
    ival2 = parse_index(index2)
    oival = infer_index_value(ival1, ival2)

    assert isinstance(oival.value, IndexValue.Index)
    assert oival.key != ival1.key
    assert oival.key != ival2.key
Ejemplo n.º 21
0
def _pandas_basic_index(pandas, entry_start, entry_stop):
    if hasattr(pandas, "RangeIndex"):
        return pandas.RangeIndex(entry_start, entry_stop)
    else:
        return pandas.Int64Index(uproot4._util.range(entry_start, entry_stop))
Ejemplo n.º 22
0
def _check_values(values: Union[VALID_FORECASTING_HORIZON_TYPES]) -> pd.Index:
    """Validate forecasting horizon values.

    Validation checks validity and also converts forecasting horizon values
    to supported pandas.Index types if possible.

    Parameters
    ----------
    values : int, list, array, certain pd.Index types
        Forecasting horizon with steps ahead to predict.

    Raises
    ------
    TypeError :
        Raised if `values` type is not supported

    Returns
    -------
    values : pd.Index
        Sorted and validated forecasting horizon values.
    """
    # if values are one of the supported pandas index types, we don't have
    # to do
    # anything as the forecasting horizon directly wraps the index, note that
    # isinstance() does not work here, because index types inherit from each
    # other,
    # hence we check for type equality here
    if type(values) in VALID_INDEX_TYPES:
        pass

    # convert single integer to pandas index, no further checks needed
    elif is_int(values):
        return pd.Int64Index([values], dtype=int)

    elif is_timedelta_or_date_offset(values):
        return pd.Index([values])

    # convert np.array or list to pandas index
    elif is_array(values) and array_is_int(values):
        values = pd.Int64Index(values, dtype=int)

    elif is_array(values) and array_is_timedelta_or_date_offset(values):
        values = pd.Index(values)

    # otherwise, raise type error
    else:
        valid_types = (
            "int",
            "np.array",
            "list",
            *[f"pd.{index_type.__name__}" for index_type in VALID_INDEX_TYPES],
        )
        raise TypeError(
            f"Invalid `fh`. The type of the passed `fh` values is not supported. "
            f"Please use one of {valid_types}, but found: {type(values)}")

    # check values does not contain duplicates
    if len(values) != values.nunique():
        raise ValueError(
            "Invalid `fh`. The `fh` values must not contain any duplicates.")

    # return sorted values
    return values.sort_values()
Ejemplo n.º 23
0
 def pd_range_index_getitem_impl(self, idx):
     res_as_arr = _sdc_take(self, idx)
     return pd.Int64Index(res_as_arr, name=self._name)
Ejemplo n.º 24
0
 def test_constructor_unwraps_index(self):
     idx = pd.Index([1, 2])
     result = pd.Int64Index(idx)
     expected = np.array([1, 2], dtype="int64")
     tm.assert_numpy_array_equal(result._data, expected)
Ejemplo n.º 25
0
class TestNumericArraylikeArithmeticWithTimedeltaScalar(object):

    # TODO: de-duplicate with test_numeric_arr_mul_tdscalar
    def test_ops_series(self):
        # regression test for G#H8813
        td = Timedelta('1 day')
        other = pd.Series([1, 2])
        expected = pd.Series(pd.to_timedelta(['1 day', '2 days']))
        tm.assert_series_equal(expected, td * other)
        tm.assert_series_equal(expected, other * td)

    @pytest.mark.parametrize('box', [
        pd.Index, Series,
        pytest.param(pd.DataFrame,
                     marks=pytest.mark.xfail(reason="block.eval incorrect",
                                             strict=True))
    ])
    @pytest.mark.parametrize('index', [
        pd.Int64Index(range(1, 11)),
        pd.UInt64Index(range(1, 11)),
        pd.Float64Index(range(1, 11)),
        pd.RangeIndex(1, 11)
    ],
                             ids=lambda x: type(x).__name__)
    @pytest.mark.parametrize('scalar_td', [
        Timedelta(days=1),
        Timedelta(days=1).to_timedelta64(),
        Timedelta(days=1).to_pytimedelta()
    ],
                             ids=lambda x: type(x).__name__)
    def test_numeric_arr_mul_tdscalar(self, scalar_td, index, box):
        # GH#19333

        if (box is Series and type(scalar_td) is timedelta
                and index.dtype == 'f8'):
            raise pytest.xfail(reason="Cannot multiply timedelta by float")

        expected = pd.timedelta_range('1 days', '10 days')

        index = tm.box_expected(index, box)
        expected = tm.box_expected(expected, box)

        result = index * scalar_td
        tm.assert_equal(result, expected)

        commute = scalar_td * index
        tm.assert_equal(commute, expected)

    @pytest.mark.parametrize('index', [
        pd.Int64Index(range(1, 3)),
        pd.UInt64Index(range(1, 3)),
        pd.Float64Index(range(1, 3)),
        pd.RangeIndex(1, 3)
    ],
                             ids=lambda x: type(x).__name__)
    @pytest.mark.parametrize('scalar_td', [
        Timedelta(days=1),
        Timedelta(days=1).to_timedelta64(),
        Timedelta(days=1).to_pytimedelta()
    ],
                             ids=lambda x: type(x).__name__)
    def test_numeric_arr_rdiv_tdscalar(self, scalar_td, index, box):

        if box is Series and type(scalar_td) is timedelta:
            raise pytest.xfail(reason="TODO: Figure out why this case fails")
        if box is pd.DataFrame and isinstance(scalar_td, timedelta):
            raise pytest.xfail(reason="TODO: Figure out why this case fails")

        expected = TimedeltaIndex(['1 Day', '12 Hours'])

        index = tm.box_expected(index, box)
        expected = tm.box_expected(expected, box)

        result = scalar_td / index
        tm.assert_equal(result, expected)

        with pytest.raises(TypeError):
            index / scalar_td
Ejemplo n.º 26
0
    def test_marshall_index(self):
        """Test streamlit.data_frame._marshall_index."""
        df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

        # Plain Index
        proto = Index()
        data_frame._marshall_index(df.columns, proto)
        self.assertEqual(["col1", "col2"], proto.plain_index.data.strings.data)

        # Range Index
        proto = Index()
        data_frame._marshall_index(df.index, proto)
        self.assertEqual(0, proto.range_index.start)
        self.assertEqual(2, proto.range_index.stop)

        # Range Index with NaNs
        df_nan = pd.DataFrame(data={"col1": [], "col2": []})
        proto = Index()
        data_frame._marshall_index(df_nan.index, proto)
        self.assertEqual(0, proto.range_index.start)
        self.assertEqual(0, proto.range_index.stop)

        # multi index
        df_multi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]], names=["one", "two"])
        proto = Index()
        data_frame._marshall_index(df_multi, proto)
        self.assertEqual([1, 2], proto.multi_index.levels[0].int_64_index.data.data)
        self.assertEqual([0, 1], proto.multi_index.labels[0].data)

        # datetimeindex
        truth = [int(x * 1e9) for x in (1554138000, 1554141600, 1554145200)]
        df_dt = pd.date_range(
            start="2019/04/01 10:00", end="2019/04/01 12:00", freq="H"
        )
        proto = Index()
        obj_to_patch = "streamlit.elements.data_frame.tzlocal.get_localzone"
        with patch(obj_to_patch) as p:
            p.return_value = "America/Los_Angeles"
            data_frame._marshall_index(df_dt, proto)
            self.assertEqual(truth, proto.datetime_index.data.data)

        # timedeltaindex
        df_td = pd.to_timedelta(np.arange(1, 5), unit="ns")
        proto = Index()
        data_frame._marshall_index(df_td, proto)
        self.assertEqual([1, 2, 3, 4], proto.timedelta_index.data.data)

        # int64index
        df_int64 = pd.Int64Index(np.arange(1, 5))
        proto = Index()
        data_frame._marshall_index(df_int64, proto)
        self.assertEqual([1, 2, 3, 4], proto.int_64_index.data.data)

        # float64index
        df_float64 = pd.Float64Index(np.arange(1, 5))
        proto = Index()
        data_frame._marshall_index(df_float64, proto)
        self.assertEqual([1, 2, 3, 4], proto.float_64_index.data.data)

        # Period index
        df_period = pd.period_range(
            start="2005-12-21 08:45 ", end="2005-12-21 11:55", freq="H"
        )
        proto = Index()
        with pytest.raises(NotImplementedError) as e:
            data_frame._marshall_index(df_period, proto)
        err_msg = (
            "Can't handle <class 'pandas.core.indexes.period.PeriodIndex'>" " yet."
        )
        self.assertEqual(err_msg, str(e.value))
Ejemplo n.º 27
0
    def testFilterIndexValue(self):
        pd_index = pd.RangeIndex(10)
        index_value = parse_index(pd_index)

        min_max = (0, True, 9, True)
        self.assertEqual(
            filter_index_value(index_value, min_max).to_pandas().tolist(),
            pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist())

        min_max = (0, False, 9, False)
        self.assertEqual(
            filter_index_value(index_value, min_max).to_pandas().tolist(),
            pd_index[(pd_index > 0) & (pd_index < 9)].tolist())

        pd_index = pd.RangeIndex(1, 11, 3)
        index_value = parse_index(pd_index)

        min_max = (2, True, 10, True)
        self.assertEqual(
            filter_index_value(index_value, min_max).to_pandas().tolist(),
            pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist())

        min_max = (2, False, 10, False)
        self.assertEqual(
            filter_index_value(index_value, min_max).to_pandas().tolist(),
            pd_index[(pd_index > 2) & (pd_index < 10)].tolist())

        pd_index = pd.RangeIndex(9, -1, -1)
        index_value = parse_index(pd_index)

        min_max = (0, True, 9, True)
        self.assertEqual(
            filter_index_value(index_value, min_max).to_pandas().tolist(),
            pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist())

        min_max = (0, False, 9, False)
        self.assertEqual(
            filter_index_value(index_value, min_max).to_pandas().tolist(),
            pd_index[(pd_index > 0) & (pd_index < 9)].tolist())

        pd_index = pd.RangeIndex(10, 0, -3)
        index_value = parse_index(pd_index, store_data=False)

        min_max = (2, True, 10, True)
        self.assertEqual(
            filter_index_value(index_value, min_max).to_pandas().tolist(),
            pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist())

        min_max = (2, False, 10, False)
        self.assertEqual(
            filter_index_value(index_value, min_max).to_pandas().tolist(),
            pd_index[(pd_index > 2) & (pd_index < 10)].tolist())

        pd_index = pd.Int64Index([0, 3, 8])
        index_value = parse_index(pd_index, store_data=True)

        min_max = (2, True, 8, False)
        self.assertEqual(
            filter_index_value(index_value, min_max,
                               store_data=True).to_pandas().tolist(),
            pd_index[(pd_index >= 2) & (pd_index < 8)].tolist())

        index_value = parse_index(pd_index)

        min_max = (2, True, 8, False)
        filtered = filter_index_value(index_value, min_max)
        self.assertEqual(len(filtered.to_pandas().tolist()), 0)
        self.assertIsInstance(filtered.value, IndexValue.Int64Index)
Ejemplo n.º 28
0
class TestFancy(Base):
    """ pure get/set item & fancy indexing """
    def test_setitem_ndarray_1d(self):
        # GH5508

        # len of indexer vs length of the 1d ndarray
        df = DataFrame(index=Index(lrange(1, 11)))
        df['foo'] = np.zeros(10, dtype=np.float64)
        df['bar'] = np.zeros(10, dtype=np.complex)

        # invalid
        def f():
            df.loc[df.index[2:5],
                   'bar'] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0])

        pytest.raises(ValueError, f)

        # valid
        df.loc[df.index[2:6], 'bar'] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0])

        result = df.loc[df.index[2:6], 'bar']
        expected = Series([2.33j, 1.23 + 0.1j, 2.2, 1.0],
                          index=[3, 4, 5, 6],
                          name='bar')
        tm.assert_series_equal(result, expected)

        # dtype getting changed?
        df = DataFrame(index=Index(lrange(1, 11)))
        df['foo'] = np.zeros(10, dtype=np.float64)
        df['bar'] = np.zeros(10, dtype=np.complex)

        def f():
            df[2:5] = np.arange(1, 4) * 1j

        pytest.raises(ValueError, f)

    def test_inf_upcast(self):
        # GH 16957
        # We should be able to use np.inf as a key
        # np.inf should cause an index to convert to float

        # Test with np.inf in rows
        df = pd.DataFrame(columns=[0])
        df.loc[1] = 1
        df.loc[2] = 2
        df.loc[np.inf] = 3

        # make sure we can look up the value
        assert df.loc[np.inf, 0] == 3

        result = df.index
        expected = pd.Float64Index([1, 2, np.inf])
        tm.assert_index_equal(result, expected)

        # Test with np.inf in columns
        df = pd.DataFrame()
        df.loc[0, 0] = 1
        df.loc[1, 1] = 2
        df.loc[0, np.inf] = 3

        result = df.columns
        expected = pd.Float64Index([0, 1, np.inf])
        tm.assert_index_equal(result, expected)

    def test_setitem_dtype_upcast(self):

        # GH3216
        df = DataFrame([{"a": 1}, {"a": 3, "b": 2}])
        df['c'] = np.nan
        assert df['c'].dtype == np.float64

        df.loc[0, 'c'] = 'foo'
        expected = DataFrame([{
            "a": 1,
            "c": 'foo'
        }, {
            "a": 3,
            "b": 2,
            "c": np.nan
        }])
        tm.assert_frame_equal(df, expected)

        # GH10280
        df = DataFrame(np.arange(6, dtype='int64').reshape(2, 3),
                       index=list('ab'),
                       columns=['foo', 'bar', 'baz'])

        for val in [3.14, 'wxyz']:
            left = df.copy()
            left.loc['a', 'bar'] = val
            right = DataFrame([[0, val, 2], [3, 4, 5]],
                              index=list('ab'),
                              columns=['foo', 'bar', 'baz'])

            tm.assert_frame_equal(left, right)
            assert is_integer_dtype(left['foo'])
            assert is_integer_dtype(left['baz'])

        left = DataFrame(np.arange(6, dtype='int64').reshape(2, 3) / 10.0,
                         index=list('ab'),
                         columns=['foo', 'bar', 'baz'])
        left.loc['a', 'bar'] = 'wxyz'

        right = DataFrame([[0, 'wxyz', .2], [.3, .4, .5]],
                          index=list('ab'),
                          columns=['foo', 'bar', 'baz'])

        tm.assert_frame_equal(left, right)
        assert is_float_dtype(left['foo'])
        assert is_float_dtype(left['baz'])

    def test_dups_fancy_indexing(self):

        # GH 3455
        from pandas.util.testing import makeCustomDataframe as mkdf
        df = mkdf(10, 3)
        df.columns = ['a', 'a', 'b']
        result = df[['b', 'a']].columns
        expected = Index(['b', 'a', 'a'])
        tm.assert_index_equal(result, expected)

        # across dtypes
        df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']],
                       columns=list('aaaaaaa'))
        df.head()
        str(df)
        result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']])
        result.columns = list('aaaaaaa')

        # TODO(wesm): unused?
        df_v = df.iloc[:, 4]  # noqa
        res_v = result.iloc[:, 4]  # noqa

        tm.assert_frame_equal(df, result)

        # GH 3561, dups not in selected order
        df = DataFrame(
            {
                'test': [5, 7, 9, 11],
                'test1': [4., 5, 6, 7],
                'other': list('abcd')
            },
            index=['A', 'A', 'B', 'C'])
        rows = ['C', 'B']
        expected = DataFrame(
            {
                'test': [11, 9],
                'test1': [7., 6],
                'other': ['d', 'c']
            },
            index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        result = df.loc[Index(rows)]
        tm.assert_frame_equal(result, expected)

        rows = ['C', 'B', 'E']
        expected = DataFrame(
            {
                'test': [11, 9, np.nan],
                'test1': [7., 6, np.nan],
                'other': ['d', 'c', np.nan]
            },
            index=rows)

        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # see GH5553, make sure we use the right indexer
        rows = ['F', 'G', 'H', 'C', 'B', 'E']
        expected = DataFrame(
            {
                'test': [np.nan, np.nan, np.nan, 11, 9, np.nan],
                'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan],
                'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan]
            },
            index=rows)
        result = df.loc[rows]
        tm.assert_frame_equal(result, expected)

        # inconsistent returns for unique/duplicate indices when values are
        # missing
        df = DataFrame(np.random.randn(4, 3), index=list('ABCD'))
        expected = df.reindex(['E'])

        dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD'))
        with catch_warnings(record=True):
            result = dfnu.ix[['E']]
        tm.assert_frame_equal(result, expected)

        # ToDo: check_index_type can be True after GH 11497

        # GH 4619; duplicate indexer with missing label
        df = DataFrame({"A": [0, 1, 2]})
        result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        df = DataFrame({"A": list('abc')})
        result = df.loc[[0, 8, 0]]
        expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0])
        tm.assert_frame_equal(result, expected, check_index_type=False)

        # non unique with non unique selector
        df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C'])
        expected = DataFrame({'test': [5, 7, 5, 7, np.nan]},
                             index=['A', 'A', 'A', 'A', 'E'])
        result = df.loc[['A', 'A', 'E']]
        tm.assert_frame_equal(result, expected)

        # GH 5835
        # dups on index and missing values
        df = DataFrame(np.random.randn(5, 5),
                       columns=['A', 'B', 'B', 'B', 'A'])

        expected = pd.concat([
            df.loc[:, ['A', 'B']],
            DataFrame(np.nan, columns=['C'], index=df.index)
        ],
                             axis=1)
        result = df.loc[:, ['A', 'B', 'C']]
        tm.assert_frame_equal(result, expected)

        # GH 6504, multi-axis indexing
        df = DataFrame(np.random.randn(9, 2),
                       index=[1, 1, 1, 2, 2, 2, 3, 3, 3],
                       columns=['a', 'b'])

        expected = df.iloc[0:6]
        result = df.loc[[1, 2]]
        tm.assert_frame_equal(result, expected)

        expected = df
        result = df.loc[:, ['a', 'b']]
        tm.assert_frame_equal(result, expected)

        expected = df.iloc[0:6, :]
        result = df.loc[[1, 2], ['a', 'b']]
        tm.assert_frame_equal(result, expected)

    def test_indexing_mixed_frame_bug(self):

        # GH3492
        df = DataFrame({
            'a': {
                1: 'aaa',
                2: 'bbb',
                3: 'ccc'
            },
            'b': {
                1: 111,
                2: 222,
                3: 333
            }
        })

        # this works, new column is created correctly
        df['test'] = df['a'].apply(lambda x: '_' if x == 'aaa' else x)

        # this does not work, ie column test is not changed
        idx = df['test'] == '_'
        temp = df.loc[idx, 'a'].apply(lambda x: '-----' if x == 'aaa' else x)
        df.loc[idx, 'test'] = temp
        assert df.iloc[0, 2] == '-----'

        # if I look at df, then element [0,2] equals '_'. If instead I type
        # df.ix[idx,'test'], I get '-----', finally by typing df.iloc[0,2] I
        # get '_'.

    def test_multitype_list_index_access(self):
        # GH 10610
        df = pd.DataFrame(np.random.random((10, 5)),
                          columns=["a"] + [20, 21, 22, 23])

        with pytest.raises(KeyError):
            df[[22, 26, -8]]
        assert df[21].shape[0] == df.shape[0]

    def test_set_index_nan(self):

        # GH 3586
        df = DataFrame({
            'PRuid': {
                17: 'nonQC',
                18: 'nonQC',
                19: 'nonQC',
                20: '10',
                21: '11',
                22: '12',
                23: '13',
                24: '24',
                25: '35',
                26: '46',
                27: '47',
                28: '48',
                29: '59',
                30: '10'
            },
            'QC': {
                17: 0.0,
                18: 0.0,
                19: 0.0,
                20: np.nan,
                21: np.nan,
                22: np.nan,
                23: np.nan,
                24: 1.0,
                25: np.nan,
                26: np.nan,
                27: np.nan,
                28: np.nan,
                29: np.nan,
                30: np.nan
            },
            'data': {
                17: 7.9544899999999998,
                18: 8.0142609999999994,
                19: 7.8591520000000008,
                20: 0.86140349999999999,
                21: 0.87853110000000001,
                22: 0.8427041999999999,
                23: 0.78587700000000005,
                24: 0.73062459999999996,
                25: 0.81668560000000001,
                26: 0.81927080000000008,
                27: 0.80705009999999999,
                28: 0.81440240000000008,
                29: 0.80140849999999997,
                30: 0.81307740000000006
            },
            'year': {
                17: 2006,
                18: 2007,
                19: 2008,
                20: 1985,
                21: 1985,
                22: 1985,
                23: 1985,
                24: 1985,
                25: 1985,
                26: 1985,
                27: 1985,
                28: 1985,
                29: 1985,
                30: 1986
            }
        }).reset_index()

        result = df.set_index(['year', 'PRuid',
                               'QC']).reset_index().reindex(columns=df.columns)
        tm.assert_frame_equal(result, df)

    def test_multi_nan_indexing(self):

        # GH 3588
        df = DataFrame({
            "a": ['R1', 'R2', np.nan, 'R4'],
            'b': ["C1", "C2", "C3", "C4"],
            "c": [10, 15, np.nan, 20]
        })
        result = df.set_index(['a', 'b'], drop=False)
        expected = DataFrame(
            {
                "a": ['R1', 'R2', np.nan, 'R4'],
                'b': ["C1", "C2", "C3", "C4"],
                "c": [10, 15, np.nan, 20]
            },
            index=[
                Index(['R1', 'R2', np.nan, 'R4'], name='a'),
                Index(['C1', 'C2', 'C3', 'C4'], name='b')
            ])
        tm.assert_frame_equal(result, expected)

    def test_multi_assign(self):

        # GH 3626, an assignement of a sub-df to a df
        df = DataFrame({
            'FC': ['a', 'b', 'a', 'b', 'a', 'b'],
            'PF': [0, 0, 0, 0, 1, 1],
            'col1': lrange(6),
            'col2': lrange(6, 12)
        })
        df.iloc[1, 0] = np.nan
        df2 = df.copy()

        mask = ~df2.FC.isnull()
        cols = ['col1', 'col2']

        dft = df2 * 2
        dft.iloc[3, 3] = np.nan

        expected = DataFrame({
            'FC': ['a', np.nan, 'a', 'b', 'a', 'b'],
            'PF': [0, 0, 0, 0, 1, 1],
            'col1': Series([0, 1, 4, 6, 8, 10]),
            'col2': [12, 7, 16, np.nan, 20, 22]
        })

        # frame on rhs
        df2.loc[mask, cols] = dft.loc[mask, cols]
        tm.assert_frame_equal(df2, expected)

        df2.loc[mask, cols] = dft.loc[mask, cols]
        tm.assert_frame_equal(df2, expected)

        # with an ndarray on rhs
        # coerces to float64 because values has float64 dtype
        # GH 14001
        expected = DataFrame({
            'FC': ['a', np.nan, 'a', 'b', 'a', 'b'],
            'PF': [0, 0, 0, 0, 1, 1],
            'col1': [0., 1., 4., 6., 8., 10.],
            'col2': [12, 7, 16, np.nan, 20, 22]
        })
        df2 = df.copy()
        df2.loc[mask, cols] = dft.loc[mask, cols].values
        tm.assert_frame_equal(df2, expected)
        df2.loc[mask, cols] = dft.loc[mask, cols].values
        tm.assert_frame_equal(df2, expected)

        # broadcasting on the rhs is required
        df = DataFrame(
            dict(A=[1, 2, 0, 0, 0],
                 B=[0, 0, 0, 10, 11],
                 C=[0, 0, 0, 10, 11],
                 D=[3, 4, 5, 6, 7]))

        expected = df.copy()
        mask = expected['A'] == 0
        for col in ['A', 'B']:
            expected.loc[mask, col] = df['D']

        df.loc[df['A'] == 0, ['A', 'B']] = df['D']
        tm.assert_frame_equal(df, expected)

    def test_setitem_list(self):

        # GH 6043
        # ix with a list
        df = DataFrame(index=[0, 1], columns=[0])
        with catch_warnings(record=True):
            df.ix[1, 0] = [1, 2, 3]
            df.ix[1, 0] = [1, 2]

        result = DataFrame(index=[0, 1], columns=[0])
        with catch_warnings(record=True):
            result.ix[1, 0] = [1, 2]

        tm.assert_frame_equal(result, df)

        # ix with an object
        class TO(object):
            def __init__(self, value):
                self.value = value

            def __str__(self):
                return "[{0}]".format(self.value)

            __repr__ = __str__

            def __eq__(self, other):
                return self.value == other.value

            def view(self):
                return self

        df = DataFrame(index=[0, 1], columns=[0])
        with catch_warnings(record=True):
            df.ix[1, 0] = TO(1)
            df.ix[1, 0] = TO(2)

        result = DataFrame(index=[0, 1], columns=[0])
        with catch_warnings(record=True):
            result.ix[1, 0] = TO(2)

        tm.assert_frame_equal(result, df)

        # remains object dtype even after setting it back
        df = DataFrame(index=[0, 1], columns=[0])
        with catch_warnings(record=True):
            df.ix[1, 0] = TO(1)
            df.ix[1, 0] = np.nan
        result = DataFrame(index=[0, 1], columns=[0])

        tm.assert_frame_equal(result, df)

    def test_string_slice(self):
        # GH 14424
        # string indexing against datetimelike with object
        # dtype should properly raises KeyError
        df = pd.DataFrame([1],
                          pd.Index([pd.Timestamp('2011-01-01')], dtype=object))
        assert df.index.is_all_dates
        with pytest.raises(KeyError):
            df['2011']

        with pytest.raises(KeyError):
            df.loc['2011', 0]

        df = pd.DataFrame()
        assert not df.index.is_all_dates
        with pytest.raises(KeyError):
            df['2011']

        with pytest.raises(KeyError):
            df.loc['2011', 0]

    def test_mi_access(self):

        # GH 4145
        data = """h1 main  h3 sub  h5
0  a    A   1  A1   1
1  b    B   2  B1   2
2  c    B   3  A1   3
3  d    A   4  B2   4
4  e    A   5  B2   5
5  f    B   6  A2   6
"""

        df = pd.read_csv(StringIO(data), sep=r'\s+', index_col=0)
        df2 = df.set_index(['main', 'sub']).T.sort_index(1)
        index = Index(['h1', 'h3', 'h5'])
        columns = MultiIndex.from_tuples([('A', 'A1')], names=['main', 'sub'])
        expected = DataFrame([['a', 1, 1]], index=columns, columns=index).T

        result = df2.loc[:, ('A', 'A1')]
        tm.assert_frame_equal(result, expected)

        result = df2[('A', 'A1')]
        tm.assert_frame_equal(result, expected)

        # GH 4146, not returning a block manager when selecting a unique index
        # from a duplicate index
        # as of 4879, this returns a Series (which is similar to what happens
        # with a non-unique)
        expected = Series(['a', 1, 1], index=['h1', 'h3', 'h5'], name='A1')
        result = df2['A']['A1']
        tm.assert_series_equal(result, expected)

        # selecting a non_unique from the 2nd level
        expected = DataFrame(
            [['d', 4, 4], ['e', 5, 5]],
            index=Index(['B2', 'B2'], name='sub'),
            columns=['h1', 'h3', 'h5'],
        ).T
        result = df2['A']['B2']
        tm.assert_frame_equal(result, expected)

    def test_astype_assignment(self):

        # GH4312 (iloc)
        df_orig = DataFrame([['1', '2', '3', '.4', 5, 6., 'foo']],
                            columns=list('ABCDEFG'))

        df = df_orig.copy()
        df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64)
        expected = DataFrame([[1, 2, '3', '.4', 5, 6., 'foo']],
                             columns=list('ABCDEFG'))
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.iloc[:, 0:2] = df.iloc[:, 0:2]._convert(datetime=True, numeric=True)
        expected = DataFrame([[1, 2, '3', '.4', 5, 6., 'foo']],
                             columns=list('ABCDEFG'))
        tm.assert_frame_equal(df, expected)

        # GH5702 (loc)
        df = df_orig.copy()
        df.loc[:, 'A'] = df.loc[:, 'A'].astype(np.int64)
        expected = DataFrame([[1, '2', '3', '.4', 5, 6., 'foo']],
                             columns=list('ABCDEFG'))
        tm.assert_frame_equal(df, expected)

        df = df_orig.copy()
        df.loc[:, ['B', 'C']] = df.loc[:, ['B', 'C']].astype(np.int64)
        expected = DataFrame([['1', 2, 3, '.4', 5, 6., 'foo']],
                             columns=list('ABCDEFG'))
        tm.assert_frame_equal(df, expected)

        # full replacements / no nans
        df = DataFrame({'A': [1., 2., 3., 4.]})
        df.iloc[:, 0] = df['A'].astype(np.int64)
        expected = DataFrame({'A': [1, 2, 3, 4]})
        tm.assert_frame_equal(df, expected)

        df = DataFrame({'A': [1., 2., 3., 4.]})
        df.loc[:, 'A'] = df['A'].astype(np.int64)
        expected = DataFrame({'A': [1, 2, 3, 4]})
        tm.assert_frame_equal(df, expected)

    def test_astype_assignment_with_dups(self):

        # GH 4686
        # assignment with dups that has a dtype change
        cols = pd.MultiIndex.from_tuples([('A', '1'), ('B', '1'), ('A', '2')])
        df = DataFrame(np.arange(3).reshape((1, 3)),
                       columns=cols,
                       dtype=object)
        index = df.index.copy()

        df['A'] = df['A'].astype(np.float64)
        tm.assert_index_equal(df.index, index)

        # TODO(wesm): unused variables
        # result = df.get_dtype_counts().sort_index()
        # expected = Series({'float64': 2, 'object': 1}).sort_index()

    @pytest.mark.parametrize("index,val", [
        (pd.Index([0, 1, 2]), 2),
        (pd.Index([0, 1, '2']), '2'),
        (pd.Index([0, 1, 2, np.inf, 4]), 4),
        (pd.Index([0, 1, 2, np.nan, 4]), 4),
        (pd.Index([0, 1, 2, np.inf]), np.inf),
        (pd.Index([0, 1, 2, np.nan]), np.nan),
    ])
    def test_index_contains(self, index, val):
        assert val in index

    @pytest.mark.parametrize(
        "index,val",
        [
            (pd.Index([0, 1, 2]), '2'),
            (pd.Index([0, 1, '2']), 2),
            (pd.Index([0, 1, 2, np.inf]), 4),
            (pd.Index([0, 1, 2, np.nan]), 4),
            (pd.Index([0, 1, 2, np.inf]), np.nan),
            (pd.Index([0, 1, 2, np.nan]), np.inf),
            # Checking if np.inf in Int64Index should not cause an OverflowError
            # Related to GH 16957
            (pd.Int64Index([0, 1, 2]), np.inf),
            (pd.Int64Index([0, 1, 2]), np.nan),
            (pd.UInt64Index([0, 1, 2]), np.inf),
            (pd.UInt64Index([0, 1, 2]), np.nan),
        ])
    def test_index_not_contains(self, index, val):
        assert val not in index

    def test_index_type_coercion(self):

        with catch_warnings(record=True):

            # GH 11836
            # if we have an index type and set it with something that looks
            # to numpy like the same, but is actually, not
            # (e.g. setting with a float or string '0')
            # then we need to coerce to object

            # integer indexes
            for s in [Series(range(5)), Series(range(5), index=range(1, 6))]:

                assert s.index.is_integer()

                for indexer in [lambda x: x.ix, lambda x: x.loc, lambda x: x]:
                    s2 = s.copy()
                    indexer(s2)[0.1] = 0
                    assert s2.index.is_floating()
                    assert indexer(s2)[0.1] == 0

                    s2 = s.copy()
                    indexer(s2)[0.0] = 0
                    exp = s.index
                    if 0 not in s:
                        exp = Index(s.index.tolist() + [0])
                    tm.assert_index_equal(s2.index, exp)

                    s2 = s.copy()
                    indexer(s2)['0'] = 0
                    assert s2.index.is_object()

            for s in [Series(range(5), index=np.arange(5.))]:

                assert s.index.is_floating()

                for idxr in [lambda x: x.ix, lambda x: x.loc, lambda x: x]:

                    s2 = s.copy()
                    idxr(s2)[0.1] = 0
                    assert s2.index.is_floating()
                    assert idxr(s2)[0.1] == 0

                    s2 = s.copy()
                    idxr(s2)[0.0] = 0
                    tm.assert_index_equal(s2.index, s.index)

                    s2 = s.copy()
                    idxr(s2)['0'] = 0
                    assert s2.index.is_object()
Ejemplo n.º 29
0
class TestSeriesConstructors(object):

    def test_invalid_dtype(self):
        # GH15520
        msg = 'not understood'
        invalid_list = [pd.Timestamp, 'pd.Timestamp', list]
        for dtype in invalid_list:
            with pytest.raises(TypeError, match=msg):
                Series([], name='time', dtype=dtype)

    def test_scalar_conversion(self):

        # Pass in scalar is disabled
        scalar = Series(0.5)
        assert not isinstance(scalar, float)

        # Coercion
        assert float(Series([1.])) == 1.0
        assert int(Series([1.])) == 1

    def test_constructor(self, datetime_series):
        empty_series = Series()

        assert datetime_series.index.is_all_dates

        # Pass in Series
        derived = Series(datetime_series)
        assert derived.index.is_all_dates

        assert tm.equalContents(derived.index, datetime_series.index)
        # Ensure new index is not created
        assert id(datetime_series.index) == id(derived.index)

        # Mixed type Series
        mixed = Series(['hello', np.NaN], index=[0, 1])
        assert mixed.dtype == np.object_
        assert mixed[1] is np.NaN

        assert not empty_series.index.is_all_dates
        assert not Series({}).index.is_all_dates

        # exception raised is of type Exception
        with pytest.raises(Exception, match="Data must be 1-dimensional"):
            Series(np.random.randn(3, 3), index=np.arange(3))

        mixed.name = 'Series'
        rs = Series(mixed).name
        xp = 'Series'
        assert rs == xp

        # raise on MultiIndex GH4187
        m = MultiIndex.from_arrays([[1, 2], [3, 4]])
        msg = "initializing a Series from a MultiIndex is not supported"
        with pytest.raises(NotImplementedError, match=msg):
            Series(m)

    @pytest.mark.parametrize('input_class', [list, dict, OrderedDict])
    def test_constructor_empty(self, input_class):
        empty = Series()
        empty2 = Series(input_class())

        # these are Index() and RangeIndex() which don't compare type equal
        # but are just .equals
        assert_series_equal(empty, empty2, check_index_type=False)

        # With explicit dtype:
        empty = Series(dtype='float64')
        empty2 = Series(input_class(), dtype='float64')
        assert_series_equal(empty, empty2, check_index_type=False)

        # GH 18515 : with dtype=category:
        empty = Series(dtype='category')
        empty2 = Series(input_class(), dtype='category')
        assert_series_equal(empty, empty2, check_index_type=False)

        if input_class is not list:
            # With index:
            empty = Series(index=lrange(10))
            empty2 = Series(input_class(), index=lrange(10))
            assert_series_equal(empty, empty2)

            # With index and dtype float64:
            empty = Series(np.nan, index=lrange(10))
            empty2 = Series(input_class(), index=lrange(10), dtype='float64')
            assert_series_equal(empty, empty2)

            # GH 19853 : with empty string, index and dtype str
            empty = Series('', dtype=str, index=range(3))
            empty2 = Series('', index=range(3))
            assert_series_equal(empty, empty2)

    @pytest.mark.parametrize('input_arg', [np.nan, float('nan')])
    def test_constructor_nan(self, input_arg):
        empty = Series(dtype='float64', index=lrange(10))
        empty2 = Series(input_arg, index=lrange(10))

        assert_series_equal(empty, empty2, check_index_type=False)

    @pytest.mark.parametrize('dtype', [
        'f8', 'i8', 'M8[ns]', 'm8[ns]', 'category', 'object',
        'datetime64[ns, UTC]',
    ])
    @pytest.mark.parametrize('index', [None, pd.Index([])])
    def test_constructor_dtype_only(self, dtype, index):
        # GH-20865
        result = pd.Series(dtype=dtype, index=index)
        assert result.dtype == dtype
        assert len(result) == 0

    def test_constructor_no_data_index_order(self):
        result = pd.Series(index=['b', 'a', 'c'])
        assert result.index.tolist() == ['b', 'a', 'c']

    def test_constructor_no_data_string_type(self):
        # GH 22477
        result = pd.Series(index=[1], dtype=str)
        assert np.isnan(result.iloc[0])

    @pytest.mark.parametrize('item', ['entry', 'ѐ', 13])
    def test_constructor_string_element_string_type(self, item):
        # GH 22477
        result = pd.Series(item, index=[1], dtype=str)
        assert result.iloc[0] == str(item)

    def test_constructor_dtype_str_na_values(self, string_dtype):
        # https://github.com/pandas-dev/pandas/issues/21083
        ser = Series(['x', None], dtype=string_dtype)
        result = ser.isna()
        expected = Series([False, True])
        tm.assert_series_equal(result, expected)
        assert ser.iloc[1] is None

        ser = Series(['x', np.nan], dtype=string_dtype)
        assert np.isnan(ser.iloc[1])

    def test_constructor_series(self):
        index1 = ['d', 'b', 'a', 'c']
        index2 = sorted(index1)
        s1 = Series([4, 7, -5, 3], index=index1)
        s2 = Series(s1, index=index2)

        assert_series_equal(s2, s1.sort_index())

    def test_constructor_iterable(self):
        # GH 21987
        class Iter():
            def __iter__(self):
                for i in range(10):
                    yield i

        expected = Series(list(range(10)), dtype='int64')
        result = Series(Iter(), dtype='int64')
        assert_series_equal(result, expected)

    def test_constructor_sequence(self):
        # GH 21987
        expected = Series(list(range(10)), dtype='int64')
        result = Series(range(10), dtype='int64')
        assert_series_equal(result, expected)

    def test_constructor_single_str(self):
        # GH 21987
        expected = Series(['abc'])
        result = Series('abc')
        assert_series_equal(result, expected)

    def test_constructor_list_like(self):

        # make sure that we are coercing different
        # list-likes to standard dtypes and not
        # platform specific
        expected = Series([1, 2, 3], dtype='int64')
        for obj in [[1, 2, 3], (1, 2, 3),
                    np.array([1, 2, 3], dtype='int64')]:
            result = Series(obj, index=[0, 1, 2])
            assert_series_equal(result, expected)

    @pytest.mark.parametrize('input_vals', [
        ([1, 2]),
        (['1', '2']),
        (list(pd.date_range('1/1/2011', periods=2, freq='H'))),
        (list(pd.date_range('1/1/2011', periods=2, freq='H',
                            tz='US/Eastern'))),
        ([pd.Interval(left=0, right=5)]),
    ])
    def test_constructor_list_str(self, input_vals, string_dtype):
        # GH 16605
        # Ensure that data elements from a list are converted to strings
        # when dtype is str, 'str', or 'U'
        result = Series(input_vals, dtype=string_dtype)
        expected = Series(input_vals).astype(string_dtype)
        assert_series_equal(result, expected)

    def test_constructor_list_str_na(self, string_dtype):
        result = Series([1.0, 2.0, np.nan], dtype=string_dtype)
        expected = Series(['1.0', '2.0', np.nan], dtype=object)
        assert_series_equal(result, expected)
        assert np.isnan(result[2])

    def test_constructor_generator(self):
        gen = (i for i in range(10))

        result = Series(gen)
        exp = Series(lrange(10))
        assert_series_equal(result, exp)

        gen = (i for i in range(10))
        result = Series(gen, index=lrange(10, 20))
        exp.index = lrange(10, 20)
        assert_series_equal(result, exp)

    def test_constructor_map(self):
        # GH8909
        m = map(lambda x: x, range(10))

        result = Series(m)
        exp = Series(lrange(10))
        assert_series_equal(result, exp)

        m = map(lambda x: x, range(10))
        result = Series(m, index=lrange(10, 20))
        exp.index = lrange(10, 20)
        assert_series_equal(result, exp)

    def test_constructor_categorical(self):
        cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'],
                             fastpath=True)
        res = Series(cat)
        tm.assert_categorical_equal(res.values, cat)

        # can cast to a new dtype
        result = Series(pd.Categorical([1, 2, 3]),
                        dtype='int64')
        expected = pd.Series([1, 2, 3], dtype='int64')
        tm.assert_series_equal(result, expected)

        # GH12574
        cat = Series(pd.Categorical([1, 2, 3]), dtype='category')
        assert is_categorical_dtype(cat)
        assert is_categorical_dtype(cat.dtype)
        s = Series([1, 2, 3], dtype='category')
        assert is_categorical_dtype(s)
        assert is_categorical_dtype(s.dtype)

    def test_constructor_categorical_with_coercion(self):
        factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
        # test basic creation / coercion of categoricals
        s = Series(factor, name='A')
        assert s.dtype == 'category'
        assert len(s) == len(factor)
        str(s.values)
        str(s)

        # in a frame
        df = DataFrame({'A': factor})
        result = df['A']
        tm.assert_series_equal(result, s)
        result = df.iloc[:, 0]
        tm.assert_series_equal(result, s)
        assert len(df) == len(factor)
        str(df.values)
        str(df)

        df = DataFrame({'A': s})
        result = df['A']
        tm.assert_series_equal(result, s)
        assert len(df) == len(factor)
        str(df.values)
        str(df)

        # multiples
        df = DataFrame({'A': s, 'B': s, 'C': 1})
        result1 = df['A']
        result2 = df['B']
        tm.assert_series_equal(result1, s)
        tm.assert_series_equal(result2, s, check_names=False)
        assert result2.name == 'B'
        assert len(df) == len(factor)
        str(df.values)
        str(df)

        # GH8623
        x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
                       [1, 'John P. Doe']],
                      columns=['person_id', 'person_name'])
        x['person_name'] = Categorical(x.person_name
                                       )  # doing this breaks transform

        expected = x.iloc[0].person_name
        result = x.person_name.iloc[0]
        assert result == expected

        result = x.person_name[0]
        assert result == expected

        result = x.person_name.loc[0]
        assert result == expected

    def test_constructor_categorical_dtype(self):
        result = pd.Series(['a', 'b'],
                           dtype=CategoricalDtype(['a', 'b', 'c'],
                                                  ordered=True))
        assert is_categorical_dtype(result) is True
        tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c']))
        assert result.cat.ordered

        result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['b', 'a']))
        assert is_categorical_dtype(result)
        tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a']))
        assert result.cat.ordered is False

        # GH 19565 - Check broadcasting of scalar with Categorical dtype
        result = Series('a', index=[0, 1],
                        dtype=CategoricalDtype(['a', 'b'], ordered=True))
        expected = Series(['a', 'a'], index=[0, 1],
                          dtype=CategoricalDtype(['a', 'b'], ordered=True))
        tm.assert_series_equal(result, expected, check_categorical=True)

    def test_categorical_sideeffects_free(self):
        # Passing a categorical to a Series and then changing values in either
        # the series or the categorical should not change the values in the
        # other one, IF you specify copy!
        cat = Categorical(["a", "b", "c", "a"])
        s = Series(cat, copy=True)
        assert s.cat is not cat
        s.cat.categories = [1, 2, 3]
        exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
        exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
        tm.assert_numpy_array_equal(s.__array__(), exp_s)
        tm.assert_numpy_array_equal(cat.__array__(), exp_cat)

        # setting
        s[0] = 2
        exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s2)
        tm.assert_numpy_array_equal(cat.__array__(), exp_cat)

        # however, copy is False by default
        # so this WILL change values
        cat = Categorical(["a", "b", "c", "a"])
        s = Series(cat)
        assert s.values is cat
        s.cat.categories = [1, 2, 3]
        exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s)
        tm.assert_numpy_array_equal(cat.__array__(), exp_s)

        s[0] = 2
        exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
        tm.assert_numpy_array_equal(s.__array__(), exp_s2)
        tm.assert_numpy_array_equal(cat.__array__(), exp_s2)

    def test_unordered_compare_equal(self):
        left = pd.Series(['a', 'b', 'c'],
                         dtype=CategoricalDtype(['a', 'b']))
        right = pd.Series(pd.Categorical(['a', 'b', np.nan],
                                         categories=['a', 'b']))
        tm.assert_series_equal(left, right)

    def test_constructor_maskedarray(self):
        data = ma.masked_all((3, ), dtype=float)
        result = Series(data)
        expected = Series([nan, nan, nan])
        assert_series_equal(result, expected)

        data[0] = 0.0
        data[2] = 2.0
        index = ['a', 'b', 'c']
        result = Series(data, index=index)
        expected = Series([0.0, nan, 2.0], index=index)
        assert_series_equal(result, expected)

        data[1] = 1.0
        result = Series(data, index=index)
        expected = Series([0.0, 1.0, 2.0], index=index)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype=int)
        result = Series(data)
        expected = Series([nan, nan, nan], dtype=float)
        assert_series_equal(result, expected)

        data[0] = 0
        data[2] = 2
        index = ['a', 'b', 'c']
        result = Series(data, index=index)
        expected = Series([0, nan, 2], index=index, dtype=float)
        assert_series_equal(result, expected)

        data[1] = 1
        result = Series(data, index=index)
        expected = Series([0, 1, 2], index=index, dtype=int)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype=bool)
        result = Series(data)
        expected = Series([nan, nan, nan], dtype=object)
        assert_series_equal(result, expected)

        data[0] = True
        data[2] = False
        index = ['a', 'b', 'c']
        result = Series(data, index=index)
        expected = Series([True, nan, False], index=index, dtype=object)
        assert_series_equal(result, expected)

        data[1] = True
        result = Series(data, index=index)
        expected = Series([True, True, False], index=index, dtype=bool)
        assert_series_equal(result, expected)

        data = ma.masked_all((3, ), dtype='M8[ns]')
        result = Series(data)
        expected = Series([iNaT, iNaT, iNaT], dtype='M8[ns]')
        assert_series_equal(result, expected)

        data[0] = datetime(2001, 1, 1)
        data[2] = datetime(2001, 1, 3)
        index = ['a', 'b', 'c']
        result = Series(data, index=index)
        expected = Series([datetime(2001, 1, 1), iNaT,
                           datetime(2001, 1, 3)], index=index, dtype='M8[ns]')
        assert_series_equal(result, expected)

        data[1] = datetime(2001, 1, 2)
        result = Series(data, index=index)
        expected = Series([datetime(2001, 1, 1), datetime(2001, 1, 2),
                           datetime(2001, 1, 3)], index=index, dtype='M8[ns]')
        assert_series_equal(result, expected)

    def test_constructor_maskedarray_hardened(self):
        # Check numpy masked arrays with hard masks -- from GH24574
        data = ma.masked_all((3, ), dtype=float).harden_mask()
        result = pd.Series(data)
        expected = pd.Series([nan, nan, nan])
        tm.assert_series_equal(result, expected)

    def test_series_ctor_plus_datetimeindex(self):
        rng = date_range('20090415', '20090519', freq='B')
        data = {k: 1 for k in rng}

        result = Series(data, index=rng)
        assert result.index is rng

    def test_constructor_default_index(self):
        s = Series([0, 1, 2])
        tm.assert_index_equal(s.index, pd.Index(np.arange(3)))

    @pytest.mark.parametrize('input', [[1, 2, 3],
                                       (1, 2, 3),
                                       list(range(3)),
                                       pd.Categorical(['a', 'b', 'a']),
                                       (i for i in range(3)),
                                       map(lambda x: x, range(3))])
    def test_constructor_index_mismatch(self, input):
        # GH 19342
        # test that construction of a Series with an index of different length
        # raises an error
        msg = 'Length of passed values is 3, index implies 4'
        with pytest.raises(ValueError, match=msg):
            Series(input, index=np.arange(4))

    def test_constructor_numpy_scalar(self):
        # GH 19342
        # construction with a numpy scalar
        # should not raise
        result = Series(np.array(100), index=np.arange(4), dtype='int64')
        expected = Series(100, index=np.arange(4), dtype='int64')
        tm.assert_series_equal(result, expected)

    def test_constructor_broadcast_list(self):
        # GH 19342
        # construction with single-element container and index
        # should raise
        msg = "Length of passed values is 1, index implies 3"
        with pytest.raises(ValueError, match=msg):
            Series(['foo'], index=['a', 'b', 'c'])

    def test_constructor_corner(self):
        df = tm.makeTimeDataFrame()
        objs = [df, df]
        s = Series(objs, index=[0, 1])
        assert isinstance(s, Series)

    def test_constructor_sanitize(self):
        s = Series(np.array([1., 1., 8.]), dtype='i8')
        assert s.dtype == np.dtype('i8')

        s = Series(np.array([1., 1., np.nan]), copy=True, dtype='i8')
        assert s.dtype == np.dtype('f8')

    def test_constructor_copy(self):
        # GH15125
        # test dtype parameter has no side effects on copy=True
        for data in [[1.], np.array([1.])]:
            x = Series(data)
            y = pd.Series(x, copy=True, dtype=float)

            # copy=True maintains original data in Series
            tm.assert_series_equal(x, y)

            # changes to origin of copy does not affect the copy
            x[0] = 2.
            assert not x.equals(y)
            assert x[0] == 2.
            assert y[0] == 1.

    @pytest.mark.parametrize(
        "index",
        [
            pd.date_range('20170101', periods=3, tz='US/Eastern'),
            pd.date_range('20170101', periods=3),
            pd.timedelta_range('1 day', periods=3),
            pd.period_range('2012Q1', periods=3, freq='Q'),
            pd.Index(list('abc')),
            pd.Int64Index([1, 2, 3]),
            pd.RangeIndex(0, 3)],
        ids=lambda x: type(x).__name__)
    def test_constructor_limit_copies(self, index):
        # GH 17449
        # limit copies of input
        s = pd.Series(index)

        # we make 1 copy; this is just a smoke test here
        assert s._data.blocks[0].values is not index

    def test_constructor_pass_none(self):
        s = Series(None, index=lrange(5))
        assert s.dtype == np.float64

        s = Series(None, index=lrange(5), dtype=object)
        assert s.dtype == np.object_

        # GH 7431
        # inference on the index
        s = Series(index=np.array([None]))
        expected = Series(index=Index([None]))
        assert_series_equal(s, expected)

    def test_constructor_pass_nan_nat(self):
        # GH 13467
        exp = Series([np.nan, np.nan], dtype=np.float64)
        assert exp.dtype == np.float64
        tm.assert_series_equal(Series([np.nan, np.nan]), exp)
        tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp)

        exp = Series([pd.NaT, pd.NaT])
        assert exp.dtype == 'datetime64[ns]'
        tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp)
        tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp)

        tm.assert_series_equal(Series([pd.NaT, np.nan]), exp)
        tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp)

        tm.assert_series_equal(Series([np.nan, pd.NaT]), exp)
        tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp)

    def test_constructor_cast(self):
        msg = "could not convert string to float"
        with pytest.raises(ValueError, match=msg):
            Series(["a", "b", "c"], dtype=float)

    def test_constructor_unsigned_dtype_overflow(self, uint_dtype):
        # see gh-15832
        msg = 'Trying to coerce negative values to unsigned integers'
        with pytest.raises(OverflowError, match=msg):
            Series([-1], dtype=uint_dtype)

    def test_constructor_coerce_float_fail(self, any_int_dtype):
        # see gh-15832
        msg = "Trying to coerce float values to integers"
        with pytest.raises(ValueError, match=msg):
            Series([1, 2, 3.5], dtype=any_int_dtype)

    def test_constructor_coerce_float_valid(self, float_dtype):
        s = Series([1, 2, 3.5], dtype=float_dtype)
        expected = Series([1, 2, 3.5]).astype(float_dtype)
        assert_series_equal(s, expected)

    def test_constructor_dtype_no_cast(self):
        # see gh-1572
        s = Series([1, 2, 3])
        s2 = Series(s, dtype=np.int64)

        s2[1] = 5
        assert s[1] == 5

    def test_constructor_datelike_coercion(self):

        # GH 9477
        # incorrectly inferring on dateimelike looking when object dtype is
        # specified
        s = Series([Timestamp('20130101'), 'NOV'], dtype=object)
        assert s.iloc[0] == Timestamp('20130101')
        assert s.iloc[1] == 'NOV'
        assert s.dtype == object

        # the dtype was being reset on the slicing and re-inferred to datetime
        # even thought the blocks are mixed
        belly = '216 3T19'.split()
        wing1 = '2T15 4H19'.split()
        wing2 = '416 4T20'.split()
        mat = pd.to_datetime('2016-01-22 2019-09-07'.split())
        df = pd.DataFrame(
            {'wing1': wing1,
             'wing2': wing2,
             'mat': mat}, index=belly)

        result = df.loc['3T19']
        assert result.dtype == object
        result = df.loc['216']
        assert result.dtype == object

    def test_constructor_datetimes_with_nulls(self):
        # gh-15869
        for arr in [np.array([None, None, None, None,
                              datetime.now(), None]),
                    np.array([None, None, datetime.now(), None])]:
            result = Series(arr)
            assert result.dtype == 'M8[ns]'

    def test_constructor_dtype_datetime64(self):

        s = Series(iNaT, dtype='M8[ns]', index=lrange(5))
        assert isna(s).all()

        # in theory this should be all nulls, but since
        # we are not specifying a dtype is ambiguous
        s = Series(iNaT, index=lrange(5))
        assert not isna(s).all()

        s = Series(nan, dtype='M8[ns]', index=lrange(5))
        assert isna(s).all()

        s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype='M8[ns]')
        assert isna(s[1])
        assert s.dtype == 'M8[ns]'

        s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype='M8[ns]')
        assert isna(s[1])
        assert s.dtype == 'M8[ns]'

        # GH3416
        dates = [
            np.datetime64(datetime(2013, 1, 1)),
            np.datetime64(datetime(2013, 1, 2)),
            np.datetime64(datetime(2013, 1, 3)),
        ]

        s = Series(dates)
        assert s.dtype == 'M8[ns]'

        s.iloc[0] = np.nan
        assert s.dtype == 'M8[ns]'

        # GH3414 related
        expected = Series([
            datetime(2013, 1, 1),
            datetime(2013, 1, 2),
            datetime(2013, 1, 3),
        ], dtype='datetime64[ns]')

        result = Series(
            Series(dates).astype(np.int64) / 1000000, dtype='M8[ms]')
        tm.assert_series_equal(result, expected)

        result = Series(dates, dtype='datetime64[ns]')
        tm.assert_series_equal(result, expected)

        expected = Series([
            pd.NaT,
            datetime(2013, 1, 2),
            datetime(2013, 1, 3),
        ], dtype='datetime64[ns]')
        result = Series([np.nan] + dates[1:], dtype='datetime64[ns]')
        tm.assert_series_equal(result, expected)

        dts = Series(dates, dtype='datetime64[ns]')

        # valid astype
        dts.astype('int64')

        # invalid casting
        msg = (r"cannot astype a datetimelike from \[datetime64\[ns\]\] to"
               r" \[int32\]")
        with pytest.raises(TypeError, match=msg):
            dts.astype('int32')

        # ints are ok
        # we test with np.int64 to get similar results on
        # windows / 32-bit platforms
        result = Series(dts, dtype=np.int64)
        expected = Series(dts.astype(np.int64))
        tm.assert_series_equal(result, expected)

        # invalid dates can be help as object
        result = Series([datetime(2, 1, 1)])
        assert result[0] == datetime(2, 1, 1, 0, 0)

        result = Series([datetime(3000, 1, 1)])
        assert result[0] == datetime(3000, 1, 1, 0, 0)

        # don't mix types
        result = Series([Timestamp('20130101'), 1], index=['a', 'b'])
        assert result['a'] == Timestamp('20130101')
        assert result['b'] == 1

        # GH6529
        # coerce datetime64 non-ns properly
        dates = date_range('01-Jan-2015', '01-Dec-2015', freq='M')
        values2 = dates.view(np.ndarray).astype('datetime64[ns]')
        expected = Series(values2, index=dates)

        for dtype in ['s', 'D', 'ms', 'us', 'ns']:
            values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype))
            result = Series(values1, dates)
            assert_series_equal(result, expected)

        # GH 13876
        # coerce to non-ns to object properly
        expected = Series(values2, index=dates, dtype=object)
        for dtype in ['s', 'D', 'ms', 'us', 'ns']:
            values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype))
            result = Series(values1, index=dates, dtype=object)
            assert_series_equal(result, expected)

        # leave datetime.date alone
        dates2 = np.array([d.date() for d in dates.to_pydatetime()],
                          dtype=object)
        series1 = Series(dates2, dates)
        tm.assert_numpy_array_equal(series1.values, dates2)
        assert series1.dtype == object

        # these will correctly infer a datetime
        s = Series([None, pd.NaT, '2013-08-05 15:30:00.000001'])
        assert s.dtype == 'datetime64[ns]'
        s = Series([np.nan, pd.NaT, '2013-08-05 15:30:00.000001'])
        assert s.dtype == 'datetime64[ns]'
        s = Series([pd.NaT, None, '2013-08-05 15:30:00.000001'])
        assert s.dtype == 'datetime64[ns]'
        s = Series([pd.NaT, np.nan, '2013-08-05 15:30:00.000001'])
        assert s.dtype == 'datetime64[ns]'

        # tz-aware (UTC and other tz's)
        # GH 8411
        dr = date_range('20130101', periods=3)
        assert Series(dr).iloc[0].tz is None
        dr = date_range('20130101', periods=3, tz='UTC')
        assert str(Series(dr).iloc[0].tz) == 'UTC'
        dr = date_range('20130101', periods=3, tz='US/Eastern')
        assert str(Series(dr).iloc[0].tz) == 'US/Eastern'

        # non-convertible
        s = Series([1479596223000, -1479590, pd.NaT])
        assert s.dtype == 'object'
        assert s[2] is pd.NaT
        assert 'NaT' in str(s)

        # if we passed a NaT it remains
        s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT])
        assert s.dtype == 'object'
        assert s[2] is pd.NaT
        assert 'NaT' in str(s)

        # if we passed a nan it remains
        s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan])
        assert s.dtype == 'object'
        assert s[2] is np.nan
        assert 'NaN' in str(s)

    def test_constructor_with_datetime_tz(self):

        # 8260
        # support datetime64 with tz

        dr = date_range('20130101', periods=3, tz='US/Eastern')
        s = Series(dr)
        assert s.dtype.name == 'datetime64[ns, US/Eastern]'
        assert s.dtype == 'datetime64[ns, US/Eastern]'
        assert is_datetime64tz_dtype(s.dtype)
        assert 'datetime64[ns, US/Eastern]' in str(s)

        # export
        result = s.values
        assert isinstance(result, np.ndarray)
        assert result.dtype == 'datetime64[ns]'

        exp = pd.DatetimeIndex(result)
        exp = exp.tz_localize('UTC').tz_convert(tz=s.dt.tz)
        tm.assert_index_equal(dr, exp)

        # indexing
        result = s.iloc[0]
        assert result == Timestamp('2013-01-01 00:00:00-0500',
                                   tz='US/Eastern', freq='D')
        result = s[0]
        assert result == Timestamp('2013-01-01 00:00:00-0500',
                                   tz='US/Eastern', freq='D')

        result = s[Series([True, True, False], index=s.index)]
        assert_series_equal(result, s[0:2])

        result = s.iloc[0:1]
        assert_series_equal(result, Series(dr[0:1]))

        # concat
        result = pd.concat([s.iloc[0:1], s.iloc[1:]])
        assert_series_equal(result, s)

        # short str
        assert 'datetime64[ns, US/Eastern]' in str(s)

        # formatting with NaT
        result = s.shift()
        assert 'datetime64[ns, US/Eastern]' in str(result)
        assert 'NaT' in str(result)

        # long str
        t = Series(date_range('20130101', periods=1000, tz='US/Eastern'))
        assert 'datetime64[ns, US/Eastern]' in str(t)

        result = pd.DatetimeIndex(s, freq='infer')
        tm.assert_index_equal(result, dr)

        # inference
        s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
                    pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')])
        assert s.dtype == 'datetime64[ns, US/Pacific]'
        assert lib.infer_dtype(s, skipna=True) == 'datetime64'

        s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
                    pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')])
        assert s.dtype == 'object'
        assert lib.infer_dtype(s, skipna=True) == 'datetime'

        # with all NaT
        s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]')
        expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern'))
        assert_series_equal(s, expected)

    @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
    @pytest.mark.parametrize("dtype", ["M8", "m8"])
    @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D'])
    def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit):
        # tests all units
        # gh-19223
        dtype = "{}[{}]".format(dtype, unit)
        arr = np.array([1, 2, 3], dtype=arr_dtype)
        s = Series(arr)
        result = s.astype(dtype)
        expected = Series(arr.astype(dtype))

        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize('arg',
                             ['2013-01-01 00:00:00', pd.NaT, np.nan, None])
    def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg):
        # GH 17415: With naive string
        result = Series([arg], dtype='datetime64[ns, CET]')
        expected = Series(pd.Timestamp(arg)).dt.tz_localize('CET')
        assert_series_equal(result, expected)

    def test_construction_interval(self):
        # construction from interval & array of intervals
        index = IntervalIndex.from_breaks(np.arange(3), closed='right')
        result = Series(index)
        repr(result)
        str(result)
        tm.assert_index_equal(Index(result.values), index)

        result = Series(index.values)
        tm.assert_index_equal(Index(result.values), index)

    def test_construction_consistency(self):

        # make sure that we are not re-localizing upon construction
        # GH 14928
        s = Series(pd.date_range('20130101', periods=3, tz='US/Eastern'))

        result = Series(s, dtype=s.dtype)
        tm.assert_series_equal(result, s)

        result = Series(s.dt.tz_convert('UTC'), dtype=s.dtype)
        tm.assert_series_equal(result, s)

        result = Series(s.values, dtype=s.dtype)
        tm.assert_series_equal(result, s)

    def test_constructor_infer_period(self):
        data = [pd.Period('2000', 'D'), pd.Period('2001', 'D'), None]
        result = pd.Series(data)
        expected = pd.Series(period_array(data))
        tm.assert_series_equal(result, expected)
        assert result.dtype == 'Period[D]'

        data = np.asarray(data, dtype=object)
        tm.assert_series_equal(result, expected)
        assert result.dtype == 'Period[D]'

    def test_constructor_period_incompatible_frequency(self):
        data = [pd.Period('2000', 'D'), pd.Period('2001', 'A')]
        result = pd.Series(data)
        assert result.dtype == object
        assert result.tolist() == data

    def test_constructor_periodindex(self):
        # GH7932
        # converting a PeriodIndex when put in a Series

        pi = period_range('20130101', periods=5, freq='D')
        s = Series(pi)
        assert s.dtype == 'Period[D]'
        expected = Series(pi.astype(object))
        assert_series_equal(s, expected)

    def test_constructor_dict(self):
        d = {'a': 0., 'b': 1., 'c': 2.}
        result = Series(d, index=['b', 'c', 'd', 'a'])
        expected = Series([1, 2, nan, 0], index=['b', 'c', 'd', 'a'])
        assert_series_equal(result, expected)

        pidx = tm.makePeriodIndex(100)
        d = {pidx[0]: 0, pidx[1]: 1}
        result = Series(d, index=pidx)
        expected = Series(np.nan, pidx)
        expected.iloc[0] = 0
        expected.iloc[1] = 1
        assert_series_equal(result, expected)

    def test_constructor_dict_order(self):
        # GH19018
        # initialization ordering: by insertion order if python>= 3.6, else
        # order by value
        d = {'b': 1, 'a': 0, 'c': 2}
        result = Series(d)
        if PY36:
            expected = Series([1, 0, 2], index=list('bac'))
        else:
            expected = Series([0, 1, 2], index=list('abc'))
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')])
    def test_constructor_dict_nan_key(self, value):
        # GH 18480
        d = {1: 'a', value: 'b', float('nan'): 'c', 4: 'd'}
        result = Series(d).sort_values()
        expected = Series(['a', 'b', 'c', 'd'], index=[1, value, np.nan, 4])
        assert_series_equal(result, expected)

        # MultiIndex:
        d = {(1, 1): 'a', (2, np.nan): 'b', (3, value): 'c'}
        result = Series(d).sort_values()
        expected = Series(['a', 'b', 'c'],
                          index=Index([(1, 1), (2, np.nan), (3, value)]))
        assert_series_equal(result, expected)

    def test_constructor_dict_datetime64_index(self):
        # GH 9456

        dates_as_str = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15']
        values = [42544017.198965244, 1234565, 40512335.181958228, -1]

        def create_data(constructor):
            return dict(zip((constructor(x) for x in dates_as_str), values))

        data_datetime64 = create_data(np.datetime64)
        data_datetime = create_data(lambda x: datetime.strptime(x, '%Y-%m-%d'))
        data_Timestamp = create_data(Timestamp)

        expected = Series(values, (Timestamp(x) for x in dates_as_str))

        result_datetime64 = Series(data_datetime64)
        result_datetime = Series(data_datetime)
        result_Timestamp = Series(data_Timestamp)

        assert_series_equal(result_datetime64, expected)
        assert_series_equal(result_datetime, expected)
        assert_series_equal(result_Timestamp, expected)

    def test_constructor_list_of_tuples(self):
        data = [(1, 1), (2, 2), (2, 3)]
        s = Series(data)
        assert list(s) == data

    def test_constructor_tuple_of_tuples(self):
        data = ((1, 1), (2, 2), (2, 3))
        s = Series(data)
        assert tuple(s) == data

    def test_constructor_dict_of_tuples(self):
        data = {(1, 2): 3,
                (None, 5): 6}
        result = Series(data).sort_values()
        expected = Series([3, 6],
                          index=MultiIndex.from_tuples([(1, 2), (None, 5)]))
        tm.assert_series_equal(result, expected)

    def test_constructor_set(self):
        values = {1, 2, 3, 4, 5}
        with pytest.raises(TypeError, match="'set' type is unordered"):
            Series(values)
        values = frozenset(values)
        with pytest.raises(TypeError, match="'frozenset' type is unordered"):
            Series(values)

    # https://github.com/pandas-dev/pandas/issues/22698
    @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning")
    def test_fromDict(self):
        data = {'a': 0, 'b': 1, 'c': 2, 'd': 3}

        series = Series(data)
        assert tm.is_sorted(series.index)

        data = {'a': 0, 'b': '1', 'c': '2', 'd': datetime.now()}
        series = Series(data)
        assert series.dtype == np.object_

        data = {'a': 0, 'b': '1', 'c': '2', 'd': '3'}
        series = Series(data)
        assert series.dtype == np.object_

        data = {'a': '0', 'b': '1'}
        series = Series(data, dtype=float)
        assert series.dtype == np.float64

    def test_fromValue(self, datetime_series):

        nans = Series(np.NaN, index=datetime_series.index)
        assert nans.dtype == np.float_
        assert len(nans) == len(datetime_series)

        strings = Series('foo', index=datetime_series.index)
        assert strings.dtype == np.object_
        assert len(strings) == len(datetime_series)

        d = datetime.now()
        dates = Series(d, index=datetime_series.index)
        assert dates.dtype == 'M8[ns]'
        assert len(dates) == len(datetime_series)

        # GH12336
        # Test construction of categorical series from value
        categorical = Series(0, index=datetime_series.index, dtype="category")
        expected = Series(0, index=datetime_series.index).astype("category")
        assert categorical.dtype == 'category'
        assert len(categorical) == len(datetime_series)
        tm.assert_series_equal(categorical, expected)

    def test_constructor_dtype_timedelta64(self):

        # basic
        td = Series([timedelta(days=i) for i in range(3)])
        assert td.dtype == 'timedelta64[ns]'

        td = Series([timedelta(days=1)])
        assert td.dtype == 'timedelta64[ns]'

        td = Series([timedelta(days=1), timedelta(days=2), np.timedelta64(
            1, 's')])

        assert td.dtype == 'timedelta64[ns]'

        # mixed with NaT
        td = Series([timedelta(days=1), NaT], dtype='m8[ns]')
        assert td.dtype == 'timedelta64[ns]'

        td = Series([timedelta(days=1), np.nan], dtype='m8[ns]')
        assert td.dtype == 'timedelta64[ns]'

        td = Series([np.timedelta64(300000000), pd.NaT], dtype='m8[ns]')
        assert td.dtype == 'timedelta64[ns]'

        # improved inference
        # GH5689
        td = Series([np.timedelta64(300000000), NaT])
        assert td.dtype == 'timedelta64[ns]'

        # because iNaT is int, not coerced to timedelta
        td = Series([np.timedelta64(300000000), iNaT])
        assert td.dtype == 'object'

        td = Series([np.timedelta64(300000000), np.nan])
        assert td.dtype == 'timedelta64[ns]'

        td = Series([pd.NaT, np.timedelta64(300000000)])
        assert td.dtype == 'timedelta64[ns]'

        td = Series([np.timedelta64(1, 's')])
        assert td.dtype == 'timedelta64[ns]'

        # these are frequency conversion astypes
        # for t in ['s', 'D', 'us', 'ms']:
        #    with pytest.raises(TypeError):
        #        td.astype('m8[%s]' % t)

        # valid astype
        td.astype('int64')

        # invalid casting
        msg = (r"cannot astype a timedelta from \[timedelta64\[ns\]\] to"
               r" \[int32\]")
        with pytest.raises(TypeError, match=msg):
            td.astype('int32')

        # this is an invalid casting
        msg = "Could not convert object to NumPy timedelta"
        with pytest.raises(ValueError, match=msg):
            Series([timedelta(days=1), 'foo'], dtype='m8[ns]')

        # leave as object here
        td = Series([timedelta(days=i) for i in range(3)] + ['foo'])
        assert td.dtype == 'object'

        # these will correctly infer a timedelta
        s = Series([None, pd.NaT, '1 Day'])
        assert s.dtype == 'timedelta64[ns]'
        s = Series([np.nan, pd.NaT, '1 Day'])
        assert s.dtype == 'timedelta64[ns]'
        s = Series([pd.NaT, None, '1 Day'])
        assert s.dtype == 'timedelta64[ns]'
        s = Series([pd.NaT, np.nan, '1 Day'])
        assert s.dtype == 'timedelta64[ns]'

    # GH 16406
    def test_constructor_mixed_tz(self):
        s = Series([Timestamp('20130101'),
                    Timestamp('20130101', tz='US/Eastern')])
        expected = Series([Timestamp('20130101'),
                           Timestamp('20130101', tz='US/Eastern')],
                          dtype='object')
        assert_series_equal(s, expected)

    def test_NaT_scalar(self):
        series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]')

        val = series[3]
        assert isna(val)

        series[2] = val
        assert isna(series[2])

    def test_NaT_cast(self):
        # GH10747
        result = Series([np.nan]).astype('M8[ns]')
        expected = Series([NaT])
        assert_series_equal(result, expected)

    def test_constructor_name_hashable(self):
        for n in [777, 777., 'name', datetime(2001, 11, 11), (1, ), "\u05D0"]:
            for data in [[1, 2, 3], np.ones(3), {'a': 0, 'b': 1}]:
                s = Series(data, name=n)
                assert s.name == n

    def test_constructor_name_unhashable(self):
        msg = r"Series\.name must be a hashable type"
        for n in [['name_list'], np.ones(2), {1: 2}]:
            for data in [['name_list'], np.ones(2), {1: 2}]:
                with pytest.raises(TypeError, match=msg):
                    Series(data, name=n)

    def test_auto_conversion(self):
        series = Series(list(date_range('1/1/2000', periods=10)))
        assert series.dtype == 'M8[ns]'

    def test_convert_non_ns(self):
        # convert from a numpy array of non-ns timedelta64
        arr = np.array([1, 2, 3], dtype='timedelta64[s]')
        s = Series(arr)
        expected = Series(pd.timedelta_range('00:00:01', periods=3, freq='s'))
        assert_series_equal(s, expected)

        # convert from a numpy array of non-ns datetime64
        # note that creating a numpy datetime64 is in LOCAL time!!!!
        # seems to work for M8[D], but not for M8[s]

        s = Series(np.array(['2013-01-01', '2013-01-02',
                             '2013-01-03'], dtype='datetime64[D]'))
        assert_series_equal(s, Series(date_range('20130101', periods=3,
                                                 freq='D')))

        # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01
        # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]'))

        # assert_series_equal(s,date_range('20130101
        # 00:00:01',period=3,freq='s'))

    @pytest.mark.parametrize(
        "index",
        [
            date_range('1/1/2000', periods=10),
            timedelta_range('1 day', periods=10),
            period_range('2000-Q1', periods=10, freq='Q')],
        ids=lambda x: type(x).__name__)
    def test_constructor_cant_cast_datetimelike(self, index):

        # floats are not ok
        msg = "Cannot cast {}.*? to ".format(
            # strip Index to convert PeriodIndex -> Period
            # We don't care whether the error message says
            # PeriodIndex or PeriodArray
            type(index).__name__.rstrip("Index")
        )
        with pytest.raises(TypeError, match=msg):
            Series(index, dtype=float)

        # ints are ok
        # we test with np.int64 to get similar results on
        # windows / 32-bit platforms
        result = Series(index, dtype=np.int64)
        expected = Series(index.astype(np.int64))
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize(
        "index",
        [
            date_range('1/1/2000', periods=10),
            timedelta_range('1 day', periods=10),
            period_range('2000-Q1', periods=10, freq='Q')],
        ids=lambda x: type(x).__name__)
    def test_constructor_cast_object(self, index):
        s = Series(index, dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

        s = Series(pd.Index(index, dtype=object), dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

        s = Series(index.astype(object), dtype=object)
        exp = Series(index).astype(object)
        tm.assert_series_equal(s, exp)

    @pytest.mark.parametrize("dtype", [
        np.datetime64,
        np.timedelta64,
    ])
    def test_constructor_generic_timestamp_no_frequency(self, dtype):
        # see gh-15524, gh-15987
        msg = "dtype has no unit. Please pass in"

        with pytest.raises(ValueError, match=msg):
            Series([], dtype=dtype)

    @pytest.mark.parametrize("dtype,msg", [
        ("m8[ps]", "cannot convert timedeltalike"),
        ("M8[ps]", "cannot convert datetimelike"),
    ])
    def test_constructor_generic_timestamp_bad_frequency(self, dtype, msg):
        # see gh-15524, gh-15987

        with pytest.raises(TypeError, match=msg):
            Series([], dtype=dtype)

    @pytest.mark.parametrize('dtype', [None, 'uint8', 'category'])
    def test_constructor_range_dtype(self, dtype):
        # GH 16804
        expected = Series([0, 1, 2, 3, 4], dtype=dtype or 'int64')
        result = Series(range(5), dtype=dtype)
        tm.assert_series_equal(result, expected)

    def test_constructor_tz_mixed_data(self):
        # GH 13051
        dt_list = [Timestamp('2016-05-01 02:03:37'),
                   Timestamp('2016-04-30 19:03:37-0700', tz='US/Pacific')]
        result = Series(dt_list)
        expected = Series(dt_list, dtype=object)
        tm.assert_series_equal(result, expected)
Ejemplo n.º 30
0
 def sdc_indexes_rename_impl(index, name):
     return pd.Int64Index(index, name=name)