import numpy as np import pandas as pd import matplotlib.pyplot as plt ftemp = np.zeros((len(inputs), 5000000)) for i in inputs: MCalgo(inputs) idx = pd.Int64Index(range(50000000)) chain1 = pd.read_csv('chain1.csv', header=None) traceplot_df = pd.DataFrame(index=idx) traceplot_df['a'] = a traceplot_df['b'] = b traceplot_df['c'] = c plt.figure(figsize=[6.4, 10]) plt.subplot(3, 1, 1) plt.plot(idx, chain1, color='tab:blue', linewidth=0.3) plt.title('a') plt.subplot(3, 1, 2) plt.plot(idx, chain2, color='tab:blue', linewidth=0.3) plt.title('b') plt.subplot(3, 1, 3) plt.plot(idx, chain3, color='tab:blue', linewidth=0.3) plt.title('c') plt.savefig("traceplots.png")
def test_insert_index_int64(self, insert, coerced_val, coerced_dtype): obj = pd.Int64Index([1, 2, 3, 4]) assert obj.dtype == np.int64 exp = pd.Index([1, coerced_val, 2, 3, 4]) self._assert_insert_conversion(obj, insert, exp, coerced_dtype)
class TestSeriesConstructors: @pytest.mark.parametrize( "constructor,check_index_type", [ # NOTE: some overlap with test_constructor_empty but that test does not # test for None or an empty generator. # test_constructor_pass_none tests None but only with the index also # passed. (lambda: Series(), True), (lambda: Series(None), True), (lambda: Series({}), True), (lambda: Series(()), False), # creates a RangeIndex (lambda: Series([]), False), # creates a RangeIndex (lambda: Series((_ for _ in [])), False), # creates a RangeIndex (lambda: Series(data=None), True), (lambda: Series(data={}), True), (lambda: Series(data=()), False), # creates a RangeIndex (lambda: Series(data=[]), False), # creates a RangeIndex (lambda: Series(data=(_ for _ in [])), False), # creates a RangeIndex ], ) def test_empty_constructor(self, constructor, check_index_type): expected = Series() result = constructor() assert len(result.index) == 0 tm.assert_series_equal(result, expected, check_index_type=check_index_type) def test_invalid_dtype(self): # GH15520 msg = "not understood" invalid_list = [pd.Timestamp, "pd.Timestamp", list] for dtype in invalid_list: with pytest.raises(TypeError, match=msg): Series([], name="time", dtype=dtype) def test_scalar_conversion(self): # Pass in scalar is disabled scalar = Series(0.5) assert not isinstance(scalar, float) # Coercion assert float(Series([1.0])) == 1.0 assert int(Series([1.0])) == 1 def test_constructor(self, datetime_series): empty_series = Series() assert datetime_series.index.is_all_dates # Pass in Series derived = Series(datetime_series) assert derived.index.is_all_dates assert tm.equalContents(derived.index, datetime_series.index) # Ensure new index is not created assert id(datetime_series.index) == id(derived.index) # Mixed type Series mixed = Series(["hello", np.NaN], index=[0, 1]) assert mixed.dtype == np.object_ assert mixed[1] is np.NaN assert not empty_series.index.is_all_dates assert not Series().index.is_all_dates # exception raised is of type Exception with pytest.raises(Exception, match="Data must be 1-dimensional"): Series(np.random.randn(3, 3), index=np.arange(3)) mixed.name = "Series" rs = Series(mixed).name xp = "Series" assert rs == xp # raise on MultiIndex GH4187 m = MultiIndex.from_arrays([[1, 2], [3, 4]]) msg = "initializing a Series from a MultiIndex is not supported" with pytest.raises(NotImplementedError, match=msg): Series(m) @pytest.mark.parametrize("input_class", [list, dict, OrderedDict]) def test_constructor_empty(self, input_class): empty = Series() empty2 = Series(input_class()) # these are Index() and RangeIndex() which don't compare type equal # but are just .equals tm.assert_series_equal(empty, empty2, check_index_type=False) # With explicit dtype: empty = Series(dtype="float64") empty2 = Series(input_class(), dtype="float64") tm.assert_series_equal(empty, empty2, check_index_type=False) # GH 18515 : with dtype=category: empty = Series(dtype="category") empty2 = Series(input_class(), dtype="category") tm.assert_series_equal(empty, empty2, check_index_type=False) if input_class is not list: # With index: empty = Series(index=range(10)) empty2 = Series(input_class(), index=range(10)) tm.assert_series_equal(empty, empty2) # With index and dtype float64: empty = Series(np.nan, index=range(10)) empty2 = Series(input_class(), index=range(10), dtype="float64") tm.assert_series_equal(empty, empty2) # GH 19853 : with empty string, index and dtype str empty = Series("", dtype=str, index=range(3)) empty2 = Series("", index=range(3)) tm.assert_series_equal(empty, empty2) @pytest.mark.parametrize("input_arg", [np.nan, float("nan")]) def test_constructor_nan(self, input_arg): empty = Series(dtype="float64", index=range(10)) empty2 = Series(input_arg, index=range(10)) tm.assert_series_equal(empty, empty2, check_index_type=False) @pytest.mark.parametrize( "dtype", [ "f8", "i8", "M8[ns]", "m8[ns]", "category", "object", "datetime64[ns, UTC]" ], ) @pytest.mark.parametrize("index", [None, pd.Index([])]) def test_constructor_dtype_only(self, dtype, index): # GH-20865 result = pd.Series(dtype=dtype, index=index) assert result.dtype == dtype assert len(result) == 0 def test_constructor_no_data_index_order(self): result = pd.Series(index=["b", "a", "c"]) assert result.index.tolist() == ["b", "a", "c"] def test_constructor_no_data_string_type(self): # GH 22477 result = pd.Series(index=[1], dtype=str) assert np.isnan(result.iloc[0]) @pytest.mark.parametrize("item", ["entry", "ѐ", 13]) def test_constructor_string_element_string_type(self, item): # GH 22477 result = pd.Series(item, index=[1], dtype=str) assert result.iloc[0] == str(item) def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 ser = Series(["x", None], dtype=string_dtype) result = ser.isna() expected = Series([False, True]) tm.assert_series_equal(result, expected) assert ser.iloc[1] is None ser = Series(["x", np.nan], dtype=string_dtype) assert np.isnan(ser.iloc[1]) def test_constructor_series(self): index1 = ["d", "b", "a", "c"] index2 = sorted(index1) s1 = Series([4, 7, -5, 3], index=index1) s2 = Series(s1, index=index2) tm.assert_series_equal(s2, s1.sort_index()) def test_constructor_iterable(self): # GH 21987 class Iter: def __iter__(self): for i in range(10): yield i expected = Series(list(range(10)), dtype="int64") result = Series(Iter(), dtype="int64") tm.assert_series_equal(result, expected) def test_constructor_sequence(self): # GH 21987 expected = Series(list(range(10)), dtype="int64") result = Series(range(10), dtype="int64") tm.assert_series_equal(result, expected) def test_constructor_single_str(self): # GH 21987 expected = Series(["abc"]) result = Series("abc") tm.assert_series_equal(result, expected) def test_constructor_list_like(self): # make sure that we are coercing different # list-likes to standard dtypes and not # platform specific expected = Series([1, 2, 3], dtype="int64") for obj in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype="int64")]: result = Series(obj, index=[0, 1, 2]) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", ["bool", "int32", "int64", "float64"]) def test_constructor_index_dtype(self, dtype): # GH 17088 s = Series(Index([0, 2, 4]), dtype=dtype) assert s.dtype == dtype @pytest.mark.parametrize( "input_vals", [ ([1, 2]), (["1", "2"]), (list(pd.date_range("1/1/2011", periods=2, freq="H"))), (list( pd.date_range("1/1/2011", periods=2, freq="H", tz="US/Eastern"))), ([pd.Interval(left=0, right=5)]), ], ) def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements from a list are converted to strings # when dtype is str, 'str', or 'U' result = Series(input_vals, dtype=string_dtype) expected = Series(input_vals).astype(string_dtype) tm.assert_series_equal(result, expected) def test_constructor_list_str_na(self, string_dtype): result = Series([1.0, 2.0, np.nan], dtype=string_dtype) expected = Series(["1.0", "2.0", np.nan], dtype=object) tm.assert_series_equal(result, expected) assert np.isnan(result[2]) def test_constructor_generator(self): gen = (i for i in range(10)) result = Series(gen) exp = Series(range(10)) tm.assert_series_equal(result, exp) gen = (i for i in range(10)) result = Series(gen, index=range(10, 20)) exp.index = range(10, 20) tm.assert_series_equal(result, exp) def test_constructor_map(self): # GH8909 m = map(lambda x: x, range(10)) result = Series(m) exp = Series(range(10)) tm.assert_series_equal(result, exp) m = map(lambda x: x, range(10)) result = Series(m, index=range(10, 20)) exp.index = range(10, 20) tm.assert_series_equal(result, exp) def test_constructor_categorical(self): cat = pd.Categorical([0, 1, 2, 0, 1, 2], ["a", "b", "c"], fastpath=True) res = Series(cat) tm.assert_categorical_equal(res.values, cat) # can cast to a new dtype result = Series(pd.Categorical([1, 2, 3]), dtype="int64") expected = pd.Series([1, 2, 3], dtype="int64") tm.assert_series_equal(result, expected) # GH12574 cat = Series(pd.Categorical([1, 2, 3]), dtype="category") assert is_categorical_dtype(cat) assert is_categorical_dtype(cat.dtype) s = Series([1, 2, 3], dtype="category") assert is_categorical_dtype(s) assert is_categorical_dtype(s.dtype) def test_constructor_categorical_with_coercion(self): factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]) # test basic creation / coercion of categoricals s = Series(factor, name="A") assert s.dtype == "category" assert len(s) == len(factor) str(s.values) str(s) # in a frame df = DataFrame({"A": factor}) result = df["A"] tm.assert_series_equal(result, s) result = df.iloc[:, 0] tm.assert_series_equal(result, s) assert len(df) == len(factor) str(df.values) str(df) df = DataFrame({"A": s}) result = df["A"] tm.assert_series_equal(result, s) assert len(df) == len(factor) str(df.values) str(df) # multiples df = DataFrame({"A": s, "B": s, "C": 1}) result1 = df["A"] result2 = df["B"] tm.assert_series_equal(result1, s) tm.assert_series_equal(result2, s, check_names=False) assert result2.name == "B" assert len(df) == len(factor) str(df.values) str(df) # GH8623 x = DataFrame( [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]], columns=["person_id", "person_name"], ) x["person_name"] = Categorical( x.person_name) # doing this breaks transform expected = x.iloc[0].person_name result = x.person_name.iloc[0] assert result == expected result = x.person_name[0] assert result == expected result = x.person_name.loc[0] assert result == expected def test_constructor_categorical_dtype(self): result = pd.Series(["a", "b"], dtype=CategoricalDtype(["a", "b", "c"], ordered=True)) assert is_categorical_dtype(result) is True tm.assert_index_equal(result.cat.categories, pd.Index(["a", "b", "c"])) assert result.cat.ordered result = pd.Series(["a", "b"], dtype=CategoricalDtype(["b", "a"])) assert is_categorical_dtype(result) tm.assert_index_equal(result.cat.categories, pd.Index(["b", "a"])) assert result.cat.ordered is False # GH 19565 - Check broadcasting of scalar with Categorical dtype result = Series("a", index=[0, 1], dtype=CategoricalDtype(["a", "b"], ordered=True)) expected = Series(["a", "a"], index=[0, 1], dtype=CategoricalDtype(["a", "b"], ordered=True)) tm.assert_series_equal(result, expected, check_categorical=True) def test_constructor_categorical_string(self): # GH 26336: the string 'category' maintains existing CategoricalDtype cdt = CategoricalDtype(categories=list("dabc"), ordered=True) expected = Series(list("abcabc"), dtype=cdt) # Series(Categorical, dtype='category') keeps existing dtype cat = Categorical(list("abcabc"), dtype=cdt) result = Series(cat, dtype="category") tm.assert_series_equal(result, expected) # Series(Series[Categorical], dtype='category') keeps existing dtype result = Series(result, dtype="category") tm.assert_series_equal(result, expected) @pytest.mark.parametrize("none, warning", [(None, None), (ordered_sentinel, FutureWarning)]) def test_categorical_ordered_none_deprecated(self, none, warning): # GH 26336: only warn if None is not explicitly passed cdt1 = CategoricalDtype(categories=list("cdab"), ordered=True) cdt2 = CategoricalDtype(categories=list("cedafb"), ordered=none) cat = Categorical(list("abcdaba"), dtype=cdt1) with tm.assert_produces_warning(warning, check_stacklevel=False): Series(cat, dtype=cdt2) s = Series(cat) with tm.assert_produces_warning(warning, check_stacklevel=False): Series(s, dtype=cdt2) def test_categorical_sideeffects_free(self): # Passing a categorical to a Series and then changing values in either # the series or the categorical should not change the values in the # other one, IF you specify copy! cat = Categorical(["a", "b", "c", "a"]) s = Series(cat, copy=True) assert s.cat is not cat s.cat.categories = [1, 2, 3] exp_s = np.array([1, 2, 3, 1], dtype=np.int64) exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_) tm.assert_numpy_array_equal(s.__array__(), exp_s) tm.assert_numpy_array_equal(cat.__array__(), exp_cat) # setting s[0] = 2 exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64) tm.assert_numpy_array_equal(s.__array__(), exp_s2) tm.assert_numpy_array_equal(cat.__array__(), exp_cat) # however, copy is False by default # so this WILL change values cat = Categorical(["a", "b", "c", "a"]) s = Series(cat) assert s.values is cat s.cat.categories = [1, 2, 3] exp_s = np.array([1, 2, 3, 1], dtype=np.int64) tm.assert_numpy_array_equal(s.__array__(), exp_s) tm.assert_numpy_array_equal(cat.__array__(), exp_s) s[0] = 2 exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64) tm.assert_numpy_array_equal(s.__array__(), exp_s2) tm.assert_numpy_array_equal(cat.__array__(), exp_s2) def test_unordered_compare_equal(self): left = pd.Series(["a", "b", "c"], dtype=CategoricalDtype(["a", "b"])) right = pd.Series( pd.Categorical(["a", "b", np.nan], categories=["a", "b"])) tm.assert_series_equal(left, right) def test_constructor_maskedarray(self): data = ma.masked_all((3, ), dtype=float) result = Series(data) expected = Series([np.nan, np.nan, np.nan]) tm.assert_series_equal(result, expected) data[0] = 0.0 data[2] = 2.0 index = ["a", "b", "c"] result = Series(data, index=index) expected = Series([0.0, np.nan, 2.0], index=index) tm.assert_series_equal(result, expected) data[1] = 1.0 result = Series(data, index=index) expected = Series([0.0, 1.0, 2.0], index=index) tm.assert_series_equal(result, expected) data = ma.masked_all((3, ), dtype=int) result = Series(data) expected = Series([np.nan, np.nan, np.nan], dtype=float) tm.assert_series_equal(result, expected) data[0] = 0 data[2] = 2 index = ["a", "b", "c"] result = Series(data, index=index) expected = Series([0, np.nan, 2], index=index, dtype=float) tm.assert_series_equal(result, expected) data[1] = 1 result = Series(data, index=index) expected = Series([0, 1, 2], index=index, dtype=int) tm.assert_series_equal(result, expected) data = ma.masked_all((3, ), dtype=bool) result = Series(data) expected = Series([np.nan, np.nan, np.nan], dtype=object) tm.assert_series_equal(result, expected) data[0] = True data[2] = False index = ["a", "b", "c"] result = Series(data, index=index) expected = Series([True, np.nan, False], index=index, dtype=object) tm.assert_series_equal(result, expected) data[1] = True result = Series(data, index=index) expected = Series([True, True, False], index=index, dtype=bool) tm.assert_series_equal(result, expected) data = ma.masked_all((3, ), dtype="M8[ns]") result = Series(data) expected = Series([iNaT, iNaT, iNaT], dtype="M8[ns]") tm.assert_series_equal(result, expected) data[0] = datetime(2001, 1, 1) data[2] = datetime(2001, 1, 3) index = ["a", "b", "c"] result = Series(data, index=index) expected = Series( [datetime(2001, 1, 1), iNaT, datetime(2001, 1, 3)], index=index, dtype="M8[ns]", ) tm.assert_series_equal(result, expected) data[1] = datetime(2001, 1, 2) result = Series(data, index=index) expected = Series( [datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 3)], index=index, dtype="M8[ns]", ) tm.assert_series_equal(result, expected) def test_constructor_maskedarray_hardened(self): # Check numpy masked arrays with hard masks -- from GH24574 data = ma.masked_all((3, ), dtype=float).harden_mask() result = pd.Series(data) expected = pd.Series([np.nan, np.nan, np.nan]) tm.assert_series_equal(result, expected) def test_series_ctor_plus_datetimeindex(self): rng = date_range("20090415", "20090519", freq="B") data = {k: 1 for k in rng} result = Series(data, index=rng) assert result.index is rng def test_constructor_default_index(self): s = Series([0, 1, 2]) tm.assert_index_equal(s.index, pd.Index(np.arange(3))) @pytest.mark.parametrize( "input", [ [1, 2, 3], (1, 2, 3), list(range(3)), pd.Categorical(["a", "b", "a"]), (i for i in range(3)), map(lambda x: x, range(3)), ], ) def test_constructor_index_mismatch(self, input): # GH 19342 # test that construction of a Series with an index of different length # raises an error msg = "Length of passed values is 3, index implies 4" with pytest.raises(ValueError, match=msg): Series(input, index=np.arange(4)) def test_constructor_numpy_scalar(self): # GH 19342 # construction with a numpy scalar # should not raise result = Series(np.array(100), index=np.arange(4), dtype="int64") expected = Series(100, index=np.arange(4), dtype="int64") tm.assert_series_equal(result, expected) def test_constructor_broadcast_list(self): # GH 19342 # construction with single-element container and index # should raise msg = "Length of passed values is 1, index implies 3" with pytest.raises(ValueError, match=msg): Series(["foo"], index=["a", "b", "c"]) def test_constructor_corner(self): df = tm.makeTimeDataFrame() objs = [df, df] s = Series(objs, index=[0, 1]) assert isinstance(s, Series) def test_constructor_sanitize(self): s = Series(np.array([1.0, 1.0, 8.0]), dtype="i8") assert s.dtype == np.dtype("i8") s = Series(np.array([1.0, 1.0, np.nan]), copy=True, dtype="i8") assert s.dtype == np.dtype("f8") def test_constructor_copy(self): # GH15125 # test dtype parameter has no side effects on copy=True for data in [[1.0], np.array([1.0])]: x = Series(data) y = pd.Series(x, copy=True, dtype=float) # copy=True maintains original data in Series tm.assert_series_equal(x, y) # changes to origin of copy does not affect the copy x[0] = 2.0 assert not x.equals(y) assert x[0] == 2.0 assert y[0] == 1.0 @pytest.mark.parametrize( "index", [ pd.date_range("20170101", periods=3, tz="US/Eastern"), pd.date_range("20170101", periods=3), pd.timedelta_range("1 day", periods=3), pd.period_range("2012Q1", periods=3, freq="Q"), pd.Index(list("abc")), pd.Int64Index([1, 2, 3]), pd.RangeIndex(0, 3), ], ids=lambda x: type(x).__name__, ) def test_constructor_limit_copies(self, index): # GH 17449 # limit copies of input s = pd.Series(index) # we make 1 copy; this is just a smoke test here assert s._data.blocks[0].values is not index def test_constructor_pass_none(self): s = Series(None, index=range(5)) assert s.dtype == np.float64 s = Series(None, index=range(5), dtype=object) assert s.dtype == np.object_ # GH 7431 # inference on the index s = Series(index=np.array([None])) expected = Series(index=Index([None])) tm.assert_series_equal(s, expected) def test_constructor_pass_nan_nat(self): # GH 13467 exp = Series([np.nan, np.nan], dtype=np.float64) assert exp.dtype == np.float64 tm.assert_series_equal(Series([np.nan, np.nan]), exp) tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) exp = Series([pd.NaT, pd.NaT]) assert exp.dtype == "datetime64[ns]" tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp) tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp) tm.assert_series_equal(Series([pd.NaT, np.nan]), exp) tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp) tm.assert_series_equal(Series([np.nan, pd.NaT]), exp) tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp) def test_constructor_cast(self): msg = "could not convert string to float" with pytest.raises(ValueError, match=msg): Series(["a", "b", "c"], dtype=float) def test_constructor_unsigned_dtype_overflow(self, uint_dtype): # see gh-15832 msg = "Trying to coerce negative values to unsigned integers" with pytest.raises(OverflowError, match=msg): Series([-1], dtype=uint_dtype) def test_constructor_coerce_float_fail(self, any_int_dtype): # see gh-15832 msg = "Trying to coerce float values to integers" with pytest.raises(ValueError, match=msg): Series([1, 2, 3.5], dtype=any_int_dtype) def test_constructor_coerce_float_valid(self, float_dtype): s = Series([1, 2, 3.5], dtype=float_dtype) expected = Series([1, 2, 3.5]).astype(float_dtype) tm.assert_series_equal(s, expected) def test_constructor_dtype_no_cast(self): # see gh-1572 s = Series([1, 2, 3]) s2 = Series(s, dtype=np.int64) s2[1] = 5 assert s[1] == 5 def test_constructor_datelike_coercion(self): # GH 9477 # incorrectly inferring on dateimelike looking when object dtype is # specified s = Series([Timestamp("20130101"), "NOV"], dtype=object) assert s.iloc[0] == Timestamp("20130101") assert s.iloc[1] == "NOV" assert s.dtype == object # the dtype was being reset on the slicing and re-inferred to datetime # even thought the blocks are mixed belly = "216 3T19".split() wing1 = "2T15 4H19".split() wing2 = "416 4T20".split() mat = pd.to_datetime("2016-01-22 2019-09-07".split()) df = pd.DataFrame({ "wing1": wing1, "wing2": wing2, "mat": mat }, index=belly) result = df.loc["3T19"] assert result.dtype == object result = df.loc["216"] assert result.dtype == object def test_constructor_datetimes_with_nulls(self): # gh-15869 for arr in [ np.array([None, None, None, None, datetime.now(), None]), np.array([None, None, datetime.now(), None]), ]: result = Series(arr) assert result.dtype == "M8[ns]" def test_constructor_dtype_datetime64(self): s = Series(iNaT, dtype="M8[ns]", index=range(5)) assert isna(s).all() # in theory this should be all nulls, but since # we are not specifying a dtype is ambiguous s = Series(iNaT, index=range(5)) assert not isna(s).all() s = Series(np.nan, dtype="M8[ns]", index=range(5)) assert isna(s).all() s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype="M8[ns]") assert isna(s[1]) assert s.dtype == "M8[ns]" s = Series([datetime(2001, 1, 2, 0, 0), np.nan], dtype="M8[ns]") assert isna(s[1]) assert s.dtype == "M8[ns]" # GH3416 dates = [ np.datetime64(datetime(2013, 1, 1)), np.datetime64(datetime(2013, 1, 2)), np.datetime64(datetime(2013, 1, 3)), ] s = Series(dates) assert s.dtype == "M8[ns]" s.iloc[0] = np.nan assert s.dtype == "M8[ns]" # GH3414 related expected = Series( [datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3)], dtype="datetime64[ns]", ) result = Series(Series(dates).astype(np.int64) / 1000000, dtype="M8[ms]") tm.assert_series_equal(result, expected) result = Series(dates, dtype="datetime64[ns]") tm.assert_series_equal(result, expected) expected = Series( [pd.NaT, datetime(2013, 1, 2), datetime(2013, 1, 3)], dtype="datetime64[ns]") result = Series([np.nan] + dates[1:], dtype="datetime64[ns]") tm.assert_series_equal(result, expected) dts = Series(dates, dtype="datetime64[ns]") # valid astype dts.astype("int64") # invalid casting msg = r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" r" \[int32\]" with pytest.raises(TypeError, match=msg): dts.astype("int32") # ints are ok # we test with np.int64 to get similar results on # windows / 32-bit platforms result = Series(dts, dtype=np.int64) expected = Series(dts.astype(np.int64)) tm.assert_series_equal(result, expected) # invalid dates can be help as object result = Series([datetime(2, 1, 1)]) assert result[0] == datetime(2, 1, 1, 0, 0) result = Series([datetime(3000, 1, 1)]) assert result[0] == datetime(3000, 1, 1, 0, 0) # don't mix types result = Series([Timestamp("20130101"), 1], index=["a", "b"]) assert result["a"] == Timestamp("20130101") assert result["b"] == 1 # GH6529 # coerce datetime64 non-ns properly dates = date_range("01-Jan-2015", "01-Dec-2015", freq="M") values2 = dates.view(np.ndarray).astype("datetime64[ns]") expected = Series(values2, index=dates) for dtype in ["s", "D", "ms", "us", "ns"]: values1 = dates.view(np.ndarray).astype("M8[{0}]".format(dtype)) result = Series(values1, dates) tm.assert_series_equal(result, expected) # GH 13876 # coerce to non-ns to object properly expected = Series(values2, index=dates, dtype=object) for dtype in ["s", "D", "ms", "us", "ns"]: values1 = dates.view(np.ndarray).astype("M8[{0}]".format(dtype)) result = Series(values1, index=dates, dtype=object) tm.assert_series_equal(result, expected) # leave datetime.date alone dates2 = np.array([d.date() for d in dates.to_pydatetime()], dtype=object) series1 = Series(dates2, dates) tm.assert_numpy_array_equal(series1.values, dates2) assert series1.dtype == object # these will correctly infer a datetime s = Series([None, pd.NaT, "2013-08-05 15:30:00.000001"]) assert s.dtype == "datetime64[ns]" s = Series([np.nan, pd.NaT, "2013-08-05 15:30:00.000001"]) assert s.dtype == "datetime64[ns]" s = Series([pd.NaT, None, "2013-08-05 15:30:00.000001"]) assert s.dtype == "datetime64[ns]" s = Series([pd.NaT, np.nan, "2013-08-05 15:30:00.000001"]) assert s.dtype == "datetime64[ns]" # tz-aware (UTC and other tz's) # GH 8411 dr = date_range("20130101", periods=3) assert Series(dr).iloc[0].tz is None dr = date_range("20130101", periods=3, tz="UTC") assert str(Series(dr).iloc[0].tz) == "UTC" dr = date_range("20130101", periods=3, tz="US/Eastern") assert str(Series(dr).iloc[0].tz) == "US/Eastern" # non-convertible s = Series([1479596223000, -1479590, pd.NaT]) assert s.dtype == "object" assert s[2] is pd.NaT assert "NaT" in str(s) # if we passed a NaT it remains s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT]) assert s.dtype == "object" assert s[2] is pd.NaT assert "NaT" in str(s) # if we passed a nan it remains s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) assert s.dtype == "object" assert s[2] is np.nan assert "NaN" in str(s) def test_constructor_with_datetime_tz(self): # 8260 # support datetime64 with tz dr = date_range("20130101", periods=3, tz="US/Eastern") s = Series(dr) assert s.dtype.name == "datetime64[ns, US/Eastern]" assert s.dtype == "datetime64[ns, US/Eastern]" assert is_datetime64tz_dtype(s.dtype) assert "datetime64[ns, US/Eastern]" in str(s) # export result = s.values assert isinstance(result, np.ndarray) assert result.dtype == "datetime64[ns]" exp = pd.DatetimeIndex(result) exp = exp.tz_localize("UTC").tz_convert(tz=s.dt.tz) tm.assert_index_equal(dr, exp) # indexing result = s.iloc[0] assert result == Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern", freq="D") result = s[0] assert result == Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern", freq="D") result = s[Series([True, True, False], index=s.index)] tm.assert_series_equal(result, s[0:2]) result = s.iloc[0:1] tm.assert_series_equal(result, Series(dr[0:1])) # concat result = pd.concat([s.iloc[0:1], s.iloc[1:]]) tm.assert_series_equal(result, s) # short str assert "datetime64[ns, US/Eastern]" in str(s) # formatting with NaT result = s.shift() assert "datetime64[ns, US/Eastern]" in str(result) assert "NaT" in str(result) # long str t = Series(date_range("20130101", periods=1000, tz="US/Eastern")) assert "datetime64[ns, US/Eastern]" in str(t) result = pd.DatetimeIndex(s, freq="infer") tm.assert_index_equal(result, dr) # inference s = Series([ pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), ]) assert s.dtype == "datetime64[ns, US/Pacific]" assert lib.infer_dtype(s, skipna=True) == "datetime64" s = Series([ pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Eastern"), ]) assert s.dtype == "object" assert lib.infer_dtype(s, skipna=True) == "datetime" # with all NaT s = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") expected = Series(pd.DatetimeIndex(["NaT", "NaT"], tz="US/Eastern")) tm.assert_series_equal(s, expected) @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units # gh-19223 dtype = "{}[{}]".format(dtype, unit) arr = np.array([1, 2, 3], dtype=arr_dtype) s = Series(arr) result = s.astype(dtype) expected = Series(arr.astype(dtype)) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("arg", ["2013-01-01 00:00:00", pd.NaT, np.nan, None]) def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): # GH 17415: With naive string result = Series([arg], dtype="datetime64[ns, CET]") expected = Series(pd.Timestamp(arg)).dt.tz_localize("CET") tm.assert_series_equal(result, expected) def test_construction_interval(self): # construction from interval & array of intervals index = IntervalIndex.from_breaks(np.arange(3), closed="right") result = Series(index) repr(result) str(result) tm.assert_index_equal(Index(result.values), index) result = Series(index.values) tm.assert_index_equal(Index(result.values), index) def test_construction_consistency(self): # make sure that we are not re-localizing upon construction # GH 14928 s = Series(pd.date_range("20130101", periods=3, tz="US/Eastern")) result = Series(s, dtype=s.dtype) tm.assert_series_equal(result, s) result = Series(s.dt.tz_convert("UTC"), dtype=s.dtype) tm.assert_series_equal(result, s) result = Series(s.values, dtype=s.dtype) tm.assert_series_equal(result, s) def test_constructor_infer_period(self): data = [pd.Period("2000", "D"), pd.Period("2001", "D"), None] result = pd.Series(data) expected = pd.Series(period_array(data)) tm.assert_series_equal(result, expected) assert result.dtype == "Period[D]" data = np.asarray(data, dtype=object) tm.assert_series_equal(result, expected) assert result.dtype == "Period[D]" def test_constructor_period_incompatible_frequency(self): data = [pd.Period("2000", "D"), pd.Period("2001", "A")] result = pd.Series(data) assert result.dtype == object assert result.tolist() == data def test_constructor_periodindex(self): # GH7932 # converting a PeriodIndex when put in a Series pi = period_range("20130101", periods=5, freq="D") s = Series(pi) assert s.dtype == "Period[D]" expected = Series(pi.astype(object)) tm.assert_series_equal(s, expected) def test_constructor_dict(self): d = {"a": 0.0, "b": 1.0, "c": 2.0} result = Series(d, index=["b", "c", "d", "a"]) expected = Series([1, 2, np.nan, 0], index=["b", "c", "d", "a"]) tm.assert_series_equal(result, expected) pidx = tm.makePeriodIndex(100) d = {pidx[0]: 0, pidx[1]: 1} result = Series(d, index=pidx) expected = Series(np.nan, pidx) expected.iloc[0] = 0 expected.iloc[1] = 1 tm.assert_series_equal(result, expected) def test_constructor_dict_order(self): # GH19018 # initialization ordering: by insertion order if python>= 3.6, else # order by value d = {"b": 1, "a": 0, "c": 2} result = Series(d) if PY36: expected = Series([1, 0, 2], index=list("bac")) else: expected = Series([0, 1, 2], index=list("abc")) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")]) def test_constructor_dict_nan_key(self, value): # GH 18480 d = {1: "a", value: "b", float("nan"): "c", 4: "d"} result = Series(d).sort_values() expected = Series(["a", "b", "c", "d"], index=[1, value, np.nan, 4]) tm.assert_series_equal(result, expected) # MultiIndex: d = {(1, 1): "a", (2, np.nan): "b", (3, value): "c"} result = Series(d).sort_values() expected = Series(["a", "b", "c"], index=Index([(1, 1), (2, np.nan), (3, value)])) tm.assert_series_equal(result, expected) def test_constructor_dict_datetime64_index(self): # GH 9456 dates_as_str = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"] values = [42544017.198965244, 1234565, 40512335.181958228, -1] def create_data(constructor): return dict(zip((constructor(x) for x in dates_as_str), values)) data_datetime64 = create_data(np.datetime64) data_datetime = create_data(lambda x: datetime.strptime(x, "%Y-%m-%d")) data_Timestamp = create_data(Timestamp) expected = Series(values, (Timestamp(x) for x in dates_as_str)) result_datetime64 = Series(data_datetime64) result_datetime = Series(data_datetime) result_Timestamp = Series(data_Timestamp) tm.assert_series_equal(result_datetime64, expected) tm.assert_series_equal(result_datetime, expected) tm.assert_series_equal(result_Timestamp, expected) def test_constructor_list_of_tuples(self): data = [(1, 1), (2, 2), (2, 3)] s = Series(data) assert list(s) == data def test_constructor_tuple_of_tuples(self): data = ((1, 1), (2, 2), (2, 3)) s = Series(data) assert tuple(s) == data def test_constructor_dict_of_tuples(self): data = {(1, 2): 3, (None, 5): 6} result = Series(data).sort_values() expected = Series([3, 6], index=MultiIndex.from_tuples([(1, 2), (None, 5)])) tm.assert_series_equal(result, expected) def test_constructor_set(self): values = {1, 2, 3, 4, 5} with pytest.raises(TypeError, match="'set' type is unordered"): Series(values) values = frozenset(values) with pytest.raises(TypeError, match="'frozenset' type is unordered"): Series(values) # https://github.com/pandas-dev/pandas/issues/22698 @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning") def test_fromDict(self): data = {"a": 0, "b": 1, "c": 2, "d": 3} series = Series(data) tm.assert_is_sorted(series.index) data = {"a": 0, "b": "1", "c": "2", "d": datetime.now()} series = Series(data) assert series.dtype == np.object_ data = {"a": 0, "b": "1", "c": "2", "d": "3"} series = Series(data) assert series.dtype == np.object_ data = {"a": "0", "b": "1"} series = Series(data, dtype=float) assert series.dtype == np.float64 def test_fromValue(self, datetime_series): nans = Series(np.NaN, index=datetime_series.index) assert nans.dtype == np.float_ assert len(nans) == len(datetime_series) strings = Series("foo", index=datetime_series.index) assert strings.dtype == np.object_ assert len(strings) == len(datetime_series) d = datetime.now() dates = Series(d, index=datetime_series.index) assert dates.dtype == "M8[ns]" assert len(dates) == len(datetime_series) # GH12336 # Test construction of categorical series from value categorical = Series(0, index=datetime_series.index, dtype="category") expected = Series(0, index=datetime_series.index).astype("category") assert categorical.dtype == "category" assert len(categorical) == len(datetime_series) tm.assert_series_equal(categorical, expected) def test_constructor_dtype_timedelta64(self): # basic td = Series([timedelta(days=i) for i in range(3)]) assert td.dtype == "timedelta64[ns]" td = Series([timedelta(days=1)]) assert td.dtype == "timedelta64[ns]" td = Series( [timedelta(days=1), timedelta(days=2), np.timedelta64(1, "s")]) assert td.dtype == "timedelta64[ns]" # mixed with NaT td = Series([timedelta(days=1), NaT], dtype="m8[ns]") assert td.dtype == "timedelta64[ns]" td = Series([timedelta(days=1), np.nan], dtype="m8[ns]") assert td.dtype == "timedelta64[ns]" td = Series([np.timedelta64(300000000), pd.NaT], dtype="m8[ns]") assert td.dtype == "timedelta64[ns]" # improved inference # GH5689 td = Series([np.timedelta64(300000000), NaT]) assert td.dtype == "timedelta64[ns]" # because iNaT is int, not coerced to timedelta td = Series([np.timedelta64(300000000), iNaT]) assert td.dtype == "object" td = Series([np.timedelta64(300000000), np.nan]) assert td.dtype == "timedelta64[ns]" td = Series([pd.NaT, np.timedelta64(300000000)]) assert td.dtype == "timedelta64[ns]" td = Series([np.timedelta64(1, "s")]) assert td.dtype == "timedelta64[ns]" # these are frequency conversion astypes # for t in ['s', 'D', 'us', 'ms']: # with pytest.raises(TypeError): # td.astype('m8[%s]' % t) # valid astype td.astype("int64") # invalid casting msg = r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" r" \[int32\]" with pytest.raises(TypeError, match=msg): td.astype("int32") # this is an invalid casting msg = "Could not convert object to NumPy timedelta" with pytest.raises(ValueError, match=msg): Series([timedelta(days=1), "foo"], dtype="m8[ns]") # leave as object here td = Series([timedelta(days=i) for i in range(3)] + ["foo"]) assert td.dtype == "object" # these will correctly infer a timedelta s = Series([None, pd.NaT, "1 Day"]) assert s.dtype == "timedelta64[ns]" s = Series([np.nan, pd.NaT, "1 Day"]) assert s.dtype == "timedelta64[ns]" s = Series([pd.NaT, None, "1 Day"]) assert s.dtype == "timedelta64[ns]" s = Series([pd.NaT, np.nan, "1 Day"]) assert s.dtype == "timedelta64[ns]" # GH 16406 def test_constructor_mixed_tz(self): s = Series( [Timestamp("20130101"), Timestamp("20130101", tz="US/Eastern")]) expected = Series( [Timestamp("20130101"), Timestamp("20130101", tz="US/Eastern")], dtype="object", ) tm.assert_series_equal(s, expected) def test_NaT_scalar(self): series = Series([0, 1000, 2000, iNaT], dtype="M8[ns]") val = series[3] assert isna(val) series[2] = val assert isna(series[2]) def test_NaT_cast(self): # GH10747 result = Series([np.nan]).astype("M8[ns]") expected = Series([NaT]) tm.assert_series_equal(result, expected) def test_constructor_name_hashable(self): for n in [777, 777.0, "name", datetime(2001, 11, 11), (1, ), "\u05D0"]: for data in [[1, 2, 3], np.ones(3), {"a": 0, "b": 1}]: s = Series(data, name=n) assert s.name == n def test_constructor_name_unhashable(self): msg = r"Series\.name must be a hashable type" for n in [["name_list"], np.ones(2), {1: 2}]: for data in [["name_list"], np.ones(2), {1: 2}]: with pytest.raises(TypeError, match=msg): Series(data, name=n) def test_auto_conversion(self): series = Series(list(date_range("1/1/2000", periods=10))) assert series.dtype == "M8[ns]" def test_convert_non_ns(self): # convert from a numpy array of non-ns timedelta64 arr = np.array([1, 2, 3], dtype="timedelta64[s]") s = Series(arr) expected = Series(pd.timedelta_range("00:00:01", periods=3, freq="s")) tm.assert_series_equal(s, expected) # convert from a numpy array of non-ns datetime64 # note that creating a numpy datetime64 is in LOCAL time!!!! # seems to work for M8[D], but not for M8[s] s = Series( np.array(["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]")) tm.assert_series_equal( s, Series(date_range("20130101", periods=3, freq="D"))) # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01 # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')) # tm.assert_series_equal(s,date_range('20130101 # 00:00:01',period=3,freq='s')) @pytest.mark.parametrize( "index", [ date_range("1/1/2000", periods=10), timedelta_range("1 day", periods=10), period_range("2000-Q1", periods=10, freq="Q"), ], ids=lambda x: type(x).__name__, ) def test_constructor_cant_cast_datetimelike(self, index): # floats are not ok msg = "Cannot cast {}.*? to ".format( # strip Index to convert PeriodIndex -> Period # We don't care whether the error message says # PeriodIndex or PeriodArray type(index).__name__.rstrip("Index")) with pytest.raises(TypeError, match=msg): Series(index, dtype=float) # ints are ok # we test with np.int64 to get similar results on # windows / 32-bit platforms result = Series(index, dtype=np.int64) expected = Series(index.astype(np.int64)) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "index", [ date_range("1/1/2000", periods=10), timedelta_range("1 day", periods=10), period_range("2000-Q1", periods=10, freq="Q"), ], ids=lambda x: type(x).__name__, ) def test_constructor_cast_object(self, index): s = Series(index, dtype=object) exp = Series(index).astype(object) tm.assert_series_equal(s, exp) s = Series(pd.Index(index, dtype=object), dtype=object) exp = Series(index).astype(object) tm.assert_series_equal(s, exp) s = Series(index.astype(object), dtype=object) exp = Series(index).astype(object) tm.assert_series_equal(s, exp) @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64]) def test_constructor_generic_timestamp_no_frequency(self, dtype): # see gh-15524, gh-15987 msg = "dtype has no unit. Please pass in" with pytest.raises(ValueError, match=msg): Series([], dtype=dtype) @pytest.mark.parametrize( "dtype,msg", [ ("m8[ps]", "cannot convert timedeltalike"), ("M8[ps]", "cannot convert datetimelike"), ], ) def test_constructor_generic_timestamp_bad_frequency(self, dtype, msg): # see gh-15524, gh-15987 with pytest.raises(TypeError, match=msg): Series([], dtype=dtype) @pytest.mark.parametrize("dtype", [None, "uint8", "category"]) def test_constructor_range_dtype(self, dtype): # GH 16804 expected = Series([0, 1, 2, 3, 4], dtype=dtype or "int64") result = Series(range(5), dtype=dtype) tm.assert_series_equal(result, expected) def test_constructor_tz_mixed_data(self): # GH 13051 dt_list = [ Timestamp("2016-05-01 02:03:37"), Timestamp("2016-04-30 19:03:37-0700", tz="US/Pacific"), ] result = Series(dt_list) expected = Series(dt_list, dtype=object) tm.assert_series_equal(result, expected)
class TestFancy(Base): """ pure get/set item & fancy indexing """ def test_setitem_ndarray_1d(self): # GH5508 # len of indexer vs length of the 1d ndarray df = DataFrame(index=Index(np.arange(1, 11))) df["foo"] = np.zeros(10, dtype=np.float64) df["bar"] = np.zeros(10, dtype=np.complex) # invalid with pytest.raises(ValueError): df.loc[df.index[2:5], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0]) # valid df.loc[df.index[2:6], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0]) result = df.loc[df.index[2:6], "bar"] expected = Series([2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[3, 4, 5, 6], name="bar") tm.assert_series_equal(result, expected) # dtype getting changed? df = DataFrame(index=Index(np.arange(1, 11))) df["foo"] = np.zeros(10, dtype=np.float64) df["bar"] = np.zeros(10, dtype=np.complex) with pytest.raises(ValueError): df[2:5] = np.arange(1, 4) * 1j @pytest.mark.parametrize("index", tm.all_index_generator(5), ids=lambda x: type(x).__name__) @pytest.mark.parametrize( "obj", [ lambda i: Series(np.arange(len(i)), index=i), lambda i: DataFrame( np.random.randn(len(i), len(i)), index=i, columns=i), ], ids=["Series", "DataFrame"], ) @pytest.mark.parametrize( "idxr, idxr_id", [ (lambda x: x, "getitem"), (lambda x: x.loc, "loc"), (lambda x: x.iloc, "iloc"), pytest.param(lambda x: x.ix, "ix", marks=ignore_ix), ], ) def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): # GH 25567 obj = obj(index) idxr = idxr(obj) nd3 = np.random.randint(5, size=(2, 2, 2)) msg = ( r"Buffer has wrong number of dimensions \(expected 1," r" got 3\)|" "The truth value of an array with more than one element is" " ambiguous|" "Cannot index with multidimensional key|" r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]|" "No matching signature found|" # TypeError "unhashable type: 'numpy.ndarray'" # TypeError ) if (isinstance(obj, Series) and idxr_id == "getitem" and index.inferred_type in [ "string", "datetime64", "period", "timedelta64", "boolean", "categorical", ]): idxr[nd3] else: if (isinstance(obj, DataFrame) and idxr_id == "getitem" and index.inferred_type == "boolean"): error = TypeError elif idxr_id == "getitem" and index.inferred_type == "interval": error = TypeError else: error = ValueError with pytest.raises(error, match=msg): idxr[nd3] @pytest.mark.parametrize("index", tm.all_index_generator(5), ids=lambda x: type(x).__name__) @pytest.mark.parametrize( "obj", [ lambda i: Series(np.arange(len(i)), index=i), lambda i: DataFrame( np.random.randn(len(i), len(i)), index=i, columns=i), ], ids=["Series", "DataFrame"], ) @pytest.mark.parametrize( "idxr, idxr_id", [ (lambda x: x, "setitem"), (lambda x: x.loc, "loc"), (lambda x: x.iloc, "iloc"), pytest.param(lambda x: x.ix, "ix", marks=ignore_ix), ], ) def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): # GH 25567 obj = obj(index) idxr = idxr(obj) nd3 = np.random.randint(5, size=(2, 2, 2)) msg = ( r"Buffer has wrong number of dimensions \(expected 1," r" got 3\)|" "The truth value of an array with more than one element is" " ambiguous|" "Only 1-dimensional input arrays are supported|" "'pandas._libs.interval.IntervalTree' object has no attribute" " 'set_value'|" # AttributeError "unhashable type: 'numpy.ndarray'|" # TypeError "No matching signature found|" # TypeError r"^\[\[\[" # pandas.core.indexing.IndexingError ) if ((idxr_id == "iloc") or ((isinstance(obj, Series) and idxr_id == "setitem" and index.inferred_type in [ "floating", "string", "datetime64", "period", "timedelta64", "boolean", "categorical", ])) or (idxr_id == "ix" and index.inferred_type in ["string", "datetime64", "period", "boolean"])): idxr[nd3] = 0 else: with pytest.raises( (ValueError, AttributeError, TypeError, pd.core.indexing.IndexingError), match=msg, ): idxr[nd3] = 0 def test_inf_upcast(self): # GH 16957 # We should be able to use np.inf as a key # np.inf should cause an index to convert to float # Test with np.inf in rows df = DataFrame(columns=[0]) df.loc[1] = 1 df.loc[2] = 2 df.loc[np.inf] = 3 # make sure we can look up the value assert df.loc[np.inf, 0] == 3 result = df.index expected = pd.Float64Index([1, 2, np.inf]) tm.assert_index_equal(result, expected) # Test with np.inf in columns df = DataFrame() df.loc[0, 0] = 1 df.loc[1, 1] = 2 df.loc[0, np.inf] = 3 result = df.columns expected = pd.Float64Index([0, 1, np.inf]) tm.assert_index_equal(result, expected) def test_setitem_dtype_upcast(self): # GH3216 df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) df["c"] = np.nan assert df["c"].dtype == np.float64 df.loc[0, "c"] = "foo" expected = DataFrame([{ "a": 1, "c": "foo" }, { "a": 3, "b": 2, "c": np.nan }]) tm.assert_frame_equal(df, expected) # GH10280 df = DataFrame( np.arange(6, dtype="int64").reshape(2, 3), index=list("ab"), columns=["foo", "bar", "baz"], ) for val in [3.14, "wxyz"]: left = df.copy() left.loc["a", "bar"] = val right = DataFrame( [[0, val, 2], [3, 4, 5]], index=list("ab"), columns=["foo", "bar", "baz"], ) tm.assert_frame_equal(left, right) assert is_integer_dtype(left["foo"]) assert is_integer_dtype(left["baz"]) left = DataFrame( np.arange(6, dtype="int64").reshape(2, 3) / 10.0, index=list("ab"), columns=["foo", "bar", "baz"], ) left.loc["a", "bar"] = "wxyz" right = DataFrame( [[0, "wxyz", 0.2], [0.3, 0.4, 0.5]], index=list("ab"), columns=["foo", "bar", "baz"], ) tm.assert_frame_equal(left, right) assert is_float_dtype(left["foo"]) assert is_float_dtype(left["baz"]) def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(10, 3) df.columns = ["a", "a", "b"] result = df[["b", "a"]].columns expected = Index(["b", "a", "a"]) tm.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa")) df.head() str(df) result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]]) result.columns = list("aaaaaaa") # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( { "test": [5, 7, 9, 11], "test1": [4.0, 5, 6, 7], "other": list("abcd") }, index=["A", "A", "B", "C"], ) rows = ["C", "B"] expected = DataFrame( { "test": [11, 9], "test1": [7.0, 6], "other": ["d", "c"] }, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ["C", "B", "E"] expected = DataFrame( { "test": [11, 9, np.nan], "test1": [7.0, 6, np.nan], "other": ["d", "c", np.nan], }, index=rows, ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer rows = ["F", "G", "H", "C", "B", "E"] expected = DataFrame( { "test": [np.nan, np.nan, np.nan, 11, 9, np.nan], "test1": [np.nan, np.nan, np.nan, 7.0, 6, np.nan], "other": [np.nan, np.nan, np.nan, "d", "c", np.nan], }, index=rows, ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[rows] tm.assert_frame_equal(result, expected) # List containing only missing label dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD")) with pytest.raises(KeyError): dfnu.loc[["E"]] # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) df = DataFrame({"A": list("abc")}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[[0, 8, 0]] expected = DataFrame({"A": ["a", np.nan, "a"]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"]) expected = DataFrame({"test": [5, 7, 5, 7, np.nan]}, index=["A", "A", "A", "A", "E"]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[["A", "A", "E"]] tm.assert_frame_equal(result, expected) def test_dups_fancy_indexing2(self): # GH 5835 # dups on index and missing values df = DataFrame(np.random.randn(5, 5), columns=["A", "B", "B", "B", "A"]) expected = pd.concat( [ df.loc[:, ["A", "B"]], DataFrame(np.nan, columns=["C"], index=df.index) ], axis=1, ) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = df.loc[:, ["A", "B", "C"]] tm.assert_frame_equal(result, expected) # GH 6504, multi-axis indexing df = DataFrame(np.random.randn(9, 2), index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=["a", "b"]) expected = df.iloc[0:6] result = df.loc[[1, 2]] tm.assert_frame_equal(result, expected) expected = df result = df.loc[:, ["a", "b"]] tm.assert_frame_equal(result, expected) expected = df.iloc[0:6, :] result = df.loc[[1, 2], ["a", "b"]] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("case", [lambda s: s, lambda s: s.loc]) def test_duplicate_int_indexing(self, case): # GH 17347 s = pd.Series(range(3), index=[1, 1, 3]) expected = s[1] result = case(s)[[1]] tm.assert_series_equal(result, expected) def test_indexing_mixed_frame_bug(self): # GH3492 df = DataFrame({ "a": { 1: "aaa", 2: "bbb", 3: "ccc" }, "b": { 1: 111, 2: 222, 3: 333 } }) # this works, new column is created correctly df["test"] = df["a"].apply(lambda x: "_" if x == "aaa" else x) # this does not work, ie column test is not changed idx = df["test"] == "_" temp = df.loc[idx, "a"].apply(lambda x: "-----" if x == "aaa" else x) df.loc[idx, "test"] = temp assert df.iloc[0, 2] == "-----" # if I look at df, then element [0,2] equals '_'. If instead I type # df.ix[idx,'test'], I get '-----', finally by typing df.iloc[0,2] I # get '_'. def test_multitype_list_index_access(self): # GH 10610 df = DataFrame(np.random.random((10, 5)), columns=["a"] + [20, 21, 22, 23]) with pytest.raises(KeyError): df[[22, 26, -8]] assert df[21].shape[0] == df.shape[0] def test_set_index_nan(self): # GH 3586 df = DataFrame({ "PRuid": { 17: "nonQC", 18: "nonQC", 19: "nonQC", 20: "10", 21: "11", 22: "12", 23: "13", 24: "24", 25: "35", 26: "46", 27: "47", 28: "48", 29: "59", 30: "10", }, "QC": { 17: 0.0, 18: 0.0, 19: 0.0, 20: np.nan, 21: np.nan, 22: np.nan, 23: np.nan, 24: 1.0, 25: np.nan, 26: np.nan, 27: np.nan, 28: np.nan, 29: np.nan, 30: np.nan, }, "data": { 17: 7.9544899999999998, 18: 8.0142609999999994, 19: 7.8591520000000008, 20: 0.86140349999999999, 21: 0.87853110000000001, 22: 0.8427041999999999, 23: 0.78587700000000005, 24: 0.73062459999999996, 25: 0.81668560000000001, 26: 0.81927080000000008, 27: 0.80705009999999999, 28: 0.81440240000000008, 29: 0.80140849999999997, 30: 0.81307740000000006, }, "year": { 17: 2006, 18: 2007, 19: 2008, 20: 1985, 21: 1985, 22: 1985, 23: 1985, 24: 1985, 25: 1985, 26: 1985, 27: 1985, 28: 1985, 29: 1985, 30: 1986, }, }).reset_index() result = (df.set_index(["year", "PRuid", "QC" ]).reset_index().reindex(columns=df.columns)) tm.assert_frame_equal(result, df) def test_multi_assign(self): # GH 3626, an assignment of a sub-df to a df df = DataFrame({ "FC": ["a", "b", "a", "b", "a", "b"], "PF": [0, 0, 0, 0, 1, 1], "col1": list(range(6)), "col2": list(range(6, 12)), }) df.iloc[1, 0] = np.nan df2 = df.copy() mask = ~df2.FC.isna() cols = ["col1", "col2"] dft = df2 * 2 dft.iloc[3, 3] = np.nan expected = DataFrame({ "FC": ["a", np.nan, "a", "b", "a", "b"], "PF": [0, 0, 0, 0, 1, 1], "col1": Series([0, 1, 4, 6, 8, 10]), "col2": [12, 7, 16, np.nan, 20, 22], }) # frame on rhs df2.loc[mask, cols] = dft.loc[mask, cols] tm.assert_frame_equal(df2, expected) df2.loc[mask, cols] = dft.loc[mask, cols] tm.assert_frame_equal(df2, expected) # with an ndarray on rhs # coerces to float64 because values has float64 dtype # GH 14001 expected = DataFrame({ "FC": ["a", np.nan, "a", "b", "a", "b"], "PF": [0, 0, 0, 0, 1, 1], "col1": [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], "col2": [12, 7, 16, np.nan, 20, 22], }) df2 = df.copy() df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) # broadcasting on the rhs is required df = DataFrame( dict( A=[1, 2, 0, 0, 0], B=[0, 0, 0, 10, 11], C=[0, 0, 0, 10, 11], D=[3, 4, 5, 6, 7], )) expected = df.copy() mask = expected["A"] == 0 for col in ["A", "B"]: expected.loc[mask, col] = df["D"] df.loc[df["A"] == 0, ["A", "B"]] = df["D"] tm.assert_frame_equal(df, expected) def test_setitem_list(self): # GH 6043 # ix with a list df = DataFrame(index=[0, 1], columns=[0]) with catch_warnings(record=True): simplefilter("ignore") df.ix[1, 0] = [1, 2, 3] df.ix[1, 0] = [1, 2] result = DataFrame(index=[0, 1], columns=[0]) with catch_warnings(record=True): simplefilter("ignore") result.ix[1, 0] = [1, 2] tm.assert_frame_equal(result, df) # ix with an object class TO: def __init__(self, value): self.value = value def __str__(self): return "[{0}]".format(self.value) __repr__ = __str__ def __eq__(self, other): return self.value == other.value def view(self): return self df = DataFrame(index=[0, 1], columns=[0]) with catch_warnings(record=True): simplefilter("ignore") df.ix[1, 0] = TO(1) df.ix[1, 0] = TO(2) result = DataFrame(index=[0, 1], columns=[0]) with catch_warnings(record=True): simplefilter("ignore") result.ix[1, 0] = TO(2) tm.assert_frame_equal(result, df) # remains object dtype even after setting it back df = DataFrame(index=[0, 1], columns=[0]) with catch_warnings(record=True): simplefilter("ignore") df.ix[1, 0] = TO(1) df.ix[1, 0] = np.nan result = DataFrame(index=[0, 1], columns=[0]) tm.assert_frame_equal(result, df) def test_string_slice(self): # GH 14424 # string indexing against datetimelike with object # dtype should properly raises KeyError df = DataFrame([1], Index([pd.Timestamp("2011-01-01")], dtype=object)) assert df.index.is_all_dates with pytest.raises(KeyError): df["2011"] with pytest.raises(KeyError): df.loc["2011", 0] df = DataFrame() assert not df.index.is_all_dates with pytest.raises(KeyError): df["2011"] with pytest.raises(KeyError): df.loc["2011", 0] def test_astype_assignment(self): # GH4312 (iloc) df_orig = DataFrame([["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")) df = df_orig.copy() df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64) expected = DataFrame([[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")) tm.assert_frame_equal(df, expected) df = df_orig.copy() df.iloc[:, 0:2] = df.iloc[:, 0:2]._convert(datetime=True, numeric=True) expected = DataFrame([[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")) tm.assert_frame_equal(df, expected) # GH5702 (loc) df = df_orig.copy() df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64) expected = DataFrame([[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")) tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64) expected = DataFrame([["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")) tm.assert_frame_equal(df, expected) # full replacements / no nans df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]}) df.iloc[:, 0] = df["A"].astype(np.int64) expected = DataFrame({"A": [1, 2, 3, 4]}) tm.assert_frame_equal(df, expected) df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]}) df.loc[:, "A"] = df["A"].astype(np.int64) expected = DataFrame({"A": [1, 2, 3, 4]}) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( "index,val", [ (Index([0, 1, 2]), 2), (Index([0, 1, "2"]), "2"), (Index([0, 1, 2, np.inf, 4]), 4), (Index([0, 1, 2, np.nan, 4]), 4), (Index([0, 1, 2, np.inf]), np.inf), (Index([0, 1, 2, np.nan]), np.nan), ], ) def test_index_contains(self, index, val): assert val in index @pytest.mark.parametrize( "index,val", [ (Index([0, 1, 2]), "2"), (Index([0, 1, "2"]), 2), (Index([0, 1, 2, np.inf]), 4), (Index([0, 1, 2, np.nan]), 4), (Index([0, 1, 2, np.inf]), np.nan), (Index([0, 1, 2, np.nan]), np.inf), # Checking if np.inf in Int64Index should not cause an OverflowError # Related to GH 16957 (pd.Int64Index([0, 1, 2]), np.inf), (pd.Int64Index([0, 1, 2]), np.nan), (pd.UInt64Index([0, 1, 2]), np.inf), (pd.UInt64Index([0, 1, 2]), np.nan), ], ) def test_index_not_contains(self, index, val): assert val not in index @pytest.mark.parametrize("index,val", [(Index([0, 1, "2"]), 0), (Index([0, 1, "2"]), "2")]) def test_mixed_index_contains(self, index, val): # GH 19860 assert val in index @pytest.mark.parametrize("index,val", [(Index([0, 1, "2"]), "1"), (Index([0, 1, "2"]), 2)]) def test_mixed_index_not_contains(self, index, val): # GH 19860 assert val not in index def test_contains_with_float_index(self): # GH#22085 integer_index = pd.Int64Index([0, 1, 2, 3]) uinteger_index = pd.UInt64Index([0, 1, 2, 3]) float_index = pd.Float64Index([0.1, 1.1, 2.2, 3.3]) for index in (integer_index, uinteger_index): assert 1.1 not in index assert 1.0 in index assert 1 in index assert 1.1 in float_index assert 1.0 not in float_index assert 1 not in float_index def test_index_type_coercion(self): with catch_warnings(record=True): simplefilter("ignore") # GH 11836 # if we have an index type and set it with something that looks # to numpy like the same, but is actually, not # (e.g. setting with a float or string '0') # then we need to coerce to object # integer indexes for s in [Series(range(5)), Series(range(5), index=range(1, 6))]: assert s.index.is_integer() for indexer in [lambda x: x.ix, lambda x: x.loc, lambda x: x]: s2 = s.copy() indexer(s2)[0.1] = 0 assert s2.index.is_floating() assert indexer(s2)[0.1] == 0 s2 = s.copy() indexer(s2)[0.0] = 0 exp = s.index if 0 not in s: exp = Index(s.index.tolist() + [0]) tm.assert_index_equal(s2.index, exp) s2 = s.copy() indexer(s2)["0"] = 0 assert s2.index.is_object() for s in [Series(range(5), index=np.arange(5.0))]: assert s.index.is_floating() for idxr in [lambda x: x.ix, lambda x: x.loc, lambda x: x]: s2 = s.copy() idxr(s2)[0.1] = 0 assert s2.index.is_floating() assert idxr(s2)[0.1] == 0 s2 = s.copy() idxr(s2)[0.0] = 0 tm.assert_index_equal(s2.index, s.index) s2 = s.copy() idxr(s2)["0"] = 0 assert s2.index.is_object()
import pytest import numpy as np import pandas as pd import pandas.util.testing as tm from pandas.compat import PY3 from pandas.core import ops from pandas import Timedelta, Series, Index, TimedeltaIndex @pytest.fixture(params=[ pd.Float64Index(np.arange(5, dtype='float64')), pd.UInt64Index(np.arange(5, dtype='uint64')), pd.Int64Index(np.arange(5, dtype='int64')), pd.RangeIndex(5) ], ids=lambda x: type(x).__name__) def idx(request): return request.param # ------------------------------------------------------------------ # Comparisons class TestNumericComparisons(object): def test_operator_series_comparison_zerorank(self): # GH#13006 result = np.float64(0) > pd.Series([1, 2, 3])
def setup(self): N = 10**5 self.idx_int_dup = pd.Int64Index(np.arange(N * 5)) # cache is_unique self.idx_int_dup.is_unique
class StatisticalBuiltInsTestCase(zf.WithAssetFinder, zf.WithTradingCalendars, zf.ZiplineTestCase): sids = ASSET_FINDER_EQUITY_SIDS = pd.Int64Index([1, 2, 3]) START_DATE = pd.Timestamp("2015-01-31", tz="UTC") END_DATE = pd.Timestamp("2015-03-01", tz="UTC") ASSET_FINDER_EQUITY_SYMBOLS = ("A", "B", "C") ASSET_FINDER_COUNTRY_CODE = "US" @classmethod def init_class_fixtures(cls): super(StatisticalBuiltInsTestCase, cls).init_class_fixtures() day = cls.trading_calendar.day cls.dates = dates = pd.date_range( "2015-02-01", "2015-02-28", freq=day, tz="UTC", ) # Using these start and end dates because they are a contigous span of # 5 days (Monday - Friday) and they allow for plenty of days to look # back on when computing correlations and regressions. cls.start_date_index = start_date_index = 14 cls.end_date_index = end_date_index = 18 cls.pipeline_start_date = dates[start_date_index] cls.pipeline_end_date = dates[end_date_index] cls.num_days = num_days = end_date_index - start_date_index + 1 sids = cls.sids cls.assets = assets = cls.asset_finder.retrieve_all(sids) cls.my_asset_column = my_asset_column = 0 cls.my_asset = assets[my_asset_column] cls.num_assets = num_assets = len(assets) cls.raw_data = raw_data = pd.DataFrame( data=np.arange(len(dates) * len(sids), dtype=float64_dtype).reshape( len(dates), len(sids), ), index=dates, columns=assets, ) # Using mock 'close' data here because the correlation and regression # built-ins use USEquityPricing.close as the input to their `Returns` # factors. Since there is no way to change that when constructing an # instance of these built-ins, we need to test with mock 'close' data # to most accurately reflect their true behavior and results. close_loader = DataFrameLoader(USEquityPricing.close, raw_data) cls.run_pipeline = SimplePipelineEngine( { USEquityPricing.close: close_loader }.__getitem__, cls.asset_finder, default_domain=US_EQUITIES, ).run_pipeline cls.cascading_mask = AssetIDPlusDay() < (sids[-1] + dates[start_date_index].day) cls.expected_cascading_mask_result = make_cascading_boolean_array( shape=(num_days, num_assets), ) cls.alternating_mask = (AssetIDPlusDay() % 2).eq(0) cls.expected_alternating_mask_result = make_alternating_boolean_array( shape=(num_days, num_assets), ) cls.expected_no_mask_result = np.full( shape=(num_days, num_assets), fill_value=True, dtype=bool_dtype, ) @parameter_space(returns_length=[2, 3], correlation_length=[3, 4]) def test_correlation_factors(self, returns_length, correlation_length): """ Tests for the built-in factors `RollingPearsonOfReturns` and `RollingSpearmanOfReturns`. """ assets = self.assets my_asset = self.my_asset my_asset_column = self.my_asset_column dates = self.dates start_date = self.pipeline_start_date end_date = self.pipeline_end_date start_date_index = self.start_date_index end_date_index = self.end_date_index num_days = self.num_days run_pipeline = self.run_pipeline returns = Returns(window_length=returns_length) masks = (self.cascading_mask, self.alternating_mask, NotSpecified) expected_mask_results = ( self.expected_cascading_mask_result, self.expected_alternating_mask_result, self.expected_no_mask_result, ) for mask, expected_mask in zip(masks, expected_mask_results): pearson_factor = RollingPearsonOfReturns( target=my_asset, returns_length=returns_length, correlation_length=correlation_length, mask=mask, ) spearman_factor = RollingSpearmanOfReturns( target=my_asset, returns_length=returns_length, correlation_length=correlation_length, mask=mask, ) columns = { "pearson_factor": pearson_factor, "spearman_factor": spearman_factor, } pipeline = Pipeline(columns=columns) if mask is not NotSpecified: pipeline.add(mask, "mask") results = run_pipeline(pipeline, start_date, end_date) pearson_results = results["pearson_factor"].unstack() spearman_results = results["spearman_factor"].unstack() if mask is not NotSpecified: mask_results = results["mask"].unstack() check_arrays(mask_results.values, expected_mask) # Run a separate pipeline that calculates returns starting # (correlation_length - 1) days prior to our start date. This is # because we need (correlation_length - 1) extra days of returns to # compute our expected correlations. results = run_pipeline( Pipeline(columns={"returns": returns}), dates[start_date_index - (correlation_length - 1)], dates[end_date_index], ) returns_results = results["returns"].unstack() # On each day, calculate the expected correlation coefficients # between the asset we are interested in and each other asset. Each # correlation is calculated over `correlation_length` days. expected_pearson_results = np.full_like(pearson_results, nan) expected_spearman_results = np.full_like(spearman_results, nan) for day in range(num_days): todays_returns = returns_results.iloc[day:day + correlation_length] my_asset_returns = todays_returns.iloc[:, my_asset_column] for asset, other_asset_returns in todays_returns.iteritems(): asset_column = int(asset) - 1 expected_pearson_results[day, asset_column] = pearsonr( my_asset_returns, other_asset_returns, )[0] expected_spearman_results[day, asset_column] = spearmanr( my_asset_returns, other_asset_returns, )[0] expected_pearson_results = pd.DataFrame( data=np.where(expected_mask, expected_pearson_results, nan), index=dates[start_date_index:end_date_index + 1], columns=assets, ) assert_frame_equal(pearson_results, expected_pearson_results) expected_spearman_results = pd.DataFrame( data=np.where(expected_mask, expected_spearman_results, nan), index=dates[start_date_index:end_date_index + 1], columns=assets, ) assert_frame_equal(spearman_results, expected_spearman_results) @parameter_space(returns_length=[2, 3], regression_length=[3, 4]) def test_regression_of_returns_factor(self, returns_length, regression_length): """ Tests for the built-in factor `RollingLinearRegressionOfReturns`. """ assets = self.assets my_asset = self.my_asset my_asset_column = self.my_asset_column dates = self.dates start_date = self.pipeline_start_date end_date = self.pipeline_end_date start_date_index = self.start_date_index end_date_index = self.end_date_index num_days = self.num_days run_pipeline = self.run_pipeline # The order of these is meant to align with the output of `linregress`. outputs = ["beta", "alpha", "r_value", "p_value", "stderr"] returns = Returns(window_length=returns_length) masks = self.cascading_mask, self.alternating_mask, NotSpecified expected_mask_results = ( self.expected_cascading_mask_result, self.expected_alternating_mask_result, self.expected_no_mask_result, ) for mask, expected_mask in zip(masks, expected_mask_results): regression_factor = RollingLinearRegressionOfReturns( target=my_asset, returns_length=returns_length, regression_length=regression_length, mask=mask, ) columns = { output: getattr(regression_factor, output) for output in outputs } pipeline = Pipeline(columns=columns) if mask is not NotSpecified: pipeline.add(mask, "mask") results = run_pipeline(pipeline, start_date, end_date) if mask is not NotSpecified: mask_results = results["mask"].unstack() check_arrays(mask_results.values, expected_mask) output_results = {} expected_output_results = {} for output in outputs: output_results[output] = results[output].unstack() expected_output_results[output] = np.full_like( output_results[output], nan, ) # Run a separate pipeline that calculates returns starting # (regression_length - 1) days prior to our start date. This is # because we need (regression_length - 1) extra days of returns to # compute our expected regressions. results = run_pipeline( Pipeline(columns={"returns": returns}), dates[start_date_index - (regression_length - 1)], dates[end_date_index], ) returns_results = results["returns"].unstack() # On each day, calculate the expected regression results for Y ~ X # where Y is the asset we are interested in and X is each other # asset. Each regression is calculated over `regression_length` # days of data. for day in range(num_days): todays_returns = returns_results.iloc[day:day + regression_length] my_asset_returns = todays_returns.iloc[:, my_asset_column] for asset, other_asset_returns in todays_returns.iteritems(): asset_column = int(asset) - 1 expected_regression_results = linregress( y=other_asset_returns, x=my_asset_returns, ) for i, output in enumerate(outputs): expected_output_results[output][ day, asset_column] = expected_regression_results[i] for output in outputs: output_result = output_results[output] expected_output_result = pd.DataFrame( np.where(expected_mask, expected_output_results[output], nan), index=dates[start_date_index:end_date_index + 1], columns=assets, ) assert_frame_equal(output_result, expected_output_result) def test_simple_beta_matches_regression(self): run_pipeline = self.run_pipeline simple_beta = SimpleBeta(target=self.my_asset, regression_length=10) complex_beta = RollingLinearRegressionOfReturns( target=self.my_asset, returns_length=2, regression_length=10, ).beta pipe = Pipeline({"simple": simple_beta, "complex": complex_beta}) results = run_pipeline( pipe, self.pipeline_start_date, self.pipeline_end_date, ) assert_equal(results["simple"], results["complex"], check_names=False) def test_simple_beta_allowed_missing_calculation(self): for percentage, expected in [ (0.651, 65), (0.659, 65), (0.66, 66), (0.0, 0), (1.0, 100), ]: beta = SimpleBeta( target=self.my_asset, regression_length=100, allowed_missing_percentage=percentage, ) assert beta.params["allowed_missing_count"] == expected def test_correlation_and_regression_with_bad_asset(self): """ Test that `RollingPearsonOfReturns`, `RollingSpearmanOfReturns` and `RollingLinearRegressionOfReturns` raise the proper exception when given a nonexistent target asset. """ my_asset = Equity( 0, exchange_info=ExchangeInfo("TEST", "TEST FULL", "US"), ) start_date = self.pipeline_start_date end_date = self.pipeline_end_date run_pipeline = self.run_pipeline # This filter is arbitrary; the important thing is that we test each # factor both with and without a specified mask. my_asset_filter = AssetID().eq(1) for mask in (NotSpecified, my_asset_filter): pearson_factor = RollingPearsonOfReturns( target=my_asset, returns_length=3, correlation_length=3, mask=mask, ) spearman_factor = RollingSpearmanOfReturns( target=my_asset, returns_length=3, correlation_length=3, mask=mask, ) regression_factor = RollingLinearRegressionOfReturns( target=my_asset, returns_length=3, regression_length=3, mask=mask, ) with pytest.raises(NonExistentAssetInTimeFrame): run_pipeline( Pipeline(columns={"pearson_factor": pearson_factor}), start_date, end_date, ) with pytest.raises(NonExistentAssetInTimeFrame): run_pipeline( Pipeline(columns={"spearman_factor": spearman_factor}), start_date, end_date, ) with pytest.raises(NonExistentAssetInTimeFrame): run_pipeline( Pipeline(columns={"regression_factor": regression_factor}), start_date, end_date, ) def test_require_length_greater_than_one(self): my_asset = Equity( 0, exchange_info=ExchangeInfo("TEST", "TEST FULL", "US"), ) with pytest.raises(ValueError): RollingPearsonOfReturns( target=my_asset, returns_length=3, correlation_length=1, ) with pytest.raises(ValueError): RollingSpearmanOfReturns( target=my_asset, returns_length=3, correlation_length=1, ) with pytest.raises(ValueError): RollingLinearRegressionOfReturns( target=my_asset, returns_length=3, regression_length=1, ) def test_simple_beta_input_validation(self): expected = ("SimpleBeta() expected a value of type" " Asset for argument 'target'," " but got str instead.") with pytest.raises(TypeError, match=re.escape(expected)): SimpleBeta( target="SPY", regression_length=100, allowed_missing_percentage=0.5, ) expected = ("SimpleBeta() expected a value greater than or equal to 3" " for argument 'regression_length', but got 1 instead.") with pytest.raises(ValueError, match=re.escape(expected)): SimpleBeta( target=self.my_asset, regression_length=1, allowed_missing_percentage=0.5, ) expected = ( "SimpleBeta() expected a value inclusively between 0.0 and 1.0 " "for argument 'allowed_missing_percentage', but got 50 instead.") with pytest.raises(ValueError, match=re.escape(expected)): SimpleBeta( target=self.my_asset, regression_length=100, allowed_missing_percentage=50, ) def test_simple_beta_target(self): beta = SimpleBeta( target=self.my_asset, regression_length=50, allowed_missing_percentage=0.5, ) assert beta.target is self.my_asset def test_simple_beta_repr(self): beta = SimpleBeta( target=self.my_asset, regression_length=50, allowed_missing_percentage=0.5, ) result = repr(beta) expected = "SimpleBeta({}, length=50, allowed_missing=25)".format( self.my_asset, ) assert result == expected def test_simple_beta_graph_repr(self): beta = SimpleBeta( target=self.my_asset, regression_length=50, allowed_missing_percentage=0.5, ) result = beta.graph_repr() expected = "SimpleBeta('A', 50, 25)" assert result == expected
def test_make_meta(): df = pd.DataFrame({'a': [1, 2, 3], 'b': list('abc'), 'c': [1., 2., 3.]}, index=[10, 20, 30]) # Pandas dataframe meta = make_meta(df) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, type(df.index)) # Pandas series meta = make_meta(df.a) assert len(meta) == 0 assert meta.dtype == df.a.dtype assert isinstance(meta.index, type(df.index)) # Pandas index meta = make_meta(df.index) assert isinstance(meta, type(df.index)) assert len(meta) == 0 # Dask object ddf = dd.from_pandas(df, npartitions=2) assert make_meta(ddf) is ddf._meta # Dict meta = make_meta({'a': 'i8', 'b': 'O', 'c': 'f8'}) assert isinstance(meta, pd.DataFrame) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, pd.RangeIndex) # Iterable meta = make_meta([('a', 'i8'), ('c', 'f8'), ('b', 'O')]) assert (meta.columns == ['a', 'c', 'b']).all() assert len(meta) == 0 assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all() assert isinstance(meta.index, pd.RangeIndex) # Tuple meta = make_meta(('a', 'i8')) assert isinstance(meta, pd.Series) assert len(meta) == 0 assert meta.dtype == 'i8' assert meta.name == 'a' # With index meta = make_meta({'a': 'i8', 'b': 'i4'}, index=pd.Int64Index([1, 2], name='foo')) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 meta = make_meta(('a', 'i8'), index=pd.Int64Index([1, 2], name='foo')) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 # Categoricals meta = make_meta({'a': 'category'}) assert len(meta.a.cat.categories) == 1 assert meta.a.cat.categories[0] == UNKNOWN_CATEGORIES meta = make_meta(('a', 'category')) assert len(meta.cat.categories) == 1 assert meta.cat.categories[0] == UNKNOWN_CATEGORIES # Numpy scalar meta = make_meta(np.float64(1.0)) assert isinstance(meta, np.float64) # Python scalar meta = make_meta(1.0) assert isinstance(meta, np.float64) # Timestamp x = pd.Timestamp(2000, 1, 1) meta = make_meta(x) assert meta is x # Dtype expressions meta = make_meta('i8') assert isinstance(meta, np.int64) meta = make_meta(float) assert isinstance(meta, np.dtype(float).type) meta = make_meta(np.dtype('bool')) assert isinstance(meta, np.bool_) assert pytest.raises(TypeError, lambda: make_meta(None))
all_data = pd.concat([train_data, test_data], axis=0).sort_values(by='ID').reset_index().drop(['index'], axis=1) bad_feature = [ 'ID', '功率A', '功率B', '功率C', '平均功率', '现场温度', '电压A', '电压B', '电压C', '电流B', '电流C', '转换效率', '转换效率A', '转换效率B', '转换效率C' ] bad_index1 = all_data[bad_feature][ (all_data[bad_feature] > all_data[bad_feature].mean() + 2 * all_data[bad_feature].std()) | (all_data[bad_feature] < all_data[bad_feature].mean() - 2 * all_data[bad_feature].std())].dropna(how='all').index bad_index2 = all_data[((all_data['电压A'] < 500) & (all_data['电压A'] != 0)) | ((all_data['电压B'] < 500) & (all_data['电压B'] != 0)) | ((all_data['电压C'] < 500) & (all_data['电压C'] != 0))].index bad_index = pd.Int64Index(list(bad_index1) + list(bad_index2)) # all_data.loc[np.concatenate([bad_index -1,bad_index,bad_index+1])].sort_values(by='ID', ascending=True) nn_bad_data = all_data.loc[np.concatenate( [bad_index - 1, bad_index, bad_index + 1])].sort_values(by='ID', ascending=True).drop_duplicates() bad_data = all_data.loc[bad_index].sort_values(by='ID', ascending=True) #上下记录均值替代异常值 for idx, line in bad_data.iterrows(): ID = line['ID'] col_index = line[bad_feature][ (line[bad_feature] > all_data[bad_feature].mean() + 3 * all_data[bad_feature].std()) | (line[bad_feature] < all_data[bad_feature].mean() - 3 * all_data[bad_feature].std())].index
def restore_dataframe( store, key, filter_query=None, columns=None, predicate_pushdown_to_io=True, categories=None, predicates=None, date_as_object=False, ): check_predicates(predicates) # If we want to do columnar access we can benefit from partial reads # otherwise full read en block is the better option. if (not predicate_pushdown_to_io) or (columns is None and predicates is None): with pa.BufferReader(store.get(key)) as reader: table = pq.read_pandas(reader, columns=columns) else: if HAVE_BOTO and isinstance(store, BotoStore): # Parquet and seeks on S3 currently leak connections thus # we omit column projection to the store. reader = pa.BufferReader(store.get(key)) else: reader = store.open(key) # Buffer at least 4 MB in requests. This is chosen because the default block size of the Azure # storage client is 4MB. reader = BlockBuffer(reader, 4 * 1024 * 1024) try: parquet_file = ParquetFile(reader) if predicates and parquet_file.metadata.num_rows > 0: # We need to calculate different predicates for predicate # pushdown and the later DataFrame filtering. This is required # e.g. in the case where we have an `in` predicate as this has # different normalized values. columns_to_io = _columns_for_pushdown(columns, predicates) predicates_for_pushdown = _normalize_predicates( parquet_file, predicates, True ) predicates = _normalize_predicates(parquet_file, predicates, False) tables = _read_row_groups_into_tables( parquet_file, columns_to_io, predicates_for_pushdown ) if len(tables) == 0: if ARROW_LARGER_EQ_0130: table = parquet_file.schema.to_arrow_schema().empty_table() else: table = _empty_table_from_schema(parquet_file) else: table = pa.concat_tables(tables) else: # ARROW-5139 Column projection with empty columns returns a table w/out index if ARROW_LARGER_EQ_0130 and columns == []: # Create an arrow table with expected index length. df = ( parquet_file.schema.to_arrow_schema() .empty_table() .to_pandas(date_as_object=date_as_object) ) index = pd.Int64Index( pd.RangeIndex(start=0, stop=parquet_file.metadata.num_rows) ) df = pd.DataFrame(df, index=index) # convert back to table to keep downstream code untouched by this patch table = pa.Table.from_pandas(df) else: table = pq.read_pandas(reader, columns=columns) finally: reader.close() table = _fix_pyarrow_07992_table(table) table = _fix_pyarrow_0130_table(table) if columns is not None: missing_columns = set(columns) - set(table.schema.names) if missing_columns: raise ValueError( u"Columns cannot be found in stored dataframe: {missing}".format( missing=u", ".join(sorted(missing_columns)) ) ) df = table.to_pandas(categories=categories, date_as_object=date_as_object) df.columns = df.columns.map(ensure_unicode_string_type) if predicates: df = filter_df_from_predicates( df, predicates, strict_date_types=date_as_object ) else: df = filter_df(df, filter_query) if columns is not None: return df.loc[:, columns] else: return df
def test_meta_nonempty_index(): idx = pd.RangeIndex(1, name='foo') res = meta_nonempty(idx) assert type(res) is pd.RangeIndex assert res.name == idx.name idx = pd.Int64Index([1], name='foo') res = meta_nonempty(idx) assert type(res) is pd.Int64Index assert res.name == idx.name idx = pd.Index(['a'], name='foo') res = meta_nonempty(idx) assert type(res) is pd.Index assert res.name == idx.name idx = pd.DatetimeIndex(['1970-01-01'], freq='d', tz='America/New_York', name='foo') res = meta_nonempty(idx) assert type(res) is pd.DatetimeIndex assert res.tz == idx.tz assert res.freq == idx.freq assert res.name == idx.name idx = pd.PeriodIndex(['1970-01-01'], freq='d', name='foo') res = meta_nonempty(idx) assert type(res) is pd.PeriodIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.TimedeltaIndex([np.timedelta64(1, 'D')], freq='d', name='foo') res = meta_nonempty(idx) assert type(res) is pd.TimedeltaIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.CategoricalIndex(['xyx'], ['xyx', 'zzz'], ordered=True, name='foo') res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert (res.categories == idx.categories).all() assert res.ordered == idx.ordered assert res.name == idx.name idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES], ordered=True, name='foo') res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert res.ordered == idx.ordered assert res.name == idx.name levels = [pd.Int64Index([1], name='a'), pd.Float64Index([1.0], name='b')] idx = pd.MultiIndex(levels=levels, labels=[[0], [0]], names=['a', 'b']) res = meta_nonempty(idx) assert type(res) is pd.MultiIndex for idx1, idx2 in zip(idx.levels, res.levels): assert type(idx1) is type(idx2) assert idx1.name == idx2.name assert res.names == idx.names levels = [pd.Int64Index([1], name='a'), pd.CategoricalIndex(data=['xyx'], categories=['xyx'], name='b'), pd.TimedeltaIndex([np.timedelta64(1, 'D')], name='timedelta')] idx = pd.MultiIndex(levels=levels, labels=[[0], [0], [0]], names=['a', 'b', 'timedelta']) res = meta_nonempty(idx) assert type(res) is pd.MultiIndex for idx1, idx2 in zip(idx.levels, res.levels): assert type(idx1) is type(idx2) assert idx1.name == idx2.name assert res.names == idx.names
def from_stimulus_file( cls, stimulus_file: StimulusFile, stimulus_timestamps: StimulusTimestamps, limit_to_images: Optional[List] = None) -> "Presentations": """Get stimulus presentation data. :param stimulus_file :param limit_to_images Only return images given by these image names :param stimulus_timestamps :returns: pd.DataFrame -- Table whose rows are stimulus presentations (i.e. a given image, for a given duration, typically 250 ms) and whose columns are presentation characteristics. """ stimulus_timestamps = stimulus_timestamps.value data = stimulus_file.data raw_stim_pres_df = get_stimulus_presentations(data, stimulus_timestamps) # Fill in nulls for image_name # This makes two assumptions: # 1. Nulls in `image_name` should be "gratings_<orientation>" # 2. Gratings are only present (or need to be fixed) when all # values for `image_name` are null. if pd.isnull(raw_stim_pres_df["image_name"]).all(): if ~pd.isnull(raw_stim_pres_df["orientation"]).all(): raw_stim_pres_df["image_name"] = ( raw_stim_pres_df["orientation"].apply( lambda x: f"gratings_{x}")) else: raise ValueError("All values for 'orentation' and 'image_name'" " are null.") stimulus_metadata_df = get_stimulus_metadata(data) idx_name = raw_stim_pres_df.index.name stimulus_index_df = (raw_stim_pres_df.reset_index().merge( stimulus_metadata_df.reset_index(), on=["image_name"]).set_index(idx_name)) stimulus_index_df = (stimulus_index_df[[ "image_set", "image_index", "start_time", "phase", "spatial_frequency" ]].rename(columns={ "start_time": "timestamps" }).sort_index().set_index("timestamps", drop=True)) stim_pres_df = raw_stim_pres_df.merge(stimulus_index_df, left_on="start_time", right_index=True, how="left") if len(raw_stim_pres_df) != len(stim_pres_df): raise ValueError("Length of `stim_pres_df` should not change after" f" merge; was {len(raw_stim_pres_df)}, now " f" {len(stim_pres_df)}.") stim_pres_df['is_change'] = is_change_event( stimulus_presentations=stim_pres_df) # Sort columns then drop columns which contain only all NaN values stim_pres_df = \ stim_pres_df[sorted(stim_pres_df)].dropna(axis=1, how='all') if limit_to_images is not None: stim_pres_df = \ stim_pres_df[stim_pres_df['image_name'].isin(limit_to_images)] stim_pres_df.index = pd.Int64Index(range(stim_pres_df.shape[0]), name=stim_pres_df.index.name) stim_pres_df = cls._postprocess(presentations=stim_pres_df) return Presentations(presentations=stim_pres_df)
def test_make_meta(): df = pd.DataFrame({ "a": [1, 2, 3], "b": list("abc"), "c": [1.0, 2.0, 3.0] }, index=[10, 20, 30]) # Pandas dataframe meta = make_meta(df) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, type(df.index)) # Pandas series meta = make_meta(df.a) assert len(meta) == 0 assert meta.dtype == df.a.dtype assert isinstance(meta.index, type(df.index)) # Pandas index meta = make_meta(df.index) assert isinstance(meta, type(df.index)) assert len(meta) == 0 # Dask object ddf = dd.from_pandas(df, npartitions=2) assert make_meta(ddf) is ddf._meta # Dict meta = make_meta({"a": "i8", "b": "O", "c": "f8"}) assert isinstance(meta, pd.DataFrame) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, pd.RangeIndex) # Iterable meta = make_meta([("a", "i8"), ("c", "f8"), ("b", "O")]) assert (meta.columns == ["a", "c", "b"]).all() assert len(meta) == 0 assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all() assert isinstance(meta.index, pd.RangeIndex) # Tuple meta = make_meta(("a", "i8")) assert isinstance(meta, pd.Series) assert len(meta) == 0 assert meta.dtype == "i8" assert meta.name == "a" # With index meta = make_meta( { "a": "i8", "b": "i4" }, index=pd.Int64Index([1, 2], name="foo"), ) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 meta = make_meta(("a", "i8"), index=pd.Int64Index([1, 2], name="foo")) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 # Categoricals meta = make_meta({"a": "category"}, parent_meta=df) assert len(meta.a.cat.categories) == 1 assert meta.a.cat.categories[0] == UNKNOWN_CATEGORIES meta = make_meta(("a", "category"), parent_meta=df) assert len(meta.cat.categories) == 1 assert meta.cat.categories[0] == UNKNOWN_CATEGORIES # Numpy scalar meta = make_meta(np.float64(1.0), parent_meta=df) assert isinstance(meta, np.float64) # Python scalar meta = make_meta(1.0, parent_meta=df) assert isinstance(meta, np.float64) # Timestamp x = pd.Timestamp(2000, 1, 1) meta = make_meta(x, parent_meta=df) assert meta is x # DatetimeTZDtype x = pd.DatetimeTZDtype(tz="UTC") meta = make_meta(x) assert meta == pd.Timestamp(1, tz=x.tz, unit=x.unit) # Dtype expressions meta = make_meta("i8", parent_meta=df) assert isinstance(meta, np.int64) meta = make_meta(float, parent_meta=df) assert isinstance(meta, np.dtype(float).type) meta = make_meta(np.dtype("bool"), parent_meta=df) assert isinstance(meta, np.bool_) assert pytest.raises(TypeError, lambda: make_meta(None))
def test_meta_nonempty_index(): idx = pd.RangeIndex(1, name="foo") res = meta_nonempty(idx) assert type(res) is pd.RangeIndex assert res.name == idx.name idx = pd.Int64Index([1], name="foo") res = meta_nonempty(idx) assert type(res) is pd.Int64Index assert res.name == idx.name idx = pd.Index(["a"], name="foo") res = meta_nonempty(idx) assert type(res) is pd.Index assert res.name == idx.name idx = pd.DatetimeIndex(["1970-01-01"], freq="d", tz="America/New_York", name="foo") res = meta_nonempty(idx) assert type(res) is pd.DatetimeIndex assert res.tz == idx.tz assert res.freq == idx.freq assert res.name == idx.name idx = pd.PeriodIndex(["1970-01-01"], freq="d", name="foo") res = meta_nonempty(idx) assert type(res) is pd.PeriodIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.TimedeltaIndex([np.timedelta64(1, "D")], freq="d", name="foo") res = meta_nonempty(idx) assert type(res) is pd.TimedeltaIndex assert res.freq == idx.freq assert res.name == idx.name idx = pd.CategoricalIndex(["xyx"], ["xyx", "zzz"], ordered=True, name="foo") res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert (res.categories == idx.categories).all() assert res.ordered == idx.ordered assert res.name == idx.name idx = pd.CategoricalIndex([], [UNKNOWN_CATEGORIES], ordered=True, name="foo") res = meta_nonempty(idx) assert type(res) is pd.CategoricalIndex assert res.ordered == idx.ordered assert res.name == idx.name levels = [pd.Int64Index([1], name="a"), pd.Float64Index([1.0], name="b")] codes = [[0], [0]] idx = pd.MultiIndex(levels=levels, names=["a", "b"], codes=codes) res = meta_nonempty(idx) assert type(res) is pd.MultiIndex for idx1, idx2 in zip(idx.levels, res.levels): assert type(idx1) is type(idx2) assert idx1.name == idx2.name assert res.names == idx.names levels = [ pd.Int64Index([1], name="a"), pd.CategoricalIndex(data=["xyx"], categories=["xyx"], name="b"), pd.TimedeltaIndex([np.timedelta64(1, "D")], name="timedelta"), ] codes = [[0], [0], [0]] idx = pd.MultiIndex(levels=levels, names=["a", "b", "timedelta"], codes=codes) res = meta_nonempty(idx) assert type(res) is pd.MultiIndex for idx1, idx2 in zip(idx.levels, res.levels): assert type(idx1) is type(idx2) assert idx1.name == idx2.name assert res.names == idx.names
def fix_df_index_impl(index): index_data = fix_df_array(index) return pd.Int64Index(index_data)
@pytest.fixture(params=zeros) def zero(request): # For testing division by (or of) zero for Index with length 5, this # gives several scalar-zeros and length-5 vector-zeros return request.param # ------------------------------------------------------------------ # Vector Fixtures @pytest.fixture( params=[ pd.Float64Index(np.arange(5, dtype="float64")), pd.Int64Index(np.arange(5, dtype="int64")), pd.UInt64Index(np.arange(5, dtype="uint64")), pd.RangeIndex(5), ], ids=lambda x: type(x).__name__, ) def numeric_idx(request): """ Several types of numeric-dtypes Index objects """ return request.param # ------------------------------------------------------------------ # Scalar Fixtures
def setup(self, keep): N = 10**5 self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) self.string_idx = tm.makeStringIndex(N)
def test_filter_index_value(): pd_index = pd.RangeIndex(10) index_value = parse_index(pd_index) min_max = (0, True, 9, True) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist() min_max = (0, False, 9, False) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index > 0) & (pd_index < 9)].tolist() pd_index = pd.RangeIndex(1, 11, 3) index_value = parse_index(pd_index) min_max = (2, True, 10, True) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist() min_max = (2, False, 10, False) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index > 2) & (pd_index < 10)].tolist() pd_index = pd.RangeIndex(9, -1, -1) index_value = parse_index(pd_index) min_max = (0, True, 9, True) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist() min_max = (0, False, 9, False) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index > 0) & (pd_index < 9)].tolist() pd_index = pd.RangeIndex(10, 0, -3) index_value = parse_index(pd_index, store_data=False) min_max = (2, True, 10, True) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist() min_max = (2, False, 10, False) assert filter_index_value( index_value, min_max).to_pandas().tolist() == pd_index[(pd_index > 2) & (pd_index < 10)].tolist() pd_index = pd.Int64Index([0, 3, 8]) index_value = parse_index(pd_index, store_data=True) min_max = (2, True, 8, False) assert filter_index_value( index_value, min_max, store_data=True).to_pandas().tolist() == pd_index[ (pd_index >= 2) & (pd_index < 8)].tolist() index_value = parse_index(pd_index) min_max = (2, True, 8, False) filtered = filter_index_value(index_value, min_max) assert len(filtered.to_pandas().tolist()) == 0 assert isinstance(filtered.value, IndexValue.Int64Index)
class StatisticalMethodsTestCase(zf.WithSeededRandomPipelineEngine, zf.ZiplineTestCase): sids = ASSET_FINDER_EQUITY_SIDS = pd.Int64Index([1, 2, 3]) START_DATE = pd.Timestamp("2015-01-31", tz="UTC") END_DATE = pd.Timestamp("2015-03-01", tz="UTC") ASSET_FINDER_COUNTRY_CODE = "US" SEEDED_RANDOM_PIPELINE_DEFAULT_DOMAIN = US_EQUITIES @classmethod def init_class_fixtures(cls): super(StatisticalMethodsTestCase, cls).init_class_fixtures() # Using these start and end dates because they are a contigous span of # 5 days (Monday - Friday) and they allow for plenty of days to look # back on when computing correlations and regressions. cls.dates = dates = cls.trading_days cls.start_date_index = start_date_index = 14 cls.end_date_index = end_date_index = 18 cls.pipeline_start_date = cls.trading_days[start_date_index] cls.pipeline_end_date = cls.trading_days[end_date_index] sids = cls.sids cls.assets = assets = cls.asset_finder.retrieve_all(sids) cls.my_asset_column = my_asset_column = 0 cls.my_asset = assets[my_asset_column] cls.num_days = num_days = end_date_index - start_date_index + 1 cls.num_assets = num_assets = len(assets) cls.cascading_mask = AssetIDPlusDay() < (sids[-1] + dates[start_date_index].day) cls.expected_cascading_mask_result = make_cascading_boolean_array( shape=(num_days, num_assets), ) cls.alternating_mask = (AssetIDPlusDay() % 2).eq(0) cls.expected_alternating_mask_result = make_alternating_boolean_array( shape=(num_days, num_assets), ) cls.expected_no_mask_result = np.full( shape=(num_days, num_assets), fill_value=True, dtype=bool_dtype, ) # Random input for factors. cls.col = TestingDataSet.float_col @parameter_space(returns_length=[2, 3], correlation_length=[3, 4]) def test_factor_correlation_methods(self, returns_length, correlation_length): """ Ensure that `Factor.pearsonr` and `Factor.spearmanr` are consistent with the built-in factors `RollingPearsonOfReturns` and `RollingSpearmanOfReturns`. """ my_asset = self.my_asset start_date = self.pipeline_start_date end_date = self.pipeline_end_date run_pipeline = self.run_pipeline returns = Returns(window_length=returns_length, inputs=[self.col]) returns_slice = returns[my_asset] pearson = returns.pearsonr( target=returns_slice, correlation_length=correlation_length, ) spearman = returns.spearmanr( target=returns_slice, correlation_length=correlation_length, ) expected_pearson = RollingPearsonOfReturns( target=my_asset, returns_length=returns_length, correlation_length=correlation_length, ) expected_spearman = RollingSpearmanOfReturns( target=my_asset, returns_length=returns_length, correlation_length=correlation_length, ) # These built-ins construct their own Returns factor to use as inputs, # so the only way to set our own inputs is to do so after the fact. # This should not be done in practice. It is necessary here because we # want Returns to use our random data as an input, but by default it is # using USEquityPricing.close. expected_pearson.inputs = [returns, returns_slice] expected_spearman.inputs = [returns, returns_slice] columns = { "pearson": pearson, "spearman": spearman, "expected_pearson": expected_pearson, "expected_spearman": expected_spearman, } results = run_pipeline(Pipeline(columns=columns), start_date, end_date) pearson_results = results["pearson"].unstack() spearman_results = results["spearman"].unstack() expected_pearson_results = results["expected_pearson"].unstack() expected_spearman_results = results["expected_spearman"].unstack() assert_frame_equal(pearson_results, expected_pearson_results) assert_frame_equal(spearman_results, expected_spearman_results) def test_correlation_methods_bad_type(self): """ Make sure we cannot call the Factor correlation methods on factors or slices that are not of float or int dtype. """ # These are arbitrary for the purpose of this test. returns_length = 2 correlation_length = 10 returns = Returns(window_length=returns_length, inputs=[self.col]) returns_slice = returns[self.my_asset] class BadTypeFactor(CustomFactor): inputs = [] window_length = 1 dtype = datetime64ns_dtype window_safe = True def compute(self, today, assets, out): pass bad_type_factor = BadTypeFactor() bad_type_factor_slice = bad_type_factor[self.my_asset] with pytest.raises(TypeError): bad_type_factor.pearsonr( target=returns_slice, correlation_length=correlation_length, ) with pytest.raises(TypeError): bad_type_factor.spearmanr( target=returns_slice, correlation_length=correlation_length, ) with pytest.raises(TypeError): returns.pearsonr( target=bad_type_factor_slice, correlation_length=correlation_length, ) with pytest.raises(TypeError): returns.spearmanr( target=bad_type_factor_slice, correlation_length=correlation_length, ) @parameter_space(returns_length=[2, 3], regression_length=[3, 4]) def test_factor_regression_method(self, returns_length, regression_length): """ Ensure that `Factor.linear_regression` is consistent with the built-in factor `RollingLinearRegressionOfReturns`. """ my_asset = self.my_asset start_date = self.pipeline_start_date end_date = self.pipeline_end_date run_pipeline = self.run_pipeline returns = Returns(window_length=returns_length, inputs=[self.col]) returns_slice = returns[my_asset] regression = returns.linear_regression( target=returns_slice, regression_length=regression_length, ) expected_regression = RollingLinearRegressionOfReturns( target=my_asset, returns_length=returns_length, regression_length=regression_length, ) # This built-in constructs its own Returns factor to use as an input, # so the only way to set our own input is to do so after the fact. This # should not be done in practice. It is necessary here because we want # Returns to use our random data as an input, but by default it is # using USEquityPricing.close. expected_regression.inputs = [returns, returns_slice] columns = { "regression": regression, "expected_regression": expected_regression, } results = run_pipeline(Pipeline(columns=columns), start_date, end_date) regression_results = results["regression"].unstack() expected_regression_results = results["expected_regression"].unstack() assert_frame_equal(regression_results, expected_regression_results) def test_regression_method_bad_type(self): """ Make sure we cannot call the Factor linear regression method on factors or slices that are not of float or int dtype. """ # These are arbitrary for the purpose of this test. returns_length = 2 regression_length = 10 returns = Returns(window_length=returns_length, inputs=[self.col]) returns_slice = returns[self.my_asset] class BadTypeFactor(CustomFactor): window_length = 1 inputs = [] dtype = datetime64ns_dtype window_safe = True def compute(self, today, assets, out): pass bad_type_factor = BadTypeFactor() bad_type_factor_slice = bad_type_factor[self.my_asset] with pytest.raises(TypeError): bad_type_factor.linear_regression( target=returns_slice, regression_length=regression_length, ) with pytest.raises(TypeError): returns.linear_regression( target=bad_type_factor_slice, regression_length=regression_length, ) @parameter_space(correlation_length=[2, 3, 4]) def test_factor_correlation_methods_two_factors(self, correlation_length): """ Tests for `Factor.pearsonr` and `Factor.spearmanr` when passed another 2D factor instead of a Slice. """ assets = self.assets dates = self.dates start_date = self.pipeline_start_date end_date = self.pipeline_end_date start_date_index = self.start_date_index end_date_index = self.end_date_index num_days = self.num_days run_pipeline = self.run_pipeline # Ensure that the correlation methods cannot be called with two 2D # factors which have different masks. returns_masked_1 = Returns( window_length=5, inputs=[self.col], mask=AssetID().eq(1), ) returns_masked_2 = Returns( window_length=5, inputs=[self.col], mask=AssetID().eq(2), ) with pytest.raises(IncompatibleTerms): returns_masked_1.pearsonr( target=returns_masked_2, correlation_length=correlation_length, ) with pytest.raises(IncompatibleTerms): returns_masked_1.spearmanr( target=returns_masked_2, correlation_length=correlation_length, ) returns_5 = Returns(window_length=5, inputs=[self.col]) returns_10 = Returns(window_length=10, inputs=[self.col]) pearson_factor = returns_5.pearsonr( target=returns_10, correlation_length=correlation_length, ) spearman_factor = returns_5.spearmanr( target=returns_10, correlation_length=correlation_length, ) columns = { "pearson_factor": pearson_factor, "spearman_factor": spearman_factor, } pipeline = Pipeline(columns=columns) results = run_pipeline(pipeline, start_date, end_date) pearson_results = results["pearson_factor"].unstack() spearman_results = results["spearman_factor"].unstack() # Run a separate pipeline that calculates returns starting # (correlation_length - 1) days prior to our start date. This is # because we need (correlation_length - 1) extra days of returns to # compute our expected correlations. columns = {"returns_5": returns_5, "returns_10": returns_10} results = run_pipeline( Pipeline(columns=columns), dates[start_date_index - (correlation_length - 1)], dates[end_date_index], ) returns_5_results = results["returns_5"].unstack() returns_10_results = results["returns_10"].unstack() # On each day, calculate the expected correlation coefficients # between each asset's 5 and 10 day rolling returns. Each correlation # is calculated over `correlation_length` days. expected_pearson_results = np.full_like(pearson_results, nan) expected_spearman_results = np.full_like(spearman_results, nan) for day in range(num_days): todays_returns_5 = returns_5_results.iloc[day:day + correlation_length] todays_returns_10 = returns_10_results.iloc[day:day + correlation_length] for asset, asset_returns_5 in todays_returns_5.iteritems(): asset_column = int(asset) - 1 asset_returns_10 = todays_returns_10[asset] expected_pearson_results[day, asset_column] = pearsonr( asset_returns_5, asset_returns_10, )[0] expected_spearman_results[day, asset_column] = spearmanr( asset_returns_5, asset_returns_10, )[0] expected_pearson_results = pd.DataFrame( data=expected_pearson_results, index=dates[start_date_index:end_date_index + 1], columns=assets, ) assert_frame_equal(pearson_results, expected_pearson_results) expected_spearman_results = pd.DataFrame( data=expected_spearman_results, index=dates[start_date_index:end_date_index + 1], columns=assets, ) assert_frame_equal(spearman_results, expected_spearman_results) @parameter_space(regression_length=[2, 3, 4]) def test_factor_regression_method_two_factors(self, regression_length): """ Tests for `Factor.linear_regression` when passed another 2D factor instead of a Slice. """ assets = self.assets dates = self.dates start_date = self.pipeline_start_date end_date = self.pipeline_end_date start_date_index = self.start_date_index end_date_index = self.end_date_index num_days = self.num_days run_pipeline = self.run_pipeline # The order of these is meant to align with the output of `linregress`. outputs = ["beta", "alpha", "r_value", "p_value", "stderr"] # Ensure that the `linear_regression` method cannot be called with two # 2D factors which have different masks. returns_masked_1 = Returns( window_length=5, inputs=[self.col], mask=AssetID().eq(1), ) returns_masked_2 = Returns( window_length=5, inputs=[self.col], mask=AssetID().eq(2), ) with pytest.raises(IncompatibleTerms): returns_masked_1.linear_regression( target=returns_masked_2, regression_length=regression_length, ) returns_5 = Returns(window_length=5, inputs=[self.col]) returns_10 = Returns(window_length=10, inputs=[self.col]) regression_factor = returns_5.linear_regression( target=returns_10, regression_length=regression_length, ) columns = { output: getattr(regression_factor, output) for output in outputs } pipeline = Pipeline(columns=columns) results = run_pipeline(pipeline, start_date, end_date) output_results = {} expected_output_results = {} for output in outputs: output_results[output] = results[output].unstack() expected_output_results[output] = np.full_like( output_results[output], nan, ) # Run a separate pipeline that calculates returns starting # (regression_length - 1) days prior to our start date. This is because # we need (regression_length - 1) extra days of returns to compute our # expected regressions. columns = {"returns_5": returns_5, "returns_10": returns_10} results = run_pipeline( Pipeline(columns=columns), dates[start_date_index - (regression_length - 1)], dates[end_date_index], ) returns_5_results = results["returns_5"].unstack() returns_10_results = results["returns_10"].unstack() # On each day, for each asset, calculate the expected regression # results of Y ~ X where Y is the asset's rolling 5 day returns and X # is the asset's rolling 10 day returns. Each regression is calculated # over `regression_length` days of data. for day in range(num_days): todays_returns_5 = returns_5_results.iloc[day:day + regression_length] todays_returns_10 = returns_10_results.iloc[day:day + regression_length] for asset, asset_returns_5 in todays_returns_5.iteritems(): asset_column = int(asset) - 1 asset_returns_10 = todays_returns_10[asset] expected_regression_results = linregress( y=asset_returns_5, x=asset_returns_10, ) for i, output in enumerate(outputs): expected_output_results[output][ day, asset_column] = expected_regression_results[i] for output in outputs: output_result = output_results[output] expected_output_result = pd.DataFrame( expected_output_results[output], index=dates[start_date_index:end_date_index + 1], columns=assets, ) assert_frame_equal(output_result, expected_output_result)
def test_infer_index_value(): # same range index index1 = pd.RangeIndex(1, 3) index2 = pd.RangeIndex(1, 3) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert oival.key == ival1.key assert oival.key == ival2.key # different range index index1 = pd.RangeIndex(1, 3) index2 = pd.RangeIndex(2, 4) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Int64Index) assert oival.key != ival1.key assert oival.key != ival2.key # same int64 index, all unique index1 = pd.Int64Index([1, 2]) index2 = pd.Int64Index([1, 2]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Int64Index) assert oival.key == ival1.key assert oival.key == ival2.key # same int64 index, not all unique index1 = pd.Int64Index([1, 2, 2]) index2 = pd.Int64Index([1, 2, 2]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Int64Index) assert oival.key != ival1.key assert oival.key != ival2.key # different int64 index index1 = pd.Int64Index([1, 2]) index2 = pd.Int64Index([2, 3]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Int64Index) assert oival.key != ival1.key assert oival.key != ival2.key # different index type index1 = pd.Int64Index([1, 2]) index2 = pd.Float64Index([2.0, 3.0]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Float64Index) assert oival.key != ival1.key assert oival.key != ival2.key # range index and other index index1 = pd.RangeIndex(1, 4) index2 = pd.Float64Index([2, 3, 4]) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Float64Index) assert oival.key != ival1.key assert oival.key != ival2.key index1 = pd.DatetimeIndex([]) index2 = pd.RangeIndex(2) ival1 = parse_index(index1) ival2 = parse_index(index2) oival = infer_index_value(ival1, ival2) assert isinstance(oival.value, IndexValue.Index) assert oival.key != ival1.key assert oival.key != ival2.key
def _pandas_basic_index(pandas, entry_start, entry_stop): if hasattr(pandas, "RangeIndex"): return pandas.RangeIndex(entry_start, entry_stop) else: return pandas.Int64Index(uproot4._util.range(entry_start, entry_stop))
def _check_values(values: Union[VALID_FORECASTING_HORIZON_TYPES]) -> pd.Index: """Validate forecasting horizon values. Validation checks validity and also converts forecasting horizon values to supported pandas.Index types if possible. Parameters ---------- values : int, list, array, certain pd.Index types Forecasting horizon with steps ahead to predict. Raises ------ TypeError : Raised if `values` type is not supported Returns ------- values : pd.Index Sorted and validated forecasting horizon values. """ # if values are one of the supported pandas index types, we don't have # to do # anything as the forecasting horizon directly wraps the index, note that # isinstance() does not work here, because index types inherit from each # other, # hence we check for type equality here if type(values) in VALID_INDEX_TYPES: pass # convert single integer to pandas index, no further checks needed elif is_int(values): return pd.Int64Index([values], dtype=int) elif is_timedelta_or_date_offset(values): return pd.Index([values]) # convert np.array or list to pandas index elif is_array(values) and array_is_int(values): values = pd.Int64Index(values, dtype=int) elif is_array(values) and array_is_timedelta_or_date_offset(values): values = pd.Index(values) # otherwise, raise type error else: valid_types = ( "int", "np.array", "list", *[f"pd.{index_type.__name__}" for index_type in VALID_INDEX_TYPES], ) raise TypeError( f"Invalid `fh`. The type of the passed `fh` values is not supported. " f"Please use one of {valid_types}, but found: {type(values)}") # check values does not contain duplicates if len(values) != values.nunique(): raise ValueError( "Invalid `fh`. The `fh` values must not contain any duplicates.") # return sorted values return values.sort_values()
def pd_range_index_getitem_impl(self, idx): res_as_arr = _sdc_take(self, idx) return pd.Int64Index(res_as_arr, name=self._name)
def test_constructor_unwraps_index(self): idx = pd.Index([1, 2]) result = pd.Int64Index(idx) expected = np.array([1, 2], dtype="int64") tm.assert_numpy_array_equal(result._data, expected)
class TestNumericArraylikeArithmeticWithTimedeltaScalar(object): # TODO: de-duplicate with test_numeric_arr_mul_tdscalar def test_ops_series(self): # regression test for G#H8813 td = Timedelta('1 day') other = pd.Series([1, 2]) expected = pd.Series(pd.to_timedelta(['1 day', '2 days'])) tm.assert_series_equal(expected, td * other) tm.assert_series_equal(expected, other * td) @pytest.mark.parametrize('box', [ pd.Index, Series, pytest.param(pd.DataFrame, marks=pytest.mark.xfail(reason="block.eval incorrect", strict=True)) ]) @pytest.mark.parametrize('index', [ pd.Int64Index(range(1, 11)), pd.UInt64Index(range(1, 11)), pd.Float64Index(range(1, 11)), pd.RangeIndex(1, 11) ], ids=lambda x: type(x).__name__) @pytest.mark.parametrize('scalar_td', [ Timedelta(days=1), Timedelta(days=1).to_timedelta64(), Timedelta(days=1).to_pytimedelta() ], ids=lambda x: type(x).__name__) def test_numeric_arr_mul_tdscalar(self, scalar_td, index, box): # GH#19333 if (box is Series and type(scalar_td) is timedelta and index.dtype == 'f8'): raise pytest.xfail(reason="Cannot multiply timedelta by float") expected = pd.timedelta_range('1 days', '10 days') index = tm.box_expected(index, box) expected = tm.box_expected(expected, box) result = index * scalar_td tm.assert_equal(result, expected) commute = scalar_td * index tm.assert_equal(commute, expected) @pytest.mark.parametrize('index', [ pd.Int64Index(range(1, 3)), pd.UInt64Index(range(1, 3)), pd.Float64Index(range(1, 3)), pd.RangeIndex(1, 3) ], ids=lambda x: type(x).__name__) @pytest.mark.parametrize('scalar_td', [ Timedelta(days=1), Timedelta(days=1).to_timedelta64(), Timedelta(days=1).to_pytimedelta() ], ids=lambda x: type(x).__name__) def test_numeric_arr_rdiv_tdscalar(self, scalar_td, index, box): if box is Series and type(scalar_td) is timedelta: raise pytest.xfail(reason="TODO: Figure out why this case fails") if box is pd.DataFrame and isinstance(scalar_td, timedelta): raise pytest.xfail(reason="TODO: Figure out why this case fails") expected = TimedeltaIndex(['1 Day', '12 Hours']) index = tm.box_expected(index, box) expected = tm.box_expected(expected, box) result = scalar_td / index tm.assert_equal(result, expected) with pytest.raises(TypeError): index / scalar_td
def test_marshall_index(self): """Test streamlit.data_frame._marshall_index.""" df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) # Plain Index proto = Index() data_frame._marshall_index(df.columns, proto) self.assertEqual(["col1", "col2"], proto.plain_index.data.strings.data) # Range Index proto = Index() data_frame._marshall_index(df.index, proto) self.assertEqual(0, proto.range_index.start) self.assertEqual(2, proto.range_index.stop) # Range Index with NaNs df_nan = pd.DataFrame(data={"col1": [], "col2": []}) proto = Index() data_frame._marshall_index(df_nan.index, proto) self.assertEqual(0, proto.range_index.start) self.assertEqual(0, proto.range_index.stop) # multi index df_multi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]], names=["one", "two"]) proto = Index() data_frame._marshall_index(df_multi, proto) self.assertEqual([1, 2], proto.multi_index.levels[0].int_64_index.data.data) self.assertEqual([0, 1], proto.multi_index.labels[0].data) # datetimeindex truth = [int(x * 1e9) for x in (1554138000, 1554141600, 1554145200)] df_dt = pd.date_range( start="2019/04/01 10:00", end="2019/04/01 12:00", freq="H" ) proto = Index() obj_to_patch = "streamlit.elements.data_frame.tzlocal.get_localzone" with patch(obj_to_patch) as p: p.return_value = "America/Los_Angeles" data_frame._marshall_index(df_dt, proto) self.assertEqual(truth, proto.datetime_index.data.data) # timedeltaindex df_td = pd.to_timedelta(np.arange(1, 5), unit="ns") proto = Index() data_frame._marshall_index(df_td, proto) self.assertEqual([1, 2, 3, 4], proto.timedelta_index.data.data) # int64index df_int64 = pd.Int64Index(np.arange(1, 5)) proto = Index() data_frame._marshall_index(df_int64, proto) self.assertEqual([1, 2, 3, 4], proto.int_64_index.data.data) # float64index df_float64 = pd.Float64Index(np.arange(1, 5)) proto = Index() data_frame._marshall_index(df_float64, proto) self.assertEqual([1, 2, 3, 4], proto.float_64_index.data.data) # Period index df_period = pd.period_range( start="2005-12-21 08:45 ", end="2005-12-21 11:55", freq="H" ) proto = Index() with pytest.raises(NotImplementedError) as e: data_frame._marshall_index(df_period, proto) err_msg = ( "Can't handle <class 'pandas.core.indexes.period.PeriodIndex'>" " yet." ) self.assertEqual(err_msg, str(e.value))
def testFilterIndexValue(self): pd_index = pd.RangeIndex(10) index_value = parse_index(pd_index) min_max = (0, True, 9, True) self.assertEqual( filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist()) min_max = (0, False, 9, False) self.assertEqual( filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index > 0) & (pd_index < 9)].tolist()) pd_index = pd.RangeIndex(1, 11, 3) index_value = parse_index(pd_index) min_max = (2, True, 10, True) self.assertEqual( filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist()) min_max = (2, False, 10, False) self.assertEqual( filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index > 2) & (pd_index < 10)].tolist()) pd_index = pd.RangeIndex(9, -1, -1) index_value = parse_index(pd_index) min_max = (0, True, 9, True) self.assertEqual( filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index >= 0) & (pd_index <= 9)].tolist()) min_max = (0, False, 9, False) self.assertEqual( filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index > 0) & (pd_index < 9)].tolist()) pd_index = pd.RangeIndex(10, 0, -3) index_value = parse_index(pd_index, store_data=False) min_max = (2, True, 10, True) self.assertEqual( filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index >= 2) & (pd_index <= 10)].tolist()) min_max = (2, False, 10, False) self.assertEqual( filter_index_value(index_value, min_max).to_pandas().tolist(), pd_index[(pd_index > 2) & (pd_index < 10)].tolist()) pd_index = pd.Int64Index([0, 3, 8]) index_value = parse_index(pd_index, store_data=True) min_max = (2, True, 8, False) self.assertEqual( filter_index_value(index_value, min_max, store_data=True).to_pandas().tolist(), pd_index[(pd_index >= 2) & (pd_index < 8)].tolist()) index_value = parse_index(pd_index) min_max = (2, True, 8, False) filtered = filter_index_value(index_value, min_max) self.assertEqual(len(filtered.to_pandas().tolist()), 0) self.assertIsInstance(filtered.value, IndexValue.Int64Index)
class TestFancy(Base): """ pure get/set item & fancy indexing """ def test_setitem_ndarray_1d(self): # GH5508 # len of indexer vs length of the 1d ndarray df = DataFrame(index=Index(lrange(1, 11))) df['foo'] = np.zeros(10, dtype=np.float64) df['bar'] = np.zeros(10, dtype=np.complex) # invalid def f(): df.loc[df.index[2:5], 'bar'] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0]) pytest.raises(ValueError, f) # valid df.loc[df.index[2:6], 'bar'] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0]) result = df.loc[df.index[2:6], 'bar'] expected = Series([2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[3, 4, 5, 6], name='bar') tm.assert_series_equal(result, expected) # dtype getting changed? df = DataFrame(index=Index(lrange(1, 11))) df['foo'] = np.zeros(10, dtype=np.float64) df['bar'] = np.zeros(10, dtype=np.complex) def f(): df[2:5] = np.arange(1, 4) * 1j pytest.raises(ValueError, f) def test_inf_upcast(self): # GH 16957 # We should be able to use np.inf as a key # np.inf should cause an index to convert to float # Test with np.inf in rows df = pd.DataFrame(columns=[0]) df.loc[1] = 1 df.loc[2] = 2 df.loc[np.inf] = 3 # make sure we can look up the value assert df.loc[np.inf, 0] == 3 result = df.index expected = pd.Float64Index([1, 2, np.inf]) tm.assert_index_equal(result, expected) # Test with np.inf in columns df = pd.DataFrame() df.loc[0, 0] = 1 df.loc[1, 1] = 2 df.loc[0, np.inf] = 3 result = df.columns expected = pd.Float64Index([0, 1, np.inf]) tm.assert_index_equal(result, expected) def test_setitem_dtype_upcast(self): # GH3216 df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) df['c'] = np.nan assert df['c'].dtype == np.float64 df.loc[0, 'c'] = 'foo' expected = DataFrame([{ "a": 1, "c": 'foo' }, { "a": 3, "b": 2, "c": np.nan }]) tm.assert_frame_equal(df, expected) # GH10280 df = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), index=list('ab'), columns=['foo', 'bar', 'baz']) for val in [3.14, 'wxyz']: left = df.copy() left.loc['a', 'bar'] = val right = DataFrame([[0, val, 2], [3, 4, 5]], index=list('ab'), columns=['foo', 'bar', 'baz']) tm.assert_frame_equal(left, right) assert is_integer_dtype(left['foo']) assert is_integer_dtype(left['baz']) left = DataFrame(np.arange(6, dtype='int64').reshape(2, 3) / 10.0, index=list('ab'), columns=['foo', 'bar', 'baz']) left.loc['a', 'bar'] = 'wxyz' right = DataFrame([[0, 'wxyz', .2], [.3, .4, .5]], index=list('ab'), columns=['foo', 'bar', 'baz']) tm.assert_frame_equal(left, right) assert is_float_dtype(left['foo']) assert is_float_dtype(left['baz']) def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(10, 3) df.columns = ['a', 'a', 'b'] result = df[['b', 'a']].columns expected = Index(['b', 'a', 'a']) tm.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('aaaaaaa')) df.head() str(df) result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']]) result.columns = list('aaaaaaa') # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( { 'test': [5, 7, 9, 11], 'test1': [4., 5, 6, 7], 'other': list('abcd') }, index=['A', 'A', 'B', 'C']) rows = ['C', 'B'] expected = DataFrame( { 'test': [11, 9], 'test1': [7., 6], 'other': ['d', 'c'] }, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ['C', 'B', 'E'] expected = DataFrame( { 'test': [11, 9, np.nan], 'test1': [7., 6, np.nan], 'other': ['d', 'c', np.nan] }, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer rows = ['F', 'G', 'H', 'C', 'B', 'E'] expected = DataFrame( { 'test': [np.nan, np.nan, np.nan, 11, 9, np.nan], 'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan], 'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan] }, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) # inconsistent returns for unique/duplicate indices when values are # missing df = DataFrame(np.random.randn(4, 3), index=list('ABCD')) expected = df.reindex(['E']) dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD')) with catch_warnings(record=True): result = dfnu.ix[['E']] tm.assert_frame_equal(result, expected) # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) result = df.loc[[0, 8, 0]] expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) df = DataFrame({"A": list('abc')}) result = df.loc[[0, 8, 0]] expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C']) expected = DataFrame({'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E']) result = df.loc[['A', 'A', 'E']] tm.assert_frame_equal(result, expected) # GH 5835 # dups on index and missing values df = DataFrame(np.random.randn(5, 5), columns=['A', 'B', 'B', 'B', 'A']) expected = pd.concat([ df.loc[:, ['A', 'B']], DataFrame(np.nan, columns=['C'], index=df.index) ], axis=1) result = df.loc[:, ['A', 'B', 'C']] tm.assert_frame_equal(result, expected) # GH 6504, multi-axis indexing df = DataFrame(np.random.randn(9, 2), index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=['a', 'b']) expected = df.iloc[0:6] result = df.loc[[1, 2]] tm.assert_frame_equal(result, expected) expected = df result = df.loc[:, ['a', 'b']] tm.assert_frame_equal(result, expected) expected = df.iloc[0:6, :] result = df.loc[[1, 2], ['a', 'b']] tm.assert_frame_equal(result, expected) def test_indexing_mixed_frame_bug(self): # GH3492 df = DataFrame({ 'a': { 1: 'aaa', 2: 'bbb', 3: 'ccc' }, 'b': { 1: 111, 2: 222, 3: 333 } }) # this works, new column is created correctly df['test'] = df['a'].apply(lambda x: '_' if x == 'aaa' else x) # this does not work, ie column test is not changed idx = df['test'] == '_' temp = df.loc[idx, 'a'].apply(lambda x: '-----' if x == 'aaa' else x) df.loc[idx, 'test'] = temp assert df.iloc[0, 2] == '-----' # if I look at df, then element [0,2] equals '_'. If instead I type # df.ix[idx,'test'], I get '-----', finally by typing df.iloc[0,2] I # get '_'. def test_multitype_list_index_access(self): # GH 10610 df = pd.DataFrame(np.random.random((10, 5)), columns=["a"] + [20, 21, 22, 23]) with pytest.raises(KeyError): df[[22, 26, -8]] assert df[21].shape[0] == df.shape[0] def test_set_index_nan(self): # GH 3586 df = DataFrame({ 'PRuid': { 17: 'nonQC', 18: 'nonQC', 19: 'nonQC', 20: '10', 21: '11', 22: '12', 23: '13', 24: '24', 25: '35', 26: '46', 27: '47', 28: '48', 29: '59', 30: '10' }, 'QC': { 17: 0.0, 18: 0.0, 19: 0.0, 20: np.nan, 21: np.nan, 22: np.nan, 23: np.nan, 24: 1.0, 25: np.nan, 26: np.nan, 27: np.nan, 28: np.nan, 29: np.nan, 30: np.nan }, 'data': { 17: 7.9544899999999998, 18: 8.0142609999999994, 19: 7.8591520000000008, 20: 0.86140349999999999, 21: 0.87853110000000001, 22: 0.8427041999999999, 23: 0.78587700000000005, 24: 0.73062459999999996, 25: 0.81668560000000001, 26: 0.81927080000000008, 27: 0.80705009999999999, 28: 0.81440240000000008, 29: 0.80140849999999997, 30: 0.81307740000000006 }, 'year': { 17: 2006, 18: 2007, 19: 2008, 20: 1985, 21: 1985, 22: 1985, 23: 1985, 24: 1985, 25: 1985, 26: 1985, 27: 1985, 28: 1985, 29: 1985, 30: 1986 } }).reset_index() result = df.set_index(['year', 'PRuid', 'QC']).reset_index().reindex(columns=df.columns) tm.assert_frame_equal(result, df) def test_multi_nan_indexing(self): # GH 3588 df = DataFrame({ "a": ['R1', 'R2', np.nan, 'R4'], 'b': ["C1", "C2", "C3", "C4"], "c": [10, 15, np.nan, 20] }) result = df.set_index(['a', 'b'], drop=False) expected = DataFrame( { "a": ['R1', 'R2', np.nan, 'R4'], 'b': ["C1", "C2", "C3", "C4"], "c": [10, 15, np.nan, 20] }, index=[ Index(['R1', 'R2', np.nan, 'R4'], name='a'), Index(['C1', 'C2', 'C3', 'C4'], name='b') ]) tm.assert_frame_equal(result, expected) def test_multi_assign(self): # GH 3626, an assignement of a sub-df to a df df = DataFrame({ 'FC': ['a', 'b', 'a', 'b', 'a', 'b'], 'PF': [0, 0, 0, 0, 1, 1], 'col1': lrange(6), 'col2': lrange(6, 12) }) df.iloc[1, 0] = np.nan df2 = df.copy() mask = ~df2.FC.isnull() cols = ['col1', 'col2'] dft = df2 * 2 dft.iloc[3, 3] = np.nan expected = DataFrame({ 'FC': ['a', np.nan, 'a', 'b', 'a', 'b'], 'PF': [0, 0, 0, 0, 1, 1], 'col1': Series([0, 1, 4, 6, 8, 10]), 'col2': [12, 7, 16, np.nan, 20, 22] }) # frame on rhs df2.loc[mask, cols] = dft.loc[mask, cols] tm.assert_frame_equal(df2, expected) df2.loc[mask, cols] = dft.loc[mask, cols] tm.assert_frame_equal(df2, expected) # with an ndarray on rhs # coerces to float64 because values has float64 dtype # GH 14001 expected = DataFrame({ 'FC': ['a', np.nan, 'a', 'b', 'a', 'b'], 'PF': [0, 0, 0, 0, 1, 1], 'col1': [0., 1., 4., 6., 8., 10.], 'col2': [12, 7, 16, np.nan, 20, 22] }) df2 = df.copy() df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) # broadcasting on the rhs is required df = DataFrame( dict(A=[1, 2, 0, 0, 0], B=[0, 0, 0, 10, 11], C=[0, 0, 0, 10, 11], D=[3, 4, 5, 6, 7])) expected = df.copy() mask = expected['A'] == 0 for col in ['A', 'B']: expected.loc[mask, col] = df['D'] df.loc[df['A'] == 0, ['A', 'B']] = df['D'] tm.assert_frame_equal(df, expected) def test_setitem_list(self): # GH 6043 # ix with a list df = DataFrame(index=[0, 1], columns=[0]) with catch_warnings(record=True): df.ix[1, 0] = [1, 2, 3] df.ix[1, 0] = [1, 2] result = DataFrame(index=[0, 1], columns=[0]) with catch_warnings(record=True): result.ix[1, 0] = [1, 2] tm.assert_frame_equal(result, df) # ix with an object class TO(object): def __init__(self, value): self.value = value def __str__(self): return "[{0}]".format(self.value) __repr__ = __str__ def __eq__(self, other): return self.value == other.value def view(self): return self df = DataFrame(index=[0, 1], columns=[0]) with catch_warnings(record=True): df.ix[1, 0] = TO(1) df.ix[1, 0] = TO(2) result = DataFrame(index=[0, 1], columns=[0]) with catch_warnings(record=True): result.ix[1, 0] = TO(2) tm.assert_frame_equal(result, df) # remains object dtype even after setting it back df = DataFrame(index=[0, 1], columns=[0]) with catch_warnings(record=True): df.ix[1, 0] = TO(1) df.ix[1, 0] = np.nan result = DataFrame(index=[0, 1], columns=[0]) tm.assert_frame_equal(result, df) def test_string_slice(self): # GH 14424 # string indexing against datetimelike with object # dtype should properly raises KeyError df = pd.DataFrame([1], pd.Index([pd.Timestamp('2011-01-01')], dtype=object)) assert df.index.is_all_dates with pytest.raises(KeyError): df['2011'] with pytest.raises(KeyError): df.loc['2011', 0] df = pd.DataFrame() assert not df.index.is_all_dates with pytest.raises(KeyError): df['2011'] with pytest.raises(KeyError): df.loc['2011', 0] def test_mi_access(self): # GH 4145 data = """h1 main h3 sub h5 0 a A 1 A1 1 1 b B 2 B1 2 2 c B 3 A1 3 3 d A 4 B2 4 4 e A 5 B2 5 5 f B 6 A2 6 """ df = pd.read_csv(StringIO(data), sep=r'\s+', index_col=0) df2 = df.set_index(['main', 'sub']).T.sort_index(1) index = Index(['h1', 'h3', 'h5']) columns = MultiIndex.from_tuples([('A', 'A1')], names=['main', 'sub']) expected = DataFrame([['a', 1, 1]], index=columns, columns=index).T result = df2.loc[:, ('A', 'A1')] tm.assert_frame_equal(result, expected) result = df2[('A', 'A1')] tm.assert_frame_equal(result, expected) # GH 4146, not returning a block manager when selecting a unique index # from a duplicate index # as of 4879, this returns a Series (which is similar to what happens # with a non-unique) expected = Series(['a', 1, 1], index=['h1', 'h3', 'h5'], name='A1') result = df2['A']['A1'] tm.assert_series_equal(result, expected) # selecting a non_unique from the 2nd level expected = DataFrame( [['d', 4, 4], ['e', 5, 5]], index=Index(['B2', 'B2'], name='sub'), columns=['h1', 'h3', 'h5'], ).T result = df2['A']['B2'] tm.assert_frame_equal(result, expected) def test_astype_assignment(self): # GH4312 (iloc) df_orig = DataFrame([['1', '2', '3', '.4', 5, 6., 'foo']], columns=list('ABCDEFG')) df = df_orig.copy() df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64) expected = DataFrame([[1, 2, '3', '.4', 5, 6., 'foo']], columns=list('ABCDEFG')) tm.assert_frame_equal(df, expected) df = df_orig.copy() df.iloc[:, 0:2] = df.iloc[:, 0:2]._convert(datetime=True, numeric=True) expected = DataFrame([[1, 2, '3', '.4', 5, 6., 'foo']], columns=list('ABCDEFG')) tm.assert_frame_equal(df, expected) # GH5702 (loc) df = df_orig.copy() df.loc[:, 'A'] = df.loc[:, 'A'].astype(np.int64) expected = DataFrame([[1, '2', '3', '.4', 5, 6., 'foo']], columns=list('ABCDEFG')) tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[:, ['B', 'C']] = df.loc[:, ['B', 'C']].astype(np.int64) expected = DataFrame([['1', 2, 3, '.4', 5, 6., 'foo']], columns=list('ABCDEFG')) tm.assert_frame_equal(df, expected) # full replacements / no nans df = DataFrame({'A': [1., 2., 3., 4.]}) df.iloc[:, 0] = df['A'].astype(np.int64) expected = DataFrame({'A': [1, 2, 3, 4]}) tm.assert_frame_equal(df, expected) df = DataFrame({'A': [1., 2., 3., 4.]}) df.loc[:, 'A'] = df['A'].astype(np.int64) expected = DataFrame({'A': [1, 2, 3, 4]}) tm.assert_frame_equal(df, expected) def test_astype_assignment_with_dups(self): # GH 4686 # assignment with dups that has a dtype change cols = pd.MultiIndex.from_tuples([('A', '1'), ('B', '1'), ('A', '2')]) df = DataFrame(np.arange(3).reshape((1, 3)), columns=cols, dtype=object) index = df.index.copy() df['A'] = df['A'].astype(np.float64) tm.assert_index_equal(df.index, index) # TODO(wesm): unused variables # result = df.get_dtype_counts().sort_index() # expected = Series({'float64': 2, 'object': 1}).sort_index() @pytest.mark.parametrize("index,val", [ (pd.Index([0, 1, 2]), 2), (pd.Index([0, 1, '2']), '2'), (pd.Index([0, 1, 2, np.inf, 4]), 4), (pd.Index([0, 1, 2, np.nan, 4]), 4), (pd.Index([0, 1, 2, np.inf]), np.inf), (pd.Index([0, 1, 2, np.nan]), np.nan), ]) def test_index_contains(self, index, val): assert val in index @pytest.mark.parametrize( "index,val", [ (pd.Index([0, 1, 2]), '2'), (pd.Index([0, 1, '2']), 2), (pd.Index([0, 1, 2, np.inf]), 4), (pd.Index([0, 1, 2, np.nan]), 4), (pd.Index([0, 1, 2, np.inf]), np.nan), (pd.Index([0, 1, 2, np.nan]), np.inf), # Checking if np.inf in Int64Index should not cause an OverflowError # Related to GH 16957 (pd.Int64Index([0, 1, 2]), np.inf), (pd.Int64Index([0, 1, 2]), np.nan), (pd.UInt64Index([0, 1, 2]), np.inf), (pd.UInt64Index([0, 1, 2]), np.nan), ]) def test_index_not_contains(self, index, val): assert val not in index def test_index_type_coercion(self): with catch_warnings(record=True): # GH 11836 # if we have an index type and set it with something that looks # to numpy like the same, but is actually, not # (e.g. setting with a float or string '0') # then we need to coerce to object # integer indexes for s in [Series(range(5)), Series(range(5), index=range(1, 6))]: assert s.index.is_integer() for indexer in [lambda x: x.ix, lambda x: x.loc, lambda x: x]: s2 = s.copy() indexer(s2)[0.1] = 0 assert s2.index.is_floating() assert indexer(s2)[0.1] == 0 s2 = s.copy() indexer(s2)[0.0] = 0 exp = s.index if 0 not in s: exp = Index(s.index.tolist() + [0]) tm.assert_index_equal(s2.index, exp) s2 = s.copy() indexer(s2)['0'] = 0 assert s2.index.is_object() for s in [Series(range(5), index=np.arange(5.))]: assert s.index.is_floating() for idxr in [lambda x: x.ix, lambda x: x.loc, lambda x: x]: s2 = s.copy() idxr(s2)[0.1] = 0 assert s2.index.is_floating() assert idxr(s2)[0.1] == 0 s2 = s.copy() idxr(s2)[0.0] = 0 tm.assert_index_equal(s2.index, s.index) s2 = s.copy() idxr(s2)['0'] = 0 assert s2.index.is_object()
class TestSeriesConstructors(object): def test_invalid_dtype(self): # GH15520 msg = 'not understood' invalid_list = [pd.Timestamp, 'pd.Timestamp', list] for dtype in invalid_list: with pytest.raises(TypeError, match=msg): Series([], name='time', dtype=dtype) def test_scalar_conversion(self): # Pass in scalar is disabled scalar = Series(0.5) assert not isinstance(scalar, float) # Coercion assert float(Series([1.])) == 1.0 assert int(Series([1.])) == 1 def test_constructor(self, datetime_series): empty_series = Series() assert datetime_series.index.is_all_dates # Pass in Series derived = Series(datetime_series) assert derived.index.is_all_dates assert tm.equalContents(derived.index, datetime_series.index) # Ensure new index is not created assert id(datetime_series.index) == id(derived.index) # Mixed type Series mixed = Series(['hello', np.NaN], index=[0, 1]) assert mixed.dtype == np.object_ assert mixed[1] is np.NaN assert not empty_series.index.is_all_dates assert not Series({}).index.is_all_dates # exception raised is of type Exception with pytest.raises(Exception, match="Data must be 1-dimensional"): Series(np.random.randn(3, 3), index=np.arange(3)) mixed.name = 'Series' rs = Series(mixed).name xp = 'Series' assert rs == xp # raise on MultiIndex GH4187 m = MultiIndex.from_arrays([[1, 2], [3, 4]]) msg = "initializing a Series from a MultiIndex is not supported" with pytest.raises(NotImplementedError, match=msg): Series(m) @pytest.mark.parametrize('input_class', [list, dict, OrderedDict]) def test_constructor_empty(self, input_class): empty = Series() empty2 = Series(input_class()) # these are Index() and RangeIndex() which don't compare type equal # but are just .equals assert_series_equal(empty, empty2, check_index_type=False) # With explicit dtype: empty = Series(dtype='float64') empty2 = Series(input_class(), dtype='float64') assert_series_equal(empty, empty2, check_index_type=False) # GH 18515 : with dtype=category: empty = Series(dtype='category') empty2 = Series(input_class(), dtype='category') assert_series_equal(empty, empty2, check_index_type=False) if input_class is not list: # With index: empty = Series(index=lrange(10)) empty2 = Series(input_class(), index=lrange(10)) assert_series_equal(empty, empty2) # With index and dtype float64: empty = Series(np.nan, index=lrange(10)) empty2 = Series(input_class(), index=lrange(10), dtype='float64') assert_series_equal(empty, empty2) # GH 19853 : with empty string, index and dtype str empty = Series('', dtype=str, index=range(3)) empty2 = Series('', index=range(3)) assert_series_equal(empty, empty2) @pytest.mark.parametrize('input_arg', [np.nan, float('nan')]) def test_constructor_nan(self, input_arg): empty = Series(dtype='float64', index=lrange(10)) empty2 = Series(input_arg, index=lrange(10)) assert_series_equal(empty, empty2, check_index_type=False) @pytest.mark.parametrize('dtype', [ 'f8', 'i8', 'M8[ns]', 'm8[ns]', 'category', 'object', 'datetime64[ns, UTC]', ]) @pytest.mark.parametrize('index', [None, pd.Index([])]) def test_constructor_dtype_only(self, dtype, index): # GH-20865 result = pd.Series(dtype=dtype, index=index) assert result.dtype == dtype assert len(result) == 0 def test_constructor_no_data_index_order(self): result = pd.Series(index=['b', 'a', 'c']) assert result.index.tolist() == ['b', 'a', 'c'] def test_constructor_no_data_string_type(self): # GH 22477 result = pd.Series(index=[1], dtype=str) assert np.isnan(result.iloc[0]) @pytest.mark.parametrize('item', ['entry', 'ѐ', 13]) def test_constructor_string_element_string_type(self, item): # GH 22477 result = pd.Series(item, index=[1], dtype=str) assert result.iloc[0] == str(item) def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 ser = Series(['x', None], dtype=string_dtype) result = ser.isna() expected = Series([False, True]) tm.assert_series_equal(result, expected) assert ser.iloc[1] is None ser = Series(['x', np.nan], dtype=string_dtype) assert np.isnan(ser.iloc[1]) def test_constructor_series(self): index1 = ['d', 'b', 'a', 'c'] index2 = sorted(index1) s1 = Series([4, 7, -5, 3], index=index1) s2 = Series(s1, index=index2) assert_series_equal(s2, s1.sort_index()) def test_constructor_iterable(self): # GH 21987 class Iter(): def __iter__(self): for i in range(10): yield i expected = Series(list(range(10)), dtype='int64') result = Series(Iter(), dtype='int64') assert_series_equal(result, expected) def test_constructor_sequence(self): # GH 21987 expected = Series(list(range(10)), dtype='int64') result = Series(range(10), dtype='int64') assert_series_equal(result, expected) def test_constructor_single_str(self): # GH 21987 expected = Series(['abc']) result = Series('abc') assert_series_equal(result, expected) def test_constructor_list_like(self): # make sure that we are coercing different # list-likes to standard dtypes and not # platform specific expected = Series([1, 2, 3], dtype='int64') for obj in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype='int64')]: result = Series(obj, index=[0, 1, 2]) assert_series_equal(result, expected) @pytest.mark.parametrize('input_vals', [ ([1, 2]), (['1', '2']), (list(pd.date_range('1/1/2011', periods=2, freq='H'))), (list(pd.date_range('1/1/2011', periods=2, freq='H', tz='US/Eastern'))), ([pd.Interval(left=0, right=5)]), ]) def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements from a list are converted to strings # when dtype is str, 'str', or 'U' result = Series(input_vals, dtype=string_dtype) expected = Series(input_vals).astype(string_dtype) assert_series_equal(result, expected) def test_constructor_list_str_na(self, string_dtype): result = Series([1.0, 2.0, np.nan], dtype=string_dtype) expected = Series(['1.0', '2.0', np.nan], dtype=object) assert_series_equal(result, expected) assert np.isnan(result[2]) def test_constructor_generator(self): gen = (i for i in range(10)) result = Series(gen) exp = Series(lrange(10)) assert_series_equal(result, exp) gen = (i for i in range(10)) result = Series(gen, index=lrange(10, 20)) exp.index = lrange(10, 20) assert_series_equal(result, exp) def test_constructor_map(self): # GH8909 m = map(lambda x: x, range(10)) result = Series(m) exp = Series(lrange(10)) assert_series_equal(result, exp) m = map(lambda x: x, range(10)) result = Series(m, index=lrange(10, 20)) exp.index = lrange(10, 20) assert_series_equal(result, exp) def test_constructor_categorical(self): cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'], fastpath=True) res = Series(cat) tm.assert_categorical_equal(res.values, cat) # can cast to a new dtype result = Series(pd.Categorical([1, 2, 3]), dtype='int64') expected = pd.Series([1, 2, 3], dtype='int64') tm.assert_series_equal(result, expected) # GH12574 cat = Series(pd.Categorical([1, 2, 3]), dtype='category') assert is_categorical_dtype(cat) assert is_categorical_dtype(cat.dtype) s = Series([1, 2, 3], dtype='category') assert is_categorical_dtype(s) assert is_categorical_dtype(s.dtype) def test_constructor_categorical_with_coercion(self): factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) # test basic creation / coercion of categoricals s = Series(factor, name='A') assert s.dtype == 'category' assert len(s) == len(factor) str(s.values) str(s) # in a frame df = DataFrame({'A': factor}) result = df['A'] tm.assert_series_equal(result, s) result = df.iloc[:, 0] tm.assert_series_equal(result, s) assert len(df) == len(factor) str(df.values) str(df) df = DataFrame({'A': s}) result = df['A'] tm.assert_series_equal(result, s) assert len(df) == len(factor) str(df.values) str(df) # multiples df = DataFrame({'A': s, 'B': s, 'C': 1}) result1 = df['A'] result2 = df['B'] tm.assert_series_equal(result1, s) tm.assert_series_equal(result2, s, check_names=False) assert result2.name == 'B' assert len(df) == len(factor) str(df.values) str(df) # GH8623 x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']], columns=['person_id', 'person_name']) x['person_name'] = Categorical(x.person_name ) # doing this breaks transform expected = x.iloc[0].person_name result = x.person_name.iloc[0] assert result == expected result = x.person_name[0] assert result == expected result = x.person_name.loc[0] assert result == expected def test_constructor_categorical_dtype(self): result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['a', 'b', 'c'], ordered=True)) assert is_categorical_dtype(result) is True tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c'])) assert result.cat.ordered result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['b', 'a'])) assert is_categorical_dtype(result) tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a'])) assert result.cat.ordered is False # GH 19565 - Check broadcasting of scalar with Categorical dtype result = Series('a', index=[0, 1], dtype=CategoricalDtype(['a', 'b'], ordered=True)) expected = Series(['a', 'a'], index=[0, 1], dtype=CategoricalDtype(['a', 'b'], ordered=True)) tm.assert_series_equal(result, expected, check_categorical=True) def test_categorical_sideeffects_free(self): # Passing a categorical to a Series and then changing values in either # the series or the categorical should not change the values in the # other one, IF you specify copy! cat = Categorical(["a", "b", "c", "a"]) s = Series(cat, copy=True) assert s.cat is not cat s.cat.categories = [1, 2, 3] exp_s = np.array([1, 2, 3, 1], dtype=np.int64) exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_) tm.assert_numpy_array_equal(s.__array__(), exp_s) tm.assert_numpy_array_equal(cat.__array__(), exp_cat) # setting s[0] = 2 exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64) tm.assert_numpy_array_equal(s.__array__(), exp_s2) tm.assert_numpy_array_equal(cat.__array__(), exp_cat) # however, copy is False by default # so this WILL change values cat = Categorical(["a", "b", "c", "a"]) s = Series(cat) assert s.values is cat s.cat.categories = [1, 2, 3] exp_s = np.array([1, 2, 3, 1], dtype=np.int64) tm.assert_numpy_array_equal(s.__array__(), exp_s) tm.assert_numpy_array_equal(cat.__array__(), exp_s) s[0] = 2 exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64) tm.assert_numpy_array_equal(s.__array__(), exp_s2) tm.assert_numpy_array_equal(cat.__array__(), exp_s2) def test_unordered_compare_equal(self): left = pd.Series(['a', 'b', 'c'], dtype=CategoricalDtype(['a', 'b'])) right = pd.Series(pd.Categorical(['a', 'b', np.nan], categories=['a', 'b'])) tm.assert_series_equal(left, right) def test_constructor_maskedarray(self): data = ma.masked_all((3, ), dtype=float) result = Series(data) expected = Series([nan, nan, nan]) assert_series_equal(result, expected) data[0] = 0.0 data[2] = 2.0 index = ['a', 'b', 'c'] result = Series(data, index=index) expected = Series([0.0, nan, 2.0], index=index) assert_series_equal(result, expected) data[1] = 1.0 result = Series(data, index=index) expected = Series([0.0, 1.0, 2.0], index=index) assert_series_equal(result, expected) data = ma.masked_all((3, ), dtype=int) result = Series(data) expected = Series([nan, nan, nan], dtype=float) assert_series_equal(result, expected) data[0] = 0 data[2] = 2 index = ['a', 'b', 'c'] result = Series(data, index=index) expected = Series([0, nan, 2], index=index, dtype=float) assert_series_equal(result, expected) data[1] = 1 result = Series(data, index=index) expected = Series([0, 1, 2], index=index, dtype=int) assert_series_equal(result, expected) data = ma.masked_all((3, ), dtype=bool) result = Series(data) expected = Series([nan, nan, nan], dtype=object) assert_series_equal(result, expected) data[0] = True data[2] = False index = ['a', 'b', 'c'] result = Series(data, index=index) expected = Series([True, nan, False], index=index, dtype=object) assert_series_equal(result, expected) data[1] = True result = Series(data, index=index) expected = Series([True, True, False], index=index, dtype=bool) assert_series_equal(result, expected) data = ma.masked_all((3, ), dtype='M8[ns]') result = Series(data) expected = Series([iNaT, iNaT, iNaT], dtype='M8[ns]') assert_series_equal(result, expected) data[0] = datetime(2001, 1, 1) data[2] = datetime(2001, 1, 3) index = ['a', 'b', 'c'] result = Series(data, index=index) expected = Series([datetime(2001, 1, 1), iNaT, datetime(2001, 1, 3)], index=index, dtype='M8[ns]') assert_series_equal(result, expected) data[1] = datetime(2001, 1, 2) result = Series(data, index=index) expected = Series([datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 3)], index=index, dtype='M8[ns]') assert_series_equal(result, expected) def test_constructor_maskedarray_hardened(self): # Check numpy masked arrays with hard masks -- from GH24574 data = ma.masked_all((3, ), dtype=float).harden_mask() result = pd.Series(data) expected = pd.Series([nan, nan, nan]) tm.assert_series_equal(result, expected) def test_series_ctor_plus_datetimeindex(self): rng = date_range('20090415', '20090519', freq='B') data = {k: 1 for k in rng} result = Series(data, index=rng) assert result.index is rng def test_constructor_default_index(self): s = Series([0, 1, 2]) tm.assert_index_equal(s.index, pd.Index(np.arange(3))) @pytest.mark.parametrize('input', [[1, 2, 3], (1, 2, 3), list(range(3)), pd.Categorical(['a', 'b', 'a']), (i for i in range(3)), map(lambda x: x, range(3))]) def test_constructor_index_mismatch(self, input): # GH 19342 # test that construction of a Series with an index of different length # raises an error msg = 'Length of passed values is 3, index implies 4' with pytest.raises(ValueError, match=msg): Series(input, index=np.arange(4)) def test_constructor_numpy_scalar(self): # GH 19342 # construction with a numpy scalar # should not raise result = Series(np.array(100), index=np.arange(4), dtype='int64') expected = Series(100, index=np.arange(4), dtype='int64') tm.assert_series_equal(result, expected) def test_constructor_broadcast_list(self): # GH 19342 # construction with single-element container and index # should raise msg = "Length of passed values is 1, index implies 3" with pytest.raises(ValueError, match=msg): Series(['foo'], index=['a', 'b', 'c']) def test_constructor_corner(self): df = tm.makeTimeDataFrame() objs = [df, df] s = Series(objs, index=[0, 1]) assert isinstance(s, Series) def test_constructor_sanitize(self): s = Series(np.array([1., 1., 8.]), dtype='i8') assert s.dtype == np.dtype('i8') s = Series(np.array([1., 1., np.nan]), copy=True, dtype='i8') assert s.dtype == np.dtype('f8') def test_constructor_copy(self): # GH15125 # test dtype parameter has no side effects on copy=True for data in [[1.], np.array([1.])]: x = Series(data) y = pd.Series(x, copy=True, dtype=float) # copy=True maintains original data in Series tm.assert_series_equal(x, y) # changes to origin of copy does not affect the copy x[0] = 2. assert not x.equals(y) assert x[0] == 2. assert y[0] == 1. @pytest.mark.parametrize( "index", [ pd.date_range('20170101', periods=3, tz='US/Eastern'), pd.date_range('20170101', periods=3), pd.timedelta_range('1 day', periods=3), pd.period_range('2012Q1', periods=3, freq='Q'), pd.Index(list('abc')), pd.Int64Index([1, 2, 3]), pd.RangeIndex(0, 3)], ids=lambda x: type(x).__name__) def test_constructor_limit_copies(self, index): # GH 17449 # limit copies of input s = pd.Series(index) # we make 1 copy; this is just a smoke test here assert s._data.blocks[0].values is not index def test_constructor_pass_none(self): s = Series(None, index=lrange(5)) assert s.dtype == np.float64 s = Series(None, index=lrange(5), dtype=object) assert s.dtype == np.object_ # GH 7431 # inference on the index s = Series(index=np.array([None])) expected = Series(index=Index([None])) assert_series_equal(s, expected) def test_constructor_pass_nan_nat(self): # GH 13467 exp = Series([np.nan, np.nan], dtype=np.float64) assert exp.dtype == np.float64 tm.assert_series_equal(Series([np.nan, np.nan]), exp) tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) exp = Series([pd.NaT, pd.NaT]) assert exp.dtype == 'datetime64[ns]' tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp) tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp) tm.assert_series_equal(Series([pd.NaT, np.nan]), exp) tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp) tm.assert_series_equal(Series([np.nan, pd.NaT]), exp) tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp) def test_constructor_cast(self): msg = "could not convert string to float" with pytest.raises(ValueError, match=msg): Series(["a", "b", "c"], dtype=float) def test_constructor_unsigned_dtype_overflow(self, uint_dtype): # see gh-15832 msg = 'Trying to coerce negative values to unsigned integers' with pytest.raises(OverflowError, match=msg): Series([-1], dtype=uint_dtype) def test_constructor_coerce_float_fail(self, any_int_dtype): # see gh-15832 msg = "Trying to coerce float values to integers" with pytest.raises(ValueError, match=msg): Series([1, 2, 3.5], dtype=any_int_dtype) def test_constructor_coerce_float_valid(self, float_dtype): s = Series([1, 2, 3.5], dtype=float_dtype) expected = Series([1, 2, 3.5]).astype(float_dtype) assert_series_equal(s, expected) def test_constructor_dtype_no_cast(self): # see gh-1572 s = Series([1, 2, 3]) s2 = Series(s, dtype=np.int64) s2[1] = 5 assert s[1] == 5 def test_constructor_datelike_coercion(self): # GH 9477 # incorrectly inferring on dateimelike looking when object dtype is # specified s = Series([Timestamp('20130101'), 'NOV'], dtype=object) assert s.iloc[0] == Timestamp('20130101') assert s.iloc[1] == 'NOV' assert s.dtype == object # the dtype was being reset on the slicing and re-inferred to datetime # even thought the blocks are mixed belly = '216 3T19'.split() wing1 = '2T15 4H19'.split() wing2 = '416 4T20'.split() mat = pd.to_datetime('2016-01-22 2019-09-07'.split()) df = pd.DataFrame( {'wing1': wing1, 'wing2': wing2, 'mat': mat}, index=belly) result = df.loc['3T19'] assert result.dtype == object result = df.loc['216'] assert result.dtype == object def test_constructor_datetimes_with_nulls(self): # gh-15869 for arr in [np.array([None, None, None, None, datetime.now(), None]), np.array([None, None, datetime.now(), None])]: result = Series(arr) assert result.dtype == 'M8[ns]' def test_constructor_dtype_datetime64(self): s = Series(iNaT, dtype='M8[ns]', index=lrange(5)) assert isna(s).all() # in theory this should be all nulls, but since # we are not specifying a dtype is ambiguous s = Series(iNaT, index=lrange(5)) assert not isna(s).all() s = Series(nan, dtype='M8[ns]', index=lrange(5)) assert isna(s).all() s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype='M8[ns]') assert isna(s[1]) assert s.dtype == 'M8[ns]' s = Series([datetime(2001, 1, 2, 0, 0), nan], dtype='M8[ns]') assert isna(s[1]) assert s.dtype == 'M8[ns]' # GH3416 dates = [ np.datetime64(datetime(2013, 1, 1)), np.datetime64(datetime(2013, 1, 2)), np.datetime64(datetime(2013, 1, 3)), ] s = Series(dates) assert s.dtype == 'M8[ns]' s.iloc[0] = np.nan assert s.dtype == 'M8[ns]' # GH3414 related expected = Series([ datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3), ], dtype='datetime64[ns]') result = Series( Series(dates).astype(np.int64) / 1000000, dtype='M8[ms]') tm.assert_series_equal(result, expected) result = Series(dates, dtype='datetime64[ns]') tm.assert_series_equal(result, expected) expected = Series([ pd.NaT, datetime(2013, 1, 2), datetime(2013, 1, 3), ], dtype='datetime64[ns]') result = Series([np.nan] + dates[1:], dtype='datetime64[ns]') tm.assert_series_equal(result, expected) dts = Series(dates, dtype='datetime64[ns]') # valid astype dts.astype('int64') # invalid casting msg = (r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" r" \[int32\]") with pytest.raises(TypeError, match=msg): dts.astype('int32') # ints are ok # we test with np.int64 to get similar results on # windows / 32-bit platforms result = Series(dts, dtype=np.int64) expected = Series(dts.astype(np.int64)) tm.assert_series_equal(result, expected) # invalid dates can be help as object result = Series([datetime(2, 1, 1)]) assert result[0] == datetime(2, 1, 1, 0, 0) result = Series([datetime(3000, 1, 1)]) assert result[0] == datetime(3000, 1, 1, 0, 0) # don't mix types result = Series([Timestamp('20130101'), 1], index=['a', 'b']) assert result['a'] == Timestamp('20130101') assert result['b'] == 1 # GH6529 # coerce datetime64 non-ns properly dates = date_range('01-Jan-2015', '01-Dec-2015', freq='M') values2 = dates.view(np.ndarray).astype('datetime64[ns]') expected = Series(values2, index=dates) for dtype in ['s', 'D', 'ms', 'us', 'ns']: values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) result = Series(values1, dates) assert_series_equal(result, expected) # GH 13876 # coerce to non-ns to object properly expected = Series(values2, index=dates, dtype=object) for dtype in ['s', 'D', 'ms', 'us', 'ns']: values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) result = Series(values1, index=dates, dtype=object) assert_series_equal(result, expected) # leave datetime.date alone dates2 = np.array([d.date() for d in dates.to_pydatetime()], dtype=object) series1 = Series(dates2, dates) tm.assert_numpy_array_equal(series1.values, dates2) assert series1.dtype == object # these will correctly infer a datetime s = Series([None, pd.NaT, '2013-08-05 15:30:00.000001']) assert s.dtype == 'datetime64[ns]' s = Series([np.nan, pd.NaT, '2013-08-05 15:30:00.000001']) assert s.dtype == 'datetime64[ns]' s = Series([pd.NaT, None, '2013-08-05 15:30:00.000001']) assert s.dtype == 'datetime64[ns]' s = Series([pd.NaT, np.nan, '2013-08-05 15:30:00.000001']) assert s.dtype == 'datetime64[ns]' # tz-aware (UTC and other tz's) # GH 8411 dr = date_range('20130101', periods=3) assert Series(dr).iloc[0].tz is None dr = date_range('20130101', periods=3, tz='UTC') assert str(Series(dr).iloc[0].tz) == 'UTC' dr = date_range('20130101', periods=3, tz='US/Eastern') assert str(Series(dr).iloc[0].tz) == 'US/Eastern' # non-convertible s = Series([1479596223000, -1479590, pd.NaT]) assert s.dtype == 'object' assert s[2] is pd.NaT assert 'NaT' in str(s) # if we passed a NaT it remains s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), pd.NaT]) assert s.dtype == 'object' assert s[2] is pd.NaT assert 'NaT' in str(s) # if we passed a nan it remains s = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) assert s.dtype == 'object' assert s[2] is np.nan assert 'NaN' in str(s) def test_constructor_with_datetime_tz(self): # 8260 # support datetime64 with tz dr = date_range('20130101', periods=3, tz='US/Eastern') s = Series(dr) assert s.dtype.name == 'datetime64[ns, US/Eastern]' assert s.dtype == 'datetime64[ns, US/Eastern]' assert is_datetime64tz_dtype(s.dtype) assert 'datetime64[ns, US/Eastern]' in str(s) # export result = s.values assert isinstance(result, np.ndarray) assert result.dtype == 'datetime64[ns]' exp = pd.DatetimeIndex(result) exp = exp.tz_localize('UTC').tz_convert(tz=s.dt.tz) tm.assert_index_equal(dr, exp) # indexing result = s.iloc[0] assert result == Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern', freq='D') result = s[0] assert result == Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern', freq='D') result = s[Series([True, True, False], index=s.index)] assert_series_equal(result, s[0:2]) result = s.iloc[0:1] assert_series_equal(result, Series(dr[0:1])) # concat result = pd.concat([s.iloc[0:1], s.iloc[1:]]) assert_series_equal(result, s) # short str assert 'datetime64[ns, US/Eastern]' in str(s) # formatting with NaT result = s.shift() assert 'datetime64[ns, US/Eastern]' in str(result) assert 'NaT' in str(result) # long str t = Series(date_range('20130101', periods=1000, tz='US/Eastern')) assert 'datetime64[ns, US/Eastern]' in str(t) result = pd.DatetimeIndex(s, freq='infer') tm.assert_index_equal(result, dr) # inference s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')]) assert s.dtype == 'datetime64[ns, US/Pacific]' assert lib.infer_dtype(s, skipna=True) == 'datetime64' s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')]) assert s.dtype == 'object' assert lib.infer_dtype(s, skipna=True) == 'datetime' # with all NaT s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern')) assert_series_equal(s, expected) @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) def test_construction_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units # gh-19223 dtype = "{}[{}]".format(dtype, unit) arr = np.array([1, 2, 3], dtype=arr_dtype) s = Series(arr) result = s.astype(dtype) expected = Series(arr.astype(dtype)) tm.assert_series_equal(result, expected) @pytest.mark.parametrize('arg', ['2013-01-01 00:00:00', pd.NaT, np.nan, None]) def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): # GH 17415: With naive string result = Series([arg], dtype='datetime64[ns, CET]') expected = Series(pd.Timestamp(arg)).dt.tz_localize('CET') assert_series_equal(result, expected) def test_construction_interval(self): # construction from interval & array of intervals index = IntervalIndex.from_breaks(np.arange(3), closed='right') result = Series(index) repr(result) str(result) tm.assert_index_equal(Index(result.values), index) result = Series(index.values) tm.assert_index_equal(Index(result.values), index) def test_construction_consistency(self): # make sure that we are not re-localizing upon construction # GH 14928 s = Series(pd.date_range('20130101', periods=3, tz='US/Eastern')) result = Series(s, dtype=s.dtype) tm.assert_series_equal(result, s) result = Series(s.dt.tz_convert('UTC'), dtype=s.dtype) tm.assert_series_equal(result, s) result = Series(s.values, dtype=s.dtype) tm.assert_series_equal(result, s) def test_constructor_infer_period(self): data = [pd.Period('2000', 'D'), pd.Period('2001', 'D'), None] result = pd.Series(data) expected = pd.Series(period_array(data)) tm.assert_series_equal(result, expected) assert result.dtype == 'Period[D]' data = np.asarray(data, dtype=object) tm.assert_series_equal(result, expected) assert result.dtype == 'Period[D]' def test_constructor_period_incompatible_frequency(self): data = [pd.Period('2000', 'D'), pd.Period('2001', 'A')] result = pd.Series(data) assert result.dtype == object assert result.tolist() == data def test_constructor_periodindex(self): # GH7932 # converting a PeriodIndex when put in a Series pi = period_range('20130101', periods=5, freq='D') s = Series(pi) assert s.dtype == 'Period[D]' expected = Series(pi.astype(object)) assert_series_equal(s, expected) def test_constructor_dict(self): d = {'a': 0., 'b': 1., 'c': 2.} result = Series(d, index=['b', 'c', 'd', 'a']) expected = Series([1, 2, nan, 0], index=['b', 'c', 'd', 'a']) assert_series_equal(result, expected) pidx = tm.makePeriodIndex(100) d = {pidx[0]: 0, pidx[1]: 1} result = Series(d, index=pidx) expected = Series(np.nan, pidx) expected.iloc[0] = 0 expected.iloc[1] = 1 assert_series_equal(result, expected) def test_constructor_dict_order(self): # GH19018 # initialization ordering: by insertion order if python>= 3.6, else # order by value d = {'b': 1, 'a': 0, 'c': 2} result = Series(d) if PY36: expected = Series([1, 0, 2], index=list('bac')) else: expected = Series([0, 1, 2], index=list('abc')) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')]) def test_constructor_dict_nan_key(self, value): # GH 18480 d = {1: 'a', value: 'b', float('nan'): 'c', 4: 'd'} result = Series(d).sort_values() expected = Series(['a', 'b', 'c', 'd'], index=[1, value, np.nan, 4]) assert_series_equal(result, expected) # MultiIndex: d = {(1, 1): 'a', (2, np.nan): 'b', (3, value): 'c'} result = Series(d).sort_values() expected = Series(['a', 'b', 'c'], index=Index([(1, 1), (2, np.nan), (3, value)])) assert_series_equal(result, expected) def test_constructor_dict_datetime64_index(self): # GH 9456 dates_as_str = ['1984-02-19', '1988-11-06', '1989-12-03', '1990-03-15'] values = [42544017.198965244, 1234565, 40512335.181958228, -1] def create_data(constructor): return dict(zip((constructor(x) for x in dates_as_str), values)) data_datetime64 = create_data(np.datetime64) data_datetime = create_data(lambda x: datetime.strptime(x, '%Y-%m-%d')) data_Timestamp = create_data(Timestamp) expected = Series(values, (Timestamp(x) for x in dates_as_str)) result_datetime64 = Series(data_datetime64) result_datetime = Series(data_datetime) result_Timestamp = Series(data_Timestamp) assert_series_equal(result_datetime64, expected) assert_series_equal(result_datetime, expected) assert_series_equal(result_Timestamp, expected) def test_constructor_list_of_tuples(self): data = [(1, 1), (2, 2), (2, 3)] s = Series(data) assert list(s) == data def test_constructor_tuple_of_tuples(self): data = ((1, 1), (2, 2), (2, 3)) s = Series(data) assert tuple(s) == data def test_constructor_dict_of_tuples(self): data = {(1, 2): 3, (None, 5): 6} result = Series(data).sort_values() expected = Series([3, 6], index=MultiIndex.from_tuples([(1, 2), (None, 5)])) tm.assert_series_equal(result, expected) def test_constructor_set(self): values = {1, 2, 3, 4, 5} with pytest.raises(TypeError, match="'set' type is unordered"): Series(values) values = frozenset(values) with pytest.raises(TypeError, match="'frozenset' type is unordered"): Series(values) # https://github.com/pandas-dev/pandas/issues/22698 @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning") def test_fromDict(self): data = {'a': 0, 'b': 1, 'c': 2, 'd': 3} series = Series(data) assert tm.is_sorted(series.index) data = {'a': 0, 'b': '1', 'c': '2', 'd': datetime.now()} series = Series(data) assert series.dtype == np.object_ data = {'a': 0, 'b': '1', 'c': '2', 'd': '3'} series = Series(data) assert series.dtype == np.object_ data = {'a': '0', 'b': '1'} series = Series(data, dtype=float) assert series.dtype == np.float64 def test_fromValue(self, datetime_series): nans = Series(np.NaN, index=datetime_series.index) assert nans.dtype == np.float_ assert len(nans) == len(datetime_series) strings = Series('foo', index=datetime_series.index) assert strings.dtype == np.object_ assert len(strings) == len(datetime_series) d = datetime.now() dates = Series(d, index=datetime_series.index) assert dates.dtype == 'M8[ns]' assert len(dates) == len(datetime_series) # GH12336 # Test construction of categorical series from value categorical = Series(0, index=datetime_series.index, dtype="category") expected = Series(0, index=datetime_series.index).astype("category") assert categorical.dtype == 'category' assert len(categorical) == len(datetime_series) tm.assert_series_equal(categorical, expected) def test_constructor_dtype_timedelta64(self): # basic td = Series([timedelta(days=i) for i in range(3)]) assert td.dtype == 'timedelta64[ns]' td = Series([timedelta(days=1)]) assert td.dtype == 'timedelta64[ns]' td = Series([timedelta(days=1), timedelta(days=2), np.timedelta64( 1, 's')]) assert td.dtype == 'timedelta64[ns]' # mixed with NaT td = Series([timedelta(days=1), NaT], dtype='m8[ns]') assert td.dtype == 'timedelta64[ns]' td = Series([timedelta(days=1), np.nan], dtype='m8[ns]') assert td.dtype == 'timedelta64[ns]' td = Series([np.timedelta64(300000000), pd.NaT], dtype='m8[ns]') assert td.dtype == 'timedelta64[ns]' # improved inference # GH5689 td = Series([np.timedelta64(300000000), NaT]) assert td.dtype == 'timedelta64[ns]' # because iNaT is int, not coerced to timedelta td = Series([np.timedelta64(300000000), iNaT]) assert td.dtype == 'object' td = Series([np.timedelta64(300000000), np.nan]) assert td.dtype == 'timedelta64[ns]' td = Series([pd.NaT, np.timedelta64(300000000)]) assert td.dtype == 'timedelta64[ns]' td = Series([np.timedelta64(1, 's')]) assert td.dtype == 'timedelta64[ns]' # these are frequency conversion astypes # for t in ['s', 'D', 'us', 'ms']: # with pytest.raises(TypeError): # td.astype('m8[%s]' % t) # valid astype td.astype('int64') # invalid casting msg = (r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" r" \[int32\]") with pytest.raises(TypeError, match=msg): td.astype('int32') # this is an invalid casting msg = "Could not convert object to NumPy timedelta" with pytest.raises(ValueError, match=msg): Series([timedelta(days=1), 'foo'], dtype='m8[ns]') # leave as object here td = Series([timedelta(days=i) for i in range(3)] + ['foo']) assert td.dtype == 'object' # these will correctly infer a timedelta s = Series([None, pd.NaT, '1 Day']) assert s.dtype == 'timedelta64[ns]' s = Series([np.nan, pd.NaT, '1 Day']) assert s.dtype == 'timedelta64[ns]' s = Series([pd.NaT, None, '1 Day']) assert s.dtype == 'timedelta64[ns]' s = Series([pd.NaT, np.nan, '1 Day']) assert s.dtype == 'timedelta64[ns]' # GH 16406 def test_constructor_mixed_tz(self): s = Series([Timestamp('20130101'), Timestamp('20130101', tz='US/Eastern')]) expected = Series([Timestamp('20130101'), Timestamp('20130101', tz='US/Eastern')], dtype='object') assert_series_equal(s, expected) def test_NaT_scalar(self): series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') val = series[3] assert isna(val) series[2] = val assert isna(series[2]) def test_NaT_cast(self): # GH10747 result = Series([np.nan]).astype('M8[ns]') expected = Series([NaT]) assert_series_equal(result, expected) def test_constructor_name_hashable(self): for n in [777, 777., 'name', datetime(2001, 11, 11), (1, ), "\u05D0"]: for data in [[1, 2, 3], np.ones(3), {'a': 0, 'b': 1}]: s = Series(data, name=n) assert s.name == n def test_constructor_name_unhashable(self): msg = r"Series\.name must be a hashable type" for n in [['name_list'], np.ones(2), {1: 2}]: for data in [['name_list'], np.ones(2), {1: 2}]: with pytest.raises(TypeError, match=msg): Series(data, name=n) def test_auto_conversion(self): series = Series(list(date_range('1/1/2000', periods=10))) assert series.dtype == 'M8[ns]' def test_convert_non_ns(self): # convert from a numpy array of non-ns timedelta64 arr = np.array([1, 2, 3], dtype='timedelta64[s]') s = Series(arr) expected = Series(pd.timedelta_range('00:00:01', periods=3, freq='s')) assert_series_equal(s, expected) # convert from a numpy array of non-ns datetime64 # note that creating a numpy datetime64 is in LOCAL time!!!! # seems to work for M8[D], but not for M8[s] s = Series(np.array(['2013-01-01', '2013-01-02', '2013-01-03'], dtype='datetime64[D]')) assert_series_equal(s, Series(date_range('20130101', periods=3, freq='D'))) # s = Series(np.array(['2013-01-01 00:00:01','2013-01-01 # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')) # assert_series_equal(s,date_range('20130101 # 00:00:01',period=3,freq='s')) @pytest.mark.parametrize( "index", [ date_range('1/1/2000', periods=10), timedelta_range('1 day', periods=10), period_range('2000-Q1', periods=10, freq='Q')], ids=lambda x: type(x).__name__) def test_constructor_cant_cast_datetimelike(self, index): # floats are not ok msg = "Cannot cast {}.*? to ".format( # strip Index to convert PeriodIndex -> Period # We don't care whether the error message says # PeriodIndex or PeriodArray type(index).__name__.rstrip("Index") ) with pytest.raises(TypeError, match=msg): Series(index, dtype=float) # ints are ok # we test with np.int64 to get similar results on # windows / 32-bit platforms result = Series(index, dtype=np.int64) expected = Series(index.astype(np.int64)) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( "index", [ date_range('1/1/2000', periods=10), timedelta_range('1 day', periods=10), period_range('2000-Q1', periods=10, freq='Q')], ids=lambda x: type(x).__name__) def test_constructor_cast_object(self, index): s = Series(index, dtype=object) exp = Series(index).astype(object) tm.assert_series_equal(s, exp) s = Series(pd.Index(index, dtype=object), dtype=object) exp = Series(index).astype(object) tm.assert_series_equal(s, exp) s = Series(index.astype(object), dtype=object) exp = Series(index).astype(object) tm.assert_series_equal(s, exp) @pytest.mark.parametrize("dtype", [ np.datetime64, np.timedelta64, ]) def test_constructor_generic_timestamp_no_frequency(self, dtype): # see gh-15524, gh-15987 msg = "dtype has no unit. Please pass in" with pytest.raises(ValueError, match=msg): Series([], dtype=dtype) @pytest.mark.parametrize("dtype,msg", [ ("m8[ps]", "cannot convert timedeltalike"), ("M8[ps]", "cannot convert datetimelike"), ]) def test_constructor_generic_timestamp_bad_frequency(self, dtype, msg): # see gh-15524, gh-15987 with pytest.raises(TypeError, match=msg): Series([], dtype=dtype) @pytest.mark.parametrize('dtype', [None, 'uint8', 'category']) def test_constructor_range_dtype(self, dtype): # GH 16804 expected = Series([0, 1, 2, 3, 4], dtype=dtype or 'int64') result = Series(range(5), dtype=dtype) tm.assert_series_equal(result, expected) def test_constructor_tz_mixed_data(self): # GH 13051 dt_list = [Timestamp('2016-05-01 02:03:37'), Timestamp('2016-04-30 19:03:37-0700', tz='US/Pacific')] result = Series(dt_list) expected = Series(dt_list, dtype=object) tm.assert_series_equal(result, expected)
def sdc_indexes_rename_impl(index, name): return pd.Int64Index(index, name=name)