def test_getitem_setitem_datetime_tz_pytz(self): tm._skip_if_no_pytz() from pytz import timezone as tz from pandas import date_range N = 50 # testing with timezone, GH #2785 rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') ts = Series(np.random.randn(N), index=rng) # also test Timestamp tz handling, GH #2789 result = ts.copy() result["1990-01-01 09:00:00+00:00"] = 0 result["1990-01-01 09:00:00+00:00"] = ts[4] assert_series_equal(result, ts) result = ts.copy() result["1990-01-01 03:00:00-06:00"] = 0 result["1990-01-01 03:00:00-06:00"] = ts[4] assert_series_equal(result, ts) # repeat with datetimes result = ts.copy() result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0 result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4] assert_series_equal(result, ts) result = ts.copy() # comparison dates with datetime MUST be localized! date = tz('US/Central').localize(datetime(1990, 1, 1, 3)) result[date] = 0 result[date] = ts[4] assert_series_equal(result, ts)
def test_set_axis_inplace(self): # GH14636 s = Series(np.arange(4), index=[1, 3, 5, 7], dtype='int64') expected = s.copy() expected.index = list('abcd') for axis in 0, 'index': # inplace=True # The FutureWarning comes from the fact that we would like to have # inplace default to False some day for inplace, warn in (None, FutureWarning), (True, None): result = s.copy() kwargs = {'inplace': inplace} with tm.assert_produces_warning(warn): result.set_axis(list('abcd'), axis=axis, **kwargs) tm.assert_series_equal(result, expected) # inplace=False result = s.set_axis(list('abcd'), axis=0, inplace=False) tm.assert_series_equal(expected, result) # omitting the "axis" parameter with tm.assert_produces_warning(None): result = s.set_axis(list('abcd'), inplace=False) tm.assert_series_equal(result, expected) # wrong values for the "axis" parameter for axis in 2, 'foo': with tm.assert_raises_regex(ValueError, 'No axis named'): s.set_axis(list('abcd'), axis=axis, inplace=False)
def test_getitem_setitem_datetime_tz_dateutil(self): tm._skip_if_no_dateutil() from dateutil.tz import tzutc from pandas.tslib import _dateutil_gettz as gettz tz = lambda x: tzutc() if x == 'UTC' else gettz( x) # handle special case for utc in dateutil from pandas import date_range N = 50 # testing with timezone, GH #2785 rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') ts = Series(np.random.randn(N), index=rng) # also test Timestamp tz handling, GH #2789 result = ts.copy() result["1990-01-01 09:00:00+00:00"] = 0 result["1990-01-01 09:00:00+00:00"] = ts[4] assert_series_equal(result, ts) result = ts.copy() result["1990-01-01 03:00:00-06:00"] = 0 result["1990-01-01 03:00:00-06:00"] = ts[4] assert_series_equal(result, ts) # repeat with datetimes result = ts.copy() result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = 0 result[datetime(1990, 1, 1, 9, tzinfo=tz('UTC'))] = ts[4] assert_series_equal(result, ts) result = ts.copy() result[datetime(1990, 1, 1, 3, tzinfo=tz('US/Central'))] = 0 result[datetime(1990, 1, 1, 3, tzinfo=tz('US/Central'))] = ts[4] assert_series_equal(result, ts)
def test_loc_getitem_setitem_integer_slice_keyerrors(): s = Series(np.random.randn(10), index=lrange(0, 20, 2)) # this is OK cp = s.copy() cp.iloc[4:10] = 0 assert (cp.iloc[4:10] == 0).all() # so is this cp = s.copy() cp.iloc[3:11] = 0 assert (cp.iloc[3:11] == 0).values.all() result = s.iloc[2:6] result2 = s.loc[3:11] expected = s.reindex([4, 6, 8, 10]) assert_series_equal(result, expected) assert_series_equal(result2, expected) # non-monotonic, raise KeyError s2 = s.iloc[lrange(5) + lrange(5, 10)[::-1]] with pytest.raises(KeyError, match=r"^3L?$"): s2.loc[3:11] with pytest.raises(KeyError, match=r"^3L?$"): s2.loc[3:11] = 0
def test_categorial_assigning_ops(): orig = Series(Categorical(["b", "b"], categories=["a", "b"])) s = orig.copy() s[:] = "a" exp = Series(Categorical(["a", "a"], categories=["a", "b"])) tm.assert_series_equal(s, exp) s = orig.copy() s[1] = "a" exp = Series(Categorical(["b", "a"], categories=["a", "b"])) tm.assert_series_equal(s, exp) s = orig.copy() s[s.index > 0] = "a" exp = Series(Categorical(["b", "a"], categories=["a", "b"])) tm.assert_series_equal(s, exp) s = orig.copy() s[[False, True]] = "a" exp = Series(Categorical(["b", "a"], categories=["a", "b"])) tm.assert_series_equal(s, exp) s = orig.copy() s.index = ["x", "y"] s["y"] = "a" exp = Series(Categorical(["b", "a"], categories=["a", "b"]), index=["x", "y"]) tm.assert_series_equal(s, exp) # ensure that one can set something to np.nan s = Series(Categorical([1, 2, 3])) exp = Series(Categorical([1, np.nan, 3], categories=[1, 2, 3])) s[1] = np.nan tm.assert_series_equal(s, exp)
def test_inplace_ops_identity(self): # GH 5104 # make sure that we are actually changing the object s_orig = Series([1, 2, 3]) df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5)) # no dtype change s = s_orig.copy() s2 = s s += 1 assert_series_equal(s, s2) assert_series_equal(s_orig + 1, s) assert s is s2 assert s._data is s2._data df = df_orig.copy() df2 = df df += 1 assert_frame_equal(df, df2) assert_frame_equal(df_orig + 1, df) assert df is df2 assert df._data is df2._data # dtype change s = s_orig.copy() s2 = s s += 1.5 assert_series_equal(s, s2) assert_series_equal(s_orig + 1.5, s) df = df_orig.copy() df2 = df df += 1.5 assert_frame_equal(df, df2) assert_frame_equal(df_orig + 1.5, df) assert df is df2 assert df._data is df2._data # mixed dtype arr = np.random.randint(0, 10, size=5) df_orig = DataFrame({'A': arr.copy(), 'B': 'foo'}) df = df_orig.copy() df2 = df df['A'] += 1 expected = DataFrame({'A': arr.copy() + 1, 'B': 'foo'}) assert_frame_equal(df, expected) assert_frame_equal(df2, expected) assert df._data is df2._data df = df_orig.copy() df2 = df df['A'] += 1.5 expected = DataFrame({'A': arr.copy() + 1.5, 'B': 'foo'}) assert_frame_equal(df, expected) assert_frame_equal(df2, expected) assert df._data is df2._data
def test_inplace_ops_identity(self): # GH 5104 # make sure that we are actually changing the object s_orig = Series([1, 2, 3]) df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5)) # no dtype change s = s_orig.copy() s2 = s s += 1 assert_series_equal(s, s2) assert_series_equal(s_orig + 1, s) self.assertIs(s, s2) self.assertIs(s._data, s2._data) df = df_orig.copy() df2 = df df += 1 assert_frame_equal(df, df2) assert_frame_equal(df_orig + 1, df) self.assertIs(df, df2) self.assertIs(df._data, df2._data) # dtype change s = s_orig.copy() s2 = s s += 1.5 assert_series_equal(s, s2) assert_series_equal(s_orig + 1.5, s) df = df_orig.copy() df2 = df df += 1.5 assert_frame_equal(df, df2) assert_frame_equal(df_orig + 1.5, df) self.assertIs(df, df2) self.assertIs(df._data, df2._data) # mixed dtype arr = np.random.randint(0, 10, size=5) df_orig = DataFrame({"A": arr.copy(), "B": "foo"}) df = df_orig.copy() df2 = df df["A"] += 1 expected = DataFrame({"A": arr.copy() + 1, "B": "foo"}) assert_frame_equal(df, expected) assert_frame_equal(df2, expected) self.assertIs(df._data, df2._data) df = df_orig.copy() df2 = df df["A"] += 1.5 expected = DataFrame({"A": arr.copy() + 1.5, "B": "foo"}) assert_frame_equal(df, expected) assert_frame_equal(df2, expected) self.assertIs(df._data, df2._data)
def test_mask_inplace(): s = Series(np.random.randn(5)) cond = s > 0 rs = s.copy() rs.mask(cond, inplace=True) assert_series_equal(rs.dropna(), s[~cond]) assert_series_equal(rs, s.mask(cond)) rs = s.copy() rs.mask(cond, -s, inplace=True) assert_series_equal(rs, s.mask(cond, -s))
def test_iloc_setitem_pandas_object(self): # GH 17193, affecting old numpy (1.7 and 1.8) s_orig = Series([0, 1, 2, 3]) expected = Series([0, -1, -2, 3]) s = s_orig.copy() s.iloc[Series([1, 2])] = [-1, -2] tm.assert_series_equal(s, expected) s = s_orig.copy() s.iloc[pd.Index([1, 2])] = [-1, -2] tm.assert_series_equal(s, expected)
def test_setitem_ambiguous_keyerror(): s = Series(lrange(10), index=lrange(0, 20, 2)) # equivalent of an append s2 = s.copy() s2[1] = 5 expected = s.append(Series([5], index=[1])) assert_series_equal(s2, expected) s2 = s.copy() s2.loc[1] = 5 expected = s.append(Series([5], index=[1])) assert_series_equal(s2, expected)
def test_indexing_with_datetimeindex_tz(self): # GH 12050 # indexing on a series with a datetimeindex with tz index = date_range('2015-01-01', periods=2, tz='utc') ser = Series(range(2), index=index, dtype='int64') # list-like indexing for sel in (index, list(index)): # getitem tm.assert_series_equal(ser[sel], ser) # setitem result = ser.copy() result[sel] = 1 expected = Series(1, index=index) tm.assert_series_equal(result, expected) # .loc getitem tm.assert_series_equal(ser.loc[sel], ser) # .loc setitem result = ser.copy() result.loc[sel] = 1 expected = Series(1, index=index) tm.assert_series_equal(result, expected) # single element indexing # getitem assert ser[index[1]] == 1 # setitem result = ser.copy() result[index[1]] = 5 expected = Series([0, 5], index=index) tm.assert_series_equal(result, expected) # .loc getitem assert ser.loc[index[1]] == 1 # .loc setitem result = ser.copy() result.loc[index[1]] = 5 expected = Series([0, 5], index=index) tm.assert_series_equal(result, expected)
def testSeriesNested(self): s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6,7,8,9,10,15]) s.sort() nested = {'s1': s, 's2': s.copy()} exp = {'s1': ujson.decode(ujson.encode(s)), 's2': ujson.decode(ujson.encode(s))} self.assertTrue(ujson.decode(ujson.encode(nested)) == exp) exp = {'s1': ujson.decode(ujson.encode(s, orient="split")), 's2': ujson.decode(ujson.encode(s, orient="split"))} self.assertTrue(ujson.decode(ujson.encode(nested, orient="split")) == exp) exp = {'s1': ujson.decode(ujson.encode(s, orient="records")), 's2': ujson.decode(ujson.encode(s, orient="records"))} self.assertTrue(ujson.decode(ujson.encode(nested, orient="records")) == exp) exp = {'s1': ujson.decode(ujson.encode(s, orient="values")), 's2': ujson.decode(ujson.encode(s, orient="values"))} self.assertTrue(ujson.decode(ujson.encode(nested, orient="values")) == exp) exp = {'s1': ujson.decode(ujson.encode(s, orient="index")), 's2': ujson.decode(ujson.encode(s, orient="index"))} self.assertTrue(ujson.decode(ujson.encode(nested, orient="index")) == exp)
def para_keys_modify(para): content=np.array(para) content=np.float64(content) content=Series(content) content1=content.copy() for i in range(1,41): content1[i*2+1]=content[i] content1[1]=content[41] for i in range(42,82): content1[(i-41)*2]=content[i] content1=content1.values return content1 #def truepara_key_modify(): # root_directory=r'E:\EnRML_Gas_Modelling\true_obs' # with open(r'E:\EnRML_Gas_Modelling\true_obs\para_true.txt','r') as f: # content=f.readlines() # # content=np.array(content) # content=np.float64(content) # content=Series(content) # content1=content.copy() # for i in range(1,41): # content1[i*2+1]=content[i] # content1[1]=content[41] # # for i in range(42,82): # content1[(i-41)*2]=content[i] # # para_distribution_map(content1,1681,root_directory) # np.savetxt(r'E:\EnRML_Gas_Modelling\true_obs\para_true.txt',content1)
def test_constructor_with_datetimelike(self, dtl): # see gh-12077 # constructor with a datetimelike and NaT s = Series(dtl) c = Categorical(s) expected = type(dtl)(s) expected.freq = None tm.assert_index_equal(c.categories, expected) tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype="int8")) # with NaT s2 = s.copy() s2.iloc[-1] = NaT c = Categorical(s2) expected = type(dtl)(s2.dropna()) expected.freq = None tm.assert_index_equal(c.categories, expected) exp = np.array([0, 1, 2, 3, -1], dtype=np.int8) tm.assert_numpy_array_equal(c.codes, exp) result = repr(c) assert "NaT" in result
def test_set_axis_inplace_axes(self, axis_series): # GH14636 ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype='int64') expected = ser.copy() expected.index = list('abcd') # inplace=True # The FutureWarning comes from the fact that we would like to have # inplace default to False some day for inplace, warn in [(None, FutureWarning), (True, None)]: result = ser.copy() kwargs = {'inplace': inplace} with tm.assert_produces_warning(warn): result.set_axis(list('abcd'), axis=axis_series, **kwargs) tm.assert_series_equal(result, expected)
def test_constructor_with_datetimelike(self): # 12077 # constructor wwth a datetimelike and NaT for dtl in [date_range('1995-01-01 00:00:00', periods=5, freq='s'), date_range('1995-01-01 00:00:00', periods=5, freq='s', tz='US/Eastern'), timedelta_range('1 day', periods=5, freq='s')]: s = Series(dtl) c = Categorical(s) expected = type(dtl)(s) expected.freq = None tm.assert_index_equal(c.categories, expected) tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype='int8')) # with NaT s2 = s.copy() s2.iloc[-1] = NaT c = Categorical(s2) expected = type(dtl)(s2.dropna()) expected.freq = None tm.assert_index_equal(c.categories, expected) exp = np.array([0, 1, 2, 3, -1], dtype=np.int8) tm.assert_numpy_array_equal(c.codes, exp) result = repr(c) assert 'NaT' in result
def test_timedelta64_nan(self): from pandas import tslib td = Series([timedelta(days=i) for i in range(10)]) # nan ops on timedeltas td1 = td.copy() td1[0] = np.nan self.assertTrue(isnull(td1[0])) self.assertEqual(td1[0].value, tslib.iNaT) td1[0] = td[0] self.assertFalse(isnull(td1[0])) td1[1] = tslib.iNaT self.assertTrue(isnull(td1[1])) self.assertEqual(td1[1].value, tslib.iNaT) td1[1] = td[1] self.assertFalse(isnull(td1[1])) td1[2] = tslib.NaT self.assertTrue(isnull(td1[2])) self.assertEqual(td1[2].value, tslib.iNaT) td1[2] = td[2] self.assertFalse(isnull(td1[2]))
def test_to_period(self): from pandas.tseries.period import period_range ts = _simple_ts('1/1/2000', '1/1/2001') pts = ts.to_period() exp = ts.copy() exp.index = period_range('1/1/2000', '1/1/2001') assert_series_equal(pts, exp) pts = ts.to_period('M') exp.index = exp.index.asfreq('M') tm.assert_index_equal(pts.index, exp.index.asfreq('M')) assert_series_equal(pts, exp) # GH 7606 without freq idx = DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', '2011-01-04']) exp_idx = pd.PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03', '2011-01-04'], freq='D') s = Series(np.random.randn(4), index=idx) expected = s.copy() expected.index = exp_idx assert_series_equal(s.to_period(), expected) df = DataFrame(np.random.randn(4, 4), index=idx, columns=idx) expected = df.copy() expected.index = exp_idx assert_frame_equal(df.to_period(), expected) expected = df.copy() expected.columns = exp_idx assert_frame_equal(df.to_period(axis=1), expected)
def test_fillna_consistency(self): # GH 16402 # fillna with a tz aware to a tz-naive, should result in object s = Series([Timestamp('20130101'), pd.NaT]) result = s.fillna(Timestamp('20130101', tz='US/Eastern')) expected = Series([Timestamp('20130101'), Timestamp('2013-01-01', tz='US/Eastern')], dtype='object') assert_series_equal(result, expected) # where (we ignore the errors=) result = s.where([True, False], Timestamp('20130101', tz='US/Eastern'), errors='ignore') assert_series_equal(result, expected) result = s.where([True, False], Timestamp('20130101', tz='US/Eastern'), errors='ignore') assert_series_equal(result, expected) # with a non-datetime result = s.fillna('foo') expected = Series([Timestamp('20130101'), 'foo']) assert_series_equal(result, expected) # assignment s2 = s.copy() s2[1] = 'foo' assert_series_equal(s2, expected)
def test_copy(self): for deep in [None, False, True]: s = Series(np.arange(10), dtype='float64') # default deep is True if deep is None: s2 = s.copy() else: s2 = s.copy(deep=deep) s2[::2] = np.NaN if deep is None or deep is True: # Did not modify original Series assert np.isnan(s2[0]) assert not np.isnan(s[0]) else: # we DID modify the original Series assert np.isnan(s2[0]) assert np.isnan(s[0]) # GH 11794 # copy of tz-aware expected = Series([Timestamp('2012/01/01', tz='UTC')]) expected2 = Series([Timestamp('1999/01/01', tz='UTC')]) for deep in [None, False, True]: s = Series([Timestamp('2012/01/01', tz='UTC')]) if deep is None: s2 = s.copy() else: s2 = s.copy(deep=deep) s2[0] = pd.Timestamp('1999/01/01', tz='UTC') # default deep is True if deep is None or deep is True: # Did not modify original Series assert_series_equal(s2, expected2) assert_series_equal(s, expected) else: # we DID modify the original Series assert_series_equal(s2, expected2) assert_series_equal(s, expected2)
def test_fillna_inplace(self): x = Series([nan, 1.0, nan, 3.0, nan], ["z", "a", "b", "c", "d"]) y = x.copy() y.fillna(value=0, inplace=True) expected = x.fillna(value=0) assert_series_equal(y, expected)
def test_drop_duplicates_bool(keep, expected): tc = Series([True, False, True, False]) tm.assert_series_equal(tc.duplicated(keep=keep), expected) tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) sc = tc.copy() sc.drop_duplicates(keep=keep, inplace=True) tm.assert_series_equal(sc, tc[~expected])
def test_drop_duplicates_non_bool(any_numpy_dtype, keep, expected): tc = Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(any_numpy_dtype)) tm.assert_series_equal(tc.duplicated(keep=keep), expected) tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) sc = tc.copy() sc.drop_duplicates(keep=keep, inplace=True) tm.assert_series_equal(sc, tc[~expected])
def test_fillna_inplace(self): x = Series([nan, 1., nan, 3., nan], ['z', 'a', 'b', 'c', 'd']) y = x.copy() y.fillna(value=0, inplace=True) expected = x.fillna(value=0) assert_series_equal(y, expected)
def test_rank_modify_inplace(self): # GH 18521 # Check rank does not mutate series s = Series([Timestamp('2017-01-05 10:20:27.569000'), NaT]) expected = s.copy() s.rank() result = s assert_series_equal(result, expected)
def test_setitem_float_labels(): # note labels are floats s = Series(['a', 'b', 'c'], index=[0, 0.5, 1]) tmp = s.copy() s.loc[1] = 'zoo' tmp.iloc[2] = 'zoo' assert_series_equal(s, tmp)
def test_series_nested(self, orient): s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15]).sort_values() nested = {"s1": s, "s2": s.copy()} kwargs = {} if orient is None else dict(orient=orient) exp = {"s1": ujson.decode(ujson.encode(s, **kwargs)), "s2": ujson.decode(ujson.encode(s, **kwargs))} assert ujson.decode(ujson.encode(nested, **kwargs)) == exp
def test_operators_datetimelike_invalid(self, all_arithmetic_operators): # these are all TypeEror ops op_str = all_arithmetic_operators def check(get_ser, test_ser): # check that we are getting a TypeError # with 'operate' (from core/ops.py) for the ops that are not # defined op = getattr(get_ser, op_str, None) with tm.assert_raises_regex(TypeError, 'operate|cannot'): op(test_ser) # ## timedelta64 ### td1 = Series([timedelta(minutes=5, seconds=3)] * 3) td1.iloc[2] = np.nan # ## datetime64 ### dt1 = Series([Timestamp('20111230'), Timestamp('20120101'), Timestamp('20120103')]) dt1.iloc[2] = np.nan dt2 = Series([Timestamp('20111231'), Timestamp('20120102'), Timestamp('20120104')]) if op_str not in ['__sub__', '__rsub__']: check(dt1, dt2) # ## datetime64 with timetimedelta ### # TODO(jreback) __rsub__ should raise? if op_str not in ['__add__', '__radd__', '__sub__']: check(dt1, td1) # 8260, 10763 # datetime64 with tz tz = 'US/Eastern' dt1 = Series(date_range('2000-01-01 09:00:00', periods=5, tz=tz), name='foo') dt2 = dt1.copy() dt2.iloc[2] = np.nan td1 = Series(timedelta_range('1 days 1 min', periods=5, freq='H')) td2 = td1.copy() td2.iloc[1] = np.nan if op_str not in ['__add__', '__radd__', '__sub__', '__rsub__']: check(dt2, td2)
def test_basic_setitem_with_labels(test_data): indices = test_data.ts.index[[5, 10, 15]] cp = test_data.ts.copy() exp = test_data.ts.copy() cp[indices] = 0 exp.loc[indices] = 0 assert_series_equal(cp, exp) cp = test_data.ts.copy() exp = test_data.ts.copy() cp[indices[0]:indices[2]] = 0 exp.loc[indices[0]:indices[2]] = 0 assert_series_equal(cp, exp) # integer indexes, be careful s = Series(np.random.randn(10), index=lrange(0, 20, 2)) inds = [0, 4, 6] arr_inds = np.array([0, 4, 6]) cp = s.copy() exp = s.copy() s[inds] = 0 s.loc[inds] = 0 assert_series_equal(cp, exp) cp = s.copy() exp = s.copy() s[arr_inds] = 0 s.loc[arr_inds] = 0 assert_series_equal(cp, exp) inds_notfound = [0, 4, 5, 6] arr_inds_notfound = np.array([0, 4, 5, 6]) msg = r"\[5\] not contained in the index" with pytest.raises(ValueError, match=msg): s[inds_notfound] = 0 with pytest.raises(Exception, match=msg): s[arr_inds_notfound] = 0 # GH12089 # with tz for values s = Series(pd.date_range("2011-01-01", periods=3, tz="US/Eastern"), index=['a', 'b', 'c']) s2 = s.copy() expected = Timestamp('2011-01-03', tz='US/Eastern') s2.loc['a'] = expected result = s2.loc['a'] assert result == expected s2 = s.copy() s2.iloc[0] = expected result = s2.iloc[0] assert result == expected s2 = s.copy() s2['a'] = expected result = s2['a'] assert result == expected
def test_getitem_setitem_periodindex(self): from pandas import period_range N = 50 rng = period_range('1/1/1990', periods=N, freq='H') ts = Series(np.random.randn(N), index=rng) result = ts["1990-01-01 04"] expected = ts[4] self.assertEqual(result, expected) result = ts.copy() result["1990-01-01 04"] = 0 result["1990-01-01 04"] = ts[4] assert_series_equal(result, ts) result = ts["1990-01-01 04":"1990-01-01 07"] expected = ts[4:8] assert_series_equal(result, expected) result = ts.copy() result["1990-01-01 04":"1990-01-01 07"] = 0 result["1990-01-01 04":"1990-01-01 07"] = ts[4:8] assert_series_equal(result, ts) lb = "1990-01-01 04" rb = "1990-01-01 07" result = ts[(ts.index >= lb) & (ts.index <= rb)] expected = ts[4:8] assert_series_equal(result, expected) # GH 2782 result = ts[ts.index[4]] expected = ts[4] self.assertEqual(result, expected) result = ts[ts.index[4:8]] expected = ts[4:8] assert_series_equal(result, expected) result = ts.copy() result[ts.index[4:8]] = 0 result[4:8] = ts[4:8] assert_series_equal(result, ts)
def test_slice_integer(self): # same as above, but for Integer based indexes # these coerce to a like integer # oob indiciates if we are out of bounds # of positional indexing for index, oob in [(tm.makeIntIndex(5), False), (tm.makeRangeIndex(5), False), (tm.makeIntIndex(5) + 10, True)]: # s is an in-range index s = Series(range(5), index=index) # getitem for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: for idxr in [lambda x: x.loc, lambda x: x.ix]: with catch_warnings(record=True): result = idxr(s)[l] # these are all label indexing # except getitem which is positional # empty if oob: indexer = slice(0, 0) else: indexer = slice(3, 5) self.check(result, s, indexer, False) # positional indexing def f(): s[l] pytest.raises(TypeError, f) # getitem out-of-bounds for l in [slice(-6, 6), slice(-6.0, 6.0)]: for idxr in [lambda x: x.loc, lambda x: x.ix]: with catch_warnings(record=True): result = idxr(s)[l] # these are all label indexing # except getitem which is positional # empty if oob: indexer = slice(0, 0) else: indexer = slice(-6, 6) self.check(result, s, indexer, False) # positional indexing def f(): s[slice(-6.0, 6.0)] pytest.raises(TypeError, f) # getitem odd floats for l, res1 in [(slice(2.5, 4), slice(3, 5)), (slice(2, 3.5), slice(2, 4)), (slice(2.5, 3.5), slice(3, 4))]: for idxr in [lambda x: x.loc, lambda x: x.ix]: with catch_warnings(record=True): result = idxr(s)[l] if oob: res = slice(0, 0) else: res = res1 self.check(result, s, res, False) # positional indexing def f(): s[l] pytest.raises(TypeError, f) # setitem for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: for idxr in [lambda x: x.loc, lambda x: x.ix]: sc = s.copy() with catch_warnings(record=True): idxr(sc)[l] = 0 result = idxr(sc)[l].values.ravel() assert (result == 0).all() # positional indexing def f(): s[l] = 0 pytest.raises(TypeError, f)
def _predict_core(self, s: pd.Series) -> pd.Series: if not (s.index.is_monotonic_increasing or s.index.is_monotonic_decreasing): raise ValueError("Time series must have a monotonic time index. ") agg = self.agg agg_params = self.agg_params if (self.agg_params is not None) else {} window = self.window min_periods = self.min_periods center = self.center diff = self.diff if not isinstance(agg, tuple): agg = (agg, agg) if not isinstance(agg_params, tuple): agg_params = (agg_params, agg_params) if not isinstance(window, tuple): window = (window, window) if not isinstance(min_periods, tuple): min_periods = (min_periods, min_periods) if center: if isinstance(window[0], int): s_rolling_left = RollingAggregate( agg=agg[0], agg_params=agg_params[0], window=window[0], min_periods=min_periods[0], center=False, ).transform(s.shift(1)) else: ra = RollingAggregate( agg=agg[0], agg_params=agg_params[0], window=window[0], min_periods=min_periods[0], center=False, ) if parse(pd.__version__) < parse("0.25"): raise PandasBugError() ra._closed = "left" s_rolling_left = ra.transform(s) if isinstance(window[1], int): s_rolling_right = (RollingAggregate( agg=agg[1], agg_params=agg_params[1], window=window[1], min_periods=min_periods[1], center=False, ).transform(s.iloc[::-1]).iloc[::-1]) else: s_reversed = pd.Series( s.values[::-1], index=pd.DatetimeIndex([ s.index[0] + (s.index[-1] - s.index[i]) for i in range(len(s) - 1, -1, -1) ]), ) s_rolling_right = pd.Series( RollingAggregate( agg=agg[1], agg_params=agg_params[1], window=window[1], min_periods=min_periods[1], center=False, ).transform(s_reversed).iloc[::-1].values, index=s.index, ) s_rolling_right.name = s.name else: if isinstance(window[1], int): s_rolling_left = RollingAggregate( agg=agg[0], agg_params=agg_params[0], window=window[0], min_periods=min_periods[0], center=False, ).transform(s.shift(window[1])) else: s_shifted = pd.Series(s.values, s.index + pd.Timedelta(window[1])) s_shifted = s_shifted.append( pd.Series(index=s.index, dtype="float64")) s_shifted = s_shifted.iloc[s_shifted.index.duplicated() == False] s_shifted = s_shifted.sort_index() s_shifted.name = s.name s_rolling_left = RollingAggregate( agg=agg[0], agg_params=agg_params[0], window=window[0], min_periods=min_periods[0], center=False, ).transform(s_shifted) if isinstance(s_rolling_left, pd.Series): s_rolling_left = s_rolling_left[s.index] else: s_rolling_left = s_rolling_left.loc[s.index, :] s_rolling_right = RollingAggregate( agg=agg[1], agg_params=agg_params[1], window=window[1], min_periods=min_periods[1], center=False, ).transform(s) if isinstance(s_rolling_left, pd.Series): if diff in ["l1", "l2"]: return abs(s_rolling_right - s_rolling_left) if diff == "diff": return s_rolling_right - s_rolling_left if diff == "rel_diff": return (s_rolling_right - s_rolling_left) / s_rolling_left if diff == "abs_rel_diff": return abs(s_rolling_right - s_rolling_left) / s_rolling_left if isinstance(s_rolling_left, pd.DataFrame): if diff == "l1": return abs(s_rolling_right - s_rolling_left).sum(axis=1, skipna=False) if diff == "l2": return ((s_rolling_right - s_rolling_left)**2).sum( axis=1, skipna=False)**0.5 if callable(diff): s_rolling = s.copy() for i in range(len(s_rolling)): s_rolling.iloc[i] = diff(s_rolling_left.iloc[i], s_rolling_right.iloc[i]) return s_rolling raise ValueError("Invalid value of diff")
def pattern_match( # pylint: disable=too-many-arguments meta_col: pd.Series, values: Union[Iterable[str], str], level: Optional[Union[str, int]] = None, regexp: bool = False, has_nan: bool = True, separator: str = DEFAULT_SEPARATOR, ) -> np.ndarray: """ Filter data by matching metadata columns to given patterns Parameters ---------- meta_col Column to perform filtering on values Values to match level Passed to ``find_depth``. For usage, see docstring of ``find_depth``. regexp If True, match using regexp rather than pseudo regexp syntax developed by the `pyam <https://github.com/IAMconsortium/pyam>`_ developers. has_nan If True, convert all nan in ``meta_col`` to empty string before applying filters. This means that "" and "*" will match rows with ``np.nan``. If False, the conversion is not applied and so a search in a string column which contains ``np.nan`` will result in a ``TypeError``. separator String used to separate the hierarchy levels in values. Defaults to '|' Returns ------- :obj:`np.array` of :obj:`bool` Array where True indicates a match Raises ------ TypeError Filtering is performed on a string metadata column which contains ``np.nan`` and ``has_nan`` is ``False``. """ matches = np.array([False] * len(meta_col)) _values = [values] if not isinstance(values, Iterable) or is_str(values) else values # pyam issue (#40) with string-to-nan comparison, replace nan by empty string # TODO: add docs and example of filtering/removing NaN given this internal # conversion _meta_col = meta_col.copy() if has_nan: _meta_col.loc[[np.isnan(i) if not is_str(i) else False for i in _meta_col]] = "" for s in _values: if is_str(s): _regexp = ( str(s) .replace("|", "\\|") .replace(".", r"\.") # `.` has to be replaced before `*` .replace("*", ".*") .replace("+", r"\+") .replace("(", r"\(") .replace(")", r"\)") .replace("$", "\\$") ) + "$" pattern = re.compile(_regexp if not regexp else str(s)) try: subset = [m for m in _meta_col if pattern.match(m)] except TypeError as e: # if it's not the cryptic pandas message we expect, raise msg = str(e) if msg != "expected string or bytes-like object": raise e # pragma: no cover # emergency valve error_msg = ( "String filtering cannot be performed on column '{}', which " "contains NaN's, unless `has_nan` is True".format(_meta_col.name) ) raise TypeError(error_msg) depth = ( True if level is None else find_depth(_meta_col, str(s), level, separator=separator) ) matches |= _meta_col.isin(subset) & depth else: matches |= meta_col == s return matches
def test_partial_setting(self): # GH2578, allow ix and friends to partially set # series s_orig = Series([1, 2, 3]) s = s_orig.copy() s[5] = 5 expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s.loc[5] = 5 expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s[5] = 5.0 expected = Series([1, 2, 3, 5.0], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s.loc[5] = 5.0 expected = Series([1, 2, 3, 5.0], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) # iloc/iat raise s = s_orig.copy() with pytest.raises(IndexError): s.iloc[3] = 5.0 with pytest.raises(IndexError): s.iat[3] = 5.0 # ## frame ## df_orig = DataFrame(np.arange(6).reshape(3, 2), columns=["A", "B"], dtype="int64") # iloc/iat raise df = df_orig.copy() with pytest.raises(IndexError): df.iloc[4, 2] = 5.0 with pytest.raises(IndexError): df.iat[4, 2] = 5.0 # row setting where it exists expected = DataFrame(dict({"A": [0, 4, 4], "B": [1, 5, 5]})) df = df_orig.copy() df.iloc[1] = df.iloc[2] tm.assert_frame_equal(df, expected) expected = DataFrame(dict({"A": [0, 4, 4], "B": [1, 5, 5]})) df = df_orig.copy() df.loc[1] = df.loc[2] tm.assert_frame_equal(df, expected) # like 2578, partial setting with dtype preservation expected = DataFrame(dict({"A": [0, 2, 4, 4], "B": [1, 3, 5, 5]})) df = df_orig.copy() df.loc[3] = df.loc[2] tm.assert_frame_equal(df, expected) # single dtype frame, overwrite expected = DataFrame(dict({"A": [0, 2, 4], "B": [0, 2, 4]})) df = df_orig.copy() df.loc[:, "B"] = df.loc[:, "A"] tm.assert_frame_equal(df, expected) # mixed dtype frame, overwrite expected = DataFrame(dict({"A": [0, 2, 4], "B": Series([0, 2, 4])})) df = df_orig.copy() df["B"] = df["B"].astype(np.float64) df.loc[:, "B"] = df.loc[:, "A"] tm.assert_frame_equal(df, expected) # single dtype frame, partial setting expected = df_orig.copy() expected["C"] = df["A"] df = df_orig.copy() df.loc[:, "C"] = df.loc[:, "A"] tm.assert_frame_equal(df, expected) # mixed frame, partial setting expected = df_orig.copy() expected["C"] = df["A"] df = df_orig.copy() df.loc[:, "C"] = df.loc[:, "A"] tm.assert_frame_equal(df, expected) # GH 8473 dates = date_range("1/1/2000", periods=8) df_orig = DataFrame(np.random.randn(8, 4), index=dates, columns=["A", "B", "C", "D"]) expected = pd.concat( [df_orig, DataFrame({"A": 7}, index=[dates[-1] + dates.freq])], sort=True) df = df_orig.copy() df.loc[dates[-1] + dates.freq, "A"] = 7 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.at[dates[-1] + dates.freq, "A"] = 7 tm.assert_frame_equal(df, expected) exp_other = DataFrame({0: 7}, index=[dates[-1] + dates.freq]) expected = pd.concat([df_orig, exp_other], axis=1) df = df_orig.copy() df.loc[dates[-1] + dates.freq, 0] = 7 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.at[dates[-1] + dates.freq, 0] = 7 tm.assert_frame_equal(df, expected)
def test_convert_objects(self): s = Series([1., 2, 3], index=['a', 'b', 'c']) with tm.assert_produces_warning(FutureWarning): result = s.convert_objects(convert_dates=False, convert_numeric=True) assert_series_equal(result, s) # force numeric conversion r = s.copy().astype('O') r['a'] = '1' with tm.assert_produces_warning(FutureWarning): result = r.convert_objects(convert_dates=False, convert_numeric=True) assert_series_equal(result, s) r = s.copy().astype('O') r['a'] = '1.' with tm.assert_produces_warning(FutureWarning): result = r.convert_objects(convert_dates=False, convert_numeric=True) assert_series_equal(result, s) r = s.copy().astype('O') r['a'] = 'garbled' expected = s.copy() expected['a'] = np.nan with tm.assert_produces_warning(FutureWarning): result = r.convert_objects(convert_dates=False, convert_numeric=True) assert_series_equal(result, expected) # GH 4119, not converting a mixed type (e.g.floats and object) s = Series([1, 'na', 3, 4]) with tm.assert_produces_warning(FutureWarning): result = s.convert_objects(convert_numeric=True) expected = Series([1, np.nan, 3, 4]) assert_series_equal(result, expected) s = Series([1, '', 3, 4]) with tm.assert_produces_warning(FutureWarning): result = s.convert_objects(convert_numeric=True) expected = Series([1, np.nan, 3, 4]) assert_series_equal(result, expected) # dates s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), datetime(2001, 1, 3, 0, 0)]) s2 = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), datetime(2001, 1, 3, 0, 0), 'foo', 1.0, 1, Timestamp('20010104'), '20010105'], dtype='O') with tm.assert_produces_warning(FutureWarning): result = s.convert_objects(convert_dates=True, convert_numeric=False) expected = Series([Timestamp('20010101'), Timestamp('20010102'), Timestamp('20010103')], dtype='M8[ns]') assert_series_equal(result, expected) with tm.assert_produces_warning(FutureWarning): result = s.convert_objects(convert_dates='coerce', convert_numeric=False) with tm.assert_produces_warning(FutureWarning): result = s.convert_objects(convert_dates='coerce', convert_numeric=True) assert_series_equal(result, expected) expected = Series([Timestamp('20010101'), Timestamp('20010102'), Timestamp('20010103'), lib.NaT, lib.NaT, lib.NaT, Timestamp('20010104'), Timestamp('20010105')], dtype='M8[ns]') with tm.assert_produces_warning(FutureWarning): result = s2.convert_objects(convert_dates='coerce', convert_numeric=False) assert_series_equal(result, expected) with tm.assert_produces_warning(FutureWarning): result = s2.convert_objects(convert_dates='coerce', convert_numeric=True) assert_series_equal(result, expected) # preserver all-nans (if convert_dates='coerce') s = Series(['foo', 'bar', 1, 1.0], dtype='O') with tm.assert_produces_warning(FutureWarning): result = s.convert_objects(convert_dates='coerce', convert_numeric=False) expected = Series([lib.NaT] * 2 + [Timestamp(1)] * 2) assert_series_equal(result, expected) # preserver if non-object s = Series([1], dtype='float32') with tm.assert_produces_warning(FutureWarning): result = s.convert_objects(convert_dates='coerce', convert_numeric=False) assert_series_equal(result, s) # r = s.copy() # r[0] = np.nan # result = r.convert_objects(convert_dates=True,convert_numeric=False) # assert result.dtype == 'M8[ns]' # dateutil parses some single letters into today's value as a date for x in 'abcdefghijklmnopqrstuvwxyz': s = Series([x]) with tm.assert_produces_warning(FutureWarning): result = s.convert_objects(convert_dates='coerce') assert_series_equal(result, s) s = Series([x.upper()]) with tm.assert_produces_warning(FutureWarning): result = s.convert_objects(convert_dates='coerce') assert_series_equal(result, s)
def func(X: pd.DataFrame, y: pd.Series, selection_times=3, title="RON_loss", del_abnormal=False, abnormal_threshold=0.08): y = np.array(y) selector = SelectFromModel(estimator=GradientBoostingRegressor( random_state=0)) X_ = QuantileTransformer(n_quantiles=1000).fit_transform(X) X_ = pd.DataFrame(X_, columns=X.columns) for i in range(selection_times): X_d = selector.fit_transform(X_, y) X_ = pd.DataFrame(X_d, columns=X_.columns[selector.get_support()]) X_d = QuantileTransformer(n_quantiles=1000).fit_transform(X[X_.columns]) # X_d=X[X_.columns].values X_ = pd.DataFrame(X_d, columns=X_.columns) print(f"{title} | {selection_times}次特征筛选后的X_.shape = {X_.shape}") print(f"{title} | 特征筛选后保留的列: {X_.columns.tolist()}") cv = KFold(n_splits=5, shuffle=True, random_state=0) pipeline = LGBMRegressor(random_state=0, n_estimators=100, learning_rate=0.1) pipeline.fit(X_, y) y_pred = pipeline.predict(X_) train_score = r2_score(y, y_pred) pearson_correlation = pearsonr(y, y_pred)[0] print( f"{title} | 在训练集上,r2 = {train_score}, pearson 相关系数 = {pearson_correlation}" ) y_ = y.copy() y_pred_ = y_pred.copy() if del_abnormal: y_pred = pipeline.predict(X_) err = np.abs(y - y_pred) mask = err > abnormal_threshold print(f"{title} | 异常样本数 = {np.count_nonzero(mask)}") plt.rcParams['figure.figsize'] = (7, 4.5) plt.grid(alpha=0.2) plt.scatter(y[mask], y_pred[mask], label="abnormal samples", c="r") plt.scatter(y[~mask], y_pred[~mask], label="normal samples", c="b") plt.legend(loc="best") print( f"{title} | 删除异常样本前的表现 = {cross_val_score(pipeline, X_, y, cv=cv).mean()}" ) X_ = X_.loc[~mask, :] y = y[~mask] print( f"{title} | 删除异常样本后的表现 = {cross_val_score(pipeline, X_, y, cv=cv).mean()}" ) plt.title(f"{title} abnormal samples") plt.xlabel("y true") plt.ylabel("y pred") plt.savefig(f"{title}_abnormal.pdf") plt.close() valid_scores = [] plt.rcParams['figure.figsize'] = (18, 12) for i, (train_ix, valid_ix) in enumerate(cv.split(X_, y)): X_train = X_.iloc[train_ix, :].copy() X_valid = X_.iloc[valid_ix, :].copy() y_train = y[train_ix].copy() y_valid = y[valid_ix].copy() pipeline.fit(X_train, y_train) y_pred = pipeline.predict(X_valid) plt.subplot(2, 3, i + 1) sns.regplot(x="y true", y="y pred", data=pd.DataFrame({ "y true": y_valid, "y pred": y_pred })) plt.title(f"fold-{i + 1}") valid_scores.append(r2_score(y_valid, y_pred)) plt.subplot(2, 3, 6) sns.regplot(x="y true", y="y pred", data=pd.DataFrame({ "y true": y_, "y pred": y_pred_ })) plt.title(f"train-set") plt.suptitle(f"{title} cross-validation") print(f"{title} | 5折交叉验证后,在验证集上的平均r2 = {np.mean(valid_scores)}\n" f"{title} | 每折的r2 = {valid_scores}") plt.savefig(f"{title}_cross-validation.pdf") plt.close() X_["label"] = y X_.to_csv(f"{title}_data.csv", index=False) dump(pipeline, f"{title}_model.bz2")
def _subtract_min_from_every_element(series: pd.Series): min_value = series.copy().apply(func=lambda x: x.value).min() return series.apply( func=PandasMunkres._subtract_value_from_object_value, args=(min_value, ))
def test_partial_setting(self): # GH2578, allow ix and friends to partially set # series s_orig = Series([1, 2, 3]) s = s_orig.copy() s[5] = 5 expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s.loc[5] = 5 expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s[5] = 5. expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s.loc[5] = 5. expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) # iloc/iat raise s = s_orig.copy() def f(): s.iloc[3] = 5. self.assertRaises(IndexError, f) def f(): s.iat[3] = 5. self.assertRaises(IndexError, f) # ## frame ## df_orig = DataFrame(np.arange(6).reshape(3, 2), columns=['A', 'B'], dtype='int64') # iloc/iat raise df = df_orig.copy() def f(): df.iloc[4, 2] = 5. self.assertRaises(IndexError, f) def f(): df.iat[4, 2] = 5. self.assertRaises(IndexError, f) # row setting where it exists expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) df = df_orig.copy() df.iloc[1] = df.iloc[2] tm.assert_frame_equal(df, expected) expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) df = df_orig.copy() df.loc[1] = df.loc[2] tm.assert_frame_equal(df, expected) # like 2578, partial setting with dtype preservation expected = DataFrame(dict({'A': [0, 2, 4, 4], 'B': [1, 3, 5, 5]})) df = df_orig.copy() df.loc[3] = df.loc[2] tm.assert_frame_equal(df, expected) # single dtype frame, overwrite expected = DataFrame(dict({'A': [0, 2, 4], 'B': [0, 2, 4]})) df = df_orig.copy() with catch_warnings(record=True): df.ix[:, 'B'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) # mixed dtype frame, overwrite expected = DataFrame(dict({'A': [0, 2, 4], 'B': Series([0, 2, 4])})) df = df_orig.copy() df['B'] = df['B'].astype(np.float64) with catch_warnings(record=True): df.ix[:, 'B'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) # single dtype frame, partial setting expected = df_orig.copy() expected['C'] = df['A'] df = df_orig.copy() with catch_warnings(record=True): df.ix[:, 'C'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) # mixed frame, partial setting expected = df_orig.copy() expected['C'] = df['A'] df = df_orig.copy() with catch_warnings(record=True): df.ix[:, 'C'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) with catch_warnings(record=True): # ## panel ## p_orig = Panel(np.arange(16).reshape(2, 4, 2), items=['Item1', 'Item2'], major_axis=pd.date_range('2001/1/12', periods=4), minor_axis=['A', 'B'], dtype='float64') # panel setting via item p_orig = Panel(np.arange(16).reshape(2, 4, 2), items=['Item1', 'Item2'], major_axis=pd.date_range('2001/1/12', periods=4), minor_axis=['A', 'B'], dtype='float64') expected = p_orig.copy() expected['Item3'] = expected['Item1'] p = p_orig.copy() p.loc['Item3'] = p['Item1'] tm.assert_panel_equal(p, expected) # panel with aligned series expected = p_orig.copy() expected = expected.transpose(2, 1, 0) expected['C'] = DataFrame( { 'Item1': [30, 30, 30, 30], 'Item2': [32, 32, 32, 32] }, index=p_orig.major_axis) expected = expected.transpose(2, 1, 0) p = p_orig.copy() p.loc[:, :, 'C'] = Series([30, 32], index=p_orig.items) tm.assert_panel_equal(p, expected) # GH 8473 dates = date_range('1/1/2000', periods=8) df_orig = DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) expected = pd.concat( [df_orig, DataFrame({'A': 7}, index=[dates[-1] + 1])]) df = df_orig.copy() df.loc[dates[-1] + 1, 'A'] = 7 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.at[dates[-1] + 1, 'A'] = 7 tm.assert_frame_equal(df, expected) exp_other = DataFrame({0: 7}, index=[dates[-1] + 1]) expected = pd.concat([df_orig, exp_other], axis=1) df = df_orig.copy() df.loc[dates[-1] + 1, 0] = 7 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.at[dates[-1] + 1, 0] = 7 tm.assert_frame_equal(df, expected)
def test_getitem_setitem_datetimeindex(): N = 50 # testing with timezone, GH #2785 rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') ts = Series(np.random.randn(N), index=rng) result = ts["1990-01-01 04:00:00"] expected = ts[4] assert result == expected result = ts.copy() result["1990-01-01 04:00:00"] = 0 result["1990-01-01 04:00:00"] = ts[4] assert_series_equal(result, ts) result = ts["1990-01-01 04:00:00":"1990-01-01 07:00:00"] expected = ts[4:8] assert_series_equal(result, expected) result = ts.copy() result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = 0 result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = ts[4:8] assert_series_equal(result, ts) lb = "1990-01-01 04:00:00" rb = "1990-01-01 07:00:00" # GH#18435 strings get a pass from tzawareness compat result = ts[(ts.index >= lb) & (ts.index <= rb)] expected = ts[4:8] assert_series_equal(result, expected) lb = "1990-01-01 04:00:00-0500" rb = "1990-01-01 07:00:00-0500" result = ts[(ts.index >= lb) & (ts.index <= rb)] expected = ts[4:8] assert_series_equal(result, expected) # repeat all the above with naive datetimes result = ts[datetime(1990, 1, 1, 4)] expected = ts[4] assert result == expected result = ts.copy() result[datetime(1990, 1, 1, 4)] = 0 result[datetime(1990, 1, 1, 4)] = ts[4] assert_series_equal(result, ts) result = ts[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] expected = ts[4:8] assert_series_equal(result, expected) result = ts.copy() result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = 0 result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = ts[4:8] assert_series_equal(result, ts) lb = datetime(1990, 1, 1, 4) rb = datetime(1990, 1, 1, 7) msg = "Cannot compare tz-naive and tz-aware datetime-like objects" with pytest.raises(TypeError, match=msg): # tznaive vs tzaware comparison is invalid # see GH#18376, GH#18162 ts[(ts.index >= lb) & (ts.index <= rb)] lb = pd.Timestamp(datetime(1990, 1, 1, 4)).tz_localize(rng.tzinfo) rb = pd.Timestamp(datetime(1990, 1, 1, 7)).tz_localize(rng.tzinfo) result = ts[(ts.index >= lb) & (ts.index <= rb)] expected = ts[4:8] assert_series_equal(result, expected) result = ts[ts.index[4]] expected = ts[4] assert result == expected result = ts[ts.index[4:8]] expected = ts[4:8] assert_series_equal(result, expected) result = ts.copy() result[ts.index[4:8]] = 0 result[4:8] = ts[4:8] assert_series_equal(result, ts) # also test partial date slicing result = ts["1990-01-02"] expected = ts[24:48] assert_series_equal(result, expected) result = ts.copy() result["1990-01-02"] = 0 result["1990-01-02"] = ts[24:48] assert_series_equal(result, ts)
def QA_fetch_get_factor_groupby(factor: pd.Series, industry_cls: str = "sw_l1", detailed: bool = False) -> pd.DataFrame: """ 获取因子的行业暴露, 注意,返回的值是 pd.DataFrame 格式,包含原因子值,附加一列 因子对应的行业信息 (需先自行导入聚宽本地 sdk 并登陆) 参数 --- :param factor: 因子值,索引为 ['日期' '资产'] :param industry_cls: 行业分类,默认为申万 1 级行业 :param detailed: 是否使用详细模式,默认为 False, 即取因子日期最后一日的行业信息 返回值 --- :return: 因子数据, 包括因子值,因子对应行业 """ warnings.warn("请先自行导入聚宽本地 sdk 并登陆", UserWarning) # 因子格式化 factor = QA_fmt_factor(factor) merged_data = pd.DataFrame(factor.copy().rename("factor")) # 股票代码格式化 stock_list = QA_fmt_code_list( factor.index.get_level_values("code").drop_duplicates(), ) # 非详细模式, 行业数据采用当前日期 ss = pd.Series() if detailed: # start_time = str(min(factor.index.get_level_values("datetime")))[:10] # end_time = str(max(factor.index.get_level_values("datetime")))[:10] # date_range = list( # map(pd.Timestamp, QA_util_get_trade_range(start_time, end_time)) # ) date_range = (factor.index.get_level_values( "datetime").drop_duplicates().tolist()) industry = pd.DataFrame() for cursor_date in date_range: df_tmp = QA_fetch_industry_adv(code=stock_list, cursor_date=cursor_date)[[ "code", "industry_name" ]] df_tmp["date"] = cursor_date industry = industry.append(df_tmp) ss = industry.set_index(["date", "code"])["industry_name"] # industries = map( # partial(jqdatasdk.get_industry, # stock_list), # date_range # ) # industries = { # d: { # s: ind.get(s).get(industry_cls, # dict()).get("industry_name", # "NA") # for s in stock_list # } # for d, # ind in zip(date_range, # industries) # } else: end_time = str(max(factor.index.get_level_values("datetime")))[:10] date_range = [pd.Timestamp(end_time)] # industries = jqdatasdk.get_industry(stock_list, end_time) ss = QA_fetch_industry_adv(stock_list, end_time)[["code", "industry_name" ]].set_index(["date", "code" ])["industry_name"] # industries = { # d: { # s: industries.get(s).get(industry_cls, # dict()).get("industry_name", # "NA") # for s in stock_list # } # for d in date_range # } # 可能历史上股票没有行业信息,用之后的行业信息往前填充 merged_data["date"] = merged_data.index.get_level_values("datetime").map( lambda x: x.date()) merged_data = (merged_data.reset_index().set_index([ "date", "code" ]).assign(group=ss).reset_index().set_index(["datetime", "code"]).drop("date", axis=1)) group = merged_data["group"].unstack().bfill().stack() merged_data["group"] = group return merged_data
def QA_fetch_factor_weight(factor: pd.Series, weight_cls: str = "mktcap", detailed: bool = True) -> pd.DataFrame: """ 获取因子的市值暴露, 注意,返回的值是 pd.DataFrame 格式,包含原因子值,附加一列 因子对应的加权信息 参数 --- :param factor: 因子值,索引为 ['日期' '资产'] :param weight_cls: 权重信息,默认加权方式为总市值加权 :param detailed: 默认为 True, 如果为 False, 取因子最后一日的加权信息 返回值 --- :return: 因子数据, 包括因子值,因子对应行业 """ # 因子格式化 factor = QA_fmt_factor(factor) merged_data = pd.DataFrame(factor.copy().rename("factor")) # 股票代码格式化 code_list = factor.index.get_level_values( "code").drop_duplicates().tolist() # 非详细模式, 加权数据采用当前日期 if detailed: # start_time = str(min(factor.index.get_level_values("datetime")))[:10] end_time = str(max(factor.index.get_level_values("datetime")))[:10] # date_range = list( # map(pd.Timestamp, QA_util_get_trade_range(start_time, end_time)) # ) date_range = (factor.index.get_level_values( "datetime").drop_duplicates().tolist()) else: date_range = [pd.Timestamp(end_time)] if weight_cls == "avg": merged_data["weight"] = 1.0 return merged_data df_local = QAAnalysis_block(code=code_list, start=date_range[0], end=date_range[-1]).market_value if weight_cls == "mktcap": df_local = df_local.reset_index().pivot(index="date", columns="code", values="mv") elif weight_cls == "sqrt_mktcap": df_local = (df_local.reset_index().pivot( index="date", columns="code", values="mv").transform("sqrt")) elif weight_cls == "ln_mktcap": df_local = (df_local.reset_index().pivot(index="date", columns="code", values="mv").transform("ln")) elif weight_cls == "cmktcap": df_local = df_local.reset_index().pivot(index="date", columns="code", values="liquidity_mv") elif weight_cls == "sqrt_cmktcap": df_local = (df_local.reset_index().pivot( index="date", columns="code", values="liquidity_mv").transform("sqrt")) elif weight_cls == "ln_cmktcap": df_local = (df_local.reset_index().pivot( index="date", columns="code", values="liquidity_mv").transform("ln")) else: raise ValueError(f"{weight_cls} 加权方式未实现") merged_data["date"] = merged_data.index.get_level_values("datetime").map( lambda x: x.date()) merged_data = (merged_data.reset_index().set_index( ["date", "code"]).assign(weight=df_local.stack()).reset_index().set_index( ["datetime", "code"]).drop("date", axis=1)) weight = merged_data["weight"].unstack().bfill().stack() merged_data["weight"] = weight return merged_data
frame3 = DataFrame(pop) frame3.index.name = 'year' frame3.columns.name = 'state' frameTest = DataFrame(frame3) frameTest = DataFrame(frame3, index=[1, 2, 3]) # pandas Index object obj1 = Series(range(3), index=['a', 'b', 'c']) index = obj1.index index = pd.Index(range(3)) obj2 = Series([1.5, -2.5, 0], index=index) #obj2 is obj2 #obj2.index is index obj3 = obj2.copy() # obj3 is obj2 # evaluates to FALSE - so is compares pointers and not the actual data. # Some python functions. obj1 = Series([4.5, 7.2, -5.3, 3.7], index=['d', 'b', 'a', 'c']) obj2 = obj1.reindex(['a', 'b', 'c', 'd', 'e']) obj2 = obj1.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0) frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California']) frame2 = frame.reindex(['a', 'b', 'c', 'd']) states = ['Texas', 'Utah', 'California'] frame3 = frame.reindex(columns=states)
def _lagged_values(X: pd.Series, p: int, ar_coef: list): """Helper Function to Calculate AutoRegressive(AR) Component""" return X if p == 0 else pd.concat([X.copy().shift(periods=i) for i in range(1, p + 1)], axis=1).dot(ar_coef)
def test_getitem_setitem_datetimeindex(): N = 50 # testing with timezone, GH #2785 rng = date_range("1/1/1990", periods=N, freq="H", tz="US/Eastern") ts = Series(np.random.randn(N), index=rng) result = ts["1990-01-01 04:00:00"] expected = ts[4] assert result == expected result = ts.copy() result["1990-01-01 04:00:00"] = 0 result["1990-01-01 04:00:00"] = ts[4] tm.assert_series_equal(result, ts) result = ts["1990-01-01 04:00:00":"1990-01-01 07:00:00"] expected = ts[4:8] tm.assert_series_equal(result, expected) result = ts.copy() result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = 0 result["1990-01-01 04:00:00":"1990-01-01 07:00:00"] = ts[4:8] tm.assert_series_equal(result, ts) lb = "1990-01-01 04:00:00" rb = "1990-01-01 07:00:00" # GH#18435 strings get a pass from tzawareness compat result = ts[(ts.index >= lb) & (ts.index <= rb)] expected = ts[4:8] tm.assert_series_equal(result, expected) lb = "1990-01-01 04:00:00-0500" rb = "1990-01-01 07:00:00-0500" result = ts[(ts.index >= lb) & (ts.index <= rb)] expected = ts[4:8] tm.assert_series_equal(result, expected) # But we do not give datetimes a pass on tzawareness compat # TODO: do the same with Timestamps and dt64 msg = "Cannot compare tz-naive and tz-aware datetime-like objects" naive = datetime(1990, 1, 1, 4) with tm.assert_produces_warning(FutureWarning): # GH#36148 will require tzawareness compat result = ts[naive] expected = ts[4] assert result == expected result = ts.copy() with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#36148 will require tzawareness compat result[datetime(1990, 1, 1, 4)] = 0 with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#36148 will require tzawareness compat result[datetime(1990, 1, 1, 4)] = ts[4] tm.assert_series_equal(result, ts) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#36148 will require tzawareness compat result = ts[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] expected = ts[4:8] tm.assert_series_equal(result, expected) result = ts.copy() with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#36148 will require tzawareness compat result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = 0 with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # GH#36148 will require tzawareness compat result[datetime(1990, 1, 1, 4):datetime(1990, 1, 1, 7)] = ts[4:8] tm.assert_series_equal(result, ts) lb = datetime(1990, 1, 1, 4) rb = datetime(1990, 1, 1, 7) msg = r"Invalid comparison between dtype=datetime64\[ns, US/Eastern\] and datetime" with pytest.raises(TypeError, match=msg): # tznaive vs tzaware comparison is invalid # see GH#18376, GH#18162 ts[(ts.index >= lb) & (ts.index <= rb)] lb = Timestamp(datetime(1990, 1, 1, 4)).tz_localize(rng.tzinfo) rb = Timestamp(datetime(1990, 1, 1, 7)).tz_localize(rng.tzinfo) result = ts[(ts.index >= lb) & (ts.index <= rb)] expected = ts[4:8] tm.assert_series_equal(result, expected) result = ts[ts.index[4]] expected = ts[4] assert result == expected result = ts[ts.index[4:8]] expected = ts[4:8] tm.assert_series_equal(result, expected) result = ts.copy() result[ts.index[4:8]] = 0 result.iloc[4:8] = ts.iloc[4:8] tm.assert_series_equal(result, ts) # also test partial date slicing result = ts["1990-01-02"] expected = ts[24:48] tm.assert_series_equal(result, expected) result = ts.copy() result["1990-01-02"] = 0 result["1990-01-02"] = ts[24:48] tm.assert_series_equal(result, ts)
def test_equals(self): s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) s2 = s1.copy() self.assertTrue(s1.equals(s2)) s1[1] = 99 self.assertFalse(s1.equals(s2)) # NaNs compare as equal s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3]) s2 = s1.copy() self.assertTrue(s1.equals(s2)) s2[0] = 9.9 self.assertFalse(s1.equals(s2)) idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')]) s1 = Series([1, 2, np.nan], index=idx) s2 = s1.copy() self.assertTrue(s1.equals(s2)) # Add object dtype column with nans index = np.random.random(10) df1 = DataFrame(np.random.random(10, ), index=index, columns=['floats']) df1['text'] = 'the sky is so blue. we could use more chocolate.'.split( ) df1['start'] = date_range('2000-1-1', periods=10, freq='T') df1['end'] = date_range('2000-1-1', periods=10, freq='D') df1['diff'] = df1['end'] - df1['start'] df1['bool'] = (np.arange(10) % 3 == 0) df1.ix[::2] = nan df2 = df1.copy() self.assertTrue(df1['text'].equals(df2['text'])) self.assertTrue(df1['start'].equals(df2['start'])) self.assertTrue(df1['end'].equals(df2['end'])) self.assertTrue(df1['diff'].equals(df2['diff'])) self.assertTrue(df1['bool'].equals(df2['bool'])) self.assertTrue(df1.equals(df2)) self.assertFalse(df1.equals(object)) # different dtype different = df1.copy() different['floats'] = different['floats'].astype('float32') self.assertFalse(df1.equals(different)) # different index different_index = -index different = df2.set_index(different_index) self.assertFalse(df1.equals(different)) # different columns different = df2.copy() different.columns = df2.columns[::-1] self.assertFalse(df1.equals(different)) # DatetimeIndex index = pd.date_range('2000-1-1', periods=10, freq='T') df1 = df1.set_index(index) df2 = df1.copy() self.assertTrue(df1.equals(df2)) # MultiIndex df3 = df1.set_index(['text'], append=True) df2 = df1.set_index(['text'], append=True) self.assertTrue(df3.equals(df2)) df2 = df1.set_index(['floats'], append=True) self.assertFalse(df3.equals(df2)) # NaN in index df3 = df1.set_index(['floats'], append=True) df2 = df1.set_index(['floats'], append=True) self.assertTrue(df3.equals(df2))
def test_cython_transform_frame(op, args, targop): s = Series(np.random.randn(1000)) s_missing = s.copy() s_missing.iloc[2:10] = np.nan labels = np.random.randint(0, 50, size=1000).astype(float) strings = list("qwertyuiopasdfghjklz") strings_missing = strings[:] strings_missing[5] = np.nan df = DataFrame( { "float": s, "float_missing": s_missing, "int": [1, 1, 1, 1, 2] * 200, "datetime": pd.date_range("1990-1-1", periods=1000), "timedelta": pd.timedelta_range(1, freq="s", periods=1000), "string": strings * 50, "string_missing": strings_missing * 50, }, columns=[ "float", "float_missing", "int", "datetime", "timedelta", "string", "string_missing", ], ) df["cat"] = df["string"].astype("category") df2 = df.copy() df2.index = pd.MultiIndex.from_product([range(100), range(10)]) # DataFrame - Single and MultiIndex, # group by values, index level, columns for df in [df, df2]: for gb_target in [ dict(by=labels), dict(level=0), dict(by="string"), ]: # dict(by='string_missing')]: # dict(by=['int','string'])]: gb = df.groupby(**gb_target) # whitelisted methods set the selection before applying # bit a of hack to make sure the cythonized shift # is equivalent to pre 0.17.1 behavior if op == "shift": gb._set_group_selection() if op != "shift" and "int" not in gb_target: # numeric apply fastpath promotes dtype so have # to apply separately and concat i = gb[["int"]].apply(targop) f = gb[["float", "float_missing"]].apply(targop) expected = pd.concat([f, i], axis=1) else: expected = gb.apply(targop) expected = expected.sort_index(axis=1) tm.assert_frame_equal(expected, gb.transform(op, *args).sort_index(axis=1)) tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1)) # individual columns for c in df: if c not in ["float", "int", "float_missing" ] and op != "shift": msg = "No numeric types to aggregate" with pytest.raises(DataError, match=msg): gb[c].transform(op) with pytest.raises(DataError, match=msg): getattr(gb[c], op)() else: expected = gb[c].apply(targop) expected.name = c tm.assert_series_equal(expected, gb[c].transform(op, *args)) tm.assert_series_equal(expected, getattr(gb[c], op)(*args))
def test_convert(self): # Tests: All to nans, coerce, true # Test coercion returns correct type s = Series(['a', 'b', 'c']) results = s._convert(datetime=True, coerce=True) expected = Series([lib.NaT] * 3) assert_series_equal(results, expected) results = s._convert(numeric=True, coerce=True) expected = Series([np.nan] * 3) assert_series_equal(results, expected) expected = Series([lib.NaT] * 3, dtype=np.dtype('m8[ns]')) results = s._convert(timedelta=True, coerce=True) assert_series_equal(results, expected) dt = datetime(2001, 1, 1, 0, 0) td = dt - datetime(2000, 1, 1, 0, 0) # Test coercion with mixed types s = Series(['a', '3.1415', dt, td]) results = s._convert(datetime=True, coerce=True) expected = Series([lib.NaT, lib.NaT, dt, lib.NaT]) assert_series_equal(results, expected) results = s._convert(numeric=True, coerce=True) expected = Series([nan, 3.1415, nan, nan]) assert_series_equal(results, expected) results = s._convert(timedelta=True, coerce=True) expected = Series([lib.NaT, lib.NaT, lib.NaT, td], dtype=np.dtype('m8[ns]')) assert_series_equal(results, expected) # Test standard conversion returns original results = s._convert(datetime=True) assert_series_equal(results, s) results = s._convert(numeric=True) expected = Series([nan, 3.1415, nan, nan]) assert_series_equal(results, expected) results = s._convert(timedelta=True) assert_series_equal(results, s) # test pass-through and non-conversion when other types selected s = Series(['1.0', '2.0', '3.0']) results = s._convert(datetime=True, numeric=True, timedelta=True) expected = Series([1.0, 2.0, 3.0]) assert_series_equal(results, expected) results = s._convert(True, False, True) assert_series_equal(results, s) s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)], dtype='O') results = s._convert(datetime=True, numeric=True, timedelta=True) expected = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)]) assert_series_equal(results, expected) results = s._convert(datetime=False, numeric=True, timedelta=True) assert_series_equal(results, s) td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0) s = Series([td, td], dtype='O') results = s._convert(datetime=True, numeric=True, timedelta=True) expected = Series([td, td]) assert_series_equal(results, expected) results = s._convert(True, True, False) assert_series_equal(results, s) s = Series([1., 2, 3], index=['a', 'b', 'c']) result = s._convert(numeric=True) assert_series_equal(result, s) # force numeric conversion r = s.copy().astype('O') r['a'] = '1' result = r._convert(numeric=True) assert_series_equal(result, s) r = s.copy().astype('O') r['a'] = '1.' result = r._convert(numeric=True) assert_series_equal(result, s) r = s.copy().astype('O') r['a'] = 'garbled' result = r._convert(numeric=True) expected = s.copy() expected['a'] = nan assert_series_equal(result, expected) # GH 4119, not converting a mixed type (e.g.floats and object) s = Series([1, 'na', 3, 4]) result = s._convert(datetime=True, numeric=True) expected = Series([1, nan, 3, 4]) assert_series_equal(result, expected) s = Series([1, '', 3, 4]) result = s._convert(datetime=True, numeric=True) assert_series_equal(result, expected) # dates s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), datetime(2001, 1, 3, 0, 0)]) s2 = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), datetime(2001, 1, 3, 0, 0), 'foo', 1.0, 1, Timestamp('20010104'), '20010105'], dtype='O') result = s._convert(datetime=True) expected = Series([Timestamp('20010101'), Timestamp('20010102'), Timestamp('20010103')], dtype='M8[ns]') assert_series_equal(result, expected) result = s._convert(datetime=True, coerce=True) assert_series_equal(result, expected) expected = Series([Timestamp('20010101'), Timestamp('20010102'), Timestamp('20010103'), lib.NaT, lib.NaT, lib.NaT, Timestamp('20010104'), Timestamp('20010105')], dtype='M8[ns]') result = s2._convert(datetime=True, numeric=False, timedelta=False, coerce=True) assert_series_equal(result, expected) result = s2._convert(datetime=True, coerce=True) assert_series_equal(result, expected) s = Series(['foo', 'bar', 1, 1.0], dtype='O') result = s._convert(datetime=True, coerce=True) expected = Series([lib.NaT] * 2 + [Timestamp(1)] * 2) assert_series_equal(result, expected) # preserver if non-object s = Series([1], dtype='float32') result = s._convert(datetime=True, coerce=True) assert_series_equal(result, s) # r = s.copy() # r[0] = np.nan # result = r._convert(convert_dates=True,convert_numeric=False) # assert result.dtype == 'M8[ns]' # dateutil parses some single letters into today's value as a date expected = Series([lib.NaT]) for x in 'abcdefghijklmnopqrstuvwxyz': s = Series([x]) result = s._convert(datetime=True, coerce=True) assert_series_equal(result, expected) s = Series([x.upper()]) result = s._convert(datetime=True, coerce=True) assert_series_equal(result, expected)
def test_cython_transform(self): # GH 4095 ops = [(('cumprod', ()), lambda x: x.cumprod()), (('cumsum', ()), lambda x: x.cumsum()), (('shift', (-1, )), lambda x: x.shift(-1)), (('shift', (1, )), lambda x: x.shift())] s = Series(np.random.randn(1000)) s_missing = s.copy() s_missing.iloc[2:10] = np.nan labels = np.random.randint(0, 50, size=1000).astype(float) # series for (op, args), targop in ops: for data in [s, s_missing]: # print(data.head()) expected = data.groupby(labels).transform(targop) tm.assert_series_equal( expected, data.groupby(labels).transform(op, *args)) tm.assert_series_equal( expected, getattr(data.groupby(labels), op)(*args)) strings = list('qwertyuiopasdfghjklz') strings_missing = strings[:] strings_missing[5] = np.nan df = DataFrame({ 'float': s, 'float_missing': s_missing, 'int': [1, 1, 1, 1, 2] * 200, 'datetime': pd.date_range('1990-1-1', periods=1000), 'timedelta': pd.timedelta_range(1, freq='s', periods=1000), 'string': strings * 50, 'string_missing': strings_missing * 50 }) df['cat'] = df['string'].astype('category') df2 = df.copy() df2.index = pd.MultiIndex.from_product([range(100), range(10)]) # DataFrame - Single and MultiIndex, # group by values, index level, columns for df in [df, df2]: for gb_target in [ dict(by=labels), dict(level=0), dict(by='string') ]: # dict(by='string_missing')]: # dict(by=['int','string'])]: gb = df.groupby(**gb_target) # whitelisted methods set the selection before applying # bit a of hack to make sure the cythonized shift # is equivalent to pre 0.17.1 behavior if op == 'shift': gb._set_group_selection() for (op, args), targop in ops: if op != 'shift' and 'int' not in gb_target: # numeric apply fastpath promotes dtype so have # to apply seperately and concat i = gb[['int']].apply(targop) f = gb[['float', 'float_missing']].apply(targop) expected = pd.concat([f, i], axis=1) else: expected = gb.apply(targop) expected = expected.sort_index(axis=1) tm.assert_frame_equal( expected, gb.transform(op, *args).sort_index(axis=1)) tm.assert_frame_equal(expected, getattr(gb, op)(*args)) # individual columns for c in df: if c not in ['float', 'int', 'float_missing' ] and op != 'shift': pytest.raises(DataError, gb[c].transform, op) pytest.raises(DataError, getattr(gb[c], op)) else: expected = gb[c].apply(targop) expected.name = c tm.assert_series_equal(expected, gb[c].transform(op, *args)) tm.assert_series_equal(expected, getattr(gb[c], op)(*args))
def predict_testcounts( testcounts: pandas.Series, *, country: str, region: typing.Optional[typing.Union[str, typing.List[str]]], regional_holidays: bool = False, keep_data: bool, ignore_before: typing.Optional[typing.Union[datetime.datetime, pandas.Timestamp, str]] = None, **kwargs, ) -> ForecastingResult: """ Predict/smooth missing test counts with Prophet. Implemented by Laura Helleckes and Michael Osthege. Parameters ---------- testcounts : pandas.Series date-indexed series of observed testcounts country : str name or short code of country (as used by https://github.com/dr-prodigy/python-holidays) region : optional, [str] if None or []: only nation-wide if "all": nation-wide and all regions if "CA": nation-wide and those for region "CA" if ["CA", "NY", "FL"]: nation-wide and those for all listed regions regional_holidays: bool, default False if True, fetch regional holidays for each region, if `region` is not set to None or to only one region. if False (default), fetch only national holidays (useful for countries where test data is available at the regional-level, but which only have national holidays). keep_data : bool if True, existing entries are kept if False, existing entries are also predicted, resulting in a smoothed profile ignore_before : timestamp all dates before this are ignored Use this argument to prevent an unrealistic upwards trend due to initial testing ramp-up **kwargs optional kwargs for the `fbprophet.Prophet`. For example: * growth: 'linear' or 'logistic' (default) * seasonality_mode: 'additive' or 'multiplicative' (default) Returns ------- result : pandas.Series the date-indexed series of smoothed/predicted testcounts m : fbprophet.Prophet the phophet model forecast : pandas.DataFrame contains the model prediction holidays : dict of { datetime : str } dictionary of the holidays that were used in the model """ testcounts.index.name = "date" testcounts.name = "total" if not ignore_before: ignore_before = testcounts.index[0] # for safety, sort the index testcounts.sort_index(inplace=True) mask_fit = testcounts.index >= ignore_before if keep_data: mask_predict = numpy.logical_and(testcounts.index >= ignore_before, numpy.isnan(testcounts.values)) else: mask_predict = testcounts.index >= ignore_before years = set([testcounts.index[0].year, testcounts.index[-1].year]) regions = numpy.atleast_1d(region) if region != "all" and len(regions) <= 1 and regional_holidays: raise ValueError( "Predicting test counts only at national level or for one region only. " "Can't ask for regional holiday. Set `regional_holidays` kwarg to False." ) # need last condition because some countries only national holidays for all regions: if (region == "all" or len(regions) > 1) and regional_holidays: # distinguish between national and regional holidays all_holidays = get_holidays(country, region, years=years) national_holidays = get_holidays(country, region=None, years=years) holiday_df = pandas.DataFrame( data=[( date, name, "national" if date in national_holidays.keys() else "regional", ) for date, name in all_holidays.items()], columns=["ds", "name", "holiday"], ) else: # none, or only one region -> no distinction between national/regional holidays all_holidays = get_holidays(country, region=None, years=years) holiday_df = pandas.DataFrame( dict( holiday="holiday", name=list(all_holidays.values()), ds=pandas.to_datetime(list(all_holidays.keys())), )) # Config settings of forecast model days = (testcounts.index[-1] - testcounts.index[0]).days prophet_kwargs = dict( growth="logistic", seasonality_mode="multiplicative", daily_seasonality=False, weekly_seasonality=True, yearly_seasonality=False, holidays=holiday_df, mcmc_samples=500, # restrict number of potential changepoints: n_changepoints=int(numpy.ceil(days / 30)), ) # override defaults with user-specified kwargs prophet_kwargs.update(kwargs) m = fbprophet.Prophet(**prophet_kwargs) # fit only the selected subset of the data df_fit = (testcounts.loc[mask_fit].reset_index().rename(columns={ "date": "ds", "total": "y" })) if prophet_kwargs["growth"] == "logistic": cap = numpy.max(testcounts) * 1 df_fit["floor"] = 0 df_fit["cap"] = cap m.fit(df_fit) # predict for all dates in the input df_predict = testcounts.reset_index().rename(columns={"date": "ds"}) if prophet_kwargs["growth"] == "logistic": df_predict["floor"] = 0 df_predict["cap"] = cap forecast = m.predict(df_predict) # make a series of the result that has the same index as the input result = pandas.Series(index=testcounts.index, data=testcounts.copy().values, name="testcount") result.loc[mask_predict] = numpy.clip( forecast.set_index("ds").yhat, 0, forecast.yhat.max()) # full-length result series, model and forecast are returned return result, m, forecast, all_holidays
def test_logical_ops_label_based(self): # GH#4947 # logical ops should be label based a = Series([True, False, True], list("bca")) b = Series([False, True, False], list("abc")) expected = Series([False, True, False], list("abc")) result = a & b tm.assert_series_equal(result, expected) expected = Series([True, True, False], list("abc")) result = a | b tm.assert_series_equal(result, expected) expected = Series([True, False, False], list("abc")) result = a ^ b tm.assert_series_equal(result, expected) # rhs is bigger a = Series([True, False, True], list("bca")) b = Series([False, True, False, True], list("abcd")) expected = Series([False, True, False, False], list("abcd")) result = a & b tm.assert_series_equal(result, expected) expected = Series([True, True, False, False], list("abcd")) result = a | b tm.assert_series_equal(result, expected) # filling # vs empty empty = Series([], dtype=object) result = a & empty.copy() expected = Series([False, False, False], list("bca")) tm.assert_series_equal(result, expected) result = a | empty.copy() expected = Series([True, False, True], list("bca")) tm.assert_series_equal(result, expected) # vs non-matching result = a & Series([1], ["z"]) expected = Series([False, False, False, False], list("abcz")) tm.assert_series_equal(result, expected) result = a | Series([1], ["z"]) expected = Series([True, True, False, False], list("abcz")) tm.assert_series_equal(result, expected) # identity # we would like s[s|e] == s to hold for any e, whether empty or not for e in [ empty.copy(), Series([1], ["z"]), Series(np.nan, b.index), Series(np.nan, a.index), ]: result = a[a | e] tm.assert_series_equal(result, a[a]) for e in [Series(["z"])]: result = a[a | e] tm.assert_series_equal(result, a[a]) # vs scalars index = list("bca") t = Series([True, False, True]) for v in [True, 1, 2]: result = Series([True, False, True], index=index) | v expected = Series([True, True, True], index=index) tm.assert_series_equal(result, expected) msg = "Cannot perform.+with a dtyped.+array and scalar of type" for v in [np.nan, "foo"]: with pytest.raises(TypeError, match=msg): t | v for v in [False, 0]: result = Series([True, False, True], index=index) | v expected = Series([True, False, True], index=index) tm.assert_series_equal(result, expected) for v in [True, 1]: result = Series([True, False, True], index=index) & v expected = Series([True, False, True], index=index) tm.assert_series_equal(result, expected) for v in [False, 0]: result = Series([True, False, True], index=index) & v expected = Series([False, False, False], index=index) tm.assert_series_equal(result, expected) msg = "Cannot perform.+with a dtyped.+array and scalar of type" for v in [np.nan]: with pytest.raises(TypeError, match=msg): t & v
def psar(high, low, close=None, af=None, max_af=None, offset=None, **kwargs): """Indicator: Parabolic Stop and Reverse (PSAR)""" # Validate Arguments high = verify_series(high) low = verify_series(low) af = float(af) if af and af > 0 else 0.02 max_af = float(max_af) if max_af and max_af > 0 else 0.2 offset = get_offset(offset) # Initialize m = high.shape[0] af0 = af bullish = True high_point = high.iloc[0] low_point = low.iloc[0] if close is not None: close = verify_series(close) sar = close.copy() else: sar = low.copy() long = Series(npNaN, index=sar.index) short = long.copy() reversal = Series(False, index=sar.index) _af = long.copy() _af.iloc[0:2] = af0 # Calculate Result for i in range(2, m): reverse = False _af[i] = af if bullish: sar[i] = sar[i - 1] + af * (high_point - sar[i - 1]) if low[i] < sar[i]: bullish, reverse, af = False, True, af0 sar[i] = high_point low_point = low[i] else: sar[i] = sar[i - 1] + af * (low_point - sar[i - 1]) if high[i] > sar[i]: bullish, reverse, af = True, True, af0 sar[i] = low_point high_point = high[i] reversal[i] = reverse if not reverse: if bullish: if high[i] > high_point: high_point = high[i] af = min(af + af0, max_af) if low[i - 1] < sar[i]: sar[i] = low[i - 1] if low[i - 2] < sar[i]: sar[i] = low[i - 2] else: if low[i] < low_point: low_point = low[i] af = min(af + af0, max_af) if high[i - 1] > sar[i]: sar[i] = high[i - 1] if high[i - 2] > sar[i]: sar[i] = high[i - 2] if bullish: long[i] = sar[i] else: short[i] = sar[i] # Offset if offset != 0: _af = _af.shift(offset) long = long.shift(offset) short = short.shift(offset) reversal = reversal.shift(offset) # Handle fills if "fillna" in kwargs: _af.fillna(kwargs["fillna"], inplace=True) long.fillna(kwargs["fillna"], inplace=True) short.fillna(kwargs["fillna"], inplace=True) reversal.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: _af.fillna(method=kwargs["fill_method"], inplace=True) long.fillna(method=kwargs["fill_method"], inplace=True) short.fillna(method=kwargs["fill_method"], inplace=True) reversal.fillna(method=kwargs["fill_method"], inplace=True) # Prepare DataFrame to return _params = f"_{af0}_{max_af}" data = { f"PSARl{_params}": long, f"PSARs{_params}": short, f"PSARaf{_params}": _af, f"PSARr{_params}": reversal, } psardf = DataFrame(data) psardf.name = f"PSAR{_params}" psardf.category = long.category = short.category = "trend" return psardf
def test_slice_integer(self): # same as above, but for Integer based indexes # these coerce to a like integer # oob indicates if we are out of bounds # of positional indexing for index, oob in [ (Int64Index(range(5)), False), (RangeIndex(5), False), (Int64Index(range(5)) + 10, True), ]: # s is an in-range index s = Series(range(5), index=index) # getitem for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: for idxr in [lambda x: x.loc, lambda x: x.ix]: with catch_warnings(record=True): result = idxr(s)[l] # these are all label indexing # except getitem which is positional # empty if oob: indexer = slice(0, 0) else: indexer = slice(3, 5) self.check(result, s, indexer, False) # positional indexing msg = ( "cannot do slice indexing" r" on {klass} with these indexers \[(3|4)\.0\] of" " {kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] # getitem out-of-bounds for l in [slice(-6, 6), slice(-6.0, 6.0)]: for idxr in [lambda x: x.loc, lambda x: x.ix]: with catch_warnings(record=True): result = idxr(s)[l] # these are all label indexing # except getitem which is positional # empty if oob: indexer = slice(0, 0) else: indexer = slice(-6, 6) self.check(result, s, indexer, False) # positional indexing msg = ( "cannot do slice indexing" r" on {klass} with these indexers \[-6\.0\] of" " {kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[slice(-6.0, 6.0)] # getitem odd floats for l, res1 in [ (slice(2.5, 4), slice(3, 5)), (slice(2, 3.5), slice(2, 4)), (slice(2.5, 3.5), slice(3, 4)), ]: for idxr in [lambda x: x.loc, lambda x: x.ix]: with catch_warnings(record=True): result = idxr(s)[l] if oob: res = slice(0, 0) else: res = res1 self.check(result, s, res, False) # positional indexing msg = ( "cannot do slice indexing" r" on {klass} with these indexers \[(2|3)\.5\] of" " {kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] # setitem for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: for idxr in [lambda x: x.loc, lambda x: x.ix]: sc = s.copy() with catch_warnings(record=True): idxr(sc)[l] = 0 result = idxr(sc)[l].values.ravel() assert (result == 0).all() # positional indexing msg = ( "cannot do slice indexing" r" on {klass} with these indexers \[(3|4)\.0\] of" " {kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] = 0
def test_rank(self, datetime_series): from scipy.stats import rankdata datetime_series[::2] = np.nan datetime_series[:10][::3] = 4.0 ranks = datetime_series.rank() oranks = datetime_series.astype("O").rank() tm.assert_series_equal(ranks, oranks) mask = np.isnan(datetime_series) filled = datetime_series.fillna(np.inf) # rankdata returns a ndarray exp = Series(rankdata(filled), index=filled.index, name="ts") exp[mask] = np.nan tm.assert_series_equal(ranks, exp) iseries = Series(np.arange(5).repeat(2)) iranks = iseries.rank() exp = iseries.astype(float).rank() tm.assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1.0 exp = iseries / 5.0 iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series(np.repeat(1, 100)) exp = Series(np.repeat(0.505, 100)) iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries[1] = np.nan exp = Series(np.repeat(50.0 / 99.0, 100)) exp[1] = np.nan iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1.0 iseries[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series(np.repeat(np.nan, 100)) exp = iseries.copy() iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1 iseries[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) rng = date_range("1/1/1990", periods=5) iseries = Series(np.arange(5), rng) + 1 iseries.iloc[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) tm.assert_series_equal(iranks, exp) iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1]) exp = Series([2, 1, 3, 5, 4, 6.0]) iranks = iseries.rank() tm.assert_series_equal(iranks, exp) # GH 5968 iseries = Series(["3 day", "1 day 10m", "-2 day", NaT], dtype="m8[ns]") exp = Series([3, 2, 1, np.nan]) iranks = iseries.rank() tm.assert_series_equal(iranks, exp) values = np.array( [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40], dtype="float64", ) random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) exp = Series(random_order + 1.0, dtype="float64") iranks = iseries.rank() tm.assert_series_equal(iranks, exp)
def test_drop_duplicates_categorical_non_bool(self, dtype, ordered): cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) # Test case 1 input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered)) if dtype == "datetime64[D]": # pre-empty flaky xfail, tc1 values are seemingly-random if not (np.array(tc1) == input1).all(): pytest.xfail(reason="GH#7996") expected = Series([False, False, False, True]) tm.assert_series_equal(tc1.duplicated(), expected) tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected]) sc = tc1.copy() sc.drop_duplicates(inplace=True) tm.assert_series_equal(sc, tc1[~expected]) expected = Series([False, False, True, False]) tm.assert_series_equal(tc1.duplicated(keep="last"), expected) tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected]) sc = tc1.copy() sc.drop_duplicates(keep="last", inplace=True) tm.assert_series_equal(sc, tc1[~expected]) expected = Series([False, False, True, True]) tm.assert_series_equal(tc1.duplicated(keep=False), expected) tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected]) sc = tc1.copy() sc.drop_duplicates(keep=False, inplace=True) tm.assert_series_equal(sc, tc1[~expected]) # Test case 2 input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered)) if dtype == "datetime64[D]": # pre-empty flaky xfail, tc2 values are seemingly-random if not (np.array(tc2) == input2).all(): pytest.xfail(reason="GH#7996") expected = Series([False, False, False, False, True, True, False]) tm.assert_series_equal(tc2.duplicated(), expected) tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected]) sc = tc2.copy() sc.drop_duplicates(inplace=True) tm.assert_series_equal(sc, tc2[~expected]) expected = Series([False, True, True, False, False, False, False]) tm.assert_series_equal(tc2.duplicated(keep="last"), expected) tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected]) sc = tc2.copy() sc.drop_duplicates(keep="last", inplace=True) tm.assert_series_equal(sc, tc2[~expected]) expected = Series([False, True, True, False, True, True, False]) tm.assert_series_equal(tc2.duplicated(keep=False), expected) tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected]) sc = tc2.copy() sc.drop_duplicates(keep=False, inplace=True) tm.assert_series_equal(sc, tc2[~expected])
def test_rank(self): pytest.importorskip('scipy.stats.special') rankdata = pytest.importorskip('scipy.stats.rankdata') self.ts[::2] = np.nan self.ts[:10][::3] = 4. ranks = self.ts.rank() oranks = self.ts.astype('O').rank() assert_series_equal(ranks, oranks) mask = np.isnan(self.ts) filled = self.ts.fillna(np.inf) # rankdata returns a ndarray exp = Series(rankdata(filled), index=filled.index, name='ts') exp[mask] = np.nan tm.assert_series_equal(ranks, exp) iseries = Series(np.arange(5).repeat(2)) iranks = iseries.rank() exp = iseries.astype(float).rank() assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1.0 exp = iseries / 5.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series(np.repeat(1, 100)) exp = Series(np.repeat(0.505, 100)) iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries[1] = np.nan exp = Series(np.repeat(50.0 / 99.0, 100)) exp[1] = np.nan iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1.0 iseries[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series(np.repeat(np.nan, 100)) exp = iseries.copy() iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series(np.arange(5)) + 1 iseries[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) rng = date_range('1/1/1990', periods=5) iseries = Series(np.arange(5), rng) + 1 iseries.iloc[4] = np.nan exp = iseries / 4.0 iranks = iseries.rank(pct=True) assert_series_equal(iranks, exp) iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1]) exp = Series([2, 1, 3, 5, 4, 6.0]) iranks = iseries.rank() assert_series_equal(iranks, exp) # GH 5968 iseries = Series(['3 day', '1 day 10m', '-2 day', NaT], dtype='m8[ns]') exp = Series([3, 2, 1, np.nan]) iranks = iseries.rank() assert_series_equal(iranks, exp) values = np.array( [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40], dtype='float64') random_order = np.random.permutation(len(values)) iseries = Series(values[random_order]) exp = Series(random_order + 1.0, dtype='float64') iranks = iseries.rank() assert_series_equal(iranks, exp)
def QA_fetch_get_factor_groupby(factor: pd.Series, industry_cls: str = "sw_l1", detailed: bool = False) -> pd.DataFrame: """ 获取因子的行业暴露, 注意,返回的值是 pd.DataFrame 格式,包含原因子值,附加一列 因子对应的行业信息 (需先自行导入聚宽本地 sdk 并登陆) 参数 --- :param factor: 因子值,索引为 ['日期' '资产'] :param industry_cls: 行业分类,默认为申万 1 级行业 :param detailed: 是否使用详细模式,默认为 False, 即取因子日期最后一日的行业信息 返回值 --- :return: 因子数据, 包括因子值,因子对应行业 """ warnings.warn("请先自行导入聚宽本地 sdk 并登陆", UserWarning) # 因子格式化 factor = QA_fmt_factor(factor) merged_data = pd.DataFrame(factor.copy().rename("factor")) # 股票代码格式化 stock_list = QA_fmt_code_list( factor.index.get_level_values("code").drop_duplicates(), style="jq") # 非详细模式, 行业数据采用当前日期 if detailed: # start_time = str(min(factor.index.get_level_values("datetime")))[:10] # end_time = str(max(factor.index.get_level_values("datetime")))[:10] # date_range = list( # map(pd.Timestamp, QA_util_get_trade_range(start_time, end_time)) # ) date_range = (factor.index.get_level_values( "datetime").drop_duplicates().tolist()) df_local = pd.DataFrame() industries = map(partial(jqdatasdk.get_industry, stock_list), date_range) industries = { d: { s: ind.get(s).get(industry_cls, dict()).get("industry_name", "NA") for s in stock_list } for d, ind in zip(date_range, industries) } else: end_time = str(max(factor.index.get_level_values("datetime")))[:10] date_range = [pd.Timestamp(end_time)] industries = jqdatasdk.get_industry(stock_list, end_time) industries = { d: { s: industries.get(s).get(industry_cls, dict()).get("industry_name", "NA") for s in stock_list } for d in date_range } # 可能历史上股票没有行业信息,用之后的行业信息往前填充 df_local = pd.DataFrame(industries).T.sort_index() df_local.columns = df_local.columns.map(str).str.slice(0, 6) ss_local = df_local.stack(level=-1) ss_local.index.names = ["date", "code"] merged_data["date"] = merged_data.index.get_level_values("datetime").map( lambda x: x.date()) merged_data = (merged_data.reset_index().set_index([ "date", "code" ]).assign(group=ss_local).reset_index().set_index(["datetime", "code"]).drop("date", axis=1)) group = merged_data["group"].unstack().bfill().stack() merged_data["group"] = group return merged_data
def _plot_discrete( self, data: pd.Series, prop: str, lineages: Optional[Union[str, Sequence[str]]] = None, cluster_key: Optional[str] = None, same_plot: bool = True, title: Optional[Union[str, List[str]]] = None, **kwargs, ) -> None: """ Plot the states for each uncovered lineage. Parameters ---------- lineages Plot only these lineages. If `None`, plot all lineages. cluster_key Key from :paramref:`adata` ``.obs`` for plotting categorical observations. same_plot Whether to plot the lineages on the same plot or separately. title The title of the plot. %(basis)s **kwargs Keyword arguments for :func:`scvelo.pl.scatter`. Returns ------- %(just_plots)s """ if data is None: raise RuntimeError( f"Compute `.{prop}` first as `.{F.COMPUTE.fmt(prop)}()`.") if not is_categorical_dtype(data): raise TypeError( f"Expected property `.{prop}` to be categorical, found `{type(data).__name__!r}`." ) if prop in (P.ABS_PROBS.s, P.TERM.s): colors = getattr(self, A.TERM_COLORS.v, None) elif prop == P.MACRO.v: colors = getattr(self, A.MACRO_COLORS.v, None) else: logg.debug("No colors found. Creating new ones") colors = _create_categorical_colors(len(data.cat.categories)) colors = dict(zip(data.cat.categories, colors)) if ( lineages is not None ): # these are states per-se, but I want to keep the arg names for dispatch the same if isinstance(lineages, str): lineages = [lineages] for state in lineages: if state not in data.cat.categories: raise ValueError( f"Invalid state `{state!r}`. Valid options are `{list(data.cat.categories)}`." ) data = data.copy() to_remove = list(set(data.cat.categories) - set(lineages)) if len(to_remove) == len(data.cat.categories): raise RuntimeError( "Nothing to plot because empty subset has been selected.") for state in to_remove: data[data == state] = np.nan data.cat.remove_categories(to_remove, inplace=True) if cluster_key is None: cluster_key = [] elif isinstance(cluster_key, str): cluster_key = [cluster_key] if not isinstance(cluster_key, list): cluster_key = list(cluster_key) same_plot = same_plot or len(data.cat.categories) == 1 kwargs["legend_loc"] = kwargs.get("legend_loc", "on data") with RandomKeys(self.adata, None if same_plot else len(data.cat.categories), where="obs") as keys: if same_plot: key = keys[0] self.adata.obs[key] = data self.adata.uns[f"{key}_colors"] = [ colors[c] for c in data.cat.categories ] if title is None: title = ( f"{prop.replace('_', ' ')} " f"({Direction.BACKWARD if self.kernel.backward else Direction.FORWARD})" ) if isinstance(title, str): title = [title] scv.pl.scatter( self.adata, title=cluster_key + title, color=cluster_key + keys, **_filter_kwargs(scv.pl.scatter, **kwargs), ) else: for key, cat in zip(keys, data.cat.categories): d = data.copy() d[data != cat] = None d.cat.set_categories([cat], inplace=True) self.adata.obs[key] = d self.adata.uns[f"{key}_colors"] = [colors[cat]] scv.pl.scatter( self.adata, color=cluster_key + keys, title=(cluster_key + [ f"{_initial if self.kernel.backward else _terminal} state {c}" for c in data.cat.categories ]) if title is None else title, **_filter_kwargs(scv.pl.scatter, **kwargs), )
def test_partial_setting(self): # GH2578, allow ix and friends to partially set # series s_orig = Series([1, 2, 3]) s = s_orig.copy() s[5] = 5 expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s.loc[5] = 5 expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s[5] = 5. expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) s = s_orig.copy() s.loc[5] = 5. expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) tm.assert_series_equal(s, expected) # iloc/iat raise s = s_orig.copy() with pytest.raises(IndexError): s.iloc[3] = 5. with pytest.raises(IndexError): s.iat[3] = 5. # ## frame ## df_orig = DataFrame(np.arange(6).reshape(3, 2), columns=['A', 'B'], dtype='int64') # iloc/iat raise df = df_orig.copy() with pytest.raises(IndexError): df.iloc[4, 2] = 5. with pytest.raises(IndexError): df.iat[4, 2] = 5. # row setting where it exists expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) df = df_orig.copy() df.iloc[1] = df.iloc[2] tm.assert_frame_equal(df, expected) expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) df = df_orig.copy() df.loc[1] = df.loc[2] tm.assert_frame_equal(df, expected) # like 2578, partial setting with dtype preservation expected = DataFrame(dict({'A': [0, 2, 4, 4], 'B': [1, 3, 5, 5]})) df = df_orig.copy() df.loc[3] = df.loc[2] tm.assert_frame_equal(df, expected) # single dtype frame, overwrite expected = DataFrame(dict({'A': [0, 2, 4], 'B': [0, 2, 4]})) df = df_orig.copy() with catch_warnings(record=True): df.ix[:, 'B'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) # mixed dtype frame, overwrite expected = DataFrame(dict({'A': [0, 2, 4], 'B': Series([0, 2, 4])})) df = df_orig.copy() df['B'] = df['B'].astype(np.float64) with catch_warnings(record=True): df.ix[:, 'B'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) # single dtype frame, partial setting expected = df_orig.copy() expected['C'] = df['A'] df = df_orig.copy() with catch_warnings(record=True): df.ix[:, 'C'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) # mixed frame, partial setting expected = df_orig.copy() expected['C'] = df['A'] df = df_orig.copy() with catch_warnings(record=True): df.ix[:, 'C'] = df.ix[:, 'A'] tm.assert_frame_equal(df, expected) # GH 8473 dates = date_range('1/1/2000', periods=8) df_orig = DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) expected = pd.concat( [df_orig, DataFrame({'A': 7}, index=[dates[-1] + dates.freq])], sort=True) df = df_orig.copy() df.loc[dates[-1] + dates.freq, 'A'] = 7 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.at[dates[-1] + dates.freq, 'A'] = 7 tm.assert_frame_equal(df, expected) exp_other = DataFrame({0: 7}, index=[dates[-1] + dates.freq]) expected = pd.concat([df_orig, exp_other], axis=1) df = df_orig.copy() df.loc[dates[-1] + dates.freq, 0] = 7 tm.assert_frame_equal(df, expected) df = df_orig.copy() df.at[dates[-1] + dates.freq, 0] = 7 tm.assert_frame_equal(df, expected)
def test_convert(self): # Tests: All to nans, coerce, true # Test coercion returns correct type s = Series(["a", "b", "c"]) results = s._convert(datetime=True, coerce=True) expected = Series([NaT] * 3) tm.assert_series_equal(results, expected) results = s._convert(numeric=True, coerce=True) expected = Series([np.nan] * 3) tm.assert_series_equal(results, expected) expected = Series([NaT] * 3, dtype=np.dtype("m8[ns]")) results = s._convert(timedelta=True, coerce=True) tm.assert_series_equal(results, expected) dt = datetime(2001, 1, 1, 0, 0) td = dt - datetime(2000, 1, 1, 0, 0) # Test coercion with mixed types s = Series(["a", "3.1415", dt, td]) results = s._convert(datetime=True, coerce=True) expected = Series([NaT, NaT, dt, NaT]) tm.assert_series_equal(results, expected) results = s._convert(numeric=True, coerce=True) expected = Series([np.nan, 3.1415, np.nan, np.nan]) tm.assert_series_equal(results, expected) results = s._convert(timedelta=True, coerce=True) expected = Series([NaT, NaT, NaT, td], dtype=np.dtype("m8[ns]")) tm.assert_series_equal(results, expected) # Test standard conversion returns original results = s._convert(datetime=True) tm.assert_series_equal(results, s) results = s._convert(numeric=True) expected = Series([np.nan, 3.1415, np.nan, np.nan]) tm.assert_series_equal(results, expected) results = s._convert(timedelta=True) tm.assert_series_equal(results, s) # test pass-through and non-conversion when other types selected s = Series(["1.0", "2.0", "3.0"]) results = s._convert(datetime=True, numeric=True, timedelta=True) expected = Series([1.0, 2.0, 3.0]) tm.assert_series_equal(results, expected) results = s._convert(True, False, True) tm.assert_series_equal(results, s) s = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)], dtype="O") results = s._convert(datetime=True, numeric=True, timedelta=True) expected = Series( [datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)]) tm.assert_series_equal(results, expected) results = s._convert(datetime=False, numeric=True, timedelta=True) tm.assert_series_equal(results, s) td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0) s = Series([td, td], dtype="O") results = s._convert(datetime=True, numeric=True, timedelta=True) expected = Series([td, td]) tm.assert_series_equal(results, expected) results = s._convert(True, True, False) tm.assert_series_equal(results, s) s = Series([1.0, 2, 3], index=["a", "b", "c"]) result = s._convert(numeric=True) tm.assert_series_equal(result, s) # force numeric conversion r = s.copy().astype("O") r["a"] = "1" result = r._convert(numeric=True) tm.assert_series_equal(result, s) r = s.copy().astype("O") r["a"] = "1." result = r._convert(numeric=True) tm.assert_series_equal(result, s) r = s.copy().astype("O") r["a"] = "garbled" result = r._convert(numeric=True) expected = s.copy() expected["a"] = np.nan tm.assert_series_equal(result, expected) # GH 4119, not converting a mixed type (e.g.floats and object) s = Series([1, "na", 3, 4]) result = s._convert(datetime=True, numeric=True) expected = Series([1, np.nan, 3, 4]) tm.assert_series_equal(result, expected) s = Series([1, "", 3, 4]) result = s._convert(datetime=True, numeric=True) tm.assert_series_equal(result, expected) # dates s = Series([ datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), datetime(2001, 1, 3, 0, 0), ]) s2 = Series( [ datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), datetime(2001, 1, 3, 0, 0), "foo", 1.0, 1, Timestamp("20010104"), "20010105", ], dtype="O", ) result = s._convert(datetime=True) expected = Series( [ Timestamp("20010101"), Timestamp("20010102"), Timestamp("20010103") ], dtype="M8[ns]", ) tm.assert_series_equal(result, expected) result = s._convert(datetime=True, coerce=True) tm.assert_series_equal(result, expected) expected = Series( [ Timestamp("20010101"), Timestamp("20010102"), Timestamp("20010103"), NaT, NaT, NaT, Timestamp("20010104"), Timestamp("20010105"), ], dtype="M8[ns]", ) result = s2._convert(datetime=True, numeric=False, timedelta=False, coerce=True) tm.assert_series_equal(result, expected) result = s2._convert(datetime=True, coerce=True) tm.assert_series_equal(result, expected) s = Series(["foo", "bar", 1, 1.0], dtype="O") result = s._convert(datetime=True, coerce=True) expected = Series([NaT] * 2 + [Timestamp(1)] * 2) tm.assert_series_equal(result, expected) # preserver if non-object s = Series([1], dtype="float32") result = s._convert(datetime=True, coerce=True) tm.assert_series_equal(result, s) # FIXME: dont leave commented-out # r = s.copy() # r[0] = np.nan # result = r._convert(convert_dates=True,convert_numeric=False) # assert result.dtype == 'M8[ns]' # dateutil parses some single letters into today's value as a date expected = Series([NaT]) for x in "abcdefghijklmnopqrstuvwxyz": s = Series([x]) result = s._convert(datetime=True, coerce=True) tm.assert_series_equal(result, expected) s = Series([x.upper()]) result = s._convert(datetime=True, coerce=True) tm.assert_series_equal(result, expected)
def test_convert(self): # GH#10265 dt = datetime(2001, 1, 1, 0, 0) td = dt - datetime(2000, 1, 1, 0, 0) # Test coercion with mixed types ser = Series(["a", "3.1415", dt, td]) results = ser._convert(numeric=True) expected = Series([np.nan, 3.1415, np.nan, np.nan]) tm.assert_series_equal(results, expected) # Test standard conversion returns original results = ser._convert(datetime=True) tm.assert_series_equal(results, ser) results = ser._convert(numeric=True) expected = Series([np.nan, 3.1415, np.nan, np.nan]) tm.assert_series_equal(results, expected) results = ser._convert(timedelta=True) tm.assert_series_equal(results, ser) # test pass-through and non-conversion when other types selected ser = Series(["1.0", "2.0", "3.0"]) results = ser._convert(datetime=True, numeric=True, timedelta=True) expected = Series([1.0, 2.0, 3.0]) tm.assert_series_equal(results, expected) results = ser._convert(True, False, True) tm.assert_series_equal(results, ser) ser = Series( [datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)], dtype="O" ) results = ser._convert(datetime=True, numeric=True, timedelta=True) expected = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 1, 0, 0)]) tm.assert_series_equal(results, expected) results = ser._convert(datetime=False, numeric=True, timedelta=True) tm.assert_series_equal(results, ser) td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0) ser = Series([td, td], dtype="O") results = ser._convert(datetime=True, numeric=True, timedelta=True) expected = Series([td, td]) tm.assert_series_equal(results, expected) results = ser._convert(True, True, False) tm.assert_series_equal(results, ser) ser = Series([1.0, 2, 3], index=["a", "b", "c"]) result = ser._convert(numeric=True) tm.assert_series_equal(result, ser) # force numeric conversion res = ser.copy().astype("O") res["a"] = "1" result = res._convert(numeric=True) tm.assert_series_equal(result, ser) res = ser.copy().astype("O") res["a"] = "1." result = res._convert(numeric=True) tm.assert_series_equal(result, ser) res = ser.copy().astype("O") res["a"] = "garbled" result = res._convert(numeric=True) expected = ser.copy() expected["a"] = np.nan tm.assert_series_equal(result, expected) # GH 4119, not converting a mixed type (e.g.floats and object) ser = Series([1, "na", 3, 4]) result = ser._convert(datetime=True, numeric=True) expected = Series([1, np.nan, 3, 4]) tm.assert_series_equal(result, expected) ser = Series([1, "", 3, 4]) result = ser._convert(datetime=True, numeric=True) tm.assert_series_equal(result, expected) # dates ser = Series( [ datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), datetime(2001, 1, 3, 0, 0), ] ) result = ser._convert(datetime=True) expected = Series( [Timestamp("20010101"), Timestamp("20010102"), Timestamp("20010103")], dtype="M8[ns]", ) tm.assert_series_equal(result, expected) result = ser._convert(datetime=True) tm.assert_series_equal(result, expected) # preserver if non-object ser = Series([1], dtype="float32") result = ser._convert(datetime=True) tm.assert_series_equal(result, ser)