def test_patsy_lazy_dict(): class LazyDict(dict): def __init__(self, data): self.data = data def __missing__(self, key): return np.array(self.data[key]) data = cpunish.load_pandas().data data = LazyDict(data) res = ols('EXECUTIONS ~ SOUTH + INCOME', data=data).fit() res2 = res.predict(data) npt.assert_allclose(res.fittedvalues, res2) data = cpunish.load_pandas().data data['INCOME'].loc[0] = None data = LazyDict(data) data.index = cpunish.load_pandas().data.index res = ols('EXECUTIONS ~ SOUTH + INCOME', data=data).fit() res2 = res.predict(data) assert_equal(res.fittedvalues, res2) # Should lose a record assert_equal(len(res2) + 1, len(cpunish.load_pandas().data))
def test_pandas_nodates_index(): data = [988, 819, 964] dates = ['a', 'b', 'c'] s = pd.Series(data, index=dates) # TODO: Remove this, this is now valid # npt.assert_raises(ValueError, TimeSeriesModel, s) # Test with a non-date index that does not raise an exception because it # can be coerced into a nanosecond DatetimeIndex data = [988, 819, 964] # index=pd.date_range('1970-01-01', periods=3, freq='QS') index = pd.to_datetime([100, 101, 102]) s = pd.Series(data, index=index) actual_str = (index[0].strftime('%Y-%m-%d %H:%M:%S.%f') + str(index[0].value)) assert_equal(actual_str, '1970-01-01 00:00:00.000000100') with pytest.warns(ValueWarning, match="No frequency information"): mod = TimeSeriesModel(s) start, end, out_of_sample, _ = mod._get_prediction_index(0, 4) assert_equal(len(mod.data.predict_dates), 5)
def test_pandas_nodates_index(): data = [988, 819, 964] dates = ['a', 'b', 'c'] s = pd.Series(data, index=dates) # TODO: Remove this, this is now valid # npt.assert_raises(ValueError, TimeSeriesModel, s) # Test with a non-date index that doesn't raise an exception because it # can be coerced into a nanosecond DatetimeIndex # (This test doesn't make sense for Numpy < 1.7 since they don't have # nanosecond support) # (This test also doesn't make sense for Pandas < 0.14 since we don't # support nanosecond index in Pandas < 0.14) try: # Check for Numpy < 1.7 pd.to_offset('N') except: pass else: data = [988, 819, 964] # index=pd.date_range('1970-01-01', periods=3, freq='QS') index = pd.to_datetime([100, 101, 102]) s = pd.Series(data, index=index) actual_str = (index[0].strftime('%Y-%m-%d %H:%M:%S.%f') + str(index[0].value)) assert_equal(actual_str, '1970-01-01 00:00:00.000000100') mod = TimeSeriesModel(s) start, end, out_of_sample, _ = mod._get_prediction_index(0, 4) assert_equal(len(mod.data.predict_dates), 5)
def test_formula_labels(): # make sure labels pass through patsy as expected # data(Duncan) from car in R dta = StringIO(""""type" "income" "education" "prestige"\n"accountant" "prof" 62 86 82\n"pilot" "prof" 72 76 83\n"architect" "prof" 75 92 90\n"author" "prof" 55 90 76\n"chemist" "prof" 64 86 90\n"minister" "prof" 21 84 87\n"professor" "prof" 64 93 93\n"dentist" "prof" 80 100 90\n"reporter" "wc" 67 87 52\n"engineer" "prof" 72 86 88\n"undertaker" "prof" 42 74 57\n"lawyer" "prof" 76 98 89\n"physician" "prof" 76 97 97\n"welfare.worker" "prof" 41 84 59\n"teacher" "prof" 48 91 73\n"conductor" "wc" 76 34 38\n"contractor" "prof" 53 45 76\n"factory.owner" "prof" 60 56 81\n"store.manager" "prof" 42 44 45\n"banker" "prof" 78 82 92\n"bookkeeper" "wc" 29 72 39\n"mail.carrier" "wc" 48 55 34\n"insurance.agent" "wc" 55 71 41\n"store.clerk" "wc" 29 50 16\n"carpenter" "bc" 21 23 33\n"electrician" "bc" 47 39 53\n"RR.engineer" "bc" 81 28 67\n"machinist" "bc" 36 32 57\n"auto.repairman" "bc" 22 22 26\n"plumber" "bc" 44 25 29\n"gas.stn.attendant" "bc" 15 29 10\n"coal.miner" "bc" 7 7 15\n"streetcar.motorman" "bc" 42 26 19\n"taxi.driver" "bc" 9 19 10\n"truck.driver" "bc" 21 15 13\n"machine.operator" "bc" 21 20 24\n"barber" "bc" 16 26 20\n"bartender" "bc" 16 28 7\n"shoe.shiner" "bc" 9 17 3\n"cook" "bc" 14 22 16\n"soda.clerk" "bc" 12 30 6\n"watchman" "bc" 17 25 11\n"janitor" "bc" 7 20 8\n"policeman" "bc" 34 47 41\n"waiter" "bc" 8 32 10""") from pandas import read_table dta = read_table(dta, sep=" ") model = ols("prestige ~ income + education", dta).fit() assert_equal(model.fittedvalues.index, dta.index)
def test_formula_labels(): # make sure labels pass through patsy as expected # data(Duncan) from car in R dta = StringIO(""""type","income","education","prestige"\n"accountant","prof",62,86,82\n"pilot","prof",72,76,83\n"architect","prof",75,92,90\n"author","prof",55,90,76\n"chemist","prof",64,86,90\n"minister","prof",21,84,87\n"professor","prof",64,93,93\n"dentist","prof",80,100,90\n"reporter","wc",67,87,52\n"engineer","prof",72,86,88\n"undertaker","prof",42,74,57\n"lawyer","prof",76,98,89\n"physician","prof",76,97,97\n"welfare.worker","prof",41,84,59\n"teacher","prof",48,91,73\n"conductor","wc",76,34,38\n"contractor","prof",53,45,76\n"factory.owner","prof",60,56,81\n"store.manager","prof",42,44,45\n"banker","prof",78,82,92\n"bookkeeper","wc",29,72,39\n"mail.carrier","wc",48,55,34\n"insurance.agent","wc",55,71,41\n"store.clerk","wc",29,50,16\n"carpenter","bc",21,23,33\n"electrician","bc",47,39,53\n"RR.engineer","bc",81,28,67\n"machinist","bc",36,32,57\n"auto.repairman","bc",22,22,26\n"plumber","bc",44,25,29\n"gas.stn.attendant","bc",15,29,10\n"coal.miner","bc",7,7,15\n"streetcar.motorman","bc",42,26,19\n"taxi.driver","bc",9,19,10\n"truck.driver","bc",21,15,13\n"machine.operator","bc",21,20,24\n"barber","bc",16,26,20\n"bartender","bc",16,28,7\n"shoe.shiner","bc",9,17,3\n"cook","bc",14,22,16\n"soda.clerk","bc",12,30,6\n"watchman","bc",17,25,11\n"janitor","bc",7,20,8\n"policeman","bc",34,47,41\n"waiter","bc",8,32,10""") from pandas import read_csv dta = read_csv(dta) model = ols("prestige ~ income + education", dta).fit() assert_equal(model.fittedvalues.index, dta.index)
def test_ar_select_order_tstat(): rs = np.random.RandomState(123) tau = 25 y = rs.randn(tau) ts = Series(y, index=date_range(start='1/1/1990', periods=tau, freq='M')) ar = AR(ts) res = ar.select_order(maxlag=5, ic='t-stat') assert_equal(res, 0)
def test_period_index(): # test 1285 dates = pd.PeriodIndex(start="1/1/1990", periods=20, freq="M") x = np.arange(1, 21.) model = TimeSeriesModel(pd.Series(x, index=dates)) assert_equal(model._index.freqstr, "M") model = TimeSeriesModel(pd.Series(x, index=dates)) npt.assert_(model.data.freq == "M")
def test_ar_select_order_tstat(): rs = np.random.RandomState(123) tau = 25 y = rs.randn(tau) ts = Series(y, index=date_range(start="1/1/1990", periods=tau, freq="M")) with pytest.warns(FutureWarning): ar = AR(ts) with pytest.warns(FutureWarning): res = ar.select_order(maxlag=5, ic="t-stat") assert_equal(res, 0)
def test_ar_dates(): # just make sure they work data = sm.datasets.sunspots.load(as_pandas=False) dates = date_range(start='1700', periods=len(data.endog), freq='A') endog = Series(data.endog, index=dates) ar_model = sm.tsa.AR(endog, freq='A').fit(maxlag=9, method='mle', disp=-1) pred = ar_model.predict(start='2005', end='2015') predict_dates = date_range(start='2005', end='2016', freq='A')[:11] assert_equal(ar_model.data.predict_dates, predict_dates) assert_equal(pred.index, predict_dates)
def test_ar_dates(): # just make sure they work data = sm.datasets.sunspots.load() dates = sm.tsa.datetools.dates_from_range('1700', length=len(data.endog)) endog = Series(data.endog, index=dates) ar_model = sm.tsa.AR(endog, freq='A').fit(maxlag=9, method='mle', disp=-1) pred = ar_model.predict(start='2005', end='2015') predict_dates = sm.tsa.datetools.dates_from_range('2005', '2015') predict_dates = DatetimeIndex(predict_dates, freq='infer') assert_equal(ar_model.data.predict_dates, predict_dates) assert_equal(pred.index, predict_dates)
def test_ar_dates(): # just make sure they work data = sm.datasets.sunspots.load() dates = sm.tsa.datetools.dates_from_range("1700", length=len(data.endog)) endog = Series(data.endog, index=dates) ar_model = sm.tsa.AR(endog, freq="A").fit(maxlag=9, method="mle", disp=-1) pred = ar_model.predict(start="2005", end="2015") predict_dates = sm.tsa.datetools.dates_from_range("2005", "2015") predict_dates = DatetimeIndex(predict_dates, freq="infer") assert_equal(ar_model.data.predict_dates, predict_dates) assert_equal(pred.index, predict_dates)
def test_get_predict_start_end(): index = pd.DatetimeIndex(start='1970-01-01', end='1990-01-01', freq='AS') endog = pd.Series(np.zeros(10), index[:10]) model = TimeSeriesModel(endog) predict_starts = [1, '1971-01-01', datetime(1971, 1, 1), index[1]] predict_ends = [20, '1990-01-01', datetime(1990, 1, 1), index[-1]] desired = (1, 9, 11) for start in predict_starts: for end in predict_ends: assert_equal(model._get_prediction_index(start, end)[:3], desired)
def test_ar_dates(): # just make sure they work data = sm.datasets.sunspots.load(as_pandas=False) dates = date_range(start="1700", periods=len(data.endog), freq="A") endog = Series(data.endog, index=dates) with pytest.warns(FutureWarning): ar_model = AR(endog, freq="A").fit(maxlag=9, method="mle", disp=-1) pred = ar_model.predict(start="2005", end="2015") predict_dates = date_range(start="2005", end="2016", freq="A")[:11] assert_equal(ar_model.data.predict_dates, predict_dates) assert_equal(pred.index, predict_dates)
def test_get_predict_start_end(): index = pd.date_range(start='1970-01-01', end='1990-01-01', freq='AS') endog = pd.Series(np.zeros(10), index[:10]) model = TimeSeriesModel(endog) predict_starts = [1, '1971-01-01', datetime(1971, 1, 1), index[1]] predict_ends = [20, '1990-01-01', datetime(1990, 1, 1), index[-1]] desired = (1, 9, 11) for start in predict_starts: for end in predict_ends: assert_equal(model._get_prediction_index(start, end)[:3], desired)
def test_pandas_dates(): data = [988, 819, 964] dates = ['2016-01-01 12:00:00', '2016-02-01 12:00:00', '2016-03-01 12:00:00'] datetime_dates = pd.to_datetime(dates) result = pd.Series(data=data, index=datetime_dates, name='price') df = pd.DataFrame(data={'price': data}, index=pd.DatetimeIndex(dates, freq='MS')) model = TimeSeriesModel(df['price']) assert_equal(model.data.dates, result.index)
def test_ar_dates(): # just make sure they work data = sm.datasets.sunspots.load() dates = sm.tsa.datetools.dates_from_range('1700', length=len(data.endog)) endog = Series(data.endog, index=dates) ar_model = sm.tsa.AR(endog, freq='A').fit(maxlag=9, method='mle', disp=-1) pred = ar_model.predict(start='2005', end='2015') predict_dates = sm.tsa.datetools.dates_from_range('2005', '2015') from pandas import DatetimeIndex # pylint: disable-msg=E0611 predict_dates = DatetimeIndex(predict_dates, freq='infer') assert_equal(ar_model.data.predict_dates, predict_dates) assert_equal(pred.index, predict_dates)
def test_patsy_missing_data(): # Test pandas-style first data = cpunish.load_pandas().data data['INCOME'].loc[0] = None res = ols('EXECUTIONS ~ SOUTH + INCOME', data=data).fit() res2 = res.predict(data) # First record will be dropped during fit, but not during predict assert_equal(res.fittedvalues, res2[1:]) # Non-pandas version data = cpunish.load_pandas().data data['INCOME'].loc[0] = None data = data.to_records(index=False) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") res2 = res.predict(data) assert 'ValueWarning' in repr(w[-1].message) assert 'nan values have been dropped' in repr(w[-1].message) # Frist record will be dropped in both cases assert_equal(res.fittedvalues, res2)
def test_predict_freq(): # test that predicted dates have same frequency x = np.arange(1, 36.) # there's a bug in pandas up to 0.10.2 for YearBegin #dates = date_range("1972-4-1", "2007-4-1", freq="AS-APR") dates = pd.date_range("1972-4-30", "2006-4-30", freq="A-APR") series = pd.Series(x, index=dates) model = TimeSeriesModel(series) #npt.assert_(model.data.freq == "AS-APR") assert_equal(model._index.freqstr, "A-APR") start, end, out_of_sample, _ = (model._get_prediction_index( "2006-4-30", "2016-4-30")) predict_dates = model.data.predict_dates #expected_dates = date_range("2006-12-31", "2016-12-31", # freq="AS-APR") expected_dates = pd.date_range("2006-4-30", "2016-4-30", freq="A-APR") assert_equal(predict_dates, expected_dates)
def test_predict_freq(): # test that predicted dates have same frequency x = np.arange(1,36.) # there's a bug in pandas up to 0.10.2 for YearBegin #dates = date_range("1972-4-1", "2007-4-1", freq="AS-APR") dates = pd.date_range("1972-4-30", "2006-4-30", freq="A-APR") series = pd.Series(x, index=dates) model = TimeSeriesModel(series) #npt.assert_(model.data.freq == "AS-APR") assert_equal(model._index.freqstr, "A-APR") start, end, out_of_sample, _ = ( model._get_prediction_index("2006-4-30", "2016-4-30")) predict_dates = model.data.predict_dates #expected_dates = date_range("2006-12-31", "2016-12-31", # freq="AS-APR") expected_dates = pd.date_range("2006-4-30", "2016-4-30", freq="A-APR") assert_equal(predict_dates, expected_dates)
def test_predict_freq(): # test that predicted dates have same frequency x = np.arange(1, 36.) # there's a bug in pandas up to 0.10.2 for YearBegin #dates = date_range("1972-4-1", "2007-4-1", freq="AS-APR") dates = date_range("1972-4-30", "2006-4-30", freq="A-APR") series = Series(x, index=dates) model = TimeSeriesModel(series) #npt.assert_(model.data.freq == "AS-APR") npt.assert_(model.data.freq == "A-APR") start = model._get_predict_start("2006-4-30") end = model._get_predict_end("2016-4-30") model._make_predict_dates() predict_dates = model.data.predict_dates #expected_dates = date_range("2006-12-31", "2016-12-31", # freq="AS-APR") expected_dates = date_range("2006-4-30", "2016-4-30", freq="A-APR") assert_equal(predict_dates, expected_dates)
def test_pandas_nodates_index(): data = [988, 819, 964] dates = ['a', 'b', 'c'] s = pd.Series(data, index=dates) npt.assert_raises(ValueError, TimeSeriesModel, s) # Test with a non-date index that doesn't raise an exception because it # can be coerced into a nanosecond DatetimeIndex # (This test doesn't make sense for Numpy < 1.7 since they don't have # nanosecond support) # (This test also doesn't make sense for Pandas < 0.14 since we don't # support nanosecond index in Pandas < 0.14) try: # Check for Numpy < 1.7 _freq_to_pandas['N'] except: pass else: data = [988, 819, 964] # index=pd.date_range('1970-01-01', periods=3, freq='QS') index = pd.to_datetime([100, 101, 102]) s = pd.Series(data, index=index) # Alternate test for Pandas < 0.14 from distutils.version import LooseVersion from pandas import __version__ as pd_version if LooseVersion(pd_version) < '0.14': assert_raises(NotImplementedError, TimeSeriesModel, s) else: actual_str = (index[0].strftime('%Y-%m-%d %H:%M:%S.%f') + str(index[0].value)) assert_equal(actual_str, '1970-01-01 00:00:00.000000100') mod = TimeSeriesModel(s) start = mod._get_predict_start(0) end, out_of_sample = mod._get_predict_end(4) mod._make_predict_dates() assert_equal(len(mod.data.predict_dates), 5)
def test_pandas_nodates_index(): data = [988, 819, 964] dates = ['a', 'b', 'c'] s = pd.Series(data, index=dates) # TODO: Remove this, this is now valid # npt.assert_raises(ValueError, TimeSeriesModel, s) # Test with a non-date index that doesn't raise an exception because it # can be coerced into a nanosecond DatetimeIndex data = [988, 819, 964] # index=pd.date_range('1970-01-01', periods=3, freq='QS') index = pd.to_datetime([100, 101, 102]) s = pd.Series(data, index=index) actual_str = (index[0].strftime('%Y-%m-%d %H:%M:%S.%f') + str(index[0].value)) assert_equal(actual_str, '1970-01-01 00:00:00.000000100') mod = TimeSeriesModel(s) start, end, out_of_sample, _ = mod._get_prediction_index(0, 4) assert_equal(len(mod.data.predict_dates), 5)
def test_predict_freq(): # test that predicted dates have same frequency x = np.arange(1,36.) # there's a bug in pandas up to 0.10.2 for YearBegin #dates = date_range("1972-4-1", "2007-4-1", freq="AS-APR") dates = date_range("1972-4-30", "2006-4-30", freq="A-APR") series = Series(x, index=dates) model = TimeSeriesModel(series) #npt.assert_(model.data.freq == "AS-APR") npt.assert_(model.data.freq == "A-APR") start = model._get_predict_start("2006-4-30") end = model._get_predict_end("2016-4-30") model._make_predict_dates() predict_dates = model.data.predict_dates #expected_dates = date_range("2006-12-31", "2016-12-31", # freq="AS-APR") expected_dates = date_range("2006-4-30", "2016-4-30", freq="A-APR") assert_equal(predict_dates, expected_dates)
def test_ar_select_order_tstat(): rs = np.random.RandomState(123) tau = 25 y = rs.randn(tau) ts = Series(y, index=DatetimeIndex(start="1/1/1990", periods=tau, freq="M")) ar = AR(ts) res = ar.select_order(maxlag=5, ic="t-stat") assert_equal(res, 0)