def test_demean_many_missing(mi_df): entities = mi_df.index.levels[0] times = mi_df.index.levels[1] skips = (3, 5, 2) for column, skip in zip(mi_df, skips): for entity in entities[::skip]: mi_df.loc[entity, column] = np.nan mi_df.index = mi_df.index.swaplevel() for time in times[::skip]: mi_df.loc[time, column] = np.nan mi_df.index = mi_df.index.swaplevel() data = PanelData(mi_df) fe = data.demean("entity") orig_nan = np.isnan(data.values3d.ravel()) fe_nan = np.isnan(fe.values3d.ravel()) assert np.all(fe_nan[orig_nan]) expected = data.values3d.copy() for i in range(3): mu = np.ones(expected[i].shape[1]) * np.nan for j in range(expected[i].shape[1]): if np.any(np.isfinite(expected[i][:, j])): mu[j] = np.nanmean(expected[i][:, j]) expected[i] -= mu assert_allclose(fe.values3d, expected) te = data.demean("time") expected = data.values3d.copy() for i in range(3): mu = np.ones((expected[i].shape[0], 1)) * np.nan for j in range(expected[i].shape[0]): if np.any(np.isfinite(expected[i][j])): mu[j, 0] = np.nanmean(expected[i][j]) expected[i] -= mu assert_allclose(te.values3d, expected)
def test_demean_against_dummy_regression(data): dh = PanelData(data.x) dh.drop(dh.isnull) df = dh.dataframe no_index = df.reset_index() cat = Categorical(no_index[df.index.levels[0].name]) d = get_dummies(cat, drop_first=False).astype(np.float64) dummy_demeaned = df.values - d @ lstsq(d, df.values, rcond=None)[0] entity_demean = dh.demean("entity") assert_allclose(1 + np.abs(entity_demean.values2d), 1 + np.abs(dummy_demeaned)) cat = Categorical(no_index[df.index.levels[1].name]) d = get_dummies(cat, drop_first=False).astype(np.float64) dummy_demeaned = df.values - d @ lstsq(d, df.values, rcond=None)[0] time_demean = dh.demean("time") assert_allclose(1 + np.abs(time_demean.values2d), 1 + np.abs(dummy_demeaned)) cat = Categorical(no_index[df.index.levels[0].name]) d1 = get_dummies(cat, drop_first=False).astype(np.float64) cat = Categorical(no_index[df.index.levels[1].name]) d2 = get_dummies(cat, drop_first=True).astype(np.float64) d = np.c_[d1.values, d2.values] dummy_demeaned = df.values - d @ lstsq(d, df.values, rcond=None)[0] both_demean = dh.demean("both") assert_allclose(1 + np.abs(both_demean.values2d), 1 + np.abs(dummy_demeaned))
def test_demean_weighted(data): x = PanelData(data.x) w = PanelData(data.w) missing = x.isnull | w.isnull x.drop(missing) w.drop(missing) entity_demean = x.demean('entity', weights=w) d = pd.get_dummies(pd.Categorical(x.index.labels[0])) d = d.values root_w = np.sqrt(w.values2d) wx = root_w * x.values2d wd = d * root_w mu = wd @ np.linalg.lstsq(wd, wx)[0] e = wx - mu assert_allclose(1 + np.abs(entity_demean.values2d), 1 + np.abs(e)) time_demean = x.demean('time', weights=w) d = pd.get_dummies(pd.Categorical(x.index.labels[1])) d = d.values root_w = np.sqrt(w.values2d) wx = root_w * x.values2d wd = d * root_w mu = wd @ np.linalg.lstsq(wd, wx)[0] e = wx - mu assert_allclose(1 + np.abs(time_demean.values2d), 1 + np.abs(e))
def test_demean_many_missing(panel): panel.iloc[0, ::3] = np.nan panel.iloc[0, :, ::3] = np.nan panel.iloc[1, ::5] = np.nan panel.iloc[1, :, ::5] = np.nan panel.iloc[2, ::2] = np.nan panel.iloc[2, :, ::2] = np.nan data = PanelData(panel) fe = data.demean('entity') orig_nan = np.isnan(panel.values.ravel()) fe_nan = np.isnan(fe.values3d.ravel()) assert np.all(fe_nan[orig_nan]) expected = panel.values.copy() for i in range(3): mu = np.ones(expected[i].shape[1]) * np.nan for j in range(expected[i].shape[1]): if np.any(np.isfinite(expected[i][:, j])): mu[j] = np.nanmean(expected[i][:, j]) expected[i] -= mu assert_allclose(fe.values3d, expected) te = data.demean('time') expected = panel.values.copy() for i in range(3): mu = np.ones((expected[i].shape[0], 1)) * np.nan for j in range(expected[i].shape[0]): if np.any(np.isfinite(expected[i][j])): mu[j, 0] = np.nanmean(expected[i][j]) expected[i] -= mu assert_allclose(te.values3d, expected)
def test_demean_weighted(data): x = PanelData(data.x) w = PanelData(data.w) missing = x.isnull | w.isnull x.drop(missing) w.drop(missing) entity_demean = x.demean("entity", weights=w) d = get_dummies(Categorical(get_codes(x.index)[0])) d = d.values root_w = np.sqrt(w.values2d) wx = root_w * x.values2d wd = d * root_w mu = wd @ lstsq(wd, wx, rcond=None)[0] e = wx - mu assert_allclose(1 + np.abs(entity_demean.values2d), 1 + np.abs(e)) time_demean = x.demean("time", weights=w) d = get_dummies(Categorical(get_codes(x.index)[1])) d = d.values root_w = np.sqrt(w.values2d) wx = root_w * x.values2d wd = d * root_w mu = wd @ lstsq(wd, wx, rcond=None)[0] e = wx - mu assert_allclose(1 + np.abs(time_demean.values2d), 1 + np.abs(e))
def test_general_unit_weighted_demean_oneway(mi_df): y = PanelData(mi_df) dm1 = y.demean("entity") g = PanelData(DataFrame(y.entity_ids, index=y.index)) weights = PanelData(g).copy() weights.dataframe.iloc[:, :] = 1 dm2 = y.general_demean(g, weights) assert_allclose(dm1.values2d, dm2.values2d) dm3 = y.general_demean(g) assert_allclose(dm3.values2d, dm2.values2d) dm1 = y.demean("time") g = PanelData(DataFrame(y.time_ids, index=y.index)) dm2 = y.general_demean(g, weights) assert_allclose(dm1.values2d, dm2.values2d) dm3 = y.general_demean(g) assert_allclose(dm3.values2d, dm2.values2d) g = PanelData( DataFrame(np.random.randint(0, 10, g.dataframe.shape), index=y.index)) dm2 = y.general_demean(g, weights) dm3 = y.general_demean(g) g = Categorical(g.dataframe.iloc[:, 0]) d = get_dummies(g) dm1 = y.values2d - d @ lstsq(d, y.values2d, rcond=None)[0] assert_allclose(dm1, dm2.values2d) assert_allclose(dm3.values2d, dm2.values2d)
def test_general_weighted_demean_oneway(panel): y = PanelData(panel) weights = pd.DataFrame(np.random.chisquare(10, (y.dataframe.shape[0], 1)) / 10, index=y.index) w = PanelData(weights) dm1 = y.demean('entity', weights=w) g = PanelData(pd.DataFrame(y.entity_ids, index=y.index)) dm2 = y.general_demean(g, w) assert_allclose(dm1.values2d, dm2.values2d) dm1 = y.demean('time', weights=w) g = PanelData(pd.DataFrame(y.time_ids, index=y.index)) dm2 = y.general_demean(g, w) assert_allclose(dm1.values2d, dm2.values2d) g = PanelData( pd.DataFrame(np.random.randint(0, 10, g.dataframe.shape), index=y.index)) dm2 = y.general_demean(g, w) g = pd.Categorical(g.dataframe.iloc[:, 0]) d = pd.get_dummies(g) wd = np.sqrt(w.values2d) * d wy = np.sqrt(w.values2d) * y.values2d dm1 = wy - wd @ np.linalg.lstsq(wd, wy)[0] assert_allclose(dm1, dm2.values2d, atol=1e-14)
def test_general_weighted_demean_oneway(mi_df): y = PanelData(mi_df) weights = DataFrame(np.random.chisquare(10, (y.dataframe.shape[0], 1)) / 10, index=y.index) w = PanelData(weights) dm1 = y.demean("entity", weights=w) g = PanelData(DataFrame(y.entity_ids, index=y.index)) dm2 = y.general_demean(g, w) assert_allclose(dm1.values2d, dm2.values2d) dm1 = y.demean("time", weights=w) g = PanelData(DataFrame(y.time_ids, index=y.index)) dm2 = y.general_demean(g, w) assert_allclose(dm1.values2d, dm2.values2d) g = PanelData( DataFrame(np.random.randint(0, 10, g.dataframe.shape), index=y.index)) dm2 = y.general_demean(g, w) g = Categorical(g.dataframe.iloc[:, 0]) d = get_dummies(g) wd = np.sqrt(w.values2d) * d wy = np.sqrt(w.values2d) * y.values2d dm1 = wy - wd @ lstsq(wd, wy, rcond=None)[0] assert_allclose(dm1, dm2.values2d, atol=1e-14)
def test_general_unit_weighted_demean_oneway(panel): y = PanelData(panel) dm1 = y.demean('entity') g = PanelData(pd.DataFrame(y.entity_ids, index=y.index)) weights = PanelData(g).copy() weights.dataframe.iloc[:, :] = 1 dm2 = y.general_demean(g, weights) assert_allclose(dm1.values2d, dm2.values2d) dm3 = y.general_demean(g) assert_allclose(dm3.values2d, dm2.values2d) dm1 = y.demean('time') g = PanelData(pd.DataFrame(y.time_ids, index=y.index)) dm2 = y.general_demean(g, weights) assert_allclose(dm1.values2d, dm2.values2d) dm3 = y.general_demean(g) assert_allclose(dm3.values2d, dm2.values2d) g = PanelData(pd.DataFrame(np.random.randint(0, 10, g.dataframe.shape), index=y.index)) dm2 = y.general_demean(g, weights) dm3 = y.general_demean(g) g = pd.Categorical(g.dataframe.iloc[:, 0]) d = pd.get_dummies(g) dm1 = y.values2d - d @ np.linalg.lstsq(d, y.values2d)[0] assert_allclose(dm1, dm2.values2d) assert_allclose(dm3.values2d, dm2.values2d)
def test_demean_missing_alt_types(data): xpd = PanelData(data.x) xpd.drop(xpd.isnull) entity_demean = xpd.demean('entity') expected = xpd.dataframe.groupby(level=0).transform(lambda s: s - s.mean()) assert_frame_equal(entity_demean.dataframe, expected) time_demean = xpd.demean('time') expected = xpd.dataframe.groupby(level=1).transform(lambda s: s - s.mean()) assert_frame_equal(time_demean.dataframe, expected)
def test_demean(panel): data = PanelData(panel) fe = data.demean('entity') expected = panel.values.copy() for i in range(3): expected[i] -= expected[i].mean(0) assert_allclose(fe.values3d, expected) te = data.demean('time') expected = panel.values.copy() for i in range(3): expected[i] -= expected[i].mean(1)[:, None] assert_allclose(te.values3d, expected)
def test_demean(mi_df): data = PanelData(mi_df) fe = data.demean("entity") expected = data.values3d.copy() for i in range(3): expected[i] -= expected[i].mean(0) assert_allclose(fe.values3d, expected) te = data.demean("time") expected = data.values3d.copy() for i in range(3): expected[i] -= expected[i].mean(1)[:, None] assert_allclose(te.values3d, expected)
def test_demean_against_groupby(data): dh = PanelData(data.x) df = dh.dataframe def demean(x): return x - x.mean() entity_demean = df.groupby(level=0).transform(demean) res = dh.demean("entity") assert_allclose(entity_demean.values, res.values2d) time_demean = df.groupby(level=1).transform(demean) res = dh.demean("time") assert_allclose(time_demean.values, res.values2d)
def test_demean_missing(mi_df): mi_df.values.flat[::13] = np.nan data = PanelData(mi_df) fe = data.demean("entity") expected = data.values3d.copy() for i in range(3): expected[i] -= np.nanmean(expected[i], 0) assert_allclose(fe.values3d, expected) te = data.demean("time") expected = data.values3d.copy() for i in range(3): expected[i] -= np.nanmean(expected[i], 1)[:, None] assert_allclose(te.values3d, expected)
def test_demean_simple_weighted(data): x = PanelData(data.x) w = PanelData(data.w) missing = x.isnull | w.isnull x.drop(missing) w.drop(missing) w.dataframe.iloc[:, 0] = 1 unweighted_entity_demean = x.demean('entity') weighted_entity_demean = x.demean('entity', weights=w) assert_allclose(unweighted_entity_demean.dataframe, weighted_entity_demean.dataframe) unweighted_entity_demean = x.demean('time') weighted_entity_demean = x.demean('time', weights=w) assert_allclose(unweighted_entity_demean.dataframe, weighted_entity_demean.dataframe)
def test_demean_missing(panel): panel.values.flat[::13] = np.nan data = PanelData(panel) fe = data.demean('entity') expected = panel.values.copy() for i in range(3): expected[i] -= np.nanmean(expected[i], 0) assert_allclose(fe.values3d, expected) te = data.demean('time') expected = panel.values.copy() for i in range(3): expected[i] -= np.nanmean(expected[i], 1)[:, None] assert_allclose(te.values3d, expected)
def test_demean_missing_alt_types(data): check = isinstance(data.x, (DataFrame, np.ndarray)) xpd = PanelData(data.x) xpd.drop(xpd.isnull) entity_demean = xpd.demean('entity') expected = xpd.dataframe.groupby(level=0).transform(lambda s: s - s.mean()) assert_frame_equal(entity_demean.dataframe, expected, check_index_type=check, check_column_type=check) time_demean = xpd.demean('time') expected = xpd.dataframe.groupby(level=1).transform(lambda s: s - s.mean()) assert_frame_equal(time_demean.dataframe, expected, check_index_type=check, check_column_type=check)
def test_general_unit_weighted_demean_twoway(mi_df): np.random.seed(12345) y = PanelData(mi_df) weights = DataFrame(np.random.chisquare(10, (y.dataframe.shape[0], 1)) / 10, index=y.index) w = PanelData(weights) dm1 = y.demean("both", weights=w) g = DataFrame(y.entity_ids, index=y.index) g["column2"] = Series(y.time_ids.squeeze(), index=y.index) dm2 = y.general_demean(g, weights=w) assert_allclose(dm1.values2d - dm2.values2d, np.zeros_like(dm2.values2d), atol=1e-7) g = DataFrame(np.random.randint(0, 10, g.shape), index=y.index) dm2 = y.general_demean(g, weights=w) g1 = Categorical(g.iloc[:, 0]) d1 = get_dummies(g1) g2 = Categorical(g.iloc[:, 1]) d2 = get_dummies(g2, drop_first=True) d = np.c_[d1, d2] wd = np.sqrt(w.values2d) * d wy = np.sqrt(w.values2d) * y.values2d dm1 = wy - wd @ lstsq(wd, wy, rcond=None)[0] assert_allclose(dm1 - dm2.values2d, np.zeros_like(dm2.values2d), atol=1e-7)
def test_general_demean_oneway(panel): y = PanelData(panel) dm1 = y.demean('entity') g = pd.DataFrame(y.entity_ids, index=y.index) dm2 = y.general_demean(g) assert_allclose(dm1.values2d, dm2.values2d) dm1 = y.demean('time') g = pd.DataFrame(y.time_ids, index=y.index) dm2 = y.general_demean(g) assert_allclose(dm1.values2d, dm2.values2d) g = pd.DataFrame(np.random.randint(0, 10, g.shape), index=y.index) dm2 = y.general_demean(g) g = pd.Categorical(g.iloc[:, 0]) d = pd.get_dummies(g) dm1 = y.values2d - d @ np.linalg.lstsq(d, y.values2d)[0] assert_allclose(dm1, dm2.values2d)
def test_general_demean_oneway(mi_df): y = PanelData(mi_df) dm1 = y.demean("entity") g = DataFrame(y.entity_ids, index=y.index) dm2 = y.general_demean(g) assert_allclose(dm1.values2d, dm2.values2d) dm1 = y.demean("time") g = DataFrame(y.time_ids, index=y.index) dm2 = y.general_demean(g) assert_allclose(dm1.values2d, dm2.values2d) g = DataFrame(np.random.randint(0, 10, g.shape), index=y.index) dm2 = y.general_demean(g) g = Categorical(g.iloc[:, 0]) d = get_dummies(g) dm1 = y.values2d - d @ lstsq(d, y.values2d, rcond=None)[0] assert_allclose(dm1, dm2.values2d)
def test_demean_many_missing_dropped(panel): panel.iloc[0, ::3, ::3] = np.nan data = PanelData(panel) data.drop(data.isnull) fe = data.demean('entity') expected = data.values2d.copy() eid = data.entity_ids.ravel() for i in np.unique(eid): expected[eid == i] -= np.nanmean(expected[eid == i], 0) assert_allclose(fe.values2d, expected)
def test_demean_both_large_t(): data = PanelData(pd.Panel(np.random.standard_normal((1, 100, 10)))) demeaned = data.demean('both') df = data.dataframe no_index = df.reset_index() cat = pd.Categorical(no_index[df.index.levels[0].name]) d1 = pd.get_dummies(cat, drop_first=False).astype(np.float64) cat = pd.Categorical(no_index[df.index.levels[1].name]) d2 = pd.get_dummies(cat, drop_first=True).astype(np.float64) d = np.c_[d1.values, d2.values] dummy_demeaned = df.values - d @ pinv(d) @ df.values assert_allclose(1 + np.abs(demeaned.values2d), 1 + np.abs(dummy_demeaned))
def test_demean_many_missing(panel): panel.iloc[0, ::3] = np.nan panel.iloc[0, :, ::3] = np.nan panel.iloc[1, ::5] = np.nan panel.iloc[1, :, ::5] = np.nan panel.iloc[2, ::2] = np.nan panel.iloc[2, :, ::2] = np.nan data = PanelData(panel) fe = data.demean('entity') orig_nan = np.isnan(panel.values.ravel()) fe_nan = np.isnan(fe.values3d.ravel()) assert np.all(fe_nan[orig_nan]) expected = panel.values.copy() for i in range(3): expected[i] -= np.nanmean(expected[i], 0) assert_allclose(fe.values3d, expected) te = data.demean('time') expected = panel.values.copy() for i in range(3): expected[i] -= np.nanmean(expected[i], 1)[:, None] assert_allclose(te.values3d, expected)
def test_general_demean_twoway(mi_df): y = PanelData(mi_df) dm1 = y.demean("both") g = DataFrame(y.entity_ids, index=y.index) g["column2"] = Series(y.time_ids.squeeze(), index=y.index) dm2 = y.general_demean(g) assert_allclose(dm1.values2d, dm2.values2d) g = DataFrame(np.random.randint(0, 10, g.shape), index=y.index) dm2 = y.general_demean(g) g1 = Categorical(g.iloc[:, 0]) d1 = get_dummies(g1) g2 = Categorical(g.iloc[:, 1]) d2 = get_dummies(g2, drop_first=True) d = np.c_[d1, d2] dm1 = y.values2d - d @ lstsq(d, y.values2d, rcond=None)[0] assert_allclose(dm1 - dm2.values2d, np.zeros_like(dm2.values2d), atol=1e-7)
def test_demean_both_large_t(): x = np.random.standard_normal((1, 100, 10)) time = date_range("1-1-2000", periods=100) entities = ["entity.{0}".format(i) for i in range(10)] data = panel_to_frame(x, ["x"], time, entities, swap=True) data = PanelData(data) demeaned = data.demean("both") df = data.dataframe no_index = df.reset_index() cat = Categorical(no_index[df.index.levels[0].name]) d1 = get_dummies(cat, drop_first=False).astype(np.float64) cat = Categorical(no_index[df.index.levels[1].name]) d2 = get_dummies(cat, drop_first=True).astype(np.float64) d = np.c_[d1.values, d2.values] dummy_demeaned = df.values - d @ pinv(d) @ df.values assert_allclose(1 + np.abs(demeaned.values2d), 1 + np.abs(dummy_demeaned))
def test_general_demean_twoway(panel): y = PanelData(panel) dm1 = y.demean('both') g = pd.DataFrame(y.entity_ids, index=y.index) g['column2'] = pd.Series(y.time_ids.squeeze(), index=y.index) dm2 = y.general_demean(g) assert_allclose(dm1.values2d, dm2.values2d) g = pd.DataFrame(np.random.randint(0, 10, g.shape), index=y.index) dm2 = y.general_demean(g) g1 = pd.Categorical(g.iloc[:, 0]) d1 = pd.get_dummies(g1) g2 = pd.Categorical(g.iloc[:, 1]) d2 = pd.get_dummies(g2, drop_first=True) d = np.c_[d1, d2] dm1 = y.values2d - d @ np.linalg.lstsq(d, y.values2d)[0] assert_allclose(dm1 - dm2.values2d, np.zeros_like(dm2.values2d), atol=1e-7)
def test_demean_many_missing_dropped(mi_df): entities = mi_df.index.levels[0] times = mi_df.index.levels[1] column = mi_df.columns[0] for entity in entities[::3]: mi_df.loc[entity, column] = np.nan mi_df.index = mi_df.index.swaplevel() for time in times[::3]: mi_df.loc[time, column] = np.nan mi_df.index = mi_df.index.swaplevel() data = PanelData(mi_df) data.drop(data.isnull) fe = data.demean("entity") expected = data.values2d.copy() eid = data.entity_ids.ravel() for i in np.unique(eid): expected[eid == i] -= np.nanmean(expected[eid == i], 0) assert_allclose(fe.values2d, expected)
mod = PanelOLS(data.y, data.x, weights=data.w) mod.fit() mod = PanelOLS(y, x, weights=data.w, entity_effects=True) mod.fit() mod = PanelOLS(data.y, data.x, weights=data.w, time_effects=True) mod.fit() mod = PanelOLS(data.y, data.x, weights=data.w, time_effects=True, entity_effects=True) mod.fit() missing = y.isnull | x.isnull | w.isnull y.drop(missing) x.drop(missing) w.drop(missing) x.dataframe.iloc[:, 0] = 1 ydw = y.demean(weights=w) xdw = x.demean(weights=w) d = x.dummies('entity', drop_first=False) root_w = np.sqrt(w.values2d) wd = root_w * d wdx_direct = root_w * x.values2d - wd @ np.linalg.lstsq(wd, root_w * x.values2d)[0] print(np.abs(wdx_direct[0] - xdw.values2d[0]) > 1e-14) mux = (w.values2d * x.values2d).sum(0) / w.values2d.sum() muy = (w.values2d * y.values2d).sum(0) / w.values2d.sum() xx = xdw.values2d + root_w * mux yy = ydw.values2d + root_w * muy.squeeze() print(np.linalg.lstsq(xx, yy)[0]) yyy = root_w * y.values2d xxx = root_w * x.values2d
def test_demean_invalid(mi_df): data = PanelData(mi_df) with pytest.raises(ValueError): data.demean("unknown")
def test_demean_invalid(panel): data = PanelData(panel) with pytest.raises(ValueError): data.demean('unknown')