def test_mean_weighted(data): x = PanelData(data.x) w = PanelData(data.w) missing = x.isnull | w.isnull x.drop(missing) w.drop(missing) entity_mean = x.mean("entity", weights=w) c = x.index.levels[0][get_codes(x.index)[0]] d = get_dummies(Categorical(c, ordered=True)) d = d[entity_mean.index] d = d.values root_w = np.sqrt(w.values2d) wx = root_w * x.values2d wd = d * root_w mu = lstsq(wd, wx, rcond=None)[0] assert_allclose(entity_mean, mu) time_mean = x.mean("time", weights=w) c = x.index.levels[1][get_codes(x.index)[1]] d = get_dummies(Categorical(c, ordered=True)) d = d[list(time_mean.index)] d = d.values root_w = np.sqrt(w.values2d) wx = root_w * x.values2d wd = d * root_w mu = pinv(wd) @ wx assert_allclose(time_mean, mu)
def test_mean_weighted(data): x = PanelData(data.x) w = PanelData(data.w) missing = x.isnull | w.isnull x.drop(missing) w.drop(missing) entity_mean = x.mean('entity', weights=w) c = x.index.levels[0][x.index.labels[0]] d = pd.get_dummies(pd.Categorical(c, ordered=True)) d = d[entity_mean.index] d = d.values root_w = np.sqrt(w.values2d) wx = root_w * x.values2d wd = d * root_w mu = np.linalg.lstsq(wd, wx)[0] assert_allclose(entity_mean, mu) time_mean = x.mean('time', weights=w) c = x.index.levels[1][x.index.labels[1]] d = pd.get_dummies(pd.Categorical(c, ordered=True)) d = d[time_mean.index] d = d.values root_w = np.sqrt(w.values2d) wx = root_w * x.values2d wd = d * root_w mu = pinv(wd) @ wx assert_allclose(time_mean, mu)
def test_mean_weighted(data): x = PanelData(data.x) w = PanelData(data.w) missing = x.isnull | w.isnull x.drop(missing) w.drop(missing) entity_mean = x.mean('entity', weights=w) c = x.index.levels[0][x.index.labels[0]] d = pd.get_dummies(pd.Categorical(c, ordered=True)) d = d[entity_mean.index] d = d.values root_w = np.sqrt(w.values2d) wx = root_w * x.values2d wd = d * root_w mu = np.linalg.lstsq(wd, wx)[0] assert_allclose(entity_mean, mu) time_mean = x.mean('time', weights=w) c = x.index.levels[1][x.index.labels[1]] d = pd.get_dummies(pd.Categorical(c, ordered=True)) ilocs = [int(d.columns.get_indexer_for([i])) for i in time_mean.index] d = d.iloc[:, ilocs] # TODO: Restore when fixed in pandas # d = d[time_mean.index] d = d.values root_w = np.sqrt(w.values2d) wx = root_w * x.values2d wd = d * root_w mu = pinv(wd) @ wx assert_allclose(time_mean, mu)
def test_mean_missing(data): xpd = PanelData(data.x) xpd.drop(xpd.isnull) entity_mean = xpd.mean("entity") expected = xpd.dataframe.groupby(level=0).mean() expected = expected.loc[xpd.entities] expected.columns.name = None assert_frame_equal(entity_mean, expected) time_mean = xpd.mean("time") expected = xpd.dataframe.groupby(level=1).mean() expected = expected.loc[xpd.time] expected.columns.name = None assert_frame_equal(time_mean, expected)
dfp.count() == dfp.nobs `all()`は,列に対して全ての要素が`True`の場合のみ`True`を返すので,これを使い確認できる。 (dfp.count() == dfp.nobs).all() `( )`はその中を先に評価する,という意味(数学と同じ)。変数が多い場合,`all()`を2回使うと全ての変数に対して評価するので便利である。 (dfp.count() == dfp.nobs).all().all() `False`なので unbalanced panel data ということが確認できた。 --- 変数の観察単位毎の平均の計算 dfp.mean() --- 変数の時間毎の平均の計算 dfp.mean('time') --- 変数の平均からの乖離 $x-\bar{x}$,$\bar{x}$は平均。 dfp.demean() --- 変数の1階階差の計算 $x_t-x_{t-1}$ dfp.first_difference()