def test_mean_missing(data): xpd = PanelData(data.x) xpd.drop(xpd.isnull) entity_mean = xpd.mean('entity') expected = xpd.dataframe.groupby(level=0).mean() expected = expected.loc[xpd.entities] expected.columns.name = None assert_frame_equal(entity_mean, expected) time_mean = xpd.mean('time') expected = xpd.dataframe.groupby(level=1).mean() expected = expected.loc[xpd.time] expected.columns.name = None assert_frame_equal(time_mean, expected)
def data(request): model, vcv, weights, missing = request.param.split('-') y_vars = ['y'] x_vars = ['x1', 'x2', 'x3', 'x4', 'x5'] vars = y_vars + x_vars if missing: for i, v in enumerate(vars): vars[i] = v + missing y_vars = vars[:1] x_vars = vars[1:] y = sim_data[y_vars] x = sim_data[['intercept'] + x_vars] mod = MODELS[model] mod_options = {} if model == 'fixed_effect': mod_options = {'entity_effects': True} if weights == 'weighted': mod_options.update({'weights': sim_data['w']}) fit_options = {'debiased': True} if weights == 'wls': fit_options.update({'reweight': True}) if vcv == 'robust' and model not in ('fixed_effect', 'random_effect'): fit_options.update({'cov_type': 'robust'}) elif vcv in ('cluster', 'robust'): y_data = PanelData(y) eid = y_data.entity_ids entities = pd.DataFrame(eid, index=y_data.index, columns=['firm_ids']) fit_options.update({'cov_type': 'clustered', 'clusters': entities}) else: fit_options.update({'cov_type': 'unadjusted'}) if vcv == 'cluster' or ( model in ('fixed_effect', 'random_effect') and vcv == 'robust'): fit_options.update({'group_debias': True}) spec_mod = mod(y, x, **mod_options) fit = spec_mod.fit(**fit_options) return AttrDict(fit=fit, model=spec_mod, model_options=mod_options, y=y, x=x, stata=STATA_RESULTS[request.param], fit_options=fit_options, model_name=model, vcv=vcv, weights=weights, missing=missing)
def test_general_demean_oneway(mi_df): y = PanelData(mi_df) dm1 = y.demean("entity") g = DataFrame(y.entity_ids, index=y.index) dm2 = y.general_demean(g) assert_allclose(dm1.values2d, dm2.values2d) dm1 = y.demean("time") g = DataFrame(y.time_ids, index=y.index) dm2 = y.general_demean(g) assert_allclose(dm1.values2d, dm2.values2d) g = DataFrame(np.random.randint(0, 10, g.shape), index=y.index) dm2 = y.general_demean(g) g = Categorical(g.iloc[:, 0]) d = get_dummies(g) dm1 = y.values2d - d @ lstsq(d, y.values2d, rcond=None)[0] assert_allclose(dm1, dm2.values2d)
def test_general_demean_oneway(panel): y = PanelData(panel) dm1 = y.demean('entity') g = pd.DataFrame(y.entity_ids, index=y.index) dm2 = y.general_demean(g) assert_allclose(dm1.values2d, dm2.values2d) dm1 = y.demean('time') g = pd.DataFrame(y.time_ids, index=y.index) dm2 = y.general_demean(g) assert_allclose(dm1.values2d, dm2.values2d) g = pd.DataFrame(np.random.randint(0, 10, g.shape), index=y.index) dm2 = y.general_demean(g) g = pd.Categorical(g.iloc[:, 0]) d = pd.get_dummies(g) dm1 = y.values2d - d @ np.linalg.lstsq(d, y.values2d)[0] assert_allclose(dm1, dm2.values2d)
def test_nested_effects(data): y = PanelData(data.y) effects = pd.DataFrame(y.entity_ids // 2, index=y.index) with pytest.raises(ValueError) as exception: PanelOLS(data.y, data.x, entity_effects=True, other_effects=effects) assert 'entity effects' in str(exception.value) effects = pd.DataFrame(y.time_ids // 2, index=y.index) with pytest.raises(ValueError) as exception: PanelOLS(data.y, data.x, time_effects=True, other_effects=effects) assert 'time effects' in str(exception.value) effects1 = pd.Series(y.entity_ids.squeeze() // 2, index=y.index) effects2 = pd.Series(y.entity_ids.squeeze() // 4, index=y.index) effects = pd.DataFrame({'eff1': effects1, 'eff2': effects2}) with pytest.raises(ValueError) as exception: PanelOLS(data.y, data.x, other_effects=effects) assert 'by other effects' in str(exception.value) assert 'time effects' not in str(exception.value) assert 'entity effects' not in str(exception.value)
def test_string_conversion(): t, n = 3, 1000 string = np.random.choice(['a', 'b', 'c'], (t, n)) num = np.random.randn(t, n) p = pd.Panel({'a': string, 'b': num}) p = p[['a', 'b']] panel = PanelData(p, var_name='OtherEffect') df = panel.dataframe assert df.shape == (3000, 3) s = string.T.ravel() a_locs = np.where(s == 'a') b_locs = np.where(s == 'b') c_locs = np.where(s == 'c') assert np.all(df.loc[:, 'a.b'].values[a_locs] == 0.0) assert np.all(df.loc[:, 'a.b'].values[b_locs] == 1.0) assert np.all(df.loc[:, 'a.b'].values[c_locs] == 0.0) assert np.all(df.loc[:, 'a.c'].values[a_locs] == 0.0) assert np.all(df.loc[:, 'a.c'].values[b_locs] == 0.0) assert np.all(df.loc[:, 'a.c'].values[c_locs] == 1.0)
def test_valid_weight_shape(data): # Same size n = np.prod(data.y.shape) weights = 1 + np.random.random_sample(n) mod = PanelOLS(data.y, data.x, weights=weights) mod.fit() w = mod.weights.values2d missing = PanelData(data.y).isnull | PanelData(data.x).isnull expected = weights[~missing.squeeze()][:, None] expected = expected / expected.mean() assert_equal(w, expected) # Per time n = data.y.shape[0] weights = 1 + np.random.random_sample(n) mod = PanelOLS(data.y, data.x, weights=weights) mod.fit() w = mod.weights.values2d expected = weights[:, None] @ np.ones((1, data.y.shape[1])) expected = expected.T.ravel() expected = expected[~missing.squeeze()][:, None] expected = expected / expected.mean() assert_equal(w, expected) # Per entity n = data.y.shape[1] weights = 1 + np.random.random_sample(n) mod = PanelOLS(data.y, data.x, weights=weights) mod.fit() w = mod.weights.values2d expected = np.ones((data.y.shape[0], 1)) @ weights[None, :] expected = expected.T.ravel() expected = expected[~missing.squeeze()][:, None] expected = expected / expected.mean() assert_equal(w, expected) weights = 1 + np.random.random_sample(data.y.shape) mod = PanelOLS(data.y, data.x, weights=weights) mod.fit() w = mod.weights.values2d expected = weights.T.ravel() expected = expected[~missing.squeeze()][:, None] expected = expected / expected.mean() assert_equal(w, expected)
def test_numpy_3d(): n, t, k = 11, 7, 3 x = np.random.random((k, t, n)) dh = PanelData(x) assert_equal(x, dh.values3d) assert dh.nentity == n assert dh.nobs == t assert dh.nvar == k assert_equal(np.reshape(x.T, (n * t, k)), dh.values2d) items = ['entity.{0}'.format(i) for i in range(n)] obs = [i for i in range(t)] var_names = ['x.{0}'.format(i) for i in range(k)] expected = pd.Panel(np.reshape(x, (k, t, n)), items=var_names, major_axis=obs, minor_axis=items) expected_frame = expected.swapaxes(1, 2).to_frame() expected_frame.index.levels[0].name = 'entity' expected_frame.index.levels[1].name = 'time' assert_frame_equal(dh.dataframe, expected_frame)
def test_methods_equivalent(data, lsdv_config): other_effects = None if lsdv_config.other_effects == 1: other_effects = PanelData(data.c).dataframe.iloc[:, [0]] elif lsdv_config.other_effects == 2: other_effects = data.c weights = data.w if lsdv_config.weights else None mod = PanelOLS( data.y, data.x, weights=weights, entity_effects=lsdv_config.entity_effects, time_effects=lsdv_config.time_effects, other_effects=other_effects, ) res1 = mod.fit() res2 = mod.fit(use_lsdv=True) res3 = mod.fit(use_lsmr=True) assert_results_equal(res1, res2) assert_results_equal(res2, res3, strict=False)
def test_demean_missing_alt_types(data): check = isinstance(data.x, (DataFrame, np.ndarray)) xpd = PanelData(data.x) xpd.drop(xpd.isnull) entity_demean = xpd.demean('entity') expected = xpd.dataframe.groupby(level=0).transform(lambda s: s - s.mean()) assert_frame_equal(entity_demean.dataframe, expected, check_index_type=check, check_column_type=check) time_demean = xpd.demean('time') expected = xpd.dataframe.groupby(level=1).transform(lambda s: s - s.mean()) assert_frame_equal(time_demean.dataframe, expected, check_index_type=check, check_column_type=check)
def test_lsdv_options(data): mod = PanelOLS(data.y, data.x, weights=data.w) res1 = mod.fit() res2 = mod.fit(use_lsdv=True) assert_results_equal(res1, res2) mod = PanelOLS(data.y, data.x, weights=data.w, entity_effect=True) res1 = mod.fit() res2 = mod.fit(use_lsdv=True) assert_results_equal(res1, res2) mod = PanelOLS(data.y, data.x, time_effect=True) res1 = mod.fit() res2 = mod.fit(use_lsdv=True) assert_results_equal(res1, res2) mod = PanelOLS(data.y, data.x, time_effect=True, entity_effect=True) res1 = mod.fit() res2 = mod.fit(use_lsdv=True) assert_results_equal(res1, res2) c1 = PanelData(data.c).dataframe.iloc[:, [0]] mod = PanelOLS(data.y, data.x, entity_effect=True, other_effects=c1) res1 = mod.fit() res2 = mod.fit(use_lsdv=True) assert_results_equal(res1, res2) mod = PanelOLS(data.y, data.x, time_effect=True, other_effects=c1) res1 = mod.fit() res2 = mod.fit(use_lsdv=True) assert_results_equal(res1, res2) mod = PanelOLS(data.y, data.x, weights=data.w, time_effect=True, other_effects=c1) res1 = mod.fit() res2 = mod.fit(use_lsdv=True) assert_results_equal(res1, res2) mod = PanelOLS(data.y, data.x, weights=data.w, other_effects=data.c) res1 = mod.fit() res2 = mod.fit(use_lsdv=True) assert_results_equal(res1, res2)
def test_singleton_removal_mixed(singleton_data, other_effects): if other_effects == 1: other_effects = PanelData(singleton_data.c).dataframe.iloc[:, [0]] elif other_effects == 2: other_effects = singleton_data.c mod = PanelOLS(singleton_data.y, singleton_data.x, other_effects=other_effects) res_keep = mod.fit(use_lsmr=True) mod = PanelOLS( singleton_data.y, singleton_data.x, other_effects=other_effects, singletons=False, ) res = mod.fit(cov_type="clustered", clusters=singleton_data.vc2, use_lsmr=True) assert_allclose(res_keep.params, res.params) assert res.nobs <= res_keep.nobs
def test_string_input(data): y = PanelData(data.y) nt = y.values2d.shape[0] temp = {} prim = ['a', 'b', 'c', 'd', 'e'] for i in range(2): name = 'effect.' + str(i) temp[name] = pd.Series(np.random.choice(prim, size=nt), index=y.index, name=name) effects = pd.DataFrame(temp, index=y.index) mod = PanelOLS(data.y, data.x, other_effects=effects) mod.fit() clusters = np.random.randint(0, y.shape[2] // 2, size=(nt, 2)) temp = {} prim = list(map(lambda s: ''.join(s), list(product(ascii_lowercase, ascii_lowercase)))) for i in range(clusters.shape[1]): name = 'effect.' + str(i) temp[name] = pd.Series(np.random.choice(prim, size=nt), index=y.index, name=name) clusters = pd.DataFrame(temp, index=y.index) mod.fit(cov_type='clustered', clusters=clusters)
def test_numpy_3d(): n, t, k = 11, 7, 3 x = np.random.random((k, t, n)) dh = PanelData(x) assert_equal(x, dh.values3d) assert dh.nentity == n assert dh.nobs == t assert dh.nvar == k assert_equal(np.reshape(x.T, (n * t, k)), dh.values2d) items = ["entity.{0}".format(i) for i in range(n)] obs = [i for i in range(t)] var_names = ["x.{0}".format(i) for i in range(k)] expected_frame = panel_to_frame( np.reshape(x, (k, t, n)), items=var_names, major_axis=obs, minor_axis=items, swap=True, ) expected_frame.index.set_names(["entity", "time"], inplace=True) assert_frame_equal(dh.dataframe, expected_frame)
def test_random_effects_small_sample(data): y = PanelData(data.y) mod = RandomEffects(data.y, data.x) no_ss = mod.fit() ss = mod.fit(small_sample=True) if y.dataframe.shape[0] == mod.dependent.dataframe.shape[0]: assert (ss.variance_decomposition.Effects == no_ss.variance_decomposition.Effects) else: assert (ss.variance_decomposition.Effects != no_ss.variance_decomposition.Effects) mod = RandomEffects(data.y, data.x, weights=data.w) no_ss = mod.fit() ss = mod.fit(small_sample=True) if y.dataframe.shape[0] == mod.dependent.dataframe.shape[0]: assert (ss.variance_decomposition.Effects == no_ss.variance_decomposition.Effects) else: assert (ss.variance_decomposition.Effects != no_ss.variance_decomposition.Effects)
def test_count(data): xpd = PanelData(data.x) xpd.drop(xpd.isnull) entity_mean = xpd.count("entity") expected = xpd.dataframe.groupby(level=0).count() expected = expected.loc[xpd.entities] expected.columns.name = None expected = expected.astype(np.int64) assert_frame_equal(entity_mean, expected) time_mean = xpd.count("time") expected = xpd.dataframe.groupby(level=1).count() expected = expected.loc[xpd.time] expected.columns.name = None expected = expected.astype(np.int64) assert_frame_equal(time_mean, expected)
def test_mixed_input(data): y = PanelData(data.y) nt = y.values2d.shape[0] effects = np.random.randint(0, 5, size=nt) prim = ["a", "b", "c", "d", "e"] temp = { "effect.0": pd.Categorical(pd.Series(effects, index=y.index)), "effect.1": pd.Series(np.random.choice(prim, size=nt), index=y.index), } effects = pd.DataFrame(temp, index=y.index) mod = PanelOLS(data.y, data.x, other_effects=effects) mod.fit() clusters = np.random.randint(0, y.shape[2] // 2, size=(nt, 2)) temp = {} prim = list( map(lambda s: "".join(s), list(product(ascii_lowercase, ascii_lowercase))) ) temp["var.cluster.0"] = pd.Series(np.random.choice(prim, size=nt), index=y.index) temp["var.cluster.1"] = pd.Series(clusters[:, 1], index=y.index) clusters = pd.DataFrame(temp, index=y.index) mod.fit(cov_type="clustered", clusters=clusters)
def test_mixed_input(data): y = PanelData(data.y) nt = y.values2d.shape[0] effects = np.random.randint(0, 5, size=nt) prim = ['a', 'b', 'c', 'd', 'e'] temp = { 'effect.0': pd.Categorical(pd.Series(effects, index=y.index)), 'effect.1': pd.Series(np.random.choice(prim, size=nt), index=y.index) } effects = pd.DataFrame(temp, index=y.index) mod = PanelOLS(data.y, data.x, other_effects=effects) mod.fit() clusters = np.random.randint(0, y.shape[2] // 2, size=(nt, 2)) temp = {} prim = list( map(lambda s: ''.join(s), list(product(ascii_lowercase, ascii_lowercase)))) temp['var.cluster.0'] = pd.Series(np.random.choice(prim, size=nt), index=y.index) temp['var.cluster.1'] = pd.Series(clusters[:, 1], index=y.index) clusters = pd.DataFrame(temp, index=y.index) mod.fit(cov_type='clustered', clusters=clusters)
def test_results_access(data): mod = PanelOLS(data.y, data.x, entity_effects=True) res = mod.fit() access_attributes(res) mod = PanelOLS(data.y, data.x, other_effects=data.c) res = mod.fit() access_attributes(res) mod = PanelOLS(data.y, data.x, time_effects=True, entity_effects=True) res = mod.fit() access_attributes(res) mod = PanelOLS(data.y, data.x) res = mod.fit() access_attributes(res) const = PanelData(data.y).copy() const.dataframe.iloc[:, :] = 1 const.dataframe.columns = ["const"] mod = PanelOLS(data.y, const) res = mod.fit() access_attributes(res)
def test_multiple_obs_per_entity(data): mod = BetweenOLS(data.y, data.x) res = mod.fit(reweight=True, debiased=False) dep = mod.dependent.values3d.mean(1).T exog = pd.DataFrame(mod.exog.values3d.mean(1).T, columns=mod.exog.vars) ols = IV2SLS(dep, exog, None, None) ols_res = ols.fit(cov_type="unadjusted") assert_results_equal(res, ols_res) res = mod.fit(cov_type="robust", debiased=False) ols_res = ols.fit(cov_type="robust", debiased=False) assert_results_equal(res, ols_res) clusters = mod.dependent.dataframe.copy() clusters.loc[:, :] = 0 clusters = clusters.astype(np.int32) for entity in mod.dependent.entities: clusters.loc[entity] = np.random.randint(9) ols_clusters = PanelData(clusters).values3d.mean(1).T.astype(np.int32) res = mod.fit(cov_type="clustered", clusters=clusters, debiased=False) ols_res = ols.fit(cov_type="clustered", clusters=ols_clusters) assert_results_equal(res, ols_res)
def test_two_way_clustering(data): mod = PooledOLS(data.y, data.x) y = PanelData(data.y) entity_clusters = pd.DataFrame(y.entity_ids, index=y.index) vc1 = PanelData(data.vc1) clusters = vc1.copy() clusters.dataframe['var.cluster.entity'] = entity_clusters clusters._frame = clusters._frame.astype(np.int64) res = mod.fit(cov_type='clustered', clusters=clusters, debiased=False) y = mod.dependent.dataframe.copy() x = mod.exog.dataframe.copy() y.index = np.arange(len(y)) x.index = y.index clusters = mod.reformat_clusters(clusters) ols_mod = IV2SLS(y, x, None, None) ols_res = ols_mod.fit(cov_type='clustered', clusters=clusters.dataframe) assert_results_equal(res, ols_res)
def test_incorrect_time_axis(): x = np.random.randn(3, 3, 1000) entities = ['entity.{0}'.format(i) for i in range(1000)] time = ['time.{0}' for i in range(3)] vars = ['var.{0}' for i in range(3)] p = pd.Panel(x, items=vars, major_axis=time, minor_axis=entities) with pytest.raises(ValueError): PanelData(p) df = p.swapaxes(1, 2).swapaxes(0, 1).to_frame() with pytest.raises(ValueError): PanelData(df) da = xr.DataArray(x, coords={ 'entities': entities, 'time': time, 'vars': vars }, dims=['vars', 'time', 'entities']) with pytest.raises(ValueError): PanelData(da) time = [1, pd.datetime(1960, 1, 1), 'a'] vars = ['var.{0}' for i in range(3)] p = pd.Panel(x, items=vars, major_axis=time, minor_axis=entities) with pytest.raises(ValueError): PanelData(p) df = p.swapaxes(1, 2).swapaxes(0, 1).to_frame() with pytest.raises(ValueError): PanelData(df) da = xr.DataArray(x, coords={ 'entities': entities, 'time': time, 'vars': vars }, dims=['vars', 'time', 'entities']) with pytest.raises(ValueError): PanelData(da)
def test_general_unit_weighted_demean_twoway(panel): np.random.seed(12345) y = PanelData(panel) weights = pd.DataFrame(np.random.chisquare(10, (y.dataframe.shape[0], 1)) / 10, index=y.index) w = PanelData(weights) dm1 = y.demean('both', weights=w) g = pd.DataFrame(y.entity_ids, index=y.index) g['column2'] = pd.Series(y.time_ids.squeeze(), index=y.index) dm2 = y.general_demean(g, weights=w) assert_allclose(dm1.values2d - dm2.values2d, np.zeros_like(dm2.values2d), atol=1e-7) g = pd.DataFrame(np.random.randint(0, 10, g.shape), index=y.index) dm2 = y.general_demean(g, weights=w) g1 = pd.Categorical(g.iloc[:, 0]) d1 = pd.get_dummies(g1) g2 = pd.Categorical(g.iloc[:, 1]) d2 = pd.get_dummies(g2, drop_first=True) d = np.c_[d1, d2] wd = np.sqrt(w.values2d) * d wy = np.sqrt(w.values2d) * y.values2d dm1 = wy - wd @ np.linalg.lstsq(wd, wy)[0] assert_allclose(dm1 - dm2.values2d, np.zeros_like(dm2.values2d), atol=1e-7)
def test_numpy_1d(): n = 11 x = np.random.random(n) with pytest.raises(ValueError): PanelData(x)
def test_general_weighted_demean_oneway(mi_df): y = PanelData(mi_df) weights = DataFrame(np.random.chisquare(10, (y.dataframe.shape[0], 1)) / 10, index=y.index) w = PanelData(weights) dm1 = y.demean("entity", weights=w) g = PanelData(DataFrame(y.entity_ids, index=y.index)) dm2 = y.general_demean(g, w) assert_allclose(dm1.values2d, dm2.values2d) dm1 = y.demean("time", weights=w) g = PanelData(DataFrame(y.time_ids, index=y.index)) dm2 = y.general_demean(g, w) assert_allclose(dm1.values2d, dm2.values2d) g = PanelData( DataFrame(np.random.randint(0, 10, g.dataframe.shape), index=y.index)) dm2 = y.general_demean(g, w) g = Categorical(g.dataframe.iloc[:, 0]) d = get_dummies(g) wd = np.sqrt(w.values2d) * d wy = np.sqrt(w.values2d) * y.values2d dm1 = wy - wd @ lstsq(wd, wy, rcond=None)[0] assert_allclose(dm1, dm2.values2d, atol=1e-14)
def test_general_unit_weighted_demean_oneway(mi_df): y = PanelData(mi_df) dm1 = y.demean("entity") g = PanelData(DataFrame(y.entity_ids, index=y.index)) weights = PanelData(g).copy() weights.dataframe.iloc[:, :] = 1 dm2 = y.general_demean(g, weights) assert_allclose(dm1.values2d, dm2.values2d) dm3 = y.general_demean(g) assert_allclose(dm3.values2d, dm2.values2d) dm1 = y.demean("time") g = PanelData(DataFrame(y.time_ids, index=y.index)) dm2 = y.general_demean(g, weights) assert_allclose(dm1.values2d, dm2.values2d) dm3 = y.general_demean(g) assert_allclose(dm3.values2d, dm2.values2d) g = PanelData( DataFrame(np.random.randint(0, 10, g.dataframe.shape), index=y.index)) dm2 = y.general_demean(g, weights) dm3 = y.general_demean(g) g = Categorical(g.dataframe.iloc[:, 0]) d = get_dummies(g) dm1 = y.values2d - d @ lstsq(d, y.values2d, rcond=None)[0] assert_allclose(dm1, dm2.values2d) assert_allclose(dm3.values2d, dm2.values2d)
def test_repr_html(mi_df): data = PanelData(mi_df) html = data._repr_html_() assert "<br/>" in html
def test_demean_simple_weighted(data): x = PanelData(data.x) w = PanelData(data.w) missing = x.isnull | w.isnull x.drop(missing) w.drop(missing) w.dataframe.iloc[:, 0] = 1 unweighted_entity_demean = x.demean("entity") weighted_entity_demean = x.demean("entity", weights=w) assert_allclose(unweighted_entity_demean.dataframe, weighted_entity_demean.dataframe) unweighted_entity_demean = x.demean("time") weighted_entity_demean = x.demean("time", weights=w) assert_allclose(unweighted_entity_demean.dataframe, weighted_entity_demean.dataframe)
def test_first_difference(data): x = PanelData(data.x) x.first_difference()
def test_invalid_seires(mi_df): si = mi_df.reset_index() with pytest.raises(ValueError): PanelData(si.iloc[:, 0])