def test_concat_sort(data): a = concat([data.df1, data.df2], 1) b = concat([data.df1, data.df2, data.s], 1) c = concat([data.df1, data.df2, data.s], 1, sort=True) d = concat([data.df2, data.df1, data.s], 1, sort=False) assert list(a.columns) == ['A', 'B', 'C'] assert list(b.columns) == ['A', 'B', 'C', 'D'] assert list(c.columns) == ['A', 'B', 'C', 'D'] assert list(d.columns) == ['B', 'C', 'A', 'D']
def test_concat_sort(data): a = concat([data.df1, data.df2], 1) b = concat([data.df1, data.df2, data.s], 1) c = concat([data.df1, data.df2, data.s], 1, sort=True) d = concat([data.df2, data.df1, data.s], 1, sort=False) assert list(a.columns) == ["A", "B", "C"] assert list(b.columns) == ["A", "B", "C", "D"] assert list(c.columns) == ["A", "B", "C", "D"] assert list(d.columns) == ["B", "C", "A", "D"]
def fit(request): method = request.param data = generate_simultaneous_data() if "ols" in method or "sur" in method: mod = SUR for key in data: temp = data[key] temp["exog"] = concat([temp["exog"], temp["endog"]], 1) del temp["endog"] del temp["instruments"] else: mod = IV3SLS if "ols" in method or "2sls" in method: fit_method = "ols" else: fit_method = "gls" mod = mod(data) iterate = "ireg3" in method stata = results[method] debiased = method in ("ols", "2sls") kwargs = {} decimal = 3 if "ireg3" in method else 5 rtol = 10**-decimal res = mod.fit( cov_type="unadjusted", method=fit_method, debiased=debiased, iterate=iterate, **kwargs, ) return stata, res, rtol
def _get_series_property(self, name: str) -> DataFrame: out: List[Tuple[str, Series]] = [(k, getattr(v, name)) for k, v in self._results.items()] cols = [v[0] for v in out] values = concat([v[1] for v in out], axis=1) values.columns = cols return values
def fit(request): method = request.param data = generate_simultaneous_data() if 'ols' in method or 'sur' in method: mod = SUR for key in data: temp = data[key] temp['exog'] = concat([temp['exog'], temp['endog']], 1) del temp['endog'] del temp['instruments'] else: mod = IV3SLS if 'ols' in method or '2sls' in method: fit_method = 'ols' else: fit_method = 'gls' mod = mod(data) iterate = 'ireg3' in method stata = results[method] debiased = method in ('ols', '2sls') kwargs = {} decimal = 2 if 'ireg3' in method else 5 rtol = 10**-decimal res = mod.fit(cov_type='unadjusted', method=fit_method, debiased=debiased, iterate=iterate, **kwargs) return stata, res, rtol
def data(): premia = np.array([.1, .1, .1]) out = generate_data(nportfolio=10, output='pandas', alpha=True, premia=premia) out['joined'] = concat([out.factors, out.portfolios], 1) return out
def test_formula_equivalence_weights(data): weights = AttrDict() eqn_copy = AttrDict() for key in data.eqns: eqn = {k: v for k, v in data.eqns[key].items()} nobs = eqn["dependent"].shape[0] w = np.random.chisquare(2, (nobs, 1)) / 2 weights[key] = w eqn["weights"] = w eqn_copy[key] = eqn mod = IVSystemGMM(eqn_copy, weight_type="unadjusted") df = [] formulas = {} for i, key in enumerate(data.eqns): eqn = data.eqns[key] dep = eqn.dependent ex = eqn.exog en = eqn.endog instr = eqn.instruments dep = DataFrame(dep, columns=["dep_{0}".format(i)]) has_const = False if np.any(np.all(ex == 1, 0)): ex = ex[:, 1:] has_const = True ex = DataFrame( ex, columns=["ex_{0}_{1}".format(i, j) for j in range(ex.shape[1])]) en = DataFrame( en, columns=["en_{0}_{1}".format(i, j) for j in range(en.shape[1])]) instr = DataFrame( instr, columns=["instr_{0}_{1}".format(i, j) for j in range(ex.shape[1])]) fmla = "".join(dep.columns) + " ~ " if has_const: fmla += " 1 + " fmla += " + ".join(ex.columns) + " + [" fmla += " + ".join(en.columns) + " ~ " fmla += " + ".join(instr.columns) + " ] " formulas[key] = fmla df.extend([dep, ex, en, instr]) df = concat(df, 1) formula_mod = IVSystemGMM.from_formula(formulas, df, weights=weights, weight_type="unadjusted") res = mod.fit(cov_type="unadjusted") formula_res = formula_mod.fit(cov_type="unadjusted") assert_allclose(res.params, formula_res.params)
def test_formula_equivalence(data): mod = IVSystemGMM(data.eqns, weight_type="unadjusted") formula = [] df = [] for i, key in enumerate(data.eqns): eqn = data.eqns[key] dep = eqn.dependent ex = eqn.exog en = eqn.endog instr = eqn.instruments dep = DataFrame(dep, columns=["dep_{0}".format(i)]) has_const = False if np.any(np.all(ex == 1, 0)): ex = ex[:, 1:] has_const = True ex = DataFrame( ex, columns=["ex_{0}_{1}".format(i, j) for j in range(ex.shape[1])]) en = DataFrame( en, columns=["en_{0}_{1}".format(i, j) for j in range(en.shape[1])]) instr = DataFrame( instr, columns=["instr_{0}_{1}".format(i, j) for j in range(ex.shape[1])]) fmla = "".join(dep.columns) + " ~ " if has_const: fmla += " 1 + " fmla += " + ".join(ex.columns) + " + [" fmla += " + ".join(en.columns) + " ~ " fmla += " + ".join(instr.columns) + " ] " formula.append(fmla) df.extend([dep, ex, en, instr]) formulas = {} for i, f in enumerate(formula): formulas["eq{0}".format(i)] = f df = concat(df, 1) formula_mod = IVSystemGMM.from_formula(formulas, df, weight_type="unadjusted") res = mod.fit(cov_type="unadjusted") formula_res = formula_mod.fit(cov_type="unadjusted") assert_allclose(res.params, formula_res.params)
def test_predict_formula_function(data, model_and_func): model, func = model_and_func fmla = 'y ~ 1 + sigmoid(x3) + x4 + [x1 + x2 ~ z1 + z2 + z3] + np.exp(x5)' mod = model.from_formula(fmla, data) res = mod.fit() exog = [data[['Intercept']], sigmoid(data[['x3']]), data[['x4']], np.exp(data[['x5']])] exog = concat(exog, 1) endog = data[['x1', 'x2']] pred = res.predict(exog, endog) pred2 = res.predict(data=data) assert_frame_equal(pred, pred2) assert_allclose(res.fitted_values, pred) res2 = func(fmla, data).fit() pred3 = res2.predict(exog, endog) pred4 = res2.predict(data=data) assert_frame_equal(pred, pred3) assert_frame_equal(pred, pred4)
def test_formula_function(data, model_and_func): model, func = model_and_func fmla = 'y ~ 1 + sigmoid(x3) + x4 + [x1 + x2 ~ z1 + z2 + z3] + np.exp(x5)' mod = model.from_formula(fmla, data) res = mod.fit() dep = data.y exog = [data[['Intercept']], sigmoid(data[['x3']]), data[['x4']], np.exp(data[['x5']])] exog = concat(exog, 1) endog = data[['x1', 'x2']] instr = data[['z1', 'z2', 'z3']] mod = model(dep, exog, endog, instr) res2 = mod.fit() assert_equal(res.params.values, res2.params.values) res3 = func(fmla, data).fit() assert_equal(res.params.values, res3.params.values) with pytest.raises(ValueError): res2.predict(data=data)
def test_formula_function(data, model_and_func): model, func = model_and_func fmla = "y ~ 1 + sigmoid(x3) + x4 + [x1 + x2 ~ z1 + z2 + z3] + np.exp(x5)" mod = model.from_formula(fmla, data) res = mod.fit() dep = data.y exog = [ data[["Intercept"]], sigmoid(data[["x3"]]), data[["x4"]], np.exp(data[["x5"]]), ] exog = concat(exog, 1) endog = data[["x1", "x2"]] instr = data[["z1", "z2", "z3"]] mod = model(dep, exog, endog, instr) res2 = mod.fit() assert_equal(res.params.values, res2.params.values) res3 = func(fmla, data).fit() assert_equal(res.params.values, res3.params.values) with pytest.raises(ValueError): res2.predict(data=data)
def test_formula_equivalence(data): mod = IVSystemGMM(data.eqns, weight_type='unadjusted') formula = [] df = [] for i, key in enumerate(data.eqns): eqn = data.eqns[key] dep = eqn.dependent ex = eqn.exog en = eqn.endog instr = eqn.instruments dep = DataFrame(dep, columns=['dep_{0}'.format(i)]) has_const = False if np.any(np.all(ex == 1, 0)): ex = ex[:, 1:] has_const = True ex = DataFrame(ex, columns=['ex_{0}_{1}'.format(i, j) for j in range(ex.shape[1])]) en = DataFrame(en, columns=['en_{0}_{1}'.format(i, j) for j in range(en.shape[1])]) instr = DataFrame(instr, columns=['instr_{0}_{1}'.format(i, j) for j in range(ex.shape[1])]) fmla = ''.join(dep.columns) + ' ~ ' if has_const: fmla += ' 1 + ' fmla += ' + '.join(ex.columns) + ' + [' fmla += ' + '.join(en.columns) + ' ~ ' fmla += ' + '.join(instr.columns) + ' ] ' formula.append(fmla) df.extend([dep, ex, en, instr]) from collections import OrderedDict formulas = OrderedDict() for i, f in enumerate(formula): formulas['eq{0}'.format(i)] = f df = concat(df, 1) formula_mod = IVSystemGMM.from_formula(formulas, df, weight_type='unadjusted') res = mod.fit(cov_type='unadjusted') formula_res = formula_mod.fit(cov_type='unadjusted') assert_allclose(res.params, formula_res.params)
w = w / w.mean() items = ['x' + str(i) for i in range(1, k + 1)] items = ['intercept'] + items major = pd.date_range('12-31-1999', periods=t, freq='A-DEC') minor = ['firm.' + str(i) for i in range(1, n + 1)] x = panel_to_frame(x, items, major, minor, swap=True) y = panel_to_frame(y[None, :], ['y'], major, minor, swap=True) w = panel_to_frame(w[None, :], ['w'], major, minor, swap=True) x = PanelData(x) y = PanelData(y) w = PanelData(w) z = concat([x.dataframe, y.dataframe, w.dataframe], 1) final_index = pd.MultiIndex.from_product([minor, major]) final_index.levels[0].name = 'firm' z = z.reindex(final_index) z.index.levels[0].name = 'firm' z.index.levels[1].name = 'time' z = z.reset_index() z['firm_id'] = z.firm.astype('category') z['firm_id'] = z.firm_id.cat.codes vars = ['y', 'x1', 'x2', 'x3', 'x4', 'x5'] missing = 0.05 for v in vars: locs = np.random.choice(n * t, int(n * t * missing)) temp = z[v].copy()
def _get_series_property(self, name): out = ([(k, getattr(v, name)) for k, v in self._results.items()]) cols = [v[0] for v in out] values = concat([v[1] for v in out], 1) values.columns = cols return values
if np.any(locs): dep.flat[locs] = np.nan exog = missing_data[key]['exog'] locs = np.where(np.random.random_sample(np.prod(exog.shape)) < 0.02)[0] if np.any(locs): exog.flat[locs] = np.nan out = [] for i, dataset in enumerate((basic_data, common_data, missing_data)): base = 'mod_{0}'.format(i) for j, key in enumerate(dataset): dep = dataset[key]['dependent'] dep = pd.DataFrame(dep, columns=[base + '_y_{0}'.format(j)]) dataset[key]['dependent'] = dep exog = dataset[key]['exog'][:, 1:] exog_cols = [ base + '_x_{0}{1}'.format(j, k) for k in range(exog.shape[1]) ] exog = pd.DataFrame(exog, columns=exog_cols) exog = exog.copy() exog['cons'] = 1.0 dataset[key]['exog'] = exog if i != 1 or j == 0: out.extend([dep, exog]) else: out.extend([dep]) if __name__ == '__main__': df = concat(out, 1) df.to_stata('simulated-sur.dta')
def generate_panel_data( nentity: int = 971, ntime: int = 7, nexog: int = 5, const: bool = False, missing: float = 0, other_effects: int = 2, ncats: Union[int, List[int]] = 4, rng: Optional[np.random.RandomState] = None, ) -> PanelModelData: """ Parameters ---------- nentity : int, default 971 The number of entities in the panel. ntime : int, default 7 The number of time periods in the panel. nexog : int, default 5 The number of explanatory variables in the dataset. const : bool, default False Flag indicating that the model should include a constant. missing : float, default 0 The percentage of values that are missing. Should be between 0 and 100. other_effects : int, default 2 The number of other effects generated. ncats : Union[int, Sequence[int]], default 4 The number of categories to use in other_effects and variance clusters. If list-like, then it must have as many elements as other_effects. rng : RandomState, default None A NumPy RandomState instance. If not provided, one is initialized using a fixed seed. Returns ------- PanelModelData A namedtuple derived class containing 4 DataFrames: * `data` - A simulated data with variables y and x# for # in 0,...,4. If const is True, then also contains a column named const. * `weights` - Simulated non-negative weights. * `other_effects` - Simulated effects. * `clusters` - Simulated data to use in clustered covariance estimation. """ if rng is None: rng = np.random.RandomState( [ 0xA14E2429, 0x448D2E51, 0x91B558E7, 0x6A3F5CD2, 0x22B43ABB, 0xE746C92D, 0xCE691A7D, 0x66746EE7, ] ) n, t, k = nentity, ntime, nexog k += int(const) x = rng.standard_normal((k, t, n)) beta = np.arange(1, k + 1)[:, None, None] / k y = ( (x * beta).sum(0) + rng.standard_normal((t, n)) + 2 * rng.standard_normal((1, n)) ) w = rng.chisquare(5, (t, n)) / 5 c = None cats = [f"cat.{i}" for i in range(other_effects)] if other_effects: if not isinstance(ncats, list): ncats = [ncats] * other_effects c = [] for i in range(other_effects): nc = ncats[i] c.append(rng.randint(0, nc, (1, t, n))) c = np.concatenate(c, 0) vcats = [f"varcat.{i}" for i in range(2)] vc2 = np.ones((2, t, 1)) @ rng.randint(0, n // 2, (2, 1, n)) vc1 = vc2[[0]] if const: x[0] = 1.0 if missing > 0: locs = rng.choice(n * t, int(n * t * missing)) y.flat[locs] = np.nan locs = rng.choice(n * t * k, int(n * t * k * missing)) x.flat[locs] = np.nan entities = [f"firm{i}" for i in range(n)] time = date_range("1-1-1900", periods=t, freq="A-DEC") var_names = [f"x{i}" for i in range(k)] if const: var_names[1:] = var_names[:-1] var_names[0] = "const" # y = DataFrame(y, index=time, columns=entities) y_df = panel_to_frame( y[None], items=["y"], major_axis=time, minor_axis=entities, swap=True ) index = y_df.index w_df = panel_to_frame( w[None], items=["w"], major_axis=time, minor_axis=entities, swap=True ) w_df = w_df.reindex(index) x_df = panel_to_frame( x, items=var_names, major_axis=time, minor_axis=entities, swap=True ) x_df = x_df.reindex(index) c_df = panel_to_frame( c, items=cats, major_axis=time, minor_axis=entities, swap=True ) other_eff = c_df.reindex(index) vc1_df = panel_to_frame( vc1, items=vcats[:1], major_axis=time, minor_axis=entities, swap=True ) vc1_df = vc1_df.reindex(index) vc2_df = panel_to_frame( vc2, items=vcats, major_axis=time, minor_axis=entities, swap=True ) vc2_df = vc2_df.reindex(index) clusters = concat([vc1_df, vc2_df]) data = concat([y_df, x_df], axis=1) return PanelModelData(data, w_df, other_eff, clusters)
def expand_categoricals(x, drop_first): if x.shape[1] == 0: return x return concat([convert_columns(x[c], drop_first) for c in x.columns], axis=1)
from linearmodels.tests.system._utility import generate_simultaneous_data data = generate_simultaneous_data() all_cols = [] out = [] for key in data: eqn = data[key] for key in ('exog', 'endog'): vals = eqn[key] for col in vals: if col in all_cols: continue else: out.append(vals[col]) all_cols.append(col) out = concat(out, 1) if 'const' in out: out.pop('const') out.to_stata('simulated-3sls.dta', write_index=False) SEP = """ file open myfile using {outfile}, write append file write myfile "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! {method} !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" _n file close myfile """ # add , 2sls to get non GLS estimator CMD = """ reg3 (dependent_0 dependent_1 dependent_2 exog_1 exog_2 exog_3 exog_4 exog_5) /// (dependent_1 dependent_0 dependent_2 exog_1 exog_2 exog_3 exog_6 exog_7) ///
def data(): premia = np.array([0.1, 0.1, 0.1]) out = generate_data(nportfolio=10, output="pandas", alpha=True, premia=premia) out["joined"] = concat([out.factors, out.portfolios], 1) return out
def expand_categoricals(x: AnyPandas, drop_first: bool) -> AnyPandas: if x.shape[1] == 0: return x return concat([convert_columns(x[c], drop_first) for c in x.columns], axis=1)
def expand_categoricals(x: DataFrame, drop_first: bool) -> DataFrame: return concat([convert_columns(x[c], drop_first) for c in x.columns], axis=1)