def test_incorrect_time_axis(): x = np.random.randn(3, 3, 1000) entities = ["entity.{0}".format(i) for i in range(1000)] time = ["time.{0}".format(i) for i in range(3)] var_names = ["var.{0}".format(i) for i in range(3)] p = panel_to_frame(x, items=var_names, major_axis=time, minor_axis=entities, swap=True) with pytest.raises(ValueError): PanelData(p) time = [1, 2, 3] var_names = ["var.{0}".format(i) for i in range(3)] p = panel_to_frame(x, items=var_names, major_axis=time, minor_axis=entities, swap=True) p.index = p.index.set_levels([1, datetime(1960, 1, 1), "a"], 1) with pytest.raises(ValueError): PanelData(p)
def test_existing_panel_data(): n, t, k = 11, 7, 3 x = np.random.random((k, t, n)) major = date_range("12-31-1999", periods=7) items = ["var.{0}".format(i) for i in range(1, k + 1)] minor = ["entities.{0}".format(i) for i in range(1, n + 1)] x = panel_to_frame(x, items=items, major_axis=major, minor_axis=minor, swap=True) dh = PanelData(x) dh2 = PanelData(dh) assert_frame_equal(dh.dataframe, dh2.dataframe)
def test_panel_to_midf(): x = np.random.standard_normal((3, 7, 100)) df = panel_to_frame(x, list(range(3)), list(range(7)), list(range(100))) mi = pd.MultiIndex.from_product([list(range(7)), list(range(100))]) expected = pd.DataFrame(index=mi, columns=[0, 1, 2]) for i in range(3): expected[i] = x[i].ravel() expected.index.names = ["major", "minor"] pd.testing.assert_frame_equal(df, expected) expected2 = expected.copy() expected2 = expected2.sort_index(level=[1, 0]) expected2.index = expected2.index.swaplevel(0, 1) expected2.index.names = ["major", "minor"] df2 = panel_to_frame(x, list(range(3)), list(range(7)), list(range(100)), True) pd.testing.assert_frame_equal(df2, expected2) entities = list( map( "".join, [[random.choice(string.ascii_lowercase) for __ in range(10)] for _ in range(100)], )) times = pd.date_range("1999-12-31", freq="A-DEC", periods=7) var_names = ["x.{0}".format(i) for i in range(1, 4)] df3 = panel_to_frame(x, var_names, times, entities, True) mi = pd.MultiIndex.from_product([times, entities]) expected3 = pd.DataFrame(index=mi, columns=var_names) for i in range(1, 4): expected3["x.{0}".format(i)] = x[i - 1].ravel() expected3.index = expected3.index.swaplevel(0, 1) mi = pd.MultiIndex.from_product([entities, times]) expected3 = expected3.loc[mi] expected3.index.names = ["major", "minor"] pd.testing.assert_frame_equal(df3, expected3)
def test_string_nonconversion(): t, n = 3, 1000 string = np.random.choice(["a", "b", "c"], (t, n)) num = np.random.randn(t, n) time = date_range("1-1-2000", periods=t) entities = ["entity.{0}".format(i) for i in range(n)] p = panel_to_frame(None, items=["a", "b"], major_axis=time, minor_axis=entities, swap=True) p["a"] = string.T.ravel() p["b"] = num.T.ravel() panel = PanelData(p, var_name="OtherEffect", convert_dummies=False) assert is_string_dtype(panel.dataframe["a"].dtype) assert np.all(panel.dataframe["a"] == string.T.ravel())
def test_demean_both_large_t(): x = np.random.standard_normal((1, 100, 10)) time = date_range("1-1-2000", periods=100) entities = ["entity.{0}".format(i) for i in range(10)] data = panel_to_frame(x, ["x"], time, entities, swap=True) data = PanelData(data) demeaned = data.demean("both") df = data.dataframe no_index = df.reset_index() cat = Categorical(no_index[df.index.levels[0].name]) d1 = get_dummies(cat, drop_first=False).astype(np.float64) cat = Categorical(no_index[df.index.levels[1].name]) d2 = get_dummies(cat, drop_first=True).astype(np.float64) d = np.c_[d1.values, d2.values] dummy_demeaned = df.values - d @ pinv(d) @ df.values assert_allclose(1 + np.abs(demeaned.values2d), 1 + np.abs(dummy_demeaned))
def first_difference(self) -> "PanelData": """ Compute first differences of variables Returns ------- PanelData Differenced values """ diffs = self.panel.values diffs = diffs[:, 1:] - diffs[:, :-1] diffs = panel_to_frame( diffs, self.panel.items, self.panel.major_axis[1:], self.panel.minor_axis, True, ) diffs = diffs.reindex(self._frame.index).dropna(how="any") return PanelData(diffs)
def test_numpy_3d(): n, t, k = 11, 7, 3 x = np.random.random((k, t, n)) dh = PanelData(x) assert_equal(x, dh.values3d) assert dh.nentity == n assert dh.nobs == t assert dh.nvar == k assert_equal(np.reshape(x.T, (n * t, k)), dh.values2d) items = ["entity.{0}".format(i) for i in range(n)] obs = [i for i in range(t)] var_names = ["x.{0}".format(i) for i in range(k)] expected_frame = panel_to_frame( np.reshape(x, (k, t, n)), items=var_names, major_axis=obs, minor_axis=items, swap=True, ) expected_frame.index.set_names(["entity", "time"], inplace=True) assert_frame_equal(dh.dataframe, expected_frame)
def __init__( self, x: "PanelDataLike", var_name: str = "x", convert_dummies: bool = True, drop_first: bool = True, copy: bool = True, ): self._var_name = var_name self._convert_dummies = convert_dummies self._drop_first = drop_first self._panel: Optional[_Panel] = None self._shape: Optional[Tuple[int, int, int]] = None index_names = ["entity", "time"] if isinstance(x, PanelData): x = x.dataframe self._original = x if not isinstance(x, (Series, DataFrame, np.ndarray)): try: from xarray import DataArray if isinstance(x, DataArray): if x.ndim not in (2, 3): raise ValueError( "Only 2-d or 3-d DataArrays are supported") if x.ndim == 2: x = x.to_pandas() else: items: List[Hashable] = np.asarray( x.coords[x.dims[0]]).tolist() major: List[Hashable] = np.asarray( x.coords[x.dims[1]]).tolist() minor: List[Hashable] = np.asarray( x.coords[x.dims[2]]).tolist() values = x.values x = panel_to_frame(values, items, major, minor, True) except ImportError: pass if isinstance(x, Series) and isinstance(x.index, MultiIndex): x = DataFrame(x) elif isinstance(x, Series): raise ValueError( "Series can only be used with a 2-level MultiIndex") if isinstance(x, DataFrame): if isinstance(x.index, MultiIndex): if len(x.index.levels) != 2: raise ValueError("DataFrame input must have a " "MultiIndex with 2 levels") if isinstance(self._original, (DataFrame, PanelData, Series)): for i in range(2): index_names[ i] = x.index.levels[i].name or index_names[i] self._frame = x if copy: self._frame = self._frame.copy() else: self._frame = DataFrame({var_name: x.T.stack(dropna=False)}) elif isinstance(x, np.ndarray): if x.ndim not in (2, 3): raise ValueError("2 or 3-d array required for numpy input") if x.ndim == 2: x = x[None, :, :] k, t, n = x.shape var_str = var_name + ".{0:0>" + str(int(np.log10(k) + 0.01)) + "}" variables = [var_name] if k == 1 else [ var_str.format(i) for i in range(k) ] entity_str = "entity.{0:0>" + str(int(np.log10(n) + 0.01)) + "}" entities = [entity_str.format(i) for i in range(n)] time = list(range(t)) assert isinstance(x, np.ndarray) x = x.astype(np.float64, copy=False) panel = _Panel.from_array(x, items=variables, major_axis=time, minor_axis=entities) self._fake_panel = panel self._frame = panel.to_frame() else: raise TypeError("Only ndarrays, DataFrames or DataArrays are " "supported") if convert_dummies: self._frame = expand_categoricals(self._frame, drop_first) self._frame = self._frame.astype(np.float64, copy=False) time_index = Series(self.index.levels[1]) if not (is_numeric_dtype(time_index.dtype) or is_datetime64_any_dtype(time_index.dtype)): raise ValueError("The index on the time dimension must be either " "numeric or date-like") # self._k, self._t, self._n = self.panel.shape self._k, self._t, self._n = self.shape self._frame.index.set_names(index_names, inplace=True)
def generate_panel_data( nentity: int = 971, ntime: int = 7, nexog: int = 5, const: bool = False, missing: float = 0, other_effects: int = 2, ncats: Union[int, List[int]] = 4, rng: Optional[np.random.RandomState] = None, ) -> PanelModelData: """ Parameters ---------- nentity : int, default 971 The number of entities in the panel. ntime : int, default 7 The number of time periods in the panel. nexog : int, default 5 The number of explanatory variables in the dataset. const : bool, default False Flag indicating that the model should include a constant. missing : float, default 0 The percentage of values that are missing. Should be between 0 and 100. other_effects : int, default 2 The number of other effects generated. ncats : Union[int, Sequence[int]], default 4 The number of categories to use in other_effects and variance clusters. If list-like, then it must have as many elements as other_effects. rng : RandomState, default None A NumPy RandomState instance. If not provided, one is initialized using a fixed seed. Returns ------- PanelModelData A namedtuple derived class containing 4 DataFrames: * `data` - A simulated data with variables y and x# for # in 0,...,4. If const is True, then also contains a column named const. * `weights` - Simulated non-negative weights. * `other_effects` - Simulated effects. * `clusters` - Simulated data to use in clustered covariance estimation. """ if rng is None: rng = np.random.RandomState( [ 0xA14E2429, 0x448D2E51, 0x91B558E7, 0x6A3F5CD2, 0x22B43ABB, 0xE746C92D, 0xCE691A7D, 0x66746EE7, ] ) n, t, k = nentity, ntime, nexog k += int(const) x = rng.standard_normal((k, t, n)) beta = np.arange(1, k + 1)[:, None, None] / k y: NDArray = ( (x * beta).sum(0) + rng.standard_normal((t, n)) + 2 * rng.standard_normal((1, n)) ) w = rng.chisquare(5, (t, n)) / 5 c: Optional[NDArray] = None cats = [f"cat.{i}" for i in range(other_effects)] if other_effects: if not isinstance(ncats, list): ncats = [ncats] * other_effects _c = [] for i in range(other_effects): nc = ncats[i] _c.append(rng.randint(0, nc, (1, t, n))) c = np.concatenate(_c, 0) vcats = [f"varcat.{i}" for i in range(2)] vc2 = np.ones((2, t, 1)) @ rng.randint(0, n // 2, (2, 1, n)) vc1 = vc2[[0]] if const: x[0] = 1.0 if missing > 0: locs = rng.choice(n * t, int(n * t * missing)) # TODO:: Fix typing in later version of numpy y.flat[locs] = np.nan # type: ignore locs = rng.choice(n * t * k, int(n * t * k * missing)) # TODO:: Fix typing in later version of numpy x.flat[locs] = np.nan # type: ignore entities = [f"firm{i}" for i in range(n)] time = date_range("1-1-1900", periods=t, freq="A-DEC") var_names = [f"x{i}" for i in range(k)] if const: var_names[1:] = var_names[:-1] var_names[0] = "const" # y = DataFrame(y, index=time, columns=entities) y_df = panel_to_frame( y[None], items=["y"], major_axis=time, minor_axis=entities, swap=True ) index = y_df.index w_df = panel_to_frame( w[None], items=["w"], major_axis=time, minor_axis=entities, swap=True ) w_df = w_df.reindex(index) x_df = panel_to_frame( x, items=var_names, major_axis=time, minor_axis=entities, swap=True ) x_df = x_df.reindex(index) c_df = panel_to_frame( c, items=cats, major_axis=time, minor_axis=entities, swap=True ) other_eff = c_df.reindex(index) vc1_df = panel_to_frame( vc1, items=vcats[:1], major_axis=time, minor_axis=entities, swap=True ) vc1_df = vc1_df.reindex(index) vc2_df = panel_to_frame( vc2, items=vcats, major_axis=time, minor_axis=entities, swap=True ) vc2_df = vc2_df.reindex(index) clusters = concat([vc1_df, vc2_df], sort=False) data = concat([y_df, x_df], axis=1, sort=False) return PanelModelData(data, w_df, other_eff, clusters)
def generate_data( missing, datatype, const=False, ntk=(971, 7, 5), other_effects=0, rng=None, num_cats=4, ): if rng is None: np.random.seed(12345) else: np.random.set_state(rng.get_state()) n, t, k = ntk k += const x = standard_normal((k, t, n)) beta = np.arange(1, k + 1)[:, None, None] / k y = (x * beta).sum(0) + standard_normal((t, n)) + 2 * standard_normal( (1, n)) w = np.random.chisquare(5, (t, n)) / 5 c = None if other_effects == 1: cats = ["Industries"] else: cats = ["cat." + str(i) for i in range(other_effects)] if other_effects: if not isinstance(num_cats, list): num_cats = [num_cats] * other_effects c = [] for i in range(other_effects): nc = num_cats[i] c.append(np.random.randint(0, nc, (1, t, n))) c = np.concatenate(c, 0) vcats = ["varcat." + str(i) for i in range(2)] vc2 = np.ones((2, t, 1)) @ np.random.randint(0, n // 2, (2, 1, n)) vc1 = vc2[[0]] if const: x[0] = 1.0 if missing > 0: locs = np.random.choice(n * t, int(n * t * missing)) y.flat[locs] = np.nan locs = np.random.choice(n * t * k, int(n * t * k * missing)) x.flat[locs] = np.nan if datatype in ("pandas", "xarray"): entities = ["firm" + str(i) for i in range(n)] time = date_range("1-1-1900", periods=t, freq="A-DEC") var_names = ["x" + str(i) for i in range(k)] # y = DataFrame(y, index=time, columns=entities) y = panel_to_frame(y[None], items=["y"], major_axis=time, minor_axis=entities, swap=True) w = panel_to_frame(w[None], items=["w"], major_axis=time, minor_axis=entities, swap=True) w = w.reindex(y.index) x = panel_to_frame(x, items=var_names, major_axis=time, minor_axis=entities, swap=True) x = x.reindex(y.index) c = panel_to_frame(c, items=cats, major_axis=time, minor_axis=entities, swap=True) c = c.reindex(y.index) vc1 = panel_to_frame(vc1, items=vcats[:1], major_axis=time, minor_axis=entities, swap=True) vc1 = vc1.reindex(y.index) vc2 = panel_to_frame(vc2, items=vcats, major_axis=time, minor_axis=entities, swap=True) vc2 = vc2.reindex(y.index) if datatype == "xarray": # TODO: This is broken now, need to transform MultiIndex to xarray 3d import xarray as xr x = xr.DataArray( PanelData(x).values3d, coords={ "entities": entities, "time": time, "vars": var_names }, dims=["vars", "time", "entities"], ) y = xr.DataArray( PanelData(y).values3d, coords={ "entities": entities, "time": time, "vars": ["y"] }, dims=["vars", "time", "entities"], ) w = xr.DataArray( PanelData(w).values3d, coords={ "entities": entities, "time": time, "vars": ["w"] }, dims=["vars", "time", "entities"], ) if c.shape[1] > 0: c = xr.DataArray( PanelData(c).values3d, coords={ "entities": entities, "time": time, "vars": c.columns }, dims=["vars", "time", "entities"], ) vc1 = xr.DataArray( PanelData(vc1).values3d, coords={ "entities": entities, "time": time, "vars": vc1.columns }, dims=["vars", "time", "entities"], ) vc2 = xr.DataArray( PanelData(vc2).values3d, coords={ "entities": entities, "time": time, "vars": vc2.columns }, dims=["vars", "time", "entities"], ) if rng is not None: rng.set_state(np.random.get_state()) return AttrDict(y=y, x=x, w=w, c=c, vc1=vc1, vc2=vc2)
x[0, :, :] = 1 beta = np.arange(1, k + 2) / (k + 1) eps = np.random.randn(t, n) beta.shape = (k + 1, 1, 1) y = (beta * x).sum(0) + eps y += np.random.randn(1, n) w = np.random.chisquare(10, size=(1, n)) / 10.0 w = np.ones((t, 1)) @ w w = w / w.mean() items = ["x" + str(i) for i in range(1, k + 1)] items = ["intercept"] + items major = pd.date_range("12-31-1999", periods=t, freq="A-DEC") minor = ["firm." + str(i) for i in range(1, n + 1)] x = panel_to_frame(x, items, major, minor, swap=True) y = panel_to_frame(y[None, :], ["y"], major, minor, swap=True) w = panel_to_frame(w[None, :], ["w"], major, minor, swap=True) x = PanelData(x) y = PanelData(y) w = PanelData(w) z = pd.concat([x.dataframe, y.dataframe, w.dataframe], 1, sort=False) final_index = pd.MultiIndex.from_product([minor, major]) final_index.levels[0].name = "firm" z = z.reindex(final_index) z.index.levels[0].name = "firm" z.index.levels[1].name = "time" z = z.reset_index()
def generate_data( missing: bool, datatype: Literal["pandas", "xarray", "numpy"], const: bool = False, ntk: tuple[int, int, int] = (971, 7, 5), other_effects: int = 0, rng: RandomState | None = None, num_cats: int | list[int] = 4, ): if rng is None: np.random.seed(12345) else: np.random.set_state(rng.get_state()) n, t, k = ntk k += const x = standard_normal((k, t, n)) beta = np.arange(1, k + 1)[:, None, None] / k y = np.empty((t, n), dtype=np.float64) y[:, :] = (x * beta).sum(0) + standard_normal( (t, n)) + 2 * standard_normal((1, n)) w = np.random.chisquare(5, (t, n)) / 5 c = np.empty((y.size, 0), dtype=int) if other_effects == 1: cats = ["Industries"] else: cats = ["cat." + str(i) for i in range(other_effects)] if other_effects: if isinstance(num_cats, int): num_cats = [num_cats] * other_effects oe = [] for i in range(other_effects): nc = num_cats[i] oe.append(np.random.randint(0, nc, (1, t, n))) c = np.concatenate(oe, 0) vcats = ["varcat." + str(i) for i in range(2)] vc2 = np.ones((2, t, 1)) @ np.random.randint(0, n // 2, (2, 1, n)) vc1 = vc2[[0]] if const: x[0] = 1.0 if missing > 0: locs = np.random.choice(n * t, int(n * t * missing)) y.flat[locs] = np.nan locs = np.random.choice(n * t * k, int(n * t * k * missing)) x.flat[locs] = np.nan if rng is not None: rng.set_state(np.random.get_state()) if datatype == "numpy": return AttrDict(y=y, x=x, w=w, c=c, vc1=vc1, vc2=vc2) entities = ["firm" + str(i) for i in range(n)] time = date_range("1-1-1900", periods=t, freq="A-DEC") var_names = ["x" + str(i) for i in range(k)] # y = DataFrame(y, index=time, columns=entities) y_df = panel_to_frame(y[None], items=["y"], major_axis=time, minor_axis=entities, swap=True) w_df = panel_to_frame(w[None], items=["w"], major_axis=time, minor_axis=entities, swap=True) w_df = w_df.reindex(y_df.index) x_df = panel_to_frame(x, items=var_names, major_axis=time, minor_axis=entities, swap=True) x_df = x_df.reindex(y_df.index) if c.shape[1]: c_df = panel_to_frame(c, items=cats, major_axis=time, minor_axis=entities, swap=True) else: c_df = DataFrame(index=y_df.index) c_df = c_df.reindex(y_df.index) vc1_df = panel_to_frame(vc1, items=vcats[:1], major_axis=time, minor_axis=entities, swap=True) vc1_df = vc1_df.reindex(y_df.index) vc2_df = panel_to_frame(vc2, items=vcats, major_axis=time, minor_axis=entities, swap=True) vc2_df = vc2_df.reindex(y_df.index) if datatype == "pandas": return AttrDict(y=y_df, x=x_df, w=w_df, c=c_df, vc1=vc1_df, vc2=vc2_df) assert datatype == "xarray" import xarray as xr from xarray.core.dtypes import NA x_xr = xr.DataArray( PanelData(x_df).values3d, coords={ "entities": entities, "time": time, "vars": var_names }, dims=["vars", "time", "entities"], ) y_xr = xr.DataArray( PanelData(y_df).values3d, coords={ "entities": entities, "time": time, "vars": ["y"] }, dims=["vars", "time", "entities"], ) w_xr = xr.DataArray( PanelData(w_df).values3d, coords={ "entities": entities, "time": time, "vars": ["w"] }, dims=["vars", "time", "entities"], ) c_vals = PanelData(c_df).values3d if c.shape[1] else NA c_xr = xr.DataArray( c_vals, coords={ "entities": entities, "time": time, "vars": c_df.columns }, dims=["vars", "time", "entities"], ) vc1_xr = xr.DataArray( PanelData(vc1_df).values3d, coords={ "entities": entities, "time": time, "vars": vc1_df.columns }, dims=["vars", "time", "entities"], ) vc2_xr = xr.DataArray( PanelData(vc2_df).values3d, coords={ "entities": entities, "time": time, "vars": vc2_df.columns }, dims=["vars", "time", "entities"], ) return AttrDict(y=y_xr, x=x_xr, w=w_xr, c=c_xr, vc1=vc1_xr, vc2=vc2_xr)