Example #1
0
def test_incorrect_time_axis():
    x = np.random.randn(3, 3, 1000)
    entities = ["entity.{0}".format(i) for i in range(1000)]
    time = ["time.{0}".format(i) for i in range(3)]
    var_names = ["var.{0}".format(i) for i in range(3)]
    p = panel_to_frame(x,
                       items=var_names,
                       major_axis=time,
                       minor_axis=entities,
                       swap=True)
    with pytest.raises(ValueError):
        PanelData(p)

    time = [1, 2, 3]
    var_names = ["var.{0}".format(i) for i in range(3)]
    p = panel_to_frame(x,
                       items=var_names,
                       major_axis=time,
                       minor_axis=entities,
                       swap=True)
    p.index = p.index.set_levels([1, datetime(1960, 1, 1), "a"], 1)
    with pytest.raises(ValueError):
        PanelData(p)
Example #2
0
def test_existing_panel_data():
    n, t, k = 11, 7, 3
    x = np.random.random((k, t, n))
    major = date_range("12-31-1999", periods=7)
    items = ["var.{0}".format(i) for i in range(1, k + 1)]
    minor = ["entities.{0}".format(i) for i in range(1, n + 1)]
    x = panel_to_frame(x,
                       items=items,
                       major_axis=major,
                       minor_axis=minor,
                       swap=True)
    dh = PanelData(x)
    dh2 = PanelData(dh)
    assert_frame_equal(dh.dataframe, dh2.dataframe)
def test_panel_to_midf():
    x = np.random.standard_normal((3, 7, 100))
    df = panel_to_frame(x, list(range(3)), list(range(7)), list(range(100)))
    mi = pd.MultiIndex.from_product([list(range(7)), list(range(100))])
    expected = pd.DataFrame(index=mi, columns=[0, 1, 2])
    for i in range(3):
        expected[i] = x[i].ravel()
    expected.index.names = ["major", "minor"]
    pd.testing.assert_frame_equal(df, expected)

    expected2 = expected.copy()
    expected2 = expected2.sort_index(level=[1, 0])
    expected2.index = expected2.index.swaplevel(0, 1)
    expected2.index.names = ["major", "minor"]
    df2 = panel_to_frame(x, list(range(3)), list(range(7)), list(range(100)),
                         True)
    pd.testing.assert_frame_equal(df2, expected2)

    entities = list(
        map(
            "".join,
            [[random.choice(string.ascii_lowercase) for __ in range(10)]
             for _ in range(100)],
        ))
    times = pd.date_range("1999-12-31", freq="A-DEC", periods=7)
    var_names = ["x.{0}".format(i) for i in range(1, 4)]
    df3 = panel_to_frame(x, var_names, times, entities, True)
    mi = pd.MultiIndex.from_product([times, entities])
    expected3 = pd.DataFrame(index=mi, columns=var_names)
    for i in range(1, 4):
        expected3["x.{0}".format(i)] = x[i - 1].ravel()
    expected3.index = expected3.index.swaplevel(0, 1)
    mi = pd.MultiIndex.from_product([entities, times])
    expected3 = expected3.loc[mi]
    expected3.index.names = ["major", "minor"]
    pd.testing.assert_frame_equal(df3, expected3)
Example #4
0
def test_string_nonconversion():
    t, n = 3, 1000
    string = np.random.choice(["a", "b", "c"], (t, n))
    num = np.random.randn(t, n)
    time = date_range("1-1-2000", periods=t)
    entities = ["entity.{0}".format(i) for i in range(n)]
    p = panel_to_frame(None,
                       items=["a", "b"],
                       major_axis=time,
                       minor_axis=entities,
                       swap=True)
    p["a"] = string.T.ravel()
    p["b"] = num.T.ravel()
    panel = PanelData(p, var_name="OtherEffect", convert_dummies=False)
    assert is_string_dtype(panel.dataframe["a"].dtype)
    assert np.all(panel.dataframe["a"] == string.T.ravel())
Example #5
0
def test_demean_both_large_t():
    x = np.random.standard_normal((1, 100, 10))
    time = date_range("1-1-2000", periods=100)
    entities = ["entity.{0}".format(i) for i in range(10)]
    data = panel_to_frame(x, ["x"], time, entities, swap=True)
    data = PanelData(data)
    demeaned = data.demean("both")

    df = data.dataframe
    no_index = df.reset_index()
    cat = Categorical(no_index[df.index.levels[0].name])
    d1 = get_dummies(cat, drop_first=False).astype(np.float64)
    cat = Categorical(no_index[df.index.levels[1].name])
    d2 = get_dummies(cat, drop_first=True).astype(np.float64)
    d = np.c_[d1.values, d2.values]
    dummy_demeaned = df.values - d @ pinv(d) @ df.values
    assert_allclose(1 + np.abs(demeaned.values2d), 1 + np.abs(dummy_demeaned))
Example #6
0
    def first_difference(self) -> "PanelData":
        """
        Compute first differences of variables

        Returns
        -------
        PanelData
            Differenced values
        """
        diffs = self.panel.values
        diffs = diffs[:, 1:] - diffs[:, :-1]
        diffs = panel_to_frame(
            diffs,
            self.panel.items,
            self.panel.major_axis[1:],
            self.panel.minor_axis,
            True,
        )
        diffs = diffs.reindex(self._frame.index).dropna(how="any")
        return PanelData(diffs)
Example #7
0
def test_numpy_3d():
    n, t, k = 11, 7, 3
    x = np.random.random((k, t, n))
    dh = PanelData(x)
    assert_equal(x, dh.values3d)
    assert dh.nentity == n
    assert dh.nobs == t
    assert dh.nvar == k
    assert_equal(np.reshape(x.T, (n * t, k)), dh.values2d)
    items = ["entity.{0}".format(i) for i in range(n)]
    obs = [i for i in range(t)]
    var_names = ["x.{0}".format(i) for i in range(k)]
    expected_frame = panel_to_frame(
        np.reshape(x, (k, t, n)),
        items=var_names,
        major_axis=obs,
        minor_axis=items,
        swap=True,
    )
    expected_frame.index.set_names(["entity", "time"], inplace=True)
    assert_frame_equal(dh.dataframe, expected_frame)
Example #8
0
    def __init__(
        self,
        x: "PanelDataLike",
        var_name: str = "x",
        convert_dummies: bool = True,
        drop_first: bool = True,
        copy: bool = True,
    ):
        self._var_name = var_name
        self._convert_dummies = convert_dummies
        self._drop_first = drop_first
        self._panel: Optional[_Panel] = None
        self._shape: Optional[Tuple[int, int, int]] = None
        index_names = ["entity", "time"]
        if isinstance(x, PanelData):
            x = x.dataframe
        self._original = x

        if not isinstance(x, (Series, DataFrame, np.ndarray)):
            try:
                from xarray import DataArray

                if isinstance(x, DataArray):
                    if x.ndim not in (2, 3):
                        raise ValueError(
                            "Only 2-d or 3-d DataArrays are supported")
                    if x.ndim == 2:
                        x = x.to_pandas()
                    else:
                        items: List[Hashable] = np.asarray(
                            x.coords[x.dims[0]]).tolist()
                        major: List[Hashable] = np.asarray(
                            x.coords[x.dims[1]]).tolist()
                        minor: List[Hashable] = np.asarray(
                            x.coords[x.dims[2]]).tolist()
                        values = x.values
                        x = panel_to_frame(values, items, major, minor, True)
            except ImportError:
                pass

        if isinstance(x, Series) and isinstance(x.index, MultiIndex):
            x = DataFrame(x)
        elif isinstance(x, Series):
            raise ValueError(
                "Series can only be used with a 2-level MultiIndex")

        if isinstance(x, DataFrame):
            if isinstance(x.index, MultiIndex):
                if len(x.index.levels) != 2:
                    raise ValueError("DataFrame input must have a "
                                     "MultiIndex with 2 levels")
                if isinstance(self._original, (DataFrame, PanelData, Series)):
                    for i in range(2):
                        index_names[
                            i] = x.index.levels[i].name or index_names[i]
                self._frame = x
                if copy:
                    self._frame = self._frame.copy()
            else:
                self._frame = DataFrame({var_name: x.T.stack(dropna=False)})
        elif isinstance(x, np.ndarray):
            if x.ndim not in (2, 3):
                raise ValueError("2 or 3-d array required for numpy input")
            if x.ndim == 2:
                x = x[None, :, :]

            k, t, n = x.shape
            var_str = var_name + ".{0:0>" + str(int(np.log10(k) + 0.01)) + "}"
            variables = [var_name] if k == 1 else [
                var_str.format(i) for i in range(k)
            ]
            entity_str = "entity.{0:0>" + str(int(np.log10(n) + 0.01)) + "}"
            entities = [entity_str.format(i) for i in range(n)]
            time = list(range(t))
            assert isinstance(x, np.ndarray)
            x = x.astype(np.float64, copy=False)
            panel = _Panel.from_array(x,
                                      items=variables,
                                      major_axis=time,
                                      minor_axis=entities)
            self._fake_panel = panel
            self._frame = panel.to_frame()
        else:
            raise TypeError("Only ndarrays, DataFrames or DataArrays are "
                            "supported")
        if convert_dummies:
            self._frame = expand_categoricals(self._frame, drop_first)
            self._frame = self._frame.astype(np.float64, copy=False)

        time_index = Series(self.index.levels[1])
        if not (is_numeric_dtype(time_index.dtype)
                or is_datetime64_any_dtype(time_index.dtype)):
            raise ValueError("The index on the time dimension must be either "
                             "numeric or date-like")
        # self._k, self._t, self._n = self.panel.shape
        self._k, self._t, self._n = self.shape
        self._frame.index.set_names(index_names, inplace=True)
Example #9
0
def generate_panel_data(
    nentity: int = 971,
    ntime: int = 7,
    nexog: int = 5,
    const: bool = False,
    missing: float = 0,
    other_effects: int = 2,
    ncats: Union[int, List[int]] = 4,
    rng: Optional[np.random.RandomState] = None,
) -> PanelModelData:
    """

    Parameters
    ----------
    nentity : int, default 971
        The number of entities in the panel.
    ntime : int, default 7
        The number of time periods in the panel.
    nexog : int, default 5
        The number of explanatory variables in the dataset.
    const : bool, default False
        Flag indicating that the model should include a constant.
    missing : float, default 0
        The percentage of values that are missing. Should be between 0 and 100.
    other_effects : int, default 2
        The number of other effects generated.
    ncats : Union[int, Sequence[int]], default 4
        The number of categories to use in other_effects and variance
        clusters. If list-like, then it must have as many elements
        as other_effects.
    rng : RandomState, default None
        A NumPy RandomState instance. If not provided, one is initialized
        using a fixed seed.

    Returns
    -------
    PanelModelData
        A namedtuple derived class containing 4 DataFrames:

        * `data` - A simulated data with variables y and x# for # in 0,...,4.
          If const is True, then also contains a column named const.
        * `weights` - Simulated non-negative weights.
        * `other_effects` - Simulated effects.
        * `clusters` - Simulated data to use in clustered covariance estimation.
    """
    if rng is None:
        rng = np.random.RandomState(
            [
                0xA14E2429,
                0x448D2E51,
                0x91B558E7,
                0x6A3F5CD2,
                0x22B43ABB,
                0xE746C92D,
                0xCE691A7D,
                0x66746EE7,
            ]
        )

    n, t, k = nentity, ntime, nexog
    k += int(const)
    x = rng.standard_normal((k, t, n))
    beta = np.arange(1, k + 1)[:, None, None] / k
    y: NDArray = (
        (x * beta).sum(0)
        + rng.standard_normal((t, n))
        + 2 * rng.standard_normal((1, n))
    )

    w = rng.chisquare(5, (t, n)) / 5
    c: Optional[NDArray] = None
    cats = [f"cat.{i}" for i in range(other_effects)]
    if other_effects:
        if not isinstance(ncats, list):
            ncats = [ncats] * other_effects
        _c = []
        for i in range(other_effects):
            nc = ncats[i]
            _c.append(rng.randint(0, nc, (1, t, n)))
        c = np.concatenate(_c, 0)

    vcats = [f"varcat.{i}" for i in range(2)]
    vc2 = np.ones((2, t, 1)) @ rng.randint(0, n // 2, (2, 1, n))
    vc1 = vc2[[0]]

    if const:
        x[0] = 1.0

    if missing > 0:
        locs = rng.choice(n * t, int(n * t * missing))
        # TODO:: Fix typing in later version of numpy
        y.flat[locs] = np.nan  # type: ignore
        locs = rng.choice(n * t * k, int(n * t * k * missing))
        # TODO:: Fix typing in later version of numpy
        x.flat[locs] = np.nan  # type: ignore

    entities = [f"firm{i}" for i in range(n)]
    time = date_range("1-1-1900", periods=t, freq="A-DEC")
    var_names = [f"x{i}" for i in range(k)]
    if const:
        var_names[1:] = var_names[:-1]
        var_names[0] = "const"
    # y = DataFrame(y, index=time, columns=entities)
    y_df = panel_to_frame(
        y[None], items=["y"], major_axis=time, minor_axis=entities, swap=True
    )
    index = y_df.index
    w_df = panel_to_frame(
        w[None], items=["w"], major_axis=time, minor_axis=entities, swap=True
    )
    w_df = w_df.reindex(index)
    x_df = panel_to_frame(
        x, items=var_names, major_axis=time, minor_axis=entities, swap=True
    )
    x_df = x_df.reindex(index)
    c_df = panel_to_frame(
        c, items=cats, major_axis=time, minor_axis=entities, swap=True
    )
    other_eff = c_df.reindex(index)
    vc1_df = panel_to_frame(
        vc1, items=vcats[:1], major_axis=time, minor_axis=entities, swap=True
    )
    vc1_df = vc1_df.reindex(index)
    vc2_df = panel_to_frame(
        vc2, items=vcats, major_axis=time, minor_axis=entities, swap=True
    )
    vc2_df = vc2_df.reindex(index)
    clusters = concat([vc1_df, vc2_df], sort=False)
    data = concat([y_df, x_df], axis=1, sort=False)
    return PanelModelData(data, w_df, other_eff, clusters)
Example #10
0
def generate_data(
        missing,
        datatype,
        const=False,
        ntk=(971, 7, 5),
        other_effects=0,
        rng=None,
        num_cats=4,
):
    if rng is None:
        np.random.seed(12345)
    else:
        np.random.set_state(rng.get_state())

    n, t, k = ntk
    k += const
    x = standard_normal((k, t, n))
    beta = np.arange(1, k + 1)[:, None, None] / k
    y = (x * beta).sum(0) + standard_normal((t, n)) + 2 * standard_normal(
        (1, n))
    w = np.random.chisquare(5, (t, n)) / 5
    c = None
    if other_effects == 1:
        cats = ["Industries"]
    else:
        cats = ["cat." + str(i) for i in range(other_effects)]
    if other_effects:
        if not isinstance(num_cats, list):
            num_cats = [num_cats] * other_effects
        c = []
        for i in range(other_effects):
            nc = num_cats[i]
            c.append(np.random.randint(0, nc, (1, t, n)))
        c = np.concatenate(c, 0)

    vcats = ["varcat." + str(i) for i in range(2)]
    vc2 = np.ones((2, t, 1)) @ np.random.randint(0, n // 2, (2, 1, n))
    vc1 = vc2[[0]]

    if const:
        x[0] = 1.0

    if missing > 0:
        locs = np.random.choice(n * t, int(n * t * missing))
        y.flat[locs] = np.nan
        locs = np.random.choice(n * t * k, int(n * t * k * missing))
        x.flat[locs] = np.nan

    if datatype in ("pandas", "xarray"):
        entities = ["firm" + str(i) for i in range(n)]
        time = date_range("1-1-1900", periods=t, freq="A-DEC")
        var_names = ["x" + str(i) for i in range(k)]
        # y = DataFrame(y, index=time, columns=entities)
        y = panel_to_frame(y[None],
                           items=["y"],
                           major_axis=time,
                           minor_axis=entities,
                           swap=True)
        w = panel_to_frame(w[None],
                           items=["w"],
                           major_axis=time,
                           minor_axis=entities,
                           swap=True)
        w = w.reindex(y.index)
        x = panel_to_frame(x,
                           items=var_names,
                           major_axis=time,
                           minor_axis=entities,
                           swap=True)
        x = x.reindex(y.index)
        c = panel_to_frame(c,
                           items=cats,
                           major_axis=time,
                           minor_axis=entities,
                           swap=True)
        c = c.reindex(y.index)
        vc1 = panel_to_frame(vc1,
                             items=vcats[:1],
                             major_axis=time,
                             minor_axis=entities,
                             swap=True)
        vc1 = vc1.reindex(y.index)
        vc2 = panel_to_frame(vc2,
                             items=vcats,
                             major_axis=time,
                             minor_axis=entities,
                             swap=True)
        vc2 = vc2.reindex(y.index)

    if datatype == "xarray":
        # TODO: This is broken now, need to transform MultiIndex to xarray 3d
        import xarray as xr

        x = xr.DataArray(
            PanelData(x).values3d,
            coords={
                "entities": entities,
                "time": time,
                "vars": var_names
            },
            dims=["vars", "time", "entities"],
        )
        y = xr.DataArray(
            PanelData(y).values3d,
            coords={
                "entities": entities,
                "time": time,
                "vars": ["y"]
            },
            dims=["vars", "time", "entities"],
        )
        w = xr.DataArray(
            PanelData(w).values3d,
            coords={
                "entities": entities,
                "time": time,
                "vars": ["w"]
            },
            dims=["vars", "time", "entities"],
        )
        if c.shape[1] > 0:
            c = xr.DataArray(
                PanelData(c).values3d,
                coords={
                    "entities": entities,
                    "time": time,
                    "vars": c.columns
                },
                dims=["vars", "time", "entities"],
            )
        vc1 = xr.DataArray(
            PanelData(vc1).values3d,
            coords={
                "entities": entities,
                "time": time,
                "vars": vc1.columns
            },
            dims=["vars", "time", "entities"],
        )
        vc2 = xr.DataArray(
            PanelData(vc2).values3d,
            coords={
                "entities": entities,
                "time": time,
                "vars": vc2.columns
            },
            dims=["vars", "time", "entities"],
        )

    if rng is not None:
        rng.set_state(np.random.get_state())

    return AttrDict(y=y, x=x, w=w, c=c, vc1=vc1, vc2=vc2)
Example #11
0
x[0, :, :] = 1
beta = np.arange(1, k + 2) / (k + 1)
eps = np.random.randn(t, n)
beta.shape = (k + 1, 1, 1)
y = (beta * x).sum(0) + eps
y += np.random.randn(1, n)
w = np.random.chisquare(10, size=(1, n)) / 10.0
w = np.ones((t, 1)) @ w
w = w / w.mean()

items = ["x" + str(i) for i in range(1, k + 1)]
items = ["intercept"] + items
major = pd.date_range("12-31-1999", periods=t, freq="A-DEC")
minor = ["firm." + str(i) for i in range(1, n + 1)]

x = panel_to_frame(x, items, major, minor, swap=True)
y = panel_to_frame(y[None, :], ["y"], major, minor, swap=True)
w = panel_to_frame(w[None, :], ["w"], major, minor, swap=True)

x = PanelData(x)
y = PanelData(y)
w = PanelData(w)

z = pd.concat([x.dataframe, y.dataframe, w.dataframe], 1, sort=False)
final_index = pd.MultiIndex.from_product([minor, major])
final_index.levels[0].name = "firm"
z = z.reindex(final_index)
z.index.levels[0].name = "firm"
z.index.levels[1].name = "time"

z = z.reset_index()
Example #12
0
def generate_data(
    missing: bool,
    datatype: Literal["pandas", "xarray", "numpy"],
    const: bool = False,
    ntk: tuple[int, int, int] = (971, 7, 5),
    other_effects: int = 0,
    rng: RandomState | None = None,
    num_cats: int | list[int] = 4,
):
    if rng is None:
        np.random.seed(12345)
    else:
        np.random.set_state(rng.get_state())

    n, t, k = ntk
    k += const
    x = standard_normal((k, t, n))
    beta = np.arange(1, k + 1)[:, None, None] / k
    y = np.empty((t, n), dtype=np.float64)
    y[:, :] = (x * beta).sum(0) + standard_normal(
        (t, n)) + 2 * standard_normal((1, n))
    w = np.random.chisquare(5, (t, n)) / 5
    c = np.empty((y.size, 0), dtype=int)
    if other_effects == 1:
        cats = ["Industries"]
    else:
        cats = ["cat." + str(i) for i in range(other_effects)]
    if other_effects:
        if isinstance(num_cats, int):
            num_cats = [num_cats] * other_effects
        oe = []
        for i in range(other_effects):
            nc = num_cats[i]
            oe.append(np.random.randint(0, nc, (1, t, n)))
        c = np.concatenate(oe, 0)

    vcats = ["varcat." + str(i) for i in range(2)]
    vc2 = np.ones((2, t, 1)) @ np.random.randint(0, n // 2, (2, 1, n))
    vc1 = vc2[[0]]

    if const:
        x[0] = 1.0

    if missing > 0:
        locs = np.random.choice(n * t, int(n * t * missing))
        y.flat[locs] = np.nan
        locs = np.random.choice(n * t * k, int(n * t * k * missing))
        x.flat[locs] = np.nan
    if rng is not None:
        rng.set_state(np.random.get_state())
    if datatype == "numpy":
        return AttrDict(y=y, x=x, w=w, c=c, vc1=vc1, vc2=vc2)

    entities = ["firm" + str(i) for i in range(n)]
    time = date_range("1-1-1900", periods=t, freq="A-DEC")
    var_names = ["x" + str(i) for i in range(k)]
    # y = DataFrame(y, index=time, columns=entities)
    y_df = panel_to_frame(y[None],
                          items=["y"],
                          major_axis=time,
                          minor_axis=entities,
                          swap=True)
    w_df = panel_to_frame(w[None],
                          items=["w"],
                          major_axis=time,
                          minor_axis=entities,
                          swap=True)
    w_df = w_df.reindex(y_df.index)
    x_df = panel_to_frame(x,
                          items=var_names,
                          major_axis=time,
                          minor_axis=entities,
                          swap=True)
    x_df = x_df.reindex(y_df.index)
    if c.shape[1]:
        c_df = panel_to_frame(c,
                              items=cats,
                              major_axis=time,
                              minor_axis=entities,
                              swap=True)
    else:
        c_df = DataFrame(index=y_df.index)
    c_df = c_df.reindex(y_df.index)
    vc1_df = panel_to_frame(vc1,
                            items=vcats[:1],
                            major_axis=time,
                            minor_axis=entities,
                            swap=True)
    vc1_df = vc1_df.reindex(y_df.index)
    vc2_df = panel_to_frame(vc2,
                            items=vcats,
                            major_axis=time,
                            minor_axis=entities,
                            swap=True)
    vc2_df = vc2_df.reindex(y_df.index)
    if datatype == "pandas":
        return AttrDict(y=y_df, x=x_df, w=w_df, c=c_df, vc1=vc1_df, vc2=vc2_df)

    assert datatype == "xarray"
    import xarray as xr
    from xarray.core.dtypes import NA

    x_xr = xr.DataArray(
        PanelData(x_df).values3d,
        coords={
            "entities": entities,
            "time": time,
            "vars": var_names
        },
        dims=["vars", "time", "entities"],
    )
    y_xr = xr.DataArray(
        PanelData(y_df).values3d,
        coords={
            "entities": entities,
            "time": time,
            "vars": ["y"]
        },
        dims=["vars", "time", "entities"],
    )
    w_xr = xr.DataArray(
        PanelData(w_df).values3d,
        coords={
            "entities": entities,
            "time": time,
            "vars": ["w"]
        },
        dims=["vars", "time", "entities"],
    )
    c_vals = PanelData(c_df).values3d if c.shape[1] else NA
    c_xr = xr.DataArray(
        c_vals,
        coords={
            "entities": entities,
            "time": time,
            "vars": c_df.columns
        },
        dims=["vars", "time", "entities"],
    )
    vc1_xr = xr.DataArray(
        PanelData(vc1_df).values3d,
        coords={
            "entities": entities,
            "time": time,
            "vars": vc1_df.columns
        },
        dims=["vars", "time", "entities"],
    )
    vc2_xr = xr.DataArray(
        PanelData(vc2_df).values3d,
        coords={
            "entities": entities,
            "time": time,
            "vars": vc2_df.columns
        },
        dims=["vars", "time", "entities"],
    )
    return AttrDict(y=y_xr, x=x_xr, w=w_xr, c=c_xr, vc1=vc1_xr, vc2=vc2_xr)