def test_demean_weighted(data): x = PanelData(data.x) w = PanelData(data.w) missing = x.isnull | w.isnull x.drop(missing) w.drop(missing) entity_demean = x.demean("entity", weights=w) d = get_dummies(Categorical(get_codes(x.index)[0])) d = d.values root_w = np.sqrt(w.values2d) wx = root_w * x.values2d wd = d * root_w mu = wd @ lstsq(wd, wx, rcond=None)[0] e = wx - mu assert_allclose(1 + np.abs(entity_demean.values2d), 1 + np.abs(e)) time_demean = x.demean("time", weights=w) d = get_dummies(Categorical(get_codes(x.index)[1])) d = d.values root_w = np.sqrt(w.values2d) wx = root_w * x.values2d wd = d * root_w mu = wd @ lstsq(wd, wx, rcond=None)[0] e = wx - mu assert_allclose(1 + np.abs(time_demean.values2d), 1 + np.abs(e))
def test_mean_weighted(data): x = PanelData(data.x) w = PanelData(data.w) missing = x.isnull | w.isnull x.drop(missing) w.drop(missing) entity_mean = x.mean("entity", weights=w) c = x.index.levels[0][get_codes(x.index)[0]] d = get_dummies(Categorical(c, ordered=True)) d = d[entity_mean.index] d = d.values root_w = np.sqrt(w.values2d) wx = root_w * x.values2d wd = d * root_w mu = lstsq(wd, wx, rcond=None)[0] assert_allclose(entity_mean, mu) time_mean = x.mean("time", weights=w) c = x.index.levels[1][get_codes(x.index)[1]] d = get_dummies(Categorical(c, ordered=True)) d = d[list(time_mean.index)] d = d.values root_w = np.sqrt(w.values2d) wx = root_w * x.values2d wd = d * root_w mu = pinv(wd) @ wx assert_allclose(time_mean, mu)
def __init__(self, df: DataFrame): self._items = df.columns index = df.index self._major_axis = Index(index.levels[1][get_codes(index)[1]]).unique() self._minor_axis = Index(index.levels[0][get_codes(index)[0]]).unique() self._full_index = MultiIndex.from_product( [self._minor_axis, self._major_axis]) new_df = df.reindex(self._full_index) new_df.index.names = df.index.names self._frame = new_df i, j, k = len(self._items), len(self._major_axis), len(self.minor_axis) self._shape = (i, j, k) self._values = np.swapaxes( np.reshape(np.asarray(new_df).copy().T, (i, k, j)), 1, 2)
def dummies(self, group: str = "entity", drop_first: bool = False) -> DataFrame: """ Generate entity or time dummies Parameters ---------- group : {'entity', 'time'}, optional Type of dummies to generate drop_first : bool, optional Flag indicating that the dummy column corresponding to the first entity or time period should be dropped Returns ------- DataFrame Dummy variables """ if group not in ("entity", "time"): raise ValueError axis = 0 if group == "entity" else 1 labels = get_codes(self._frame.index) levels = self._frame.index.levels cat = Categorical(levels[axis][labels[axis]]) dummies = get_dummies(cat, drop_first=drop_first) cols = self.entities if group == "entity" else self.time return dummies[[c for c in cols if c in dummies]].astype(np.float64, copy=False)
def test_absorbing_regressors(cat, cont, interact, weights): areg = AbsorbingRegressor(cat=cat, cont=cont, interactions=interact, weights=weights) rank = areg.approx_rank expected_rank = 0 expected = [] for i, col in enumerate(cat): expected_rank += pd.Series(get_codes(cat[col].cat)).nunique() - (i > 0) expected.append(dummy_matrix(cat, precondition=False)[0]) expected_rank += cont.shape[1] expected.append(csc_matrix(cont)) if interact is not None: for inter in interact: interact_mat = inter.sparse expected_rank += interact_mat.shape[1] expected.append(interact_mat) expected = sp.hstack(expected, format="csc") if weights is not None: expected = (sp.diags(np.sqrt(weights)).dot(expected)).asformat("csc") actual = areg.regressors assert expected.shape == actual.shape assert_array_equal(expected.indptr, actual.indptr) assert_array_equal(expected.indices, actual.indices) assert_allclose(expected.A, actual.A) assert expected_rank == rank
def category_continuous_interaction(cat: AnyPandas, cont: AnyPandas, precondition: bool = True) -> csc_matrix: """ Parameters ---------- cat : Series Categorical series to convert to dummy variables cont : {Series, DataFrame} Continuous variable values to use in the dummy interaction precondition : bool Flag whether dummies should be preconditioned Returns ------- csc_matrix Sparse matrix of dummy interactions with unit column norm """ codes = get_codes(category_product(cat).cat) interact = csc_matrix( (to_numpy(cont).flat, (arange(codes.shape[0]), codes))) if not precondition: return interact else: contioned = preconditioner(interact)[0] assert isinstance(contioned, csc_matrix) return contioned
def hash(self) -> Tuple[Tuple[str, ...], ...]: hashes: List[Tuple[str, ...]] = [] hasher = hash_func() if self._cat is not None: for col in self._cat: hasher.update( ascontiguousarray( to_numpy(get_codes(self._cat[col].cat)).data)) hashes.append((hasher.hexdigest(), )) hasher = _reset(hasher) if self._cont is not None: for col in self._cont: hasher.update(ascontiguousarray( to_numpy(self._cont[col]).data)) hashes.append((hasher.hexdigest(), )) hasher = _reset(hasher) if self._interactions is not None: for interact in self._interactions: hashes.extend(interact.hash) # Add weight hash if provided if self._weights is not None: hasher = hash_func() hasher.update(ascontiguousarray(self._weights.data)) hashes.append((hasher.hexdigest(), )) return tuple(sorted(hashes))
def test_fitted_effects_residuals(both_data_types): mod = BetweenOLS(both_data_types.y, both_data_types.x) res = mod.fit(reweight=True, debiased=False) expected = pd.DataFrame( mod.exog.values2d @ res.params.values, mod.dependent.index, columns=["fitted_values"], ) assert_allclose(expected, res.fitted_values) assert_frame_similar(res.fitted_values, expected) index = mod.dependent.dataframe.index reindex = index.levels[0][get_codes(index)[0]] resids = res.resids.copy() resids = resids.reindex(reindex) resids.index = index expected = pd.DataFrame(resids) expected.columns = ["estimated_effects"] assert_allclose(expected, res.estimated_effects) assert_frame_similar(res.estimated_effects, expected) fitted_effects = res.fitted_values.values + res.estimated_effects.values expected.iloc[:, 0] = mod.dependent.values2d - fitted_effects expected.columns = ["idiosyncratic"] assert_allclose(expected, res.idiosyncratic, atol=1e-8) assert_frame_similar(res.idiosyncratic, expected)
def category_product(cats: AnyPandas) -> Series: """ Construct category from all combination of input categories Parameters ---------- cats : {Series, DataFrame} DataFrame containing categorical variables. If cats is a Series, cats is returned unmodified. Returns ------- Series Categorical series containing the cartesian product of the categories in cats """ if isinstance(cats, Series): return cats sizes = [] for c in cats: if not is_categorical(cats[c]): raise TypeError("cats must contain only categorical variables") col = cats[c] max_code = get_codes(col.cat).max() size = 1 while max_code >= 2**size: size += 1 sizes.append(size) nobs = cats.shape[0] total_size = sum(sizes) if total_size >= 63: raise ValueError( "There are too many cats with too many states to use this method.") dtype_size = min(filter(lambda v: total_size < (v - 1), (8, 16, 32, 64))) dtype_str = "int{0:d}".format(dtype_size) dtype_val = dtype(dtype_str) codes = zeros(nobs, dtype=dtype_val) cum_size = 0 for i, col in enumerate(cats): codes += get_codes(cats[col].cat).astype( dtype_val) << SCALAR_DTYPES[dtype_str](cum_size) cum_size += sizes[i] return Series(Categorical(codes), index=cats.index)
def entity_ids(self) -> NDArray: """ Get array containing entity group membership information Returns ------- ndarray 2d array containing entity ids corresponding dataframe view """ return np.asarray(get_codes(self._frame.index)[0])[:, None]
def time_ids(self) -> NDArray: """ Get array containing time membership information Returns ------- ndarray 2d array containing time ids corresponding dataframe view """ return np.asarray(get_codes(self._frame.index)[1])[:, None]
def absorbed_data(request): datatype = request.param rng = np.random.RandomState(12345) data = generate_data(0, datatype, ntk=(131, 4, 3), rng=rng) x = data.x if isinstance(data.x, np.ndarray): absorbed = np.arange(x.shape[2]) absorbed = np.tile(absorbed, (1, x.shape[1], 1)) data.x = np.concatenate([data.x, absorbed]) elif isinstance(data.x, pd.DataFrame): codes = get_codes(data.x.index) absorbed = np.array(codes[0]).astype(np.double) data.x["x_absorbed"] = absorbed return data
def category_interaction(cat: Series, precondition: bool = True) -> csc_matrix: """ Parameters ---------- cat : Series Categorical series to convert to dummy variables precondition : bool Flag whether dummies should be preconditioned Returns ------- dummies : csc_matrix Sparse matrix of dummies with unit column norm """ codes = get_codes(category_product(cat).cat) return dummy_matrix(codes[:, None], precondition=precondition)[0]
def test_absorbing_regressors_hash(cat, cont, interact, weights): areg = AbsorbingRegressor(cat=cat, cont=cont, interactions=interact, weights=weights) # Build hash hashes = [] for col in cat: hashes.append( (hasher.single(to_numpy(get_codes(cat[col].cat)).data), )) for col in cont: hashes.append((hasher.single(to_numpy(cont[col]).data), )) hashes = sorted(hashes) if interact is not None: for inter in interact: hashes.extend(inter.hash) if weights is not None: hashes.append((hasher.single(weights.data), )) hashes = tuple(sorted(hashes)) assert hashes == areg.hash
def hash(self): """ Construct a hash that will be invariant for any permutation of inputs that produce the same fit when used as regressors""" # Sorted hashes of any categoricals hasher = hash_func() cat_hashes = [] cat = self.cat for col in cat: hasher.update(ascontiguousarray(to_numpy(get_codes(self.cat[col].cat)).data)) cat_hashes.append(hasher.hexdigest()) hasher = _reset(hasher) cat_hashes = tuple(sorted(cat_hashes)) hashes = [] cont = self.cont for col in cont: hasher.update(ascontiguousarray(to_numpy(cont[col]).data)) hashes.append(cat_hashes + (hasher.hexdigest(),)) hasher = _reset(hasher) return sorted(hashes)
def time(self) -> List[Label]: """List of time index names""" index = self._frame.index return list(index.levels[1][get_codes(index)[1]].unique())
def entities(self) -> List[Label]: """List of entity index names""" index = self._frame.index return list(index.levels[0][get_codes(index)[0]].unique())