def _check_weights(self) -> None: if self._weights is None: nobs = self._dependent.shape[0] self._is_weighted = False self._weight_data = IVData(ones(nobs), "weights") else: self._is_weighted = True weights = IVData(self._weights).ndarray weights = weights / nanmean(weights) self._weight_data = IVData(weights, var_name="weights", nobs=self._nobs)
def test_string_cat_equiv(self): s1 = pd.Series(['a', 'b', 'a', 'b', 'c', 'd', 'a', 'b']) s2 = pd.Series(np.arange(8.0)) s3 = pd.Series(['apple', 'banana', 'apple', 'banana', 'cherry', 'date', 'apple', 'banana']) df = pd.DataFrame({'string': s1, 'number': s2, 'other_string': s3}) dh = IVData(df) df_cat = df.copy() df_cat['string'] = df_cat['string'].astype('category') dh_cat = IVData(df_cat) assert_frame_equal(dh.pandas, dh_cat.pandas)
def test_invalid_types(self): with pytest.raises(ValueError): IVData(np.empty((1, 1, 1))) with pytest.raises(ValueError): IVData(np.empty((10, 2, 2))) with pytest.raises(TypeError): class a(object): @property def ndim(self): return 2 IVData(a())
def test_existing_datahandler(self): x = np.empty((10, 2)) index = pd.date_range('2017-01-01', periods=10) xdf = pd.DataFrame(x, columns=['a', 'b'], index=index) xdh = IVData(xdf) xdh2 = IVData(xdh) assert xdh is not xdh2 assert xdh.cols == xdh2.cols assert xdh.rows == xdh2.rows assert_equal(xdh.ndarray, xdh2.ndarray) assert xdh.ndim == xdh2.ndim assert_frame_equal(xdh.pandas, xdh2.pandas)
def test_string_cat_equiv() -> None: s1 = pd.Series(["a", "b", "a", "b", "c", "d", "a", "b"]) s2 = pd.Series(np.arange(8.0)) s3 = pd.Series( ["apple", "banana", "apple", "banana", "cherry", "date", "apple", "banana"] ) df = pd.DataFrame({"string": s1, "number": s2, "other_string": s3}) dh = IVData(df) df_cat = df.copy() df_cat["string"] = df_cat["string"].astype("category") dh_cat = IVData(df_cat) assert_frame_equal(dh.pandas, dh_cat.pandas)
def test_invalid_types(self) -> None: with pytest.raises(ValueError): IVData(np.empty((1, 1, 1))) with pytest.raises(ValueError): IVData(np.empty((10, 2, 2))) with pytest.raises(TypeError): class AnotherClass(object): @property def ndim(self) -> int: return 2 IVData(AnotherClass())
def __init__( self, dependent: ArrayLike, exog: OptionalArrayLike = None, *, absorb: InteractionVar = None, interactions: Union[InteractionVar, Iterable[InteractionVar]] = None, weights: OptionalArrayLike = None, drop_absorbed: bool = False, ) -> None: self._dependent = IVData(dependent, "dependent") self._nobs = nobs = self._dependent.shape[0] self._exog = IVData(exog, "exog", nobs=self._nobs) self._absorb = absorb if isinstance(absorb, DataFrame): self._absorb_inter = Interaction.from_frame(absorb) elif absorb is None: self._absorb_inter = Interaction(None, None, nobs) elif isinstance(absorb, Interaction): self._absorb_inter = absorb else: raise TypeError("absorb must ba a DataFrame or an Interaction") self._weights = weights self._is_weighted = False self._drop_absorbed = drop_absorbed self._check_weights() self._interactions = interactions self._interaction_list: List[Interaction] = [] self._prepare_interactions() self._absorbed_dependent: Optional[DataFrame] = None self._absorbed_exog: Optional[DataFrame] = None self._check_shape() self._original_index = self._dependent.pandas.index self._drop_locs = self._drop_missing() self._columns = self._exog.cols self._index = self._dependent.rows self._method = "Absorbing LS" self._const_col = 0 self._has_constant = False self._has_constant_exog = self._check_constant() self._constant_absorbed = False self._num_params = 0 self._regressors: Optional[sp.csc_matrix] = None self._regressors_hash: Optional[Tuple[Tuple[str, ...], ...]] = None
def __init__(self, dependent: ArrayLike, exog: OptionalArrayLike = None, *, absorb: InteractionVar = None, interactions: Union[InteractionVar, Iterable[InteractionVar]] = None, weights: OptionalArrayLike = None): self._dependent = IVData(dependent, 'dependent') self._nobs = nobs = self._dependent.shape[0] self._exog = IVData(exog, 'exog', nobs=self._nobs) self._absorb = absorb if isinstance(absorb, DataFrame): self._absorb_inter = Interaction.from_frame(absorb) elif absorb is None: self._absorb_inter = Interaction(None, None, nobs) elif isinstance(absorb, Interaction): self._absorb_inter = absorb else: raise TypeError('absorb must ba a DataFrame or an Interaction') self._weights = weights self._is_weighted = False self._check_weights() self._interactions = interactions self._interaction_list = [] # type: List[Interaction] self._prepare_interactions() self._absorbed_dependent = None self._absorbed_exog = None self._x = None self._check_shape() self._original_index = self._dependent.pandas.index self._drop_locs = self._drop_missing() self._columns = self._exog.cols self._index = self._dependent.rows self._method = 'Absorbing LS' self._const_col = 0 self._has_constant = False self._has_constant_exog = self._check_constant() self._constant_absorbed = False self._num_params = 0 self._regressors = None self._regressors_hash = None
def multivariate_ls(cls, dependent, exog=None, endog=None, instruments=None): """ Interface for specification of multivariate IV models Parameters ---------- dependent : array-like nobs by ndep array of dependent variables exog : array-like, optional nobs by nexog array of exogenous regressors common to all models endog : array-like, optional nobs by nengod array of endogenous regressors common to all models instruments : array-like, optional nobs by ninstr array of instruments to use in all equations Returns ------- model : IV3SLS Model instance Notes ----- At least one of exog or endog must be provided. Utility function to simplify the construction of multivariate IV models which all use the same regressors and instruments. Constructs the dictionary of equations from the variables using the common exogenous, endogenous and instrumental variables. """ equations = OrderedDict() dependent = IVData(dependent, var_name='dependent') if exog is None and endog is None: raise ValueError('At least one of exog or endog must be provided') exog = IVData(exog, var_name='exog') endog = IVData(endog, var_name='endog', nobs=dependent.shape[0]) instr = IVData(instruments, var_name='instruments', nobs=dependent.shape[0]) for col in dependent.pandas: equations[col] = (dependent.pandas[[col]], exog.pandas, endog.pandas, instr.pandas) return cls(equations)
def _check_data(self): cat, cont = self._cat, self._cont cat_nobs = getattr(cat, 'shape', (0, ))[0] cont_nobs = getattr(cont, 'shape', (0, ))[0] nobs = max(cat_nobs, cont_nobs) if cat is None and cont is None: if self._nobs is not None: self._cont_data = self._cat_data = IVData(None, 'none', nobs=self._nobs) else: raise ValueError( 'nobs must be provided when cat and cont are None') return self._nobs = nobs self._cat_data = IVData(cat, 'cat', nobs=nobs, convert_dummies=False) self._cont_data = IVData(cont, 'cont', nobs=nobs, convert_dummies=False) if self._cat_data.shape[1] == self._cont_data.shape[1] == 0: raise ValueError('Both cat and cont are empty arrays') cat_data = self._cat_data.pandas convert = [ col for col in cat_data if not (is_categorical(cat_data[col])) ] if convert: cat_data = DataFrame( {col: cat_data[col].astype('category') for col in cat_data}) self._cat_data = IVData(cat_data, 'cat', convert_dummies=False)
def _check_data(self) -> None: cat, cont = self._cat, self._cont cat_nobs = getattr(cat, "shape", (0, ))[0] cont_nobs = getattr(cont, "shape", (0, ))[0] nobs = max(cat_nobs, cont_nobs) if cat is None and cont is None: if self._nobs is not None: self._cont_data = self._cat_data = IVData(None, "none", nobs=self._nobs) else: raise ValueError( "nobs must be provided when cat and cont are None") return self._nobs = nobs self._cat_data = IVData(cat, "cat", nobs=nobs, convert_dummies=False) self._cont_data = IVData(cont, "cont", nobs=nobs, convert_dummies=False) if self._cat_data.shape[1] == self._cont_data.shape[1] == 0: raise ValueError("Both cat and cont are empty arrays") cat_data = self._cat_data.pandas convert = [ col for col in cat_data if not (is_categorical_dtype(cat_data[col])) ] if convert: cat_data = DataFrame( {col: cat_data[col].astype("category") for col in cat_data}) self._cat_data = IVData(cat_data, "cat", convert_dummies=False)
def test_xarray_2d() -> None: x_np = np.random.randn(10, 2) x = xr.DataArray(x_np) dh = IVData(x) assert_equal(dh.ndarray, x_np) assert dh.rows == list(np.arange(10)) assert dh.cols == ["x.0", "x.1"] expected = pd.DataFrame(x_np, columns=dh.cols, index=dh.rows) assert_frame_equal(expected, dh.pandas) index = pd.date_range("2017-01-01", periods=10) x = xr.DataArray(x_np, [("time", index), ("variables", ["apple", "banana"])]) dh = IVData(x) assert_equal(dh.ndarray, x_np) assert_series_equal(pd.Series(dh.rows), pd.Series(list(index))) assert dh.cols == ["apple", "banana"] expected = pd.DataFrame(x_np, columns=dh.cols, index=dh.rows) assert_frame_equal(expected, dh.pandas)
def test_xarray_1d(self): x_np = np.random.randn(10) x = xr.DataArray(x_np) dh = IVData(x, 'some_variable') assert_equal(dh.ndarray, x_np[:, None]) assert dh.rows == list(np.arange(10)) assert dh.cols == ['some_variable.0'] expected = pd.DataFrame(x_np, columns=dh.cols, index=dh.rows) assert_frame_equal(expected, dh.pandas) index = pd.date_range('2017-01-01', periods=10) x = xr.DataArray(x_np, [('time', index)]) dh = IVData(x, 'some_variable') assert_equal(dh.ndarray, x_np[:, None]) assert_series_equal(pd.Series(dh.rows), pd.Series(list(index))) assert dh.cols == ['some_variable.0'] expected = pd.DataFrame(x_np[:, None], columns=dh.cols, index=dh.rows) assert_frame_equal(expected, dh.pandas)
def test_xarray_2d(self): x_np = np.random.randn(10, 2) x = xr.DataArray(x_np) dh = IVData(x) assert_equal(dh.ndarray, x_np) assert dh.rows == list(np.arange(10)) assert dh.cols == ['x.0', 'x.1'] expected = pd.DataFrame(x_np, columns=dh.cols, index=dh.rows) assert_frame_equal(expected, dh.pandas) index = pd.date_range('2017-01-01', periods=10) x = xr.DataArray(x_np, [('time', index), ('variables', ['apple', 'banana'])]) dh = IVData(x) assert_equal(dh.ndarray, x_np) assert_series_equal(pd.Series(dh.rows), pd.Series(list(index))) assert dh.cols == ['apple', 'banana'] expected = pd.DataFrame(x_np, columns=dh.cols, index=dh.rows) assert_frame_equal(expected, dh.pandas)
def test_numpy_1d(self): x = np.empty(10) xdh = IVData(x) assert xdh.ndim == 2 assert xdh.cols == ['x'] assert xdh.rows == list(np.arange(10)) assert_equal(xdh.ndarray, x[:, None]) df = pd.DataFrame(x[:, None], columns=xdh.cols, index=xdh.rows) assert_frame_equal(xdh.pandas, df) assert xdh.shape == (10, 1)
def test_categorical_no_conversion(self): index = pd.date_range('2017-01-01', periods=10) cat = pd.Categorical(['a', 'b', 'a', 'b', 'a', 'a', 'b', 'c', 'c', 'a']) s = pd.Series({'cat': cat}, index=index, name='cat') dh = IVData(s, convert_dummies=False) assert dh.ndim == 2 assert dh.shape == (10, 1) assert dh.cols == ['cat'] assert dh.rows == list(index) df = pd.DataFrame(s) assert_frame_equal(dh.pandas, df)
def test_categorical_series(self): index = pd.date_range('2017-01-01', periods=10) cat = pd.Categorical(['a', 'b', 'a', 'b', 'a', 'a', 'b', 'c', 'c', 'a']) s = pd.Series(cat, name='cat', index=index) dh = IVData(s) assert dh.ndim == 2 assert dh.shape == (10, 2) assert sorted(dh.cols) == sorted(['cat.b', 'cat.c']) assert dh.rows == list(index) assert_equal(dh.pandas['cat.b'].values, (cat == 'b').astype(np.float)) assert_equal(dh.pandas['cat.c'].values, (cat == 'c').astype(np.float))
def test_numpy_2d(self): x = np.empty((10, 2)) xdh = IVData(x) assert xdh.ndim == x.ndim assert xdh.cols == ['x.0', 'x.1'] assert xdh.rows == list(np.arange(10)) assert_equal(xdh.ndarray, x) df = pd.DataFrame(x, columns=xdh.cols, index=xdh.rows) assert_frame_equal(xdh.pandas, df) assert xdh.shape == (10, 2) assert xdh.labels == {0: xdh.rows, 1: xdh.cols}
def test_categorical_series() -> None: index = pd.date_range("2017-01-01", periods=10) cat = pd.Categorical(["a", "b", "a", "b", "a", "a", "b", "c", "c", "a"]) s = pd.Series(cat, name="cat", index=index) dh = IVData(s) assert dh.ndim == 2 assert dh.shape == (10, 2) assert sorted(dh.cols) == sorted(["cat.b", "cat.c"]) assert dh.rows == list(index) assert_equal(dh.pandas["cat.b"].values, (cat == "b").astype(float)) assert_equal(dh.pandas["cat.c"].values, (cat == "c").astype(float))
def test_categorical_no_conversion() -> None: index = pd.date_range("2017-01-01", periods=10) cat = pd.Categorical(["a", "b", "a", "b", "a", "a", "b", "c", "c", "a"]) s = pd.Series(cat, index=index, name="cat") dh = IVData(s, convert_dummies=False) assert dh.ndim == 2 assert dh.shape == (10, 1) assert dh.cols == ["cat"] assert dh.rows == list(index) df = pd.DataFrame(s) assert_frame_equal(dh.pandas, df)
def multivariate_ls(cls, dependent, exog): """ Interface for specification of multivariate regression models Parameters ---------- dependent : array-like nobs by ndep array of dependent variables exog : array-like nobs by nvar array of exogenous regressors common to all models Returns ------- model : SUR Model instance Notes ----- Utility function to simplify the construction of multivariate regression models which all use the same regressors. Constructs the dictionary of equations from the variables using the common exogenous variable. Examples -------- A simple CAP-M can be estimated as a multivariate regression >>> from linearmodels.datasets import french >>> from linearmodels.system import SUR >>> data = french.load() >>> portfolios = data[['S1V1','S1V5','S5V1','S5V5']] >>> factors = data[['MktRF']].copy() >>> factors['alpha'] = 1 >>> mod = SUR.multivariate_ls(portfolios, factors) """ equations = OrderedDict() dependent = IVData(dependent, var_name='dependent') exog = IVData(exog, var_name='exog') for col in dependent.pandas: equations[col] = (dependent.pandas[[col]], exog.pandas) return cls(equations)
def test_pandas_series_numeric(self): x = np.empty(10) index = pd.date_range('2017-01-01', periods=10) xs = pd.Series(x, name='charlie', index=index) xdh = IVData(xs) assert xdh.ndim == 2 assert xdh.cols == [xs.name] assert xdh.rows == list(xs.index) assert_equal(xdh.ndarray, x[:, None]) df = pd.DataFrame(x[:, None], columns=xdh.cols, index=xdh.rows) assert_frame_equal(xdh.pandas, df) assert xdh.shape == (10, 1)
def test_pandas_df_numeric(self): x = np.empty((10, 2)) index = pd.date_range('2017-01-01', periods=10) xdf = pd.DataFrame(x, columns=['a', 'b'], index=index) xdh = IVData(xdf) assert xdh.ndim == 2 assert xdh.cols == list(xdf.columns) assert xdh.rows == list(xdf.index) assert_equal(xdh.ndarray, x) df = pd.DataFrame(x, columns=xdh.cols, index=xdh.rows) assert_frame_equal(xdh.pandas, df) assert xdh.shape == (10, 2)
def test_pandas_df_numeric() -> None: x = np.empty((10, 2)) index = pd.date_range("2017-01-01", periods=10) xdf = pd.DataFrame(x, columns=["a", "b"], index=index) xdh = IVData(xdf) assert xdh.ndim == 2 assert xdh.cols == list(xdf.columns) assert xdh.rows == list(xdf.index) assert_equal(xdh.ndarray, x) df = pd.DataFrame(x, columns=xdh.cols, index=xdh.rows).asfreq("D") assert_frame_equal(xdh.pandas, df) assert xdh.shape == (10, 2)
def test_categorical(self): index = pd.date_range('2017-01-01', periods=10) cat = pd.Categorical(['a', 'b', 'a', 'b', 'a', 'a', 'b', 'c', 'c', 'a']) num = np.empty(10) df = pd.DataFrame(OrderedDict(cat=cat, num=num), index=index) dh = IVData(df) assert dh.ndim == 2 assert dh.shape == (10, 3) assert sorted(dh.cols) == sorted(['cat.b', 'cat.c', 'num']) assert dh.rows == list(index) assert_equal(dh.pandas['num'].values, num) assert_equal(dh.pandas['cat.b'].values, (cat == 'b').astype(np.float)) assert_equal(dh.pandas['cat.c'].values, (cat == 'c').astype(np.float))
def test_categorical() -> None: index = pd.date_range("2017-01-01", periods=10) cat = pd.Categorical(["a", "b", "a", "b", "a", "a", "b", "c", "c", "a"]) num = np.empty(10) df = pd.DataFrame(dict(cat=cat, num=num), index=index) dh = IVData(df) assert dh.ndim == 2 assert dh.shape == (10, 3) assert sorted(dh.cols) == sorted(["cat.b", "cat.c", "num"]) assert dh.rows == list(index) assert_equal(dh.pandas["num"].values, num) assert_equal(dh.pandas["cat.b"].values, (cat == "b").astype(float)) assert_equal(dh.pandas["cat.c"].values, (cat == "c").astype(float))
def test_drop_missing(data): p = data.portfolios if isinstance(p, pd.DataFrame): p.iloc[::33] = np.nan else: p[::33] = np.nan res = TradedFactorModel(p, data.factors).fit() p = IVData(p) f = IVData(data.factors) isnull = p.isnull | f.isnull p.drop(isnull) f.drop(isnull) res2 = TradedFactorModel(p, f).fit() assert_equal(np.asarray(res.params), np.asarray(res2.params))
def test_fitted_predict(data, model): mod = model(data.dep, None, data.endog, data.instr) res = mod.fit() assert_series_equal(res.idiosyncratic, res.resids) y = mod.dependent.pandas expected = y.values - res.resids.values[:, None] expected = DataFrame(expected, y.index, ['fitted_values']) assert_frame_similar(expected, res.fitted_values) assert_allclose(expected, res.fitted_values) pred = res.predict() nobs = res.resids.shape[0] assert isinstance(pred, DataFrame) assert pred.shape == (nobs, 1) pred = res.predict(idiosyncratic=True, missing=True) nobs = IVData(data.dep).pandas.shape[0] assert pred.shape == (nobs, 2) assert list(pred.columns) == ['fitted_values', 'residual']
def instruments(self) -> IVData: return IVData(None, "instrument", nobs=self._dependent.shape[0])
class Interaction(object): """ Class that simplifies specifying interactions Parameters ---------- cat : {ndarray, Series, DataFrame, DataArray}, optional Variables to treat as categoricals. Best format is a Categorical Series or DataFrame containing Categorical Series. Other formats are converted to Categorical Series, column-by-column. cats has shape (nobs, ncat). cont : {ndarray, Series, DataFrame, DataArray}, optional Variables to treat as continuous, (nobs, ncont). Notes ----- For each variable in `cont`, computes the interaction of the variable and the cartesian product of the categories. Examples -------- >>> import numpy as np >>> from linearmodels.iv.absorbing import Interaction >>> rs = np.random.RandomState(0) >>> n = 100000 >>> cats = rs.randint(2, size=n) # binary dummy >>> cont = rs.standard_normal((n, 3)) >>> interact = Interaction(cats, cont) >>> interact.sparse.shape # Get the shape of the dummy matrix (100000, 6) >>> rs = np.random.RandomState(0) >>> import pandas as pd >>> cats_df = pd.concat([pd.Series(pd.Categorical(rs.randint(5,size=n))) ... for _ in range(4)],1) >>> cats_df.describe() 0 1 2 3 count 100000 100000 100000 100000 unique 5 5 5 5 top 3 3 0 4 freq 20251 20195 20331 20158 >>> interact = Interaction(cats, cont) >>> interact.sparse.shape # Cart product of all cats, 5**4, times ncont, 3 (100000, 1875) """ _iv_data = IVData(None, "none", 1) def __init__( self, cat: OptionalArrayLike = None, cont: OptionalArrayLike = None, nobs: Optional[int] = None, ) -> None: self._cat = cat self._cont = cont self._cat_data = self._iv_data self._cont_data = self._iv_data self._nobs = nobs self._check_data() @property def nobs(self) -> int: assert self._nobs is not None return self._nobs def _check_data(self) -> None: cat, cont = self._cat, self._cont cat_nobs = getattr(cat, "shape", (0, ))[0] cont_nobs = getattr(cont, "shape", (0, ))[0] nobs = max(cat_nobs, cont_nobs) if cat is None and cont is None: if self._nobs is not None: self._cont_data = self._cat_data = IVData(None, "none", nobs=self._nobs) else: raise ValueError( "nobs must be provided when cat and cont are None") return self._nobs = nobs self._cat_data = IVData(cat, "cat", nobs=nobs, convert_dummies=False) self._cont_data = IVData(cont, "cont", nobs=nobs, convert_dummies=False) if self._cat_data.shape[1] == self._cont_data.shape[1] == 0: raise ValueError("Both cat and cont are empty arrays") cat_data = self._cat_data.pandas convert = [ col for col in cat_data if not (is_categorical_dtype(cat_data[col])) ] if convert: cat_data = DataFrame( {col: cat_data[col].astype("category") for col in cat_data}) self._cat_data = IVData(cat_data, "cat", convert_dummies=False) @property def cat(self) -> DataFrame: """Categorical Variables""" return self._cat_data.pandas @property def cont(self) -> DataFrame: """Continuous Variables""" return self._cont_data.pandas @property def isnull(self) -> Series: return self.cat.isnull().any(1) | self.cont.isnull().any(1) def drop(self, locs: BoolArray) -> None: self._cat_data.drop(locs) self._cont_data.drop(locs) @property def sparse(self) -> sp.csc_matrix: r""" Construct a sparse interaction matrix Returns ------- csc_matrix Dummy interaction constructed from the cartesian product of the categories and each of the continuous variables. Notes ----- The number of columns in `dummy_interact` is .. math:: ncont \times \prod_{i=1}^{ncat} |c_i| where :math:`|c_i|` is the number distinct categories in column i. """ if self.cat.shape[1] and self.cont.shape[1]: out = [] for col in self.cont: out.append( category_continuous_interaction(self.cat, self.cont[col], precondition=False)) return sp.hstack(out, format="csc") elif self.cat.shape[1]: return category_interaction(category_product(self.cat), precondition=False) elif self.cont.shape[1]: return sp.csc_matrix(self._cont_data.ndarray) else: # empty interaction return sp.csc_matrix(empty((self._cat_data.shape[0], 0))) @property def hash(self) -> List[Tuple[str, ...]]: """ Construct a hash that will be invariant for any permutation of inputs that produce the same fit when used as regressors""" # Sorted hashes of any categoricals hasher = hash_func() cat_hashes = [] cat = self.cat for col in cat: hasher.update( ascontiguousarray(self.cat[col].cat.codes.to_numpy().data)) cat_hashes.append(hasher.hexdigest()) hasher = _reset(hasher) sorted_hashes = tuple(sorted(cat_hashes)) hashes = [] cont = self.cont for col in cont: hasher.update(ascontiguousarray(cont[col].to_numpy()).data) hashes.append(sorted_hashes + (hasher.hexdigest(), )) hasher = _reset(hasher) return sorted(hashes) @staticmethod def from_frame(frame: DataFrame) -> Interaction: """ Convenience function the simplifies using a DataFrame Parameters ---------- frame : DataFrame Frame containing categorical and continuous variables. All categorical variables are passed to `cat` and all other variables are passed as `cont`. Returns ------- Interaction Instance using the columns of frame Examples -------- >>> import numpy as np >>> from linearmodels.iv.absorbing import Interaction >>> import pandas as pd >>> rs = np.random.RandomState(0) >>> n = 100000 >>> cats = pd.concat([pd.Series(pd.Categorical(rs.randint(i+2,size=n))) ... for i in range(4)],1) >>> cats.columns = ['cat{0}'.format(i) for i in range(4)] >>> columns = ['cont{0}'.format(i) for i in range(6)] >>> cont = pd.DataFrame(rs.standard_normal((n, 6)), columns=columns) >>> frame = pd.concat([cats, cont], 1) >>> interact = Interaction.from_frame(frame) >>> interact.sparse.shape # Cart product of all cats, 5!, times ncont, 6 (100000, 720) """ cat_cols = [col for col in frame if is_categorical_dtype(frame[col])] cont_cols = [col for col in frame if col not in cat_cols] return Interaction(frame[cat_cols], frame[cont_cols], nobs=frame.shape[0])