def test_dummy_precondition():
    c1 = pd.Series(pd.Categorical(["a"] * 5 + ["b"] * 5 + ["c"] * 5))
    c2 = pd.Series(pd.Categorical(["A", "B", "C", "D", "E"] * 3))
    cats = pd.concat([c1, c2], 1)
    out_arr, cond_arr = dummy_matrix(cats,
                                     output_format="array",
                                     drop="last",
                                     precondition=True)
    csc = dummy_matrix(cats,
                       output_format="csc",
                       drop="last",
                       precondition=True)
    out_csc: csc_matrix = csc[0]
    cond_csc: np.ndarray = csc[1]
    csr = dummy_matrix(cats,
                       output_format="csr",
                       drop="last",
                       precondition=True)
    out_csr: csr_matrix = csr[0]
    cond_csr: np.ndarray = csr[1]
    assert_allclose((out_arr**2).sum(0), np.ones(out_arr.shape[1]))
    assert_allclose((out_csc.multiply(out_csc)).sum(0).A1,
                    np.ones(out_arr.shape[1]))
    assert_allclose(cond_arr, cond_csc)
    assert_allclose(cond_csr, cond_csc)
    assert isinstance(out_csr, scipy.sparse.csr_matrix)
def test_absorbing_regressors(cat, cont, interact, weights):
    areg = AbsorbingRegressor(cat=cat,
                              cont=cont,
                              interactions=interact,
                              weights=weights)
    rank = areg.approx_rank
    expected_rank = 0

    expected = []
    for i, col in enumerate(cat):
        expected_rank += pd.Series(cat[col].cat.codes).nunique() - (i > 0)
    expected.append(dummy_matrix(cat, precondition=False)[0])
    expected_rank += cont.shape[1]
    expected.append(csc_matrix(cont))
    if interact is not None:
        for inter in interact:
            interact_mat = inter.sparse
            expected_rank += interact_mat.shape[1]
            expected.append(interact_mat)
    expected = sp.hstack(expected, format="csc")
    if weights is not None:
        expected = (sp.diags(np.sqrt(weights)).dot(expected)).asformat("csc")
    actual = areg.regressors
    assert expected.shape == actual.shape
    assert_array_equal(expected.indptr, actual.indptr)
    assert_array_equal(expected.indices, actual.indices)
    assert_allclose(expected.A, actual.A)
    assert expected_rank == rank
def test_drop_singletons_slow():
    rs = np.random.RandomState(0)
    c1 = rs.randint(0, 10000, (40000, 1))
    c2 = rs.randint(0, 20000, (40000, 1))
    cats = np.concatenate([c1, c2], 1)
    retain = in_2core_graph_slow(cats)
    nonsingletons = cats[retain]
    for col in (c1, c2):
        uniq, counts = np.unique(col, return_counts=True)
        assert not np.any(np.isin(col[retain], uniq[counts == 1]))

    idx = np.arange(40000)

    cols = {"c1": c1.copy(), "c2": c2.copy()}
    for i in range(40000):
        last = cols["c1"].shape[0]
        for col in cols:
            keep = in_2core_graph_slow(cols[col])
            for col2 in cols:
                cols[col2] = cols[col2][keep]
            idx = idx[keep]
        if cols["c1"].shape[0] == last:
            break

    expected = np.concatenate([c1[idx], c2[idx]], 1)
    assert_array_equal(nonsingletons, expected)
    expected = np.concatenate([cols["c1"], cols["c2"]], 1)
    assert_array_equal(nonsingletons, expected)

    dummies, _ = dummy_matrix(cats, output_format="csr", precondition=False)
    to_drop = dummies[~retain]
    assert to_drop.sum() == 2 * (~retain).sum()
def test_dummy_pandas():
    c1 = pd.Series(pd.Categorical(["a"] * 5 + ["b"] * 5 + ["c"] * 5))
    c2 = pd.Series(pd.Categorical(["A", "B", "C", "D", "E"] * 3))
    cats = pd.concat([c1, c2], 1)
    out, _ = dummy_matrix(cats, drop="last", precondition=False)
    assert isinstance(out, scipy.sparse.csc.csc_matrix)
    assert out.shape == (15, 3 + 5 - 1)
    expected = np.array([5, 5, 5, 3, 3, 3, 3], dtype=np.int32)
    assert_array_equal(np.squeeze(np.asarray(out.sum(0), dtype=np.int32)), expected)
def test_dummy_pandas():
    c1 = pd.Series(pd.Categorical(['a'] * 5 + ['b'] * 5 + ['c'] * 5))
    c2 = pd.Series(pd.Categorical(['A', 'B', 'C', 'D', 'E'] * 3))
    cats = pd.concat([c1, c2], 1)
    out, _ = dummy_matrix(cats, drop='last', precondition=False)
    assert isinstance(out, scipy.sparse.csc.csc_matrix)
    assert out.shape == (15, 3 + 5 - 1)
    expected = np.array([5, 5, 5, 3, 3, 3, 3], dtype=np.int32)
    assert_array_equal(np.squeeze(np.asarray(out.sum(0), dtype=np.int32)),
                       expected)
def test_dummy_last():
    cats = np.zeros([15, 2], dtype=np.int8)
    cats[5:, 0] = 1
    cats[10:, 0] = 2
    cats[:, 1] = np.arange(15) % 5
    cats[-1, 1] = 0
    out, _ = dummy_matrix(cats, drop="last", precondition=False)
    assert isinstance(out, scipy.sparse.csc.csc_matrix)
    assert out.shape == (15, 3 + 5 - 1)
    expected = np.array([5, 5, 5, 4, 3, 3, 3], dtype=np.int32)
    assert out.shape == (15, 3 + 5 - 1)
    assert_array_equal(np.squeeze(np.asarray(out.sum(0), dtype=np.int32)), expected)
def test_dummy_format(dummy_format):
    code, expected_type = dummy_format
    cats = np.zeros([15, 2], dtype=np.int8)
    cats[5:, 0] = 1
    cats[10:, 0] = 2
    cats[:, 1] = np.arange(15) % 5
    out, cond = dummy_matrix(cats, output_format=code, precondition=False)
    assert isinstance(out, expected_type)
    assert out.shape == (15, 3 + 5 - 1)
    expected = np.array([5, 5, 5, 3, 3, 3, 3], dtype=np.int32)
    assert_array_equal(np.squeeze(np.asarray(out.sum(0), dtype=np.int32)), expected)
    assert_array_equal(cond, np.ones(out.shape[1]))
def test_dummy_precondition():
    c1 = pd.Series(pd.Categorical(['a'] * 5 + ['b'] * 5 + ['c'] * 5))
    c2 = pd.Series(pd.Categorical(['A', 'B', 'C', 'D', 'E'] * 3))
    cats = pd.concat([c1, c2], 1)
    out_arr, cond_arr = dummy_matrix(cats,
                                     format='array',
                                     drop='last',
                                     precondition=True)
    out_csc, cond_csc = dummy_matrix(cats,
                                     format='csc',
                                     drop='last',
                                     precondition=True)
    out_csr, cond_csr = dummy_matrix(cats,
                                     format='csr',
                                     drop='last',
                                     precondition=True)
    assert_allclose((out_arr**2).sum(0), np.ones(out_arr.shape[1]))
    assert_allclose((out_csc.multiply(out_csc)).sum(0).A1,
                    np.ones(out_arr.shape[1]))
    assert_allclose(cond_arr, cond_csc)
    assert_allclose(cond_csr, cond_csc)
    assert isinstance(out_csr, scipy.sparse.csr_matrix)
Exemple #9
0
def category_interaction(cat: Series, precondition: bool = True) -> csc_matrix:
    """
    Parameters
    ----------
    cat : Series
        Categorical series to convert to dummy variables
    precondition : bool
        Flag whether dummies should be preconditioned

    Returns
    -------
    dummies : csc_matrix
        Sparse matrix of dummies with unit column norm
    """
    codes = get_codes(category_product(cat).cat)
    return dummy_matrix(codes[:, None], precondition=precondition)[0]
Exemple #10
0
def category_interaction(cat: Series, precondition: bool = True) -> sp.csc_matrix:
    """
    Parameters
    ----------
    cat : Series
        Categorical series to convert to dummy variables
    precondition : bool
        Flag whether dummies should be preconditioned

    Returns
    -------
    csc_matrix
        Sparse matrix of dummies with unit column norm
    """
    codes = asarray(category_product(cat).cat.codes)[:, None]
    mat = dummy_matrix(codes, precondition=precondition)[0]
    assert isinstance(mat, sp.csc_matrix)
    return mat
def test_against_ols(ols_data):
    mod = AbsorbingLS(
        ols_data.y,
        ols_data.x,
        absorb=ols_data.absorb,
        interactions=ols_data.interactions,
        weights=ols_data.weights,
    )
    res = mod.fit()
    absorb = []
    has_dummy = False
    if ols_data.absorb is not None:
        absorb.append(ols_data.absorb.cont.to_numpy())
        if ols_data.absorb.cat.shape[1] > 0:
            dummies = dummy_matrix(ols_data.absorb.cat, precondition=False)[0]
            assert isinstance(dummies, sp.csc_matrix)
            absorb.append(dummies.A)
        has_dummy = ols_data.absorb.cat.shape[1] > 0
    if ols_data.interactions is not None:
        for interact in ols_data.interactions:
            absorb.append(interact.sparse.A)
    _x = ols_data.x
    if absorb:
        absorb = np.column_stack(absorb)
        if np.any(np.ptp(_x, 0) == 0) and has_dummy:
            if ols_data.weights is None:
                absorb = annihilate(absorb, np.ones((absorb.shape[0], 1)))
            else:
                root_w = np.sqrt(mod.weights.ndarray)
                wabsorb = annihilate(root_w * absorb, root_w)
                absorb = (1.0 / root_w) * wabsorb
        rank = np.linalg.matrix_rank(absorb)
        if rank < absorb.shape[1]:
            a, b = np.linalg.eig(absorb.T @ absorb)
            order = np.argsort(a)[::-1]
            a, b = a[order], b[:, order]
            z = absorb @ b
            absorb = z[:, :rank]
        _x = np.column_stack([_x, absorb])
    ols_mod = _OLS(ols_data.y, _x, weights=ols_data.weights)
    ols_res = ols_mod.fit()

    assert_results_equal(ols_res, res)
Exemple #12
0
    def _regressors(self) -> csc_matrix:
        regressors = []

        if self._cat is not None and self._cat.shape[1] > 0:
            regressors.append(dummy_matrix(self._cat, precondition=False)[0])
        if self._cont is not None and self._cont.shape[1] > 0:
            regressors.append(csc_matrix(to_numpy(self._cont)))
        if self._interactions is not None:
            regressors.extend([interact.sparse for interact in self._interactions])

        if regressors:
            regressor_mat = sp.hstack(regressors, format='csc')
            approx_rank = regressor_mat.shape[1]
            self._approx_rank = approx_rank
            if self._weights is not None:
                return (sp.diags(sqrt(self._weights.squeeze())).dot(regressor_mat)).asformat('csc')
            return regressor_mat
        else:
            self._approx_rank = 0
            return csc_matrix(empty((0, 0)))
def test_invalid_format():
    cats = np.zeros([10, 1], dtype=np.int8)
    cats[5:, 0] = 1
    with pytest.raises(ValueError):
        dummy_matrix(cats, output_format="unknown", precondition=False)