Esempio n. 1
0
def test_isin():
    a = np.arange(5)
    b = np.arange(3)
    expected = np.array([1, 1, 1, 0, 0], dtype=np.bool)
    assert_array_equal(isin(a, b), np.array([1, 1, 1, 0, 0], dtype=np.bool))

    a = np.column_stack([a] * 3)
    expected = np.column_stack([expected] * 3)
    assert_array_equal(isin(a, b), expected)
Esempio n. 2
0
def test_drop_singletons_slow():
    rs = np.random.RandomState(0)
    c1 = rs.randint(0, 10000, (40000, 1))
    c2 = rs.randint(0, 20000, (40000, 1))
    cats = np.concatenate([c1, c2], 1)
    retain = in_2core_graph_slow(cats)
    nonsingletons = cats[retain]
    for col in (c1, c2):
        uniq, counts = np.unique(col, return_counts=True)
        assert not np.any(isin(col[retain], uniq[counts == 1]))

    idx = np.arange(40000)

    cols = {'c1': c1.copy(), 'c2': c2.copy()}
    for i in range(40000):
        last = cols['c1'].shape[0]
        for col in cols:
            keep = in_2core_graph_slow(cols[col])
            for col2 in cols:
                cols[col2] = cols[col2][keep]
            idx = idx[keep]
        if cols['c1'].shape[0] == last:
            break

    expected = np.concatenate([c1[idx], c2[idx]], 1)
    assert_array_equal(nonsingletons, expected)
    expected = np.concatenate([cols['c1'], cols['c2']], 1)
    assert_array_equal(nonsingletons, expected)

    dummies, _ = dummy_matrix(cats, format='csr', precondition=False)
    to_drop = dummies[~retain]
    assert to_drop.sum() == 2 * (~retain).sum()
Esempio n. 3
0
def test_drop_singletons_single():
    rs = np.random.RandomState(0)
    cats = rs.randint(0, 10000, (40000, 1))
    retain = in_2core_graph_slow(cats)
    nonsingletons = cats[retain]
    cats = pd.Series(cats[:, 0])
    vc = cats.value_counts()
    expected = np.sort(np.asarray(vc.index[vc > 1]))
    assert_array_equal(np.unique(nonsingletons), expected)
    assert vc[vc > 1].sum() == nonsingletons.shape[0]
    singletons = np.asarray(vc.index[vc == 1])
    assert nonsingletons.shape[0] == (40000 - singletons.shape[0])
    assert not np.any(isin(nonsingletons, singletons))
Esempio n. 4
0
def in_2core_graph_slow(cats):
    """
    Parameters
    ----------
    cats: {DataFrame, ndarray}
        Array containing the category codes of pandas categoricals
        (nobs, ncats)

    Returns
    -------
    retain : ndarray
        Boolean array that marks non-singleton entries as True

    Notes
    -----
    This is a reference implementation that can be very slow to remove
    all singleton nodes in some graphs.
    """
    if isinstance(cats, pd.DataFrame):
        cats = np.column_stack([np.asarray(cats[c].cat.codes) for c in cats])
    if cats.shape[1] == 1:
        return in_2core_graph(cats)
    nobs, ncats = cats.shape
    retain_idx = np.arange(cats.shape[0])
    num_singleton = 1
    while num_singleton > 0 and cats.shape[0] > 0:
        singleton = np.zeros(cats.shape[0], dtype=np.bool)
        for i in range(ncats):
            ucats, counts = np.unique(cats[:, i], return_counts=True)
            singleton |= isin(cats[:, i], ucats[counts == 1])
        num_singleton = singleton.sum()
        if num_singleton:
            cats = cats[~singleton]
            retain_idx = retain_idx[~singleton]
    retain = np.zeros(nobs, dtype=np.bool)
    retain[retain_idx] = True
    return retain
Esempio n. 5
0
def in_2core_graph(cats):
    """
    Parameters
    ----------
    cats: {DataFrame, ndarray}
        Array containing the category codes of pandas categoricals
        (nobs, ncats)

    Returns
    -------
    retain : ndarray
        Boolean array that marks non-singleton entries as True
    """
    if isinstance(cats, pd.DataFrame):
        cats = np.column_stack([np.asarray(cats[c].cat.codes) for c in cats])
    if cats.shape[1] == 1:
        # Fast, simple path
        ucats, counts = np.unique(cats, return_counts=True)
        retain = ucats[counts > 1]
        return isin(cats, retain).ravel()

    nobs, ncats = cats.shape
    zero_cats = []
    # Switch to 0 based indexing
    for col in range(ncats):
        u, inv = np.unique(cats[:, col], return_inverse=True)
        zero_cats.append(np.arange(u.shape[0])[inv])
    zero_cats = np.column_stack(zero_cats)
    # 2 tables
    # a.
    #    origin_id, dest_id
    max_cat = zero_cats.max(0)
    shift = np.r_[0, max_cat[:-1] + 1]
    zero_cats += shift
    orig_dest = []
    for i in range(ncats):
        col_order = list(range(ncats))
        col_order.remove(i)
        col_order = [i] + col_order
        temp = zero_cats[:, col_order]
        idx = np.argsort(temp[:, 0])
        orig_dest.append(temp[idx])
        if i == 0:
            inverter = np.empty_like(zero_cats[:, 0])
            inverter[idx] = np.arange(nobs)
    orig_dest = np.concatenate(orig_dest, 0)
    # b.
    #    node_id, count, offset
    node_id, count = np.unique(orig_dest[:, 0], return_counts=True)
    offset = np.r_[0, np.where(np.diff(orig_dest[:, 0]) != 0)[0] + 1]

    def min_dtype(*args):
        bits = max([np.log2(max(arg.max(), 1)) for arg in args])
        return 'int{0}'.format(
            min([i for i in (8, 16, 32, 64) if bits < (i - 1)]))

    dtype = min_dtype(offset, node_id, count, orig_dest)
    meta = np.column_stack(
        [node_id.astype(dtype),
         count.astype(dtype),
         offset.astype(dtype)])
    orig_dest = orig_dest.astype(dtype)

    singletons = np.any(meta[:, 1] == 1)
    while singletons:
        _drop_singletons(meta, orig_dest)
        singletons = np.any(meta[:, 1] == 1)

    sorted_cats = orig_dest[:nobs]
    unsorted_cats = sorted_cats[inverter]
    retain = unsorted_cats[:, 1] > 0

    return retain