Ejemplo n.º 1
0
 def test_DCER_fit(self):
     np.random.seed(8888)
     graph = self.graph
     p_mat = self.p_mat
     dcsbe = DCSBMEstimator(directed=True, loops=False)
     dcsbe.fit(graph)
     assert_allclose(p_mat, dcsbe.p_mat_, atol=0.12)
Ejemplo n.º 2
0
 def test_DCSBM_fit_supervised(self):
     p_mat = self.p_mat
     labels = self.labels
     g = self.g
     dcsbe = DCSBMEstimator(directed=True, loops=False)
     dcsbe.fit(g, y=labels)
     assert_allclose(dcsbe.p_mat_, p_mat, atol=0.1)
Ejemplo n.º 3
0
    def test_DCSBM_score(self):
        p_mat = self.p_mat
        graph = self.g
        estimator = DCSBMEstimator()
        _test_score(estimator, p_mat, graph)

        with pytest.raises(ValueError):
            estimator.score_samples(graph=graph[1:100, 1:100])
Ejemplo n.º 4
0
    def test_DCSBM_fit_unsupervised(self):
        np.random.seed(12345)
        n_verts = 1500

        distances = np.random.beta(4, 1, n_verts)
        B = np.array([[0.7, 0.1, 0.1], [0.1, 0.9, 0.1], [0.05, 0.1, 0.75]])
        n = np.array([500, 500, 500])
        labels = _n_to_labels(n)
        p_mat = _block_to_full(B, labels, (n_verts, n_verts))
        p_mat = p_mat * np.outer(distances, distances)
        p_mat -= np.diag(np.diag(p_mat))
        graph = sample_edges(p_mat, directed=True, loops=False)
        dcsbe = DCSBMEstimator(directed=True, loops=False)
        dcsbe.fit(graph)
        assert adjusted_rand_score(labels, dcsbe.vertex_assignments_) > 0.95
        assert_allclose(p_mat, dcsbe.p_mat_, atol=0.12)
def dcsbm_pvalue(G1,
                 G2,
                 max_comm,
                 num_perm,
                 pooled_variance=True,
                 min_comm=1,
                 epsilon1=1e-3,
                 epsilon2=1e-3,
                 Z1=None,
                 Z2=None):
    """
    Estimate p-value via parametric bootstrap, i.e. fit a DC-SBM
    """
    # if we are fixing the number of communities, we should also fix the number of latent dimensions of the embedding
    # otherwise (when we let the algorithm to automatically choose the number of communities)
    # we also let it choose the number of latent dimensions
    if min_comm == max_comm:
        K = min_comm
    else:
        K = None
    obs_test_stat = gcorr_dcsbm(G1,
                                G2,
                                min_comm=min_comm,
                                max_comm=max_comm,
                                pooled_variance=pooled_variance,
                                epsilon1=epsilon1,
                                epsilon2=epsilon2)
    G1_dcsbm = DCSBMEstimator(directed=False,
                              min_comm=min_comm,
                              max_comm=max_comm,
                              n_components=K).fit(G1, y=Z1)
    G2_dcsbm = DCSBMEstimator(directed=False,
                              min_comm=min_comm,
                              max_comm=max_comm,
                              n_components=K).fit(G2, y=Z2)
    # create bootstrap samples
    G1_bootstrap = G1_dcsbm.sample(n_samples=num_perm)
    G2_bootstrap = G2_dcsbm.sample(n_samples=num_perm)
    null_test_stats = np.zeros(num_perm)
    for i in tqdm(range(num_perm)):
        null_test_stats[i] = gcorr_dcsbm(G1_bootstrap[i],
                                         G2_bootstrap[i],
                                         min_comm=min_comm,
                                         max_comm=max_comm,
                                         pooled_variance=pooled_variance,
                                         epsilon1=epsilon1,
                                         epsilon2=epsilon2)
    num_extreme = np.where(null_test_stats >= obs_test_stat)[0].size
    if num_extreme < num_perm / 2:
        # P(T > t | H0) is smaller
        return (2 * num_extreme + 1) / (num_perm + 1)
    else:
        # P(T < t | H0) is smaller
        return (2 * (num_perm - num_extreme) + 1) / (num_perm + 1)
Ejemplo n.º 6
0
def dcsbm_corr(n,
               p,
               r,
               theta,
               epsilon1=1e-3,
               epsilon2=1e-3,
               directed=False,
               loops=False):
    '''
    Sample a pair of DC-SBM with the same marginal probabilities
    '''
    Z = np.repeat(np.arange(0, np.array(n).size), n)
    R = r * np.ones((np.sum(n), np.sum(n)))
    # sample a DC-SBM w/ block prob p
    G = sbm(n, p, dc=theta)
    # fit DC-SBM to G1 to estimate P
    G_dcsbm = DCSBMEstimator(directed=False).fit(G, y=Z)
    p_mat = G_dcsbm.p_mat_
    # P could be out of range
    p_mat[p_mat < epsilon1] = epsilon1
    p_mat[p_mat > 1 - epsilon2] = 1 - epsilon2
    # sample correlated graphs based on P
    G1, G2 = sample_edges_corr(p_mat, R, directed, loops)
    return G1, G2
Ejemplo n.º 7
0
    def test_DCSBM_nparams(self):
        n_verts = 3000
        n_class = 4
        graph = self.g
        labels = self.labels
        e = DCSBMEstimator(directed=True)
        e.fit(graph)
        assert e._n_parameters() == (n_verts + n_class - 1 + n_class ** 2)

        e = DCSBMEstimator(directed=True)
        e.fit(graph, y=labels)
        assert e._n_parameters() == (n_verts + n_class ** 2)

        e = DCSBMEstimator(directed=True, degree_directed=True)
        e.fit(graph, y=labels)
        assert e._n_parameters() == (2 * n_verts + n_class ** 2)

        e = DCSBMEstimator(directed=False)
        e.fit(graph, y=labels)
        assert e._n_parameters() == (n_verts + 10)
Ejemplo n.º 8
0
    def test_DCSBM_sample(self):
        np.random.seed(8888)
        estimator = DCSBMEstimator(directed=True, loops=False)
        B = np.array([[0.9, 0.1], [0.1, 0.9]])
        dc = np.random.uniform(0.25, 0.75, size=100)
        labels = _n_to_labels([50, 50])

        p_mat = _block_to_full(B, labels, (100, 100))
        p_mat = p_mat * np.outer(dc, dc)
        p_mat -= np.diag(np.diag(p_mat))
        g = sample_edges(p_mat, directed=True)

        with pytest.raises(NotFittedError):
            estimator.sample()

        estimator.fit(g, y=labels)
        with pytest.raises(ValueError):
            estimator.sample(n_samples=-1)

        with pytest.raises(TypeError):
            estimator.sample(n_samples="nope")
        estimator.p_mat_ = p_mat
        _test_sample(estimator, p_mat, n_samples=1000, atol=0.1)
Ejemplo n.º 9
0
    def test_DCSBM_inputs(self):
        with pytest.raises(TypeError):
            DCSBMEstimator(directed="hey")

        with pytest.raises(TypeError):
            DCSBMEstimator(loops=6)

        with pytest.raises(TypeError):
            DCSBMEstimator(n_components="XD")

        with pytest.raises(ValueError):
            DCSBMEstimator(n_components=-1)

        with pytest.raises(TypeError):
            DCSBMEstimator(min_comm="1")

        with pytest.raises(ValueError):
            DCSBMEstimator(min_comm=-1)

        with pytest.raises(TypeError):
            DCSBMEstimator(max_comm="ay")

        with pytest.raises(ValueError):
            DCSBMEstimator(max_comm=-1)

        with pytest.raises(ValueError):
            DCSBMEstimator(min_comm=4, max_comm=2)

        graph = er_np(100, 0.5)
        bad_y = np.zeros(99)
        dcsbe = DCSBMEstimator()
        with pytest.raises(ValueError):
            dcsbe.fit(graph, y=bad_y)

        with pytest.raises(ValueError):
            dcsbe.fit(graph[:, :99])

        with pytest.raises(ValueError):
            dcsbe.fit(graph[..., np.newaxis])

        with pytest.raises(TypeError):
            DCSBMEstimator(cluster_kws=1)

        with pytest.raises(TypeError):
            DCSBMEstimator(embed_kws=1)
Ejemplo n.º 10
0
def gcorr_dcsbm(G1,
                G2,
                max_comm,
                pooled_variance=True,
                min_comm=1,
                epsilon1=1e-3,
                epsilon2=1e-3,
                Z1=None,
                Z2=None,
                return_fit=False,
                seed=None):
    """
    Compute a test statistic based on DC-SBM fit
    Note this test statistic doesn't require the vertex assignment
    optionally give fitted DC-SBM to save computation time
    Note: if `G1_dcsbm` or `G2_dcsbm` is given, the estimated P matrices are extracted from these model fits
    otherwise, they are extracted from the model fitted on `G1`, `G2`
    """
    # if we are fixing the number of communities, we should also fix the number of latent dimensions of the embedding
    # otherwise (when we let the algorithm to automatically choose the number of communities)
    # we also let it choose the number of latent dimensions
    if min_comm == max_comm:
        K = min_comm
    else:
        K = None
    G1_dcsbm = DCSBMEstimator(directed=False,
                              min_comm=min_comm,
                              max_comm=max_comm,
                              n_components=K,
                              cluster_kws={
                                  'random_state': seed
                              }).fit(G1, y=Z1)
    G2_dcsbm = DCSBMEstimator(directed=False,
                              min_comm=min_comm,
                              max_comm=max_comm,
                              n_components=K,
                              cluster_kws={
                                  'random_state': seed
                              }).fit(G2, y=Z2)
    # since the diagonal entries are forced to be zeros in graphs with no loops
    # we should ignore them in the calculation of correlation
    g1 = off_diag(G1)
    g2 = off_diag(G2)
    phat = off_diag(G1_dcsbm.p_mat_)
    qhat = off_diag(G2_dcsbm.p_mat_)
    # trim the estimated probability matrix
    phat[phat < epsilon1] = epsilon1
    phat[phat > 1 - epsilon2] = 1 - epsilon2
    qhat[qhat < epsilon1] = epsilon1
    qhat[qhat > 1 - epsilon2] = 1 - epsilon2

    # calculate the test statistic
    if pooled_variance:
        T = np.sum((g1 - phat) * (g2 - qhat)) / np.sqrt(
            np.sum(np.square(g1 - phat)) * np.sum(np.square(g2 - qhat)))
    else:
        num_vertices = G1.shape[0]
        T = np.sum((g1 - phat) *
                   (g2 - qhat) / np.sqrt(phat * (1 - phat) * qhat *
                                         (1 - qhat))) / (num_vertices *
                                                         (num_vertices - 1))

    if return_fit:
        dcsbm_fit = {'G1': G1_dcsbm, 'G2': G2_dcsbm}
        return T, dcsbm_fit
    else:
        return T
        if args.sim == 'sbm':
            G1, G2 = sbm_corr(n, p, args.rho)
        elif args.sim == 'dcsbm':
            theta = np.linspace(100, 1, n[0])
            theta /= theta.sum()
            theta = np.concatenate([theta, theta])
            G1, G2 = dcsbm_corr(n, p, args.rho, theta)

        # null by block permutation
        Z = community_estimation(G1, G2, min_components=max_comm)
        # Z = np.repeat([0, 1], n)
        G2_block_perm = block_permutation(G2, Z)

        # null by parametric bootstrap
        G1_dcsbm = DCSBMEstimator(directed=False).fit(G1)
        G2_dcsbm = DCSBMEstimator(directed=False).fit(G2)
        G1_bootstrap = G1_dcsbm.sample()[0]
        G2_bootstrap = G2_dcsbm.sample()[0]

        test_stats_alt['gcorr_block_perm'][i, rep] = gcorr(G1, G2, Z)
        test_stats_null['gcorr_block_perm'][i, rep] = gcorr(G1, G2_block_perm, Z)
        test_stats_alt['gcorr_param_bootstrap'][i, rep] = gcorr(G1, G2, Z)
        test_stats_null['gcorr_param_bootstrap'][i, rep] = gcorr(G1_bootstrap, G2_bootstrap, Z)
        test_stats_alt['gcorrDC_param_bootstrap'][i, rep] = gcorr_dcsbm(G1, G2, max_comm)
        test_stats_null['gcorrDC_param_bootstrap'][i, rep] = gcorr_dcsbm(G1_bootstrap, G2_bootstrap, max_comm)
        test_stats_alt['gcorrDC_block_perm'][i, rep] = gcorr_dcsbm(G1, G2, max_comm)
        test_stats_null['gcorrDC_block_perm'][i, rep] = gcorr_dcsbm(G1, G2_block_perm, max_comm)


# compute power