def sample_edges_corr_diffmarg(P, Q, R, directed=False, loops=False): """ Generate a pair of correlated graphs with Bernoulli distribution. Both G1 and G2 are binary matrices. Allows for different marginal distributions Parameters ---------- P: np.ndarray, shape (n_vertices, n_vertices) Matrix of probabilities (between 0 and 1) for the first random graph. Q: np.ndarray, shape (n_vertices, n_vertices) Matrix of probabilities (between 0 and 1) for the second random graph. R: np.ndarray, shape (n_vertices, n_vertices) Matrix of correlation (between 0 and 1) between graph pairs. directed: boolean, optional (default=False) If False, output adjacency matrix will be symmetric. Otherwise, output adjacency matrix will be asymmetric. loops: boolean, optional (default=False) If False, no edges will be sampled in the diagonal. Otherwise, edges are sampled in the diagonal. Returns ------- G1: ndarray (n_vertices, n_vertices) Adjacency matrix the same size as P representing a random graph. G2: ndarray (n_vertices, n_vertices) Adjacency matrix the same size as P representing a random graph. """ # test input # check P if type(P) is not np.ndarray: raise TypeError("P must be numpy.ndarray") if len(P.shape) != 2: raise ValueError("P must have dimension 2 (n_vertices, n_vertices)") if P.shape[0] != P.shape[1]: raise ValueError("P must be a square matrix") # check Q if type(Q) is not np.ndarray: raise TypeError("Q must be numpy.ndarray") if len(Q.shape) != 2: raise ValueError("Q must have dimension 2 (n_vertices, n_vertices)") if Q.shape[0] != P.shape[0] or Q.shape[1] != P.shape[1]: raise ValueError("Q must have the same shape as P") # check R if type(R) is not np.ndarray: raise TypeError("R must be numpy.ndarray") if len(R.shape) != 2: raise ValueError("R must have dimension 2 (n_vertices, n_vertices)") if R.shape[0] != P.shape[0] or R.shape[1] != P.shape[1]: raise ValueError("R must have the same shape as P") # check directed and loops check_dirloop(directed, loops) G1 = sample_edges(P, directed=directed, loops=loops) P2 = G1.copy() P2 = np.where(P2 == 1, Q + R * np.sqrt((1 - P) * Q * (1 - Q) / P), Q - R * np.sqrt(P * Q * (1 - Q) / (1 - P))) G2 = sample_edges(P2, directed=directed, loops=loops) return G1, G2
def sample_null_distribution(p_mat, tstat_func, n_samples=1000, parallel=True): if parallel: def sample_and_tstat(seed=None): if seed is not None: np.random.seed(seed) A = sample_edges(np.array(p_mat), directed=True, loops=False) if not is_fully_connected(A): print( "Original sample was not fully connected, trying again...") tries = 0 connected = False while not connected and tries < 10: A = sample_edges(np.array(p_mat), directed=True, loops=False) connected = is_fully_connected(A) tries += 1 if not connected: print("Did not sample connected graph after 10 tries.") tstat = tstat_func(A) return tstat seeds = np.random.randint(1e8, size=n_samples) null = Parallel(n_jobs=-2, verbose=10)(delayed(sample_and_tstat)(seed) for seed in seeds) else: null = [] for i in tqdm(range(n_samples)): A = sample_edges(p_mat, directed=True, loops=False) if not is_fully_connected(A): print( "Original sample was not fully connected, trying again...") tries = 0 connected = False while not connected and tries < 10: A = sample_edges(np.array(p_mat), directed=True, loops=False) connected = is_fully_connected(A) tries += 1 if not connected: print("Did not sample connected graph after 10 tries.") tstat = tstat_func(A) null.append(tstat) null = np.array(null) null = np.sort(null) return null
def sample_and_tstat(seed=None): if seed is not None: np.random.seed(seed) A = sample_edges(np.array(p_mat), directed=True, loops=False) if not is_fully_connected(A): print( "Original sample was not fully connected, trying again...") tries = 0 connected = False while not connected and tries < 10: A = sample_edges(np.array(p_mat), directed=True, loops=False) connected = is_fully_connected(A) tries += 1 if not connected: print("Did not sample connected graph after 10 tries.") tstat = tstat_func(A) return tstat
def setup_class(cls): np.random.seed(8888) n = 1000 p = 0.5 dc = np.random.beta(2, 5, size=n) p_mat = np.full((n, n), p) p_mat = p_mat * np.outer(dc, dc) p_mat -= np.diag(np.diag(p_mat)) graph = sample_edges(p_mat, directed=True, loops=False) cls.p_mat = p_mat cls.graph = graph
def setup_class(cls): np.random.seed(8888) n_verts = 500 point1 = np.array([0.1, 0.9]) point2 = np.array([0.9, 0.1]) latent1 = np.tile(point1, reps=(n_verts, 1)) latent2 = np.tile(point2, reps=(n_verts, 1)) latent = np.concatenate((latent1, latent2), axis=0) p_mat = latent @ latent.T p_mat -= np.diag(np.diag(p_mat)) g = sample_edges(p_mat) cls.p_mat = p_mat cls.graph = g
def test_SBM_score(self): # tests score() and score_sample() B = np.array([[0.75, 0.25], [0.25, 0.75]]) n_verts = 100 n = np.array([n_verts, n_verts]) tau = _n_to_labels(n) p_mat = _block_to_full(B, tau, shape=(n_verts * 2, n_verts * 2)) graph = sample_edges(p_mat, directed=True) estimator = SBMEstimator(max_comm=4) _test_score(estimator, p_mat, graph) with pytest.raises(ValueError): estimator.score_samples(graph=graph[1:100, 1:100])
def setUp(self) -> None: np.random.seed(8888) n_verts = 500 point1 = np.array([0.1, 0.9]) point2 = np.array([0.9, 0.1]) latent1 = np.tile(point1, reps=(n_verts, 1)) latent2 = np.tile(point2, reps=(n_verts, 1)) latent = np.concatenate((latent1, latent2), axis=0) p_mat = latent @ latent.T p_mat -= np.diag(np.diag(p_mat)) g = sample_edges(p_mat) self.p_mat = p_mat self.graph = g
def test_RDPG_fit(self): np.random.seed(8888) n_points = 2000 dists = np.random.uniform(0, 1, n_points) points = hardy_weinberg(dists) p_mat = points @ points.T p_mat -= np.diag(np.diag(p_mat)) g = sample_edges(p_mat) estimator = RDPGEstimator(loops=False, n_components=3) estimator.fit(g) assert_allclose(estimator.p_mat_, p_mat, atol=0.2)
def test_SBM_fit_unsupervised(self): np.random.seed(12345) n_verts = 1500 B = np.array([[0.7, 0.1, 0.1], [0.1, 0.9, 0.1], [0.05, 0.1, 0.75]]) n = np.array([500, 500, 500]) labels = _n_to_labels(n) p_mat = _block_to_full(B, labels, (n_verts, n_verts)) p_mat -= np.diag(np.diag(p_mat)) graph = sample_edges(p_mat, directed=True, loops=False) sbe = SBMEstimator(directed=True, loops=False) sbe.fit(graph) assert adjusted_rand_score(labels, sbe.vertex_assignments_) > 0.95 assert_allclose(p_mat, sbe.p_mat_, atol=0.12)
def setUpClass(cls) -> None: np.random.seed(8888) B = np.array([ [0.9, 0.2, 0.05, 0.1], [0.1, 0.7, 0.1, 0.1], [0.2, 0.4, 0.8, 0.5], [0.1, 0.2, 0.1, 0.7], ]) n = np.array([1000, 1000, 500, 500]) dc = np.random.beta(2, 5, size=n.sum()) labels = _n_to_labels(n) p_mat = _block_to_full(B, labels, (n.sum(), n.sum())) p_mat = p_mat * np.outer(dc, dc) p_mat -= np.diag(np.diag(p_mat)) g = sample_edges(p_mat, directed=True, loops=False) cls.p_mat = p_mat cls.labels = labels cls.g = g
def test_DCER_sample(self): np.random.seed(8888) estimator = DCEREstimator(directed=True, loops=False) g = self.graph p_mat = self.p_mat with pytest.raises(NotFittedError): estimator.sample() estimator.fit(g) with pytest.raises(ValueError): estimator.sample(n_samples=-1) with pytest.raises(TypeError): estimator.sample(n_samples="nope") B = 0.5 dc = np.random.uniform(0.25, 0.75, size=100) p_mat = np.outer(dc, dc) * B p_mat -= np.diag(np.diag(p_mat)) g = sample_edges(p_mat, directed=True) estimator.fit(g) estimator.p_mat_ = p_mat _test_sample(estimator, p_mat, n_samples=1000, atol=0.2)
def test_DCSBM_sample(self): np.random.seed(8888) estimator = DCSBMEstimator(directed=True, loops=False) B = np.array([[0.9, 0.1], [0.1, 0.9]]) dc = np.random.uniform(0.25, 0.75, size=100) labels = _n_to_labels([50, 50]) p_mat = _block_to_full(B, labels, (100, 100)) p_mat = p_mat * np.outer(dc, dc) p_mat -= np.diag(np.diag(p_mat)) g = sample_edges(p_mat, directed=True) with pytest.raises(NotFittedError): estimator.sample() estimator.fit(g, y=labels) with pytest.raises(ValueError): estimator.sample(n_samples=-1) with pytest.raises(TypeError): estimator.sample(n_samples="nope") estimator.p_mat_ = p_mat _test_sample(estimator, p_mat, n_samples=1000, atol=0.1)
def sample_upset(): P = construct_feedforward_P(n, p=p, delta=0) A = sample_edges(P, directed=True, loops=False) return A
triu_inds = np.triu_indices(n, k=1) p_upper = p + delta p_lower = p - delta P = np.zeros((n, n)) P[triu_inds] = p_upper P[triu_inds[::-1]] = p_lower return P n = 30 p = 0.5 delta = 0.1 P = construct_feedforward_P(n, p=p, delta=delta) A = sample_edges(P, directed=True, loops=False) fig, axs = plt.subplots(1, 3, figsize=(12, 4)) # TODO make a plot of Phat title = r"$P$" + "\n" title += r"$p = $" + f"{p}, " + r"$\delta = $" + f"{delta}" ax = axs[0] heatmap(P, vmin=0, vmax=1, cbar=False, ax=ax, title=title) ax.text(n / 4, 3 * n / 4, r"$p - \delta$", ha="center", va="center") ax.text(3 * n / 4, n / 4, r"$p - \delta$", ha="center", va="center", color="white")
def sample_edges_corr(P, R, directed=False, loops=False): """ Generate a pair of correlated graphs with Bernoulli distribution. Both G1 and G2 are binary matrices. Parameters ---------- P: np.ndarray, shape (n_vertices, n_vertices) Matrix of probabilities (between 0 and 1) for a random graph. R: np.ndarray, shape (n_vertices, n_vertices) Matrix of correlation (between 0 and 1) between graph pairs. directed: boolean, optional (default=False) If False, output adjacency matrix will be symmetric. Otherwise, output adjacency matrix will be asymmetric. loops: boolean, optional (default=False) If False, no edges will be sampled in the diagonal. Otherwise, edges are sampled in the diagonal. References ---------- .. [1] Vince Lyzinski, et al. "Seeded Graph Matching for Correlated Erdos-Renyi Graphs", Journal of Machine Learning Research 15, 2014 Returns ------- G1: ndarray (n_vertices, n_vertices) Adjacency matrix the same size as P representing a random graph. G2: ndarray (n_vertices, n_vertices) Adjacency matrix the same size as P representing a random graph. Examples -------- >>> np.random.seed(1) >>> p = 0.5 >>> r = 0.3 >>> R = r * np.ones((5, 5)) >>> P = p * np.ones((5, 5)) To sample a correlated graph pair based on P and R matrices: >>> sample_edges_corr(P, R, directed = False, loops = False) (array([[0., 1., 0., 0., 0.], [1., 0., 0., 0., 0.], [0., 0., 0., 0., 1.], [0., 0., 0., 0., 1.], [0., 0., 1., 1., 0.]]), array([[0., 1., 0., 0., 0.], [1., 0., 1., 0., 1.], [0., 1., 0., 1., 1.], [0., 0., 1., 0., 1.], [0., 1., 1., 1., 0.]])) """ # test input # check P if type(P) is not np.ndarray: raise TypeError("P must be numpy.ndarray") if len(P.shape) != 2: raise ValueError("P must have dimension 2 (n_vertices, n_vertices)") if P.shape[0] != P.shape[1]: raise ValueError("P must be a square matrix") # check R if type(R) is not np.ndarray: raise TypeError("R must be numpy.ndarray") if len(R.shape) != 2: raise ValueError("R must have dimension 2 (n_vertices, n_vertices)") if R.shape[0] != P.shape[1]: raise ValueError("R must be a square matrix") # check directed and loops check_dirloop(directed, loops) G1 = sample_edges(P, directed=directed, loops=loops) P2 = G1.copy() P2 = np.where(P2 == 1, P + R * (1 - P), P * (1 - R)) G2 = sample_edges(P2, directed=directed, loops=loops) return G1, G2
#%% from graspologic.simulations import sample_edges, sbm from graspologic.utils import cartprod import seaborn as sns n_per_comm = 50 B = np.array([[0.8, 0.1, 0.1], [0.1, 0.75, 0.05], [0.1, 0.05, 0.6]]) _, labels = sbm([n_per_comm, n_per_comm, n_per_comm], B, return_labels=True) P = B[np.ix_(labels, labels)] sns.heatmap(P) #%% fig, ax = plt.subplots(1, 1, figsize=(8, 4)) true_eigvals = np.linalg.eigvalsh(P) n_sims = 1000 all_estimated_eigvals = [] for i in range(n_sims): A = sample_edges(P, directed=False, loops=True) estimated_eigvals = np.linalg.eigvalsh(A) all_estimated_eigvals += list(estimated_eigvals) sns.histplot((all_estimated_eigvals), ax=ax, stat='density') for true_eigval in true_eigvals[::-1][:3]: ax.axvline(true_eigval, color="darkred") #%% np.linalg.norm(P - A, ord=2)