Ejemplo n.º 1
0
def plot_heatmap(A,
                 ax,
                 title="A Stochastic Block Model With 3 Communities",
                 show_cbar=True):
    cmap = matplotlib.colors.ListedColormap(["white", 'black'])
    ax = heatmap(A,
                 cmap=cmap,
                 ax=ax,
                 inner_hier_labels=labels,
                 title=title,
                 center=None,
                 cbar=False)
    ax.set_frame_on(False)
    if show_cbar:
        fig = plt.gcf()
        cax = fig.add_axes([0.95, 0.4, 0.05, 0.2])
        colorbar = fig.colorbar(ax.imshow(A, cmap=cmap), cax=cax)
        colorbar.set_ticks([0.25, 0.75])
        colorbar.set_ticklabels(["Edge", "No Edge"])
        cax.set_frame_on(True)
Ejemplo n.º 2
0

# number of clusters
num_clusters = np.zeros(Zhat.shape[0])
for i in range(Zhat.shape[0]):
    num_clusters[i] = np.unique(Zhat[i, :]).size
pair_clusters = flip_diag(num_clusters)

fig, axs = plt.subplots(1, 2, figsize=(20, 8))
cluster, count = np.unique(num_clusters, return_counts=True)
axs[0].bar(cluster, count)
axs[0].set_xlabel('number of estimated clusters')
axs[0].set_ylabel('number of pairs of graphs')

heatmap(pair_clusters,
        inner_hier_labels=participants['genotype'].values,
        ax=axs[1],
        title='number of estimated clusters')
plt.savefig('figures/mouse_num_clusters_{}'.format(args.transformation))

# clustering metrics
num_tests = Zhat.shape[0]
adj_rand = np.zeros(num_tests)
adj_mi = np.zeros(num_tests)

for i in range(num_tests):
    adj_rand[i] = adjusted_rand_score(Ztrue, Zhat[i, :])
    adj_mi[i] = adjusted_mutual_info_score(Ztrue, Zhat[i, :])

adj_rand = flip_diag(adj_rand)
adj_mi = flip_diag(adj_mi)
n = 30
p = 0.5
delta = 0.1

P = construct_feedforward_P(n, p=p, delta=delta)

A = sample_edges(P, directed=True, loops=False)

fig, axs = plt.subplots(1, 3, figsize=(12, 4))

# TODO make a plot of Phat
title = r"$P$" + "\n"
title += r"$p = $" + f"{p}, " + r"$\delta = $" + f"{delta}"
ax = axs[0]
heatmap(P, vmin=0, vmax=1, cbar=False, ax=ax, title=title)
ax.text(n / 4, 3 * n / 4, r"$p - \delta$", ha="center", va="center")
ax.text(3 * n / 4,
        n / 4,
        r"$p - \delta$",
        ha="center",
        va="center",
        color="white")

p_upper = calculate_p_upper(A)
title = "A (original permutation)\n"
title += r"$p_{upper} = $" + f"{p_upper:0.2f}"
heatmap(A, cbar=False, ax=axs[1], title=title)

perm_inds = rank_graph_match_flow(A)
p_upper = calculate_p_upper(A[np.ix_(perm_inds, perm_inds)])
Ejemplo n.º 4
0
np.random.seed(42)

from graspologic.simulations import sbm
from graspologic.plot import heatmap

# Start with some simple parameters
N = 1500  # Total number of nodes
n = N // 3  # Nodes per community
p, q = .3, .15
B = np.array([[.3, .3, .15],
              [.3, .3, .15],
              [.15, .15, .3]])  # Our block probability matrix

# Make and visualize our Stochastic Block Model
A, labels = sbm([n, n, n], B, return_labels = True)
heatmap(A, title="A Stochastic Block Model");

There are three communities (we promise), but the first two are impossible to distinguish between using only our network. The third community is distinct: nodes belonging to it aren't likely to connect to nodes in the first two communities, and are very likely to connect to each other. If we wanted to embed this graph using our Laplacian or Adjacency Spectral Embedding methods, we'd find the first and second communities layered on top of each other.

from graspologic.embed import LaplacianSpectralEmbed as LSE
from graspologic.utils import to_laplacian
import matplotlib.pyplot as plt
import seaborn as sns


def plot_latents(latent_positions, *, title, labels, ax=None):
    if ax is None:
        ax = plt.gca()
    plot = sns.scatterplot(latent_positions[:, 0], latent_positions[:, 1], hue=labels, 
                           palette="Set1", linewidth=0, s=10, ax=ax)
    plot.set_title(title, wrap=True);
Ejemplo n.º 5
0
import seaborn as sns
import matplotlib.pyplot as plt


def eig(A):
    evals, evecs = np.linalg.eig(A)
    sort_inds = np.argsort(evals)
    evals = evals[sort_inds]
    evecs = evecs[:, sort_inds]
    return evals, evecs


#%%
B = np.array([[0.8, 0.05], [0.05, 0.8]])
A = sbm([10, 10], B)
heatmap(A)

#%%
sns.set_context("talk")
degrees = np.sum(A, axis=0)
D = np.diag(degrees)
L = D - A
evals, evecs = eig(L)
fig = plt.figure()
sns.scatterplot(y=evals, x=np.arange(len(evals)))

#%%
rows = []
for p in np.linspace(0, 0.8, 20):
    for i in range(10):
        B = np.array([[0.8, p], [p, 0.8]])
Ejemplo n.º 6
0
        ))
    ranking_stats.append({
        "p_upper": p_upper,
        "season": season,
        "season_start": int(season.split("/")[0]),
    })
ranking_stats = pd.DataFrame(ranking_stats)
rankings = pd.DataFrame(rankings).T
rankings.index.name = "team"
rankings["mean"] = rankings.fillna(30).mean(axis=1)
rankings = rankings.sort_values("mean")
rankings
#%%
from graspologic.plot import heatmap

heatmap(adj)
heatmap(adj[np.ix_(perm_inds, perm_inds)])
season_nodes[perm_inds]

#%%
fig, ax = plt.subplots(1, 1, figsize=(8, 6))

sns.lineplot(data=ranking_stats, x="season_start", y="p_upper")

#%%
# rankings = rankings.fillna(2)
import colorcet as cc

fig, ax = plt.subplots(1, 1, figsize=(20, 10))
pd.plotting.parallel_coordinates(
    rankings.fillna(30).reset_index().drop("mean", axis=1),
Ejemplo n.º 7
0
\end{align*}
Which follows by using the fact that all of the $n-1$ possible edges which are incident vertex $v_i$ have the same expected probability of occurence, $p$, governed by the parameter for the $ER_n(p)$ model. This tractability of theoretical results makes the $ER_n(p)$ an ideal candidate graph to study in describing properties of networks to be expected if the network is $ER_n(p)$. Similarly, we can easily invert the properties of $ER_n(p)$ networks, to study when a graph is *not* an $ER_n(p)$ random graph, and may merit more careful inferential tasks. On another front, when one wishes to devise new computational techniques and deduce the efficiency or effectiveness of a technique on a network with a given number of nodes and a given number of edges, and is not concerned with how efficient the technique is if the network displays other (potentially exploitable) properties, the $ER_n(p)$ model also makes a good candidate for analysis. This is particularly common when dealing with graphs which are known to be sparse; that is, $p$ is very small (usually, on the order or less than $1/n$).

The following python code can be used to generate and visualize a graph which is generated by the $ER_n(p)$ model. Here, we let $n=50$ vertices, and the probability of an edge $p=.3$:

from graspologic.plot import heatmap
from graspologic.simulations import er_np

n = 50  # graph with 50 vertices
ps = 0.3  # probability of an edge existing is .3

# sample a single adj. mtx from ER(50, .3)
As = er_np(n=n, p=ps, directed=True, loops=True)

# and plot it
heatmap(As, title="ER(50, 0.3) Simulation")

In the simple simulation above, we sample a single, undirected, network with loops, with adjacency matrix $\pmb A^{(s)}$. We visualize the network using a heatmap, where we recall that a square is dark red if an edge is present, and white if no edge is present.

Given a graph with an adjacency matrix $\pmb A^{(s)}$, we can also use graspologic to estimate the probability parameter of the $ER_n(p)$ model:

from graspologic.models import EREstimator

# instantiate an ER Estimator which is directed with loops
er = EREstimator(directed=True, loops=True)
# fit an ER model to As
er.fit(As)
print(f"ER \"p\" parameter: {er.p_}")

As we can see, the probability parameter for a directed network with loops is simply the average edge weight within the network:
    plt.figure(figsize=(14, 10))
    sns.set(font_scale=2)
    sns.lineplot(data=median_ts_all, x=xlab, y=ylab, hue=klab)
    plt.title(
        'Median Test Statistic on Enron Email Dataset with Various Number of Community',
        fontsize=fontsize)
    plt.savefig('figures/enron_teststats_median.png')

elif args.setting == 'auto':
    xlab = 'Difference in Time'
    ylab = 'Test Statistic'
    fontsize = 20

    with open(
            'outputs/enron_gcorrDC_teststats_unpooled_Zestimated_untransformed_fixseed.pkl',
            'rb') as f:
        ts = pickle.load(f)
    ts_pair_result = get_pair_result(ts, xlab, ylab)
    fig, axs = plt.subplots(1, 2, figsize=(20, 6))
    heatmap(ts, ax=axs[0])
    axs[0].set_xlabel('Time Point', fontsize=fontsize)
    axs[0].set_ylabel('Time Point', fontsize=fontsize)
    axs[0].set_title(
        'Test Statistic (Number of Communities Chosen Automatically)',
        fontsize=fontsize)
    p = sns.boxplot(x=xlab, y=ylab, data=ts_pair_result, ax=axs[1])
    p.set_xlabel(xlab, fontsize=fontsize)
    p.set_ylabel(ylab, fontsize=fontsize)
    axs[1].set_title('Test Statistic vs. Difference in Time',
                     fontsize=fontsize)
    plt.savefig('figures/enron_teststats_autoK.png')
Ejemplo n.º 9
0
# Below, you can see the Laplacian we generated earlier next to $YY^\top$. Remember, each matrix contains information about our communities that the other doesn't have - and our goal is to combine them in a way that lets us distinguish between all three communities.

# In[10]:

# plot
fig, axs = plt.subplots(nrows=1,
                        ncols=2,
                        figsize=(10, 5),
                        constrained_layout=True)
L_ax = plot_heatmap(L,
                    title=r"Regularized Laplacian",
                    ax=axs[0],
                    show_cbar=False)
X_ax = heatmap(YYt,
               title=r"Covariate matrix times its transpose ($YY^\top$)",
               ax=axs[1])

# The way we'll combine the two matrices will simply be a weighted sum of these two matrices - this is what CASE is doing under the hood. The weight (here called $\alpha$) is multiplied by $YY^\top$ - that way, both matrices contribute an equal amount of useful information to the embedding.
#
# $$
# L + \alpha YY^\top
# $$

# ### Exploring Possible Weights

# An obvious question here is how to weight the covariates. If we simply summed the two matrices by themselves, we'd unfortunately be in a situation where whichever matrix contained larger numbers would dominate over the other. In our current setup, without a weight on $YY^\top$, the covariates of our network would dominate over its topology.

# In[11]:

fig, axs = plt.subplots(1, 2, figsize=(10, 5))
Ejemplo n.º 10
0
for i, X in enumerate(Xs):
    Q = ortho_group.rvs(X.shape[1])
    X = X @ Q
    new_Xs.append(X)

Z_w = np.concatenate(new_Xs, axis=1)
Z = np.concatenate(Xs, axis=1)
np.linalg.norm(Z_w @ Z_w.T - Z @ Z.T)

from graspologic.plot import heatmap

plt.figure()
sns.heatmap(Z_w)
plt.figure()
sns.heatmap(Z)
heatmap(Z_w @ Z_w.T)
heatmap(Z @ Z.T)

#%%
Q = ortho_group.rvs(X.shape[1])
XWYt = Xs[0] @ Q @ Xs[1].T
XYt = Xs[0] @ Xs[1].T
np.linalg.norm(XWYt - XYt)
#%%
block = [[Xs[0] @ Xs[0].T, Xs[0] @ Xs[1].T],
         [Xs[1] @ Xs[0].T, Xs[1] @ Xs[1].T]]
ZZt_mine = np.block(block)
heatmap(ZZt_mine)

#%%
ZZt_w = Z_w @ Z_w.T
Ejemplo n.º 11
0
            input_file = 'mouse_gcorr_teststats_unpooled_Z{}_{}.pkl'.format(
                z, tf)
        elif test == 'DCSBM Gcorr Pooled':
            input_file = 'mouse_gcorrDC_teststats_pooled_Z{}_{}.pkl'.format(
                z, tf)
        elif test == 'DCSBM Gcorr Unpooled':
            input_file = 'mouse_gcorrDC_teststats_unpooled_Z{}_{}.pkl'.format(
                z, tf)

        with open('outputs/{}'.format(input_file), 'rb') as f:
            result = pickle.load(f)
        result += result.T
        heatmap(result,
                ax=axs[i, j],
                vmax=0.7,
                center=0.35,
                inner_hier_labels=participants['genotype'],
                hier_label_fontsize=25,
                cbar=not bool(i + j))

pad = 60
label_size = 40
rows = ['Z Given', 'Z Estimated']
for ax, row in zip(axs[:, 0], rows):
    ax.annotate(row,
                xy=(0, 0.5),
                xytext=(-ax.yaxis.labelpad - pad, 0),
                xycoords=ax.yaxis.label,
                textcoords='offset points',
                size=label_size,
                ha='right',