Example #1
0
def estimate_assignments(graph,
                         n_communities,
                         n_components=None,
                         method="gc",
                         metric=None):
    """Given a graph and n_comunities, sweeps over covariance structures
    Not deterministic
    Not using graph bic or mse to calculate best

    1. Does an embedding on the raw graph
    2. GaussianCluster on the embedding. This will sweep covariance structure for the 
       given n_communities
    3. Returns n_parameters based on the number used in GaussianCluster

    method can be "gc" or "bc" 

    method 
    "gc" : use graspy GaussianCluster
        this defaults to full covariance
    "bc" : tommyclust with defaults
        so sweep covariance, agglom, linkage
    "bc-metric" : tommyclust with custom metric
        still sweep everything
    "bc-none" : mostly for testing, should behave just like GaussianCluster

    """
    embed_graph = graph.copy()
    latent = AdjacencySpectralEmbed(
        n_components=n_components).fit_transform(embed_graph)
    if isinstance(latent, tuple):
        latent = np.concatenate(latent, axis=1)
    if method == "gc":
        gc = GaussianCluster(
            min_components=n_communities,
            max_components=n_communities,
            covariance_type="all",
        )
        vertex_assignments = gc.fit_predict(latent)
        n_params = gc.model_._n_parameters()
    elif method == "bc":
        vertex_assignments, n_params = brute_cluster(latent, [n_communities])
    elif method == "bc-metric":
        vertex_assignments, n_params = brute_cluster(latent, [n_communities],
                                                     metric=metric)
    elif method == "bc-none":
        vertex_assignments, n_params = brute_cluster(
            latent,
            [n_communities],
            affinities=["none"],
            linkages=["none"],
            covariance_types=["full"],
        )
    else:
        raise ValueError("Unspecified clustering method")
    return (vertex_assignments, n_params)
def brute_graspy_cluster(Ns,
                         x,
                         covariance_types,
                         ks,
                         c_true,
                         savefigs=None,
                         graphList=None):
    if graphList != None and 'all_bics' in graphList:
        _, ((ax0, ax1), (ax2, ax3)) = plt.subplots(2,
                                                   2,
                                                   sharey='row',
                                                   sharex='col',
                                                   figsize=(10, 10))
    titles = ['full', 'tied', 'diag', 'spherical']
    best_bic = -np.inf
    for N in Ns:
        bics = np.zeros([len(ks), len(covariance_types), N])
        aris = np.zeros([len(ks), len(covariance_types), N])
        for i in np.arange(N):
            graspy_gmm = GaussianCluster(min_components=ks[0],
                                         max_components=ks[len(ks) - 1],
                                         covariance_type=covariance_types,
                                         random_state=i)
            c_hat, ari = graspy_gmm.fit_predict(x, y=c_true)
            bic_values = -graspy_gmm.bic_.values
            ari_values = graspy_gmm.ari_.values
            bics[:, :, i] = bic_values
            aris[:, :, i] = ari_values
            bic = bic_values.max()

            if bic > best_bic:
                idx = np.argmax(bic_values)
                idxs = np.unravel_index(idx, bic_values.shape)
                best_ari_bic = ari
                best_bic = bic
                best_k_bic = ks[idxs[0]]
                best_cov_bic = titles[3 - idxs[1]]
                best_c_hat_bic = c_hat

        max_bics = np.amax(bics, axis=2)
        title = 'N=' + str(N)
        if graphList != None and 'all_bics' in graphList:
            ax0.plot(np.arange(1, len(ks) + 1), max_bics[:, 3])
            ax1.plot(np.arange(1, len(ks) + 1), max_bics[:, 2], label=title)
            ax2.plot(np.arange(1, len(ks) + 1), max_bics[:, 1])
            ax3.plot(np.arange(1, len(ks) + 1), max_bics[:, 0])

    if graphList != None and 'best_bic' in graphList:
        #Plot with best BIC*********************************
        if c_true is None:
            best_ari_bic_str = 'NA'
        else:
            best_ari_bic_str = '%1.3f' % best_ari_bic

        fig_bestbic = plt.figure(figsize=(8, 8))
        ax_bestbic = fig_bestbic.add_subplot(1, 1, 1)
        #ptcolors = [colors[i] for i in best_c_hat_bic]
        ax_bestbic.scatter(x[:, 0], x[:, 1], c=best_c_hat_bic)
        #mncolors = [colors[i] for i in np.arange(best_k_bic)]
        mncolors = [i for i in np.arange(best_k_bic)]
        ax_bestbic.set_title(
            "py(agg-gmm) BIC %3.0f from " % best_bic + str(best_cov_bic) +
            " k=" + str(best_k_bic) + ' ari=' +
            best_ari_bic_str)  # + "iter=" + str(best_iter_bic))
        ax_bestbic.set_xlabel("First feature")
        ax_bestbic.set_ylabel("Second feature")
        if savefigs is not None:
            plt.savefig(savefigs + '_python_bestbic.jpg')

    if graphList != None and 'all_bics' in graphList:
        #plot of all BICS*******************************
        titles = ['full', 'tied', 'diag', 'spherical']
        #ax0.set_title(titles[0],fontsize=20,fontweight='bold')
        #ax0.set_ylabel('BIC',fontsize=20)
        ax0.locator_params(axis='y', tight=True, nbins=4)
        ax0.set_yticklabels(ax0.get_yticks(), fontsize=14)

        #ax1.set_title(titles[1],fontsize=20,fontweight='bold')
        legend = ax1.legend(loc='best', title='Number of\nRuns', fontsize=12)
        plt.setp(legend.get_title(), fontsize=14)

        #ax2.set_title(titles[2],fontsize=20,fontweight='bold')
        #ax2.set_xlabel('Number of components',fontsize=20)
        ax2.set_xticks(np.arange(0, 21, 4))
        ax2.set_xticklabels(ax2.get_xticks(), fontsize=14)
        #ax2.set_ylabel('BIC',fontsize=20)
        ax2.locator_params(axis='y', tight=True, nbins=4)
        ax2.set_yticklabels(ax2.get_yticks(), fontsize=14)

        #ax3.set_title(titles[3],fontsize=20,fontweight='bold')
        #ax3.set_xlabel('Number of components',fontsize=20)
        ax3.set_xticks(np.arange(0, 21, 4))
        ax3.set_xticklabels(ax3.get_xticks(), fontsize=14)

        if savefigs is not None:
            plt.savefig('.\\figures\\25_6_19_paperv2\\' + savefigs +
                        '_graspy_bicplot2.jpg')
    plt.show()

    return best_c_hat_bic, best_cov_bic, best_k_bic, best_ari_bic, best_bic
Example #3
0
        adj = pass_to_ranks(adj)
    lap = to_laplace(adj, form="R-DAD")
    ase = AdjacencySpectralEmbed(n_components=n_components)
    latent = ase.fit_transform(lap)
    latent = np.concatenate(latent, axis=-1)
    return latent


n_components = None
k = 30

latent = lse(adj, n_components, regularizer=None)

gmm = GaussianCluster(min_components=k, max_components=k)

pred_labels = gmm.fit_predict(latent)

stacked_barplot(pred_labels, class_labels, palette="tab20")

# %% [markdown]
# # verify on sklearn toy dataset
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

X, y = make_blobs(n_samples=200, n_features=3, centers=None, cluster_std=3)
# y = y.astype(int).astype(str)
data_df = pd.DataFrame(
    data=np.concatenate((X, y[:, np.newaxis]), axis=-1),
    columns=("Dim 0", "Dim 1", "Dim 2", "Labels"),
)
# data_df["Labels"] = data_df["Labels"].values.astype("<U10")
    #- BIC
    bic_ = 2 * likeli - temp_n_params * np.log(n)

    #- ARI
    ari_ = ari(true_labels, temp_c_hat)

    return [combo, likeli, ari_, bic_]


np.random.seed(16661)
A = binarize(right_adj)
X_hat = np.concatenate(ASE(n_components=3).fit_transform(A), axis=1)
n, d = X_hat.shape

gclust = GCLUST(max_components=15)
est_labels = gclust.fit_predict(X_hat)

loglikelihoods = [np.sum(gclust.model_.score_samples(X_hat))]
combos = [None]
aris = [ari(right_labels, est_labels)]
bic = [gclust.model_.bic(X_hat)]

unique_labels = np.unique(est_labels)

class_idx = np.array([np.where(est_labels == u)[0] for u in unique_labels])

for k in range(len(unique_labels)):
    for combo in list(combinations(np.unique(est_labels), k + 1)):
        combo = np.array(list(combo)).astype(int)
        combos.append(combo)
Example #5
0
n = 1000
pi = 0.9

A, counts = generate_cyclops(X, n, pi, None)
c = [0] * counts[0]
c += [1] * counts[1]

ase = ASE(n_components=3)
X_hat = ase.fit_transform(A)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_hat[:, 0], X_hat[:, 1], X_hat[:, 2], c=c)

gclust = GCLUST(max_components=4)
c_hat = gclust.fit_predict(X_hat)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_hat[:, 0], X_hat[:, 1], X_hat[:, 2], c=c_hat)


def quadratic(data, params):
    if data.ndim == 1:
        sum_ = np.sum(data[:-1]**2 * params[:-1]) + params[-1]
        return sum_
    elif data.ndim == 2:
        sums = np.sum(data[:, :-1]**2 * params[:-1], axis=1) + params[-1]
        return sums
    else:
        raise ValueError("unsuppored data")
unknown = classes == "Other"
plot_unknown = np.tile(unknown, n_graphs)
pairplot(plot_latent, labels=plot_unknown, alpha=0.3, legend_name="Unknown")


clust_latent = np.concatenate(list(latent), axis=-1)
clust_latent.shape
#%%
gc = GaussianCluster(min_components=2, max_components=15, covariance_type="all")

filterwarnings("ignore")
n_init = 50
sim_mat = np.zeros((n_verts, n_verts))

for i in tqdm(range(n_init)):
    assignments = gc.fit_predict(clust_latent)
    for c in np.unique(assignments):
        inds = np.where(assignments == c)[0]
        sim_mat[np.ix_(inds, inds)] += 1


sim_mat -= np.diag(np.diag(sim_mat))
sim_mat = sim_mat / n_init
heatmap(sim_mat)


#%%
thresh_sim_mat = sim_mat.copy()
thresh_sim_mat[thresh_sim_mat > 0.5] = 1
thresh_sim_mat[thresh_sim_mat < 0.5] = 0