Example #1
0
 def fit(self, X, y=None):
     n_samples = X.shape[0]
     self.n_samples_ = n_samples
     self.cum_dist_ = 0
     if n_samples > self.min_split_samples:
         if self.cluster_method == "graspy-gmm":
             cluster = GaussianCluster(
                 min_components=1,
                 max_components=2,
                 n_init=self.n_init,
                 covariance_type="all",
             )
         elif self.cluster_method == "auto-gmm":
             cluster = AutoGMMCluster(
                 min_components=1, max_components=2, max_agglom_size=None
             )
         elif self.cluster_method == "vmm":
             # cluster = VonMisesFisherMixture(n)
             pass
         else:
             raise ValueError(f"`cluster_method` must be one of {valid_methods}")
         cluster.fit(X)
         pred_labels = cluster.predict(X)
         self.pred_labels_ = pred_labels
         self.model_ = cluster
         if hasattr(cluster, "bic_"):
             bics = cluster.bic_
             self.bics_ = bics
             bic_ratio = bics.loc[2].min() / bics.loc[1].min()
             self.bic_ratio_ = bic_ratio
         if cluster.n_components_ != 1:  # recurse
             indicator = pred_labels == 0
             self.X_children_ = (X[indicator, :], X[~indicator, :])
             children = []
             for i, X_child in enumerate(self.X_children_):
                 child = DivisiveCluster(
                     name=self.name + str(i),
                     parent=self,
                     min_split_samples=self.min_split_samples,
                     n_init=self.n_init,
                     cluster_method=self.cluster_method,
                 )
                 child = child.fit(X_child)
                 children.append(child)
             self.children = children
     return self
Example #2
0
def one_iteration(start_labels, class_key="Merge Class"):
    # generate walks
    data, bins, classes = random_walk_classes(start_labels,
                                              seed=None,
                                              class_key=class_key)
    log_data = np.log10(data + 1)
    # plot the clustermap
    path_clustermap(log_data, classes, bins)
    # embed and plot by known class
    embedding = PCA(n_components=8).fit_transform(log_data)
    pairplot(embedding, labels=classes, palette=CLASS_COLOR_DICT)
    # cluster
    agm = AutoGMMCluster(min_components=2,
                         max_components=20,
                         n_jobs=-1,
                         verbose=10)
    pred_labels = agm.fit_predict(embedding)
    plt.figure()
    sns.scatterplot(data=agm.results_, x="n_components", y="bic/aic")
    # plot embedding by cluster
    pairplot(embedding, labels=pred_labels, palette=cc.glasbey_light)
    # plot predicted clusters by known class
    stacked_barplot(pred_labels, classes, color_dict=CLASS_COLOR_DICT)
    return pred_labels
Example #3
0
print("Finding pairwise jaccard distances")
pdist_sparse = pairwise_sparse_jaccard_distance(path_mat)

print(pdist_sparse.shape)

print("Embedding with MDS")
mds = ClassicalMDS(dissimilarity="precomputed")
# mds = MDS(dissimilarity="precomputed", n_components=6, n_init=16, n_jobs=-2)
jaccard_embedding = mds.fit_transform(pdist_sparse)

# %% [markdown]
# #

print("Clustering embedding")
agmm = AutoGMMCluster(min_components=10,
                      max_components=40,
                      affinity="euclidean",
                      linkage="single")
labels = agmm.fit_predict(jaccard_embedding)

pairplot(jaccard_embedding,
         title="AGMM o CMDS o Jaccard o Sensorimotor Paths",
         labels=labels)
savefig("AGMM-CMDS-jaccard-sm-path")

print("Finding mean paths")
mean_paths = []
uni_labels = np.unique(labels)
for ul in uni_labels:
    inds = np.where(labels == ul)[0]
    paths = path_mat[inds, :]
    mean_path = np.array(np.mean(paths, axis=0))
Example #4
0
    palette=CLASS_COLOR_DICT,
)
stashfig("raw-response-pairs" + basename)

# %% [markdown]
# # Cluster each thing separately
from graspy.cluster import AutoGMMCluster

pred = []
for i, name in enumerate(from_group_names):
    print(name)
    o = log_collapsed_hist[:, i * n_bins:(i + 1) * n_bins]
    agmm = AutoGMMCluster(
        min_components=2,
        max_components=20,
        n_jobs=-2,
        verbose=10,
        affinity=["euclidean", "manhattan", "none"],
    )
    pred_labels = agmm.fit_predict(o)
    pairplot(o[:, :5],
             labels=meta["Merge Class"].values,
             palette=CLASS_COLOR_DICT)
    stashfig(f"from-g{name}-known" + basename)
    pairplot(o[:, :5], labels=pred_labels, palette=cc.glasbey_light)
    stashfig(f"from-g{name}-predicted" + basename)
    print(len(np.unique(pred_labels)))
    pred.append(pred_labels)
    print()
# generate_cascade_paths(start_ind, probs, 1, stop_inds=out_inds, max_depth=10)
Example #5
0
for i, (fg, fg_name) in enumerate(zip(from_groups, from_group_names)):
    print(f"Clustering for {fg_name}")

    # run the clustering on histogram
    hop_hist = fg_hop_hists[i]

    X = hop_hist.T
    if normalize:
        sums = X.sum(axis=1)
        sums[sums == 0] = 1
        X = X / sums[:, None]

    if log_cluster:
        X = np.log10(X + 1)

    agmm = AutoGMMCluster(**cluster_kws)
    pred_labels = agmm.fit_predict(X)
    results = agmm.results_
    fg_col_meta[i]["pred_labels"] = pred_labels
    fg_autoclusters.append(agmm)

    ggmm = GaussianCluster(min_components=10,
                           max_components=40,
                           n_init=20,
                           covariance_type="diag")
    ggmm.fit(X)
    fg_graspyclusters.append(ggmm)

    gbics = ggmm.bic_

    fig, ax = plt.subplots(1, 1, figsize=(10, 5))
Example #6
0
pc.set_zorder(10)
ax.plot(rng, cmds.singular_values_, "o-")
ax.legend()
stashfig("cmds-screeplot" + basename)

# %% [markdown]
# ##

pairplot(path_embed, alpha=0.02)
stashfig("cmds-pairs-all" + basename)
# %% [markdown]
# ##
print("Running AGMM on CMDS embedding")
n_components = 4

agmm = AutoGMMCluster(max_components=40, n_jobs=-2)
pred = agmm.fit_predict(path_embed[:, :n_components])

print(f"Number of clusters: {agmm.n_components_}")

# %% [markdown]
# ##
pairplot(
    path_embed[:, :n_components],
    alpha=0.02,
    labels=pred,
    palette=cc.glasbey_light,
    legend_name="Cluster",
)
stashfig("pairplot-agmm-cmds" + basename)
Example #7
0
screeplot(all_hop_hist, show_first=40)
stashfig("scree-first-40")
screeplot(all_hop_hist, show_first=None)
stashfig("scree-all")
screeplot(np.log10(all_hop_hist + 1), show_first=100)
screeplot(np.log10(all_hop_hist + 1), show_first=100, cumulative=True)

# %% [markdown]
# ##

from graspy.cluster import AutoGMMCluster

agmm = AutoGMMCluster(
    min_components=2,
    max_components=50,
    affinity=["euclidean", "manhattan"],
    max_agglom_size=3000,
    n_jobs=-2,
    verbose=10,
)
agmm.fit(all_hop_hist.T)

# %% [markdown]
# ##

from graspy.embed import select_dimension

select_dimension(all_hop_hist.T, n_elbows=5)
#%%
from graspy.embed import selectSVD
from graspy.plot import pairplot
    side_mb_mg = side_mgs[side]
    labels = side_mb_mg.meta["class1"].values
    labels = np.vectorize(label_map.get)(labels)
    plot_labels = side_mb_mg.meta["merge_class"].values

    # embed
    ase = AdjacencySpectralEmbed(n_components=None, algorithm="randomized")
    embed = ase.fit_transform(pass_to_ranks(side_mb_mg.adj))
    embed = np.concatenate(embed, axis=1)

    # cluster using AutoGMM
    method = "AutoGMM"
    agmm = AutoGMMCluster(
        min_components=2,
        max_components=10,
        affinity=["euclidean", "manhattan", "cosine"],
        covariance_type=["full"],
        n_jobs=-1,
    )
    agmm.fit(embed, labels)
    agmm_results = agmm.results_.copy()
    agmm_results.sort_values("bic/aic", inplace=True)
    agmm_model = agmm.model_
    agmm_pred_labels = agmm_model.predict(embed)
    ari = adjusted_rand_score(labels, agmm_pred_labels)
    ari_no_kc = adjusted_rand_score(
        labels[labels != "KC"], agmm_pred_labels[labels != "KC"]
    )
    row = dict(
        ari=ari,
        ari_no_kc=ari_no_kc,
Example #9
0
embedding = PCA(n_components=8).fit_transform(raw_hist_data)
pairplot(embedding, labels=dfs[0]["Merge Class"].values, palette=CLASS_COLOR_DICT)
# %% [markdown] 
# # 
from sklearn.cluster import AgglomerativeClustering

agg = AgglomerativeClustering(n_clusters=10, affinity="euclidean", linkage="average")
labels = agg.fit_predict(raw_hist_data)
pairplot(embedding, labels=labels, palette=cc.glasbey_light)

# %% [markdown]
# #

from graspy.cluster import AutoGMMCluster

agm = AutoGMMCluster(min_components=2, max_components=20, n_jobs=-1
agm.fit(embedding)

# %% [markdown] 
# # 
# agm.results_.groupby(["affinity", "covariance_type", "linkage"])
sns.scatterplot(data=agm.results_, x='n_components', y='bic/aic')

# %% [markdown] 
# # 

new_groups = agm.predict(embedding)

stacked_barplot(new_groups, meta["Merge Class"].values, color_dict=CLASS_COLOR_DICT)
# %% [markdown] 
# # 
Example #10
0
else:
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    sns.scatterplot(path_embed[:, 0], path_embed[:, 1], s=30, alpha=0.2)
    ax.axis("off")

stashfig(f"pairs-all" + basename)

# %% [markdown]
# ## Cluster and plot on the embedding
print("Running AGMM on path embedding")
n_components = elbows[0]
# n_components = 2d
print(f"Using {n_components} dimensions")

agmm = AutoGMMCluster(max_components=30, n_jobs=-2)
pred = agmm.fit_predict(path_embed[:, :n_components]
                        )  # + np.random.normal(0, 0.01, size=path_embed.shape)

print(f"Number of clusters: {agmm.n_components_}")

pg = pairplot(
    path_embed[:, :n_components],
    alpha=0.1,
    labels=pred,
    palette=cc.glasbey_light,
    legend_name="Cluster",
)
leg = pg._legend
for lh in leg.legendHandles:
    lh.set_alpha(1)
Example #11
0
    heatmap(
        mg.adj,
        transform="simple-all",
        title=f"MB, threshold={threshold}",
        inner_hier_labels=true_labels,
        hier_label_fontsize=10,
        sort_nodes=True,
    )

    latent = ase(mg.adj, n_components, ptr=ptr)
    # cluster = GaussianCluster(
    #     min_components=2, max_components=10, covariance_type="all", n_init=100
    # )

    cluster = AutoGMMCluster(min_components=2, max_components=10)
    pred_labels = cluster.fit_predict(latent)
    ari = adjusted_rand_score(true_labels, pred_labels)
    row = {"ARI": ari, "Threshold": threshold, "Method": "GMMoASE"}
    rows.append(row)

    # do the MCMC
    block_series = run_minimize_blockmodel(mg, weight_model="discrete-poisson")
    ari = adjusted_rand_score(true_labels, block_series.values)
    row = {"ARI": ari, "Threshold": threshold, "Method": "GT-dp"}
    rows.append(row)

    # do the MCMC
    block_series = run_minimize_blockmodel(mg, weight_model=None)
    ari = adjusted_rand_score(true_labels, block_series.values)
    row = {"ARI": ari, "Threshold": threshold, "Method": "GT-None"}
Example #12
0
cutoff = 8
base = f"-c{cutoff}-t{threshold}-{graph_type}"

base_path = Path(f"./maggot_models/notebooks/outs/{run_name}/csvs")
meta = pd.read_csv(base_path / str("meta" + base + ".csv"), index_col=0)
path_mat = pd.read_csv(base_path / str("prob-path-mat" + base + ".csv"),
                       index_col=0).values

base_path = Path(f"./maggot_models/notebooks/outs/{embed_name}/csvs")
embed_mat = pd.read_csv(base_path / str("euclid-mds-embed.csv"), index_col=0)

gmm = AutoGMMCluster(
    min_components=10,
    max_components=50,
    affinity="all",
    linkage="all",
    covariance_type="all",
    n_jobs=-2,
    verbose=30,
)
labels = gmm.fit_predict(embed_mat.values)

label_df = pd.DataFrame(data=labels)
stashcsv(label_df, "labels")

print("Finding mean paths")
mean_paths = []
uni_labels = np.unique(labels)
for ul in uni_labels:
    inds = np.where(labels == ul)[0]
    paths = path_mat[inds, :]