Beispiel #1
0
 def cluster_test(self, test_file, clusters=10):
     df_test1 = pd.read_csv(test_file)
     output = {}
     for K in clusters:
         vectors = list()
         y_true = list()
         sections = dict()
         idx = 0
         for word, section, y in df_test1.values:
             sliceIdx = self.yearDict[str(y)]
             if word in self.vocabularies[sliceIdx]:
                 if section not in sections:
                     sections[section] = idx
                     idx += 1
                 y_true.append(sections[section])
                 vectors.append(self.matrices_norm[sliceIdx][
                     self.vocabularies[sliceIdx][word]])
         skm = SphericalKMeans(n_clusters=K, max_iter=100000)
         skm.fit(np.array(vectors))
         metric = normalized_mutual_info_score(skm.predict(
             np.array(vectors)),
                                               y_true,
                                               average_method='arithmetic')
         y_true_bool = [(triplet1 == triplet2) for triplet2 in y_true
                        for triplet1 in y_true]
         y_pred = skm.predict(np.array(vectors))
         y_pred_bool = [(triplet1 == triplet2) for triplet2 in y_pred
                        for triplet1 in y_pred]
         metric2 = fbeta_score(y_true_bool, y_pred_bool, beta=5)
         output[f'NMI({K})'] = metric
         output[f'F_beta-score({K})'] = metric2
     return output
Beispiel #2
0
def SphericalKMeans_model(vocab_embeddings, vocab, topics, rerank, rand,
                          weights):
    spkmeans = SphericalKMeans(n_clusters=topics,
                               random_state=rand).fit(vocab_embeddings,
                                                      sample_weight=weights)
    m_clusters = spkmeans.predict(vocab_embeddings, sample_weight=weights)
    centers = np.array(spkmeans.cluster_centers_)

    indices = []

    for i in range(topics):
        topk_vals = sort_closest_cossine_center(centers[i], m_clusters,
                                                vocab_embeddings, i)
        if rerank:
            indices.append(find_top_k_words(100, topk_vals, vocab))
        else:
            indices.append(find_top_k_words(10, topk_vals, vocab))
        # print(indices)
    return m_clusters, indices
def analyse(methode,
            preproc,
            true_label,
            nb_clusters=3,
            normalizer=True,
            scikit=True):
    if scikit:
        data = methode.fit_transform(preproc)
    else:
        data = preproc
    if normalizer:
        data = Normalizer(norm='l2', copy=False).fit_transform(data)
    skplt.cluster.plot_elbow_curve(SphericalKMeans(random_state=42, n_jobs=-1),
                                   data,
                                   title="Elbow Curve avec Spherical K-means",
                                   cluster_ranges=range(1, 15))
    skplt.cluster.plot_elbow_curve(KMeans(random_state=42,
                                          n_jobs=-1,
                                          precompute_distances=True),
                                   data,
                                   title="Elbow Curve avec K-means",
                                   cluster_ranges=range(1, 15))
    ("Fitting For Spherical K-means for ", nb_clusters, "...")
    skmeans = SphericalKMeans(n_clusters=nb_clusters,
                              random_state=42,
                              n_jobs=-1).fit(data)
    ("Fitting For Spherical K-means for ", nb_clusters, "...")
    kmeans = KMeans(n_clusters=nb_clusters,
                    random_state=42,
                    n_jobs=-1,
                    precompute_distances=True).fit(data)
    y_pred_skmeans = skmeans.predict(data)
    y_pred_kmeans = kmeans.predict(data)
    print("Results from Spherical K-means")
    scoring_cluster(skmeans, true_label, y_pred_skmeans)
    print("Results from K-means")
    scoring_cluster(kmeans, true_label, y_pred_kmeans)
    return methode, skmeans, kmeans, data
Beispiel #4
0
lse_latent = lse(sum_adj, 4, regularizer=None)

latent = lse_latent
pairplot(latent, labels=simple_class_labels, title=embed)

for k in range(MIN_CLUSTERS, MAX_CLUSTERS + 1):
    run_name = f"k = {k}, {cluster}, {embed}, right hemisphere (sum), PTR, raw"
    print(run_name)
    print()

    # Cluster
    # gmm = GaussianCluster(min_components=k, max_components=k, **gmm_params)
    # gmm.fit(latent)
    skmeans = SphericalKMeans(n_clusters=k, **skmeans_params)
    skmeans.fit(latent)
    pred_labels = skmeans.predict(latent)

    # ARI
    base_dict = {
        "K": k,
        "Cluster": cluster,
        "Embed": embed,
        "Method": f"{cluster} o {embed}",
        "Score": skmeans.inertia_,
    }
    mb_ari = sub_ari(known_inds, mb_labels, pred_labels)
    mb_ari_dict = base_dict.copy()
    mb_ari_dict["ARI"] = mb_ari
    mb_ari_dict["Metric"] = "MB ARI"
    out_dicts.append(mb_ari_dict)
Beispiel #5
0
                                     swapRB=True,
                                     crop=False)
    embedder.setInput(faceBlob)
    vec = embedder.forward()
    vectors.append(vec.flatten())
    identities.append(int(identity[1:]))
    frames.append(int(frame))

identities = np.array(identities)
frames = np.array(frames)
vectors = np.array(vectors)
print(vectors.shape, frames.shape, identities.shape, len(imagePaths))

df = pd.DataFrame(vectors, columns=[str(k) for k in range(0, 128)])
df['frames'] = frames
df['identities'] = identities
X = df.loc[:, '0':'127']
Y = df['identities']

print("[INFO] Finding cluster centroids ...")
skm = SphericalKMeans(n_clusters=100, verbose=1, n_jobs=-2, random_state=1)
skm.fit(X)
labels = skm.predict(X)
df['labels'] = labels
df = df.sort_values(by='identities')

df.to_csv(tracks_path + "/embedding.csv", index=False)

print("[INFO] embeddings complete ...")
print("[INFO] Linking ...")