def __init__(self, data, n_cluster, method="soft-movMF", init="random-class", n_init=10, n_jobs=1): self.data = data self.n_cluster = n_cluster self.method = method if method == "spk": self.clus = SphericalKMeans(n_clusters=n_cluster) elif method == "hard-movMF": self.clus = VonMisesFisherMixture(n_clusters=n_cluster, posterior_type='hard', init=init, n_init=n_init, n_jobs=n_jobs) elif method == "soft-movMF": self.clus = VonMisesFisherMixture(n_clusters=n_cluster, posterior_type='soft', init=init, n_init=n_init, n_jobs=n_jobs) self.clusters = { } # cluster id -> dict(element_id: distance to center) self.clusters_phrase = {} # cluster id -> representative words self.membership = None # a list contain the membership of the data points self.center_ids = None # a list contain the ids of the cluster centers self.inertia_scores = None
def mandatory_questions(): # 1.2.1 # ang_data = unistroke_to_angular(read_data("")) # plt.hist(ang_data[:, 0], 150, density=True) # plt.hist(ang_data[:, 1], 150, density=True) # 1.2.5 mix = VonMisesFisherMixture(2) data = unistroke_to_angular(read_data("")) mix = mix.fit(data)
def label_expansion(class_labels, write_path, vocabulary_inv, embedding_mat): print("Retrieving top-t nearest words...") n_classes = len(class_labels) prob_sup_array = [] current_szes = [] all_class_labels = [] for class_label in class_labels: current_sz = len(class_label) current_szes.append(current_sz) prob_sup_array.append([1 / current_sz] * current_sz) all_class_labels += list(class_label) current_sz = np.min(current_szes) while len(all_class_labels) == len(set(all_class_labels)): current_sz += 1 expanded_array = seed_expansion(class_labels, prob_sup_array, current_sz, None, vocabulary_inv, embedding_mat) all_class_labels = [w for w_class in expanded_array for w in w_class] expanded_array = seed_expansion(class_labels, prob_sup_array, current_sz - 1, None, vocabulary_inv, embedding_mat) print("Final expansion size t = {}".format(len(expanded_array[0]))) centers = [] kappas = [] print("Top-t nearest words for each class:") for i in range(n_classes): expanded_class = expanded_array[i] vocab_expanded = [vocabulary_inv[w] for w in expanded_class] print("Class {}:".format(i)) print(vocab_expanded) expanded_mat = embedding_mat[np.asarray(expanded_class)] vmf_soft = VonMisesFisherMixture(n_clusters=1) vmf_soft.fit(expanded_mat) center = vmf_soft.cluster_centers_[0] kappa = vmf_soft.concentrations_[0] centers.append(center) kappas.append(kappa) for j, expanded_class in enumerate(expanded_array): if write_path is not None: if not os.path.exists(write_path): os.makedirs(write_path) f = open(write_path + 'class' + str(j) + '.txt', 'w') for i, word in enumerate(expanded_class): f.write(vocabulary_inv[word] + ' ') f.close() print("Finished vMF distribution fitting.") return expanded_array, centers, kappas
def estimate_vMF_params(self): """ Returns: """ vmf_soft = VonMisesFisherMixture(n_clusters=1, posterior_type='soft') try: vmf_soft.fit(self.samples_xyz) self.kappa = vmf_soft.concentrations_[0] self.mean = vmf_soft.cluster_centers_[0] print('concentration parameter ', self.kappa.astype(int), 'mean direction ', self._cartesian2spherical(self.mean).astype(int)) except AttributeError: print( 'object has no orientations. Use add_orientations to load orientation data manually or sample from a vMF distribution with the .sample method' )
def VonMisesFisherMixture_Model(vocab_embeddings, vocab, topics, rerank, rand): # vmf_soft = VonMisesFisherMixture(n_clusters=topics, posterior_type='hard', n_jobs=-1, random_state=rand).fit(vocab_embeddings) print("fitting vmf...") vmf_soft = VonMisesFisherMixture(n_clusters=topics, posterior_type='soft', n_jobs=-1, random_state=rand).fit(vocab_embeddings) llh = vmf_soft.log_likelihood(vocab_embeddings) indices = [] for i in range(topics): topk_vals = llh[i, :].argsort()[::-1].astype(int) if rerank: indices.append(find_top_k_words(100, topk_vals, vocab)) else: indices.append(find_top_k_words(10, topk_vals, vocab)) return vmf_soft.predict(vocab_embeddings), indices
def fit_movmf(theta, **kwargs): """ Fit a mixture of 2 von-Mises-Fisher distributions Args: theta (np.array): angles **kwargs: keyword arguments to spherecluster.VonMisesFisherMixture() Returns: pi, mu, kappa """ X = np.array([np.cos(theta * 2), np.sin(theta * 2)]).T vmf_soft = VonMisesFisherMixture(n_clusters=2, **kwargs) vmf_soft.fit(X) mu = np.arctan2(vmf_soft.cluster_centers_[:, 1], vmf_soft.cluster_centers_[:, 0]) pi = vmf_soft.weights_ kappa = vmf_soft.concentrations_ return pi, mu, kappa
def test_integration_dense(params_in): n_clusters = 5 n_examples = 20 n_features = 100 X = np.random.randn(n_examples, n_features) for ee in range(n_examples): X[ee, :] /= np.linalg.norm(X[ee, :]) params_in.update({'n_clusters': n_clusters}) movmf = VonMisesFisherMixture(**params_in) movmf.fit(X) assert movmf.cluster_centers_.shape == (n_clusters, n_features) assert len(movmf.concentrations_) == n_clusters assert len(movmf.weights_) == n_clusters assert len(movmf.labels_) == n_examples for center in movmf.cluster_centers_: assert_almost_equal(np.linalg.norm(center), 1.0) for concentration in movmf.concentrations_: assert concentration > 0 for weight in movmf.weights_: assert not np.isnan(weight) plabels = movmf.predict(X) assert_array_equal(plabels, movmf.labels_) ll = movmf.log_likelihood(X) ll_labels = np.zeros(movmf.labels_.shape) for ee in range(n_examples): ll_labels[ee] = np.argmax(ll[:, ee]) assert_array_equal(ll_labels, movmf.labels_)
def soft_clustering(terms: List[str], wv: Dict[str, np.ndarray], n_clusters: int) -> Tuple[List[int], List[str]]: """Use spherical vmf to cluster word vectors""" X = [] X_terms = [] n_out_of_vocab = 0 for term in terms: try: phrase = term emb = wv[phrase] X.append(emb) X_terms.append(phrase) except KeyError as e: n_out_of_vocab += 1 logger.debug(f"{n_out_of_vocab} / {len(terms)} words out of vocab") logger.debug(f"Clustering {len(X)} words") vmf_soft = VonMisesFisherMixture(n_clusters=n_clusters, posterior_type='soft') vmf_soft.fit(X) return vmf_soft.predict(X), X_terms, vmf_soft
def test_integration_sparse(params_in): n_clusters = 5 n_examples = 20 n_features = 100 n_nonzero = 10 X = sp.sparse.csr_matrix((n_examples, n_features)) for ee in range(n_examples): ridx = np.random.randint(n_features, size=(n_nonzero)) random_values = np.random.randn(n_nonzero) random_values = random_values / np.linalg.norm(random_values) X[ee, ridx] = random_values params_in.update({"n_clusters": n_clusters}) movmf = VonMisesFisherMixture(**params_in) movmf.fit(X) assert movmf.cluster_centers_.shape == (n_clusters, n_features) assert len(movmf.concentrations_) == n_clusters assert len(movmf.weights_) == n_clusters assert len(movmf.labels_) == n_examples assert len(movmf.posterior_) == n_clusters for center in movmf.cluster_centers_: assert_almost_equal(np.linalg.norm(center), 1.0) for concentration in movmf.concentrations_: assert concentration > 0 for weight in movmf.weights_: assert not np.isnan(weight) plabels = movmf.predict(X) assert_array_equal(plabels, movmf.labels_) ll = movmf.log_likelihood(X) ll_labels = np.zeros(movmf.labels_.shape) for ee in range(n_examples): ll_labels[ee] = np.argmax(ll[:, ee]) assert_array_equal(ll_labels, movmf.labels_)
print() table.append([ 'spherical k-means', metrics.homogeneity_score(labels, skm.labels_), metrics.completeness_score(labels, skm.labels_), metrics.v_measure_score(labels, skm.labels_), metrics.adjusted_rand_score(labels, skm.labels_), metrics.adjusted_mutual_info_score(labels, skm.labels_), metrics.silhouette_score(X, skm.labels_, metric='cosine')]) ############################################################################### # Mixture of von Mises Fisher clustering (soft) vmf_soft = VonMisesFisherMixture(n_clusters=true_k, posterior_type='soft', init='random-class', n_init=20, force_weights=np.ones((true_k,))/true_k) print("Clustering with %s" % vmf_soft) vmf_soft.fit(X) print() print('weights: {}'.format(vmf_soft.weights_)) print('concentrations: {}'.format(vmf_soft.concentrations_)) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, vmf_soft.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, vmf_soft.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, vmf_soft.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, vmf_soft.labels_)) print("Adjusted Mututal Information: %.3f" % metrics.adjusted_mutual_info_score(labels, vmf_soft.labels_)) print("Silhouette Coefficient (euclidean): %0.3f"
return args.x_s, args.y_s, args.mu_s, args.mode, args.verbose input_X_s, input_Y_s, input_mu_s, processMode, verboseMode = parseArguments() X_s = np.load(input_X_s) Y_s = np.load(input_Y_s) mu_s = np.load(input_mu_s) num_clusters = np.unique(Y_s).size print(num_clusters) if processMode == 'soft': vmf_model = VonMisesFisherMixture(n_clusters=num_clusters, posterior_type='soft') vmf_model.fit(X_s) elif processMode == 'hard': vmf_model = VonMisesFisherMixture(n_clusters=num_clusters, posterior_type='hard') vmf_model.fit(X_s) estimated_kappa_s = vmf_model.concentrations_ estimated_mu_s = vmf_model.cluster_centers_ print(estimated_kappa_s) cross_distance_s = cdist(estimated_mu_s, mu_s, metric='cosine') print(cross_distance_s) #print( vmf_model.labels_ )
# coding: utf-8 # In[6]: import scipy.io as sio from spherecluster import VonMisesFisherMixture # using spherecluster package : https://pypi.org/project/spherecluster/0.1.2/ # In[10]: train = sio.loadmat('./vmf_data.mat') # load segmented frames or superframes data = train['ps'] print(data.shape) # In[11]: vmf_soft = VonMisesFisherMixture(n_clusters=15, posterior_type='soft') vmf_soft.fit(data) # In[15]: mean_directions = vmf_soft.cluster_centers_ print(mean_directions.shape) # In[18]: concen = vmf_soft.concentrations_ ind = concen.argsort()[-10:][::-1] print(concen) print(ind) mean_directions = mean_directions[ind, :] print(mean_directions.shape)
def sim_multimodal(): to_save = [] data_cells = [] # Data pre-processing for i in range(40): read_sim(i, f_in='datasets/multimodal_sim2.npy', f_out='datasets/transformed/multimodal_sim2_cell' + str(i)) # Angles to query Thq = np.linspace(-np.pi, np.pi, 360)[:, None] Xq = np.hstack((np.cos(Thq), np.sin(Thq))) # Fit one cell at a time for i in range(40): print('\ncell no={}'.format(i)) try: # Read data read_data = np.load('datasets/transformed/multimodal_sim2_cell' + str(i) + '.npz') data, xx, yy = read_data['data'], read_data['xx'], read_data['yy'] if data.shape[0] <= 1: continue # Data Th = data[:, 4][:, None] X = np.hstack((np.cos(Th), np.sin(Th))) db = DBSCAN().fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) unique_labels = set(labels) print("n_clusters_={}, labels={}".format(n_clusters_, unique_labels)) for k in unique_labels: if k == -1: # noisy samples continue class_member_mask = (labels == k) xy = X[class_member_mask & core_samples_mask] if k == 0: db_centers = np.mean(xy, axis=0)[None, :] else: db_centers = np.concatenate( (db_centers, np.mean(xy, axis=0)[None, :]), axis=0) print("db_centers=", db_centers) # TBD: "NOTE:: play with max_iter if you get the denom=inf error" # Mixture of von Mises Fisher clustering (soft) vmf_soft = VonMisesFisherMixture(n_clusters=n_clusters_, posterior_type='soft', init=db_centers, n_init=1, verbose=True, max_iter=20) vmf_soft.fit(X) y = 0 for cn in range(n_clusters_): y += vmf_soft.weights_[cn] * np.exp( von_mises_fisher_mixture._vmf_log( Xq, vmf_soft.concentrations_[cn], vmf_soft.cluster_centers_[cn])) yq = np.array(y)[:, None] to_save.append(yq) data_cells.append(i) # Plot pl.figure(figsize=(15, 4)) pl.subplot(131) mesh = np.vstack((xx.ravel(), yy.ravel())).T pl.scatter(mesh[:, 0], mesh[:, 1], c='k', marker='.') pl.scatter(data[:, 1], data[:, 2], c=data[:, 0], marker='*', cmap='jet') pl.colorbar() pl.xlim([0, 20]) pl.ylim([-5, 30]) pl.title('data') pl.subplot(132) pl.scatter(Xq[:, 0], Xq[:, 1], c=yq[:], cmap='jet') pl.colorbar() pl.scatter(X[:, 0] * 0.9, X[:, 1] * 0.9, c='k', marker='+') pl.title('data and extimated distribution') pl.subplot(133, projection='polar') pl.polar(Thq, yq) pl.title('polar plot') pl.savefig('outputs/multimodal_sim2_cell{}'.format(i)) #pl.show() except: print(' skipped...') continue
def label_expansion(relevant_nodes, write_path, vocabulary_inv, embedding_mat, manual_num=None, fitting='mix'): print("Retrieving top-t nearest words...") vocab_dict = {v: k for k, v in vocabulary_inv.items()} prob_sup_array = [] current_szes = [] all_class_keywords = [] children_nodes = [] for relevant_node in relevant_nodes: if relevant_node.children: children_nodes += relevant_node.children else: children_nodes += [relevant_node] for children_node in children_nodes: current_sz = len(children_node.keywords) current_szes.append(current_sz) prob_sup_array.append([1 / current_sz] * current_sz) all_class_keywords += children_node.keywords current_sz = np.min(current_szes) if manual_num is None: while len(all_class_keywords) == len(set(all_class_keywords)): print(f'current_sz: {current_sz}') current_sz += 1 # print(f'len_kw: {len(all_class_keywords)}') seed_expansion(children_nodes, prob_sup_array, current_sz, vocab_dict, embedding_mat) all_class_keywords = [ w for relevant_node in children_nodes for w in relevant_node.expanded ] seed_expansion(children_nodes, prob_sup_array, current_sz - 1, vocab_dict, embedding_mat) # seed_expansion(children_nodes, prob_sup_array, current_sz, vocab_dict, embedding_mat) else: seed_expansion(children_nodes, prob_sup_array, manual_num, vocab_dict, embedding_mat) if manual_num is None: print(f"Final expansion size t = {len(children_nodes[0].expanded)}") else: print(f"Manual expansion size t = {manual_num}") centers = [] kappas = [] weights = [] if write_path is not None: if not os.path.exists(write_path): os.makedirs(write_path) else: f = open(os.path.join(write_path, 'expanded.txt'), 'w') f.close() for relevant_node in relevant_nodes: children_nodes = relevant_node.children if relevant_node.children else [ relevant_node ] num_children = len(children_nodes) expanded_class = [] if fitting == 'mix': for child in children_nodes: # assert child.expanded != [] expanded_class = np.concatenate( (expanded_class, child.expanded)) print([vocabulary_inv[w] for w in child.expanded]) vocab_expanded = [vocabulary_inv[w] for w in expanded_class] expanded_mat = embedding_mat[np.asarray(list(set(expanded_class)), dtype='int32')] vmf_soft = VonMisesFisherMixture(n_clusters=num_children, n_jobs=15, random_state=0) vmf_soft.fit(expanded_mat) center = vmf_soft.cluster_centers_ kappa = vmf_soft.concentrations_ weight = vmf_soft.weights_ print(f'weight: {weight}') print(f'kappa: {kappa}') centers.append(center) kappas.append(kappa) weights.append(weight) elif fitting == 'separate': center = [] kappa = [] weight = [] for child in children_nodes: assert child.expanded != [] expanded_class = np.concatenate( (expanded_class, child.expanded)) expanded_mat = embedding_mat[np.asarray(child.expanded, dtype='int32')] vmf_soft = VonMisesFisherMixture(n_clusters=1, n_jobs=15, random_state=0) vmf_soft.fit(expanded_mat) center.append(vmf_soft.cluster_centers_[0]) kappa.append(vmf_soft.concentrations_[0]) weight.append(1 / num_children) expanded = np.dot(embedding_mat, center[-1]) word_expanded = sorted(range(len(expanded)), key=lambda k: expanded[k], reverse=True) vocab_expanded = [vocabulary_inv[w] for w in expanded_class] print(f'Class {relevant_node.name}:') print(vocab_expanded) print(f'weight: {weight}') print(f'kappa: {kappa}') centers.append(center) kappas.append(kappa) weights.append(weight) if write_path is not None: f = open(os.path.join(write_path, 'expanded.txt'), 'a') f.write(relevant_node.name + '\t') f.write(' '.join(vocab_expanded) + '\n') f.close() print("Finished vMF distribution fitting.") return centers, kappas, weights
def sim_unimodal(): to_save = [] data_cells = [] # Data pre-processing for i in range(40): read_sim(i, f_in='datasets/unimodal_sim1.npy', f_out='datasets/transformed/unimodal_sim1_cell' + str(i)) # Angles to query Thq = np.linspace(-np.pi, np.pi, 360)[:, None] Xq = np.hstack((np.cos(Thq), np.sin(Thq))) # Fit one cell at a time for i in range(40): print('cell no={}'.format(i)) try: # Read data read_data = np.load('datasets/transformed/unimodal_sim1_cell' + str(i) + '.npz') data, xx, yy = read_data['data'], read_data['xx'], read_data['yy'] if data.shape[0] <= 1: continue # Data Th = data[:, 4][:, None] X = np.hstack((np.cos(Th), np.sin(Th))) # Von Mises clustering (soft) vmf_soft = VonMisesFisherMixture(n_clusters=1, posterior_type='soft', n_init=20) vmf_soft.fit(X) y0 = np.exp( von_mises_fisher_mixture._vmf_log( Xq, vmf_soft.concentrations_[0], vmf_soft.cluster_centers_[0])) y = y0 * vmf_soft.weights_[0] # Query yq = np.array(y)[:, None] to_save.append(yq) data_cells.append(i) # Plot pl.figure(figsize=(15, 4)) pl.subplot(131) mesh = np.vstack((xx.ravel(), yy.ravel())).T pl.scatter(mesh[:, 0], mesh[:, 1], c='k', marker='.') pl.scatter(data[:, 1], data[:, 2], c=data[:, 0], marker='*', cmap='jet') pl.colorbar() pl.xlim([0, 20]) pl.ylim([-5, 30]) pl.title('data') pl.subplot(132) pl.scatter(Xq[:, 0], Xq[:, 1], c=y0[:], cmap='jet') pl.colorbar() pl.scatter(X[:, 0] * 0.9, X[:, 1] * 0.9, c='k', marker='+') pl.title('data and extimated distribution') pl.subplot(133, projection='polar') pl.polar(Thq, yq) pl.title('polar plot') #pl.show() pl.savefig('outputs/unimodal_sim1_cell{}'.format(i)) except: print(' skipped...') continue
skm.fit(X) cdists = [] for center in skm.cluster_centers_: cdists.append(np.linalg.norm(mus[0] - center)) skm_mu_0_idx = np.argmin(cdists) skm_mu_1_idx = 1 - skm_mu_0_idx skm_mu_0_error = np.linalg.norm(mus[0] - skm.cluster_centers_[skm_mu_0_idx]) skm_mu_1_error = np.linalg.norm(mus[1] - skm.cluster_centers_[skm_mu_1_idx]) ############################################################################### # Mixture of von Mises Fisher clustering (soft) vmf_soft = VonMisesFisherMixture(n_clusters=2, posterior_type='soft', n_init=20) vmf_soft.fit(X) cdists = [] for center in vmf_soft.cluster_centers_: cdists.append(np.linalg.norm(mus[0] - center)) vmf_soft_mu_0_idx = np.argmin(cdists) vmf_soft_mu_1_idx = 1 - vmf_soft_mu_0_idx vmf_soft_mu_0_error = np.linalg.norm( mus[0] - vmf_soft.cluster_centers_[vmf_soft_mu_0_idx]) vmf_soft_mu_1_error = np.linalg.norm( mus[1] - vmf_soft.cluster_centers_[vmf_soft_mu_1_idx])
class Clusterer: def __init__(self, data, n_cluster, method="soft-movMF", init="random-class", n_init=10, n_jobs=1): self.data = data self.n_cluster = n_cluster self.method = method if method == "spk": self.clus = SphericalKMeans(n_clusters=n_cluster) elif method == "hard-movMF": self.clus = VonMisesFisherMixture(n_clusters=n_cluster, posterior_type='hard', init=init, n_init=n_init, n_jobs=n_jobs) elif method == "soft-movMF": self.clus = VonMisesFisherMixture(n_clusters=n_cluster, posterior_type='soft', init=init, n_init=n_init, n_jobs=n_jobs) self.clusters = { } # cluster id -> dict(element_id: distance to center) self.clusters_phrase = {} # cluster id -> representative words self.membership = None # a list contain the membership of the data points self.center_ids = None # a list contain the ids of the cluster centers self.inertia_scores = None def fit(self, debug=False): start = time.time() self.clus.fit(self.data) end = time.time() print("Finish fitting data of size %s using %s seconds" % (self.data.shape, (end - start))) self.inertia_scores = self.clus.inertia_ print('Clustering inertia score (smaller is better):', self.inertia_scores) labels = self.clus.labels_ self.membership = labels if debug: print("Labels:", labels) # print("cluster_centers_:", self.clus.cluster_centers_) if self.method != "spk": print("concentrations_:", self.clus.concentrations_) print("weights_:", self.clus.weights_) print("posterior_:", self.clus.posterior_) for idx, label in enumerate(labels): cluster_center = self.clus.cluster_centers_[int(label)] consine_sim = self.calc_cosine(self.data[idx], cluster_center) if label not in self.clusters: self.clusters[label] = {} self.clusters[label][idx] = consine_sim else: self.clusters[label][idx] = consine_sim for cluster_id in range(self.n_cluster): self.clusters_phrase[cluster_id] = sorted( self.clusters[cluster_id].items(), key=lambda x: -x[1]) # self.center_ids = self.gen_center_idx() # find the idx of each cluster center def gen_center_idx(self): ret = [] for cluster_id in range(self.n_cluster): center_idx = self.find_center_idx_for_one_cluster(cluster_id) ret.append((cluster_id, center_idx)) return ret def find_center_idx_for_one_cluster(self, cluster_id): query_vec = self.clus.cluster_centers_[cluster_id] members = self.clusters[cluster_id] best_similarity, ret = -1, -1 for member_idx in members: member_vec = self.data[member_idx] cosine_sim = self.calc_cosine(query_vec, member_vec) if cosine_sim > best_similarity: best_similarity = cosine_sim ret = member_idx return ret def calc_cosine(self, vec_a, vec_b): return 1 - cosine(vec_a, vec_b) def find_phrase_rank(self, phrase, cluster_id): for idx, ele in enumerate(self.clusters_phrase[cluster_id]): if ele[0] == phrase: return (idx + 1) return -1 def senity_check(self): for phrase_id in range(self.data.shape[0]): cluster_member = self.membership[phrase_id] result = [] for cluster_id in range(self.n_cluster): cluster_rank = self.find_phrase_rank(phrase_id, cluster_id) sim = self.calc_cosine(self.data[phrase_id], self.clus.cluster_centers_[cluster_id]) if sim < 0: print(phrase_id, sim) return # result.append((cluster_id, cluster_rank, sim)) # print("Put in cluster: %s" % cluster_member) # print("Rank information in all clusters: %s" % str(result)) def explore(self, keyword2id=None, id2keyword=None, iteractive=False): for cluster_id in range(self.n_cluster): print("Cluster %s top keywords" % cluster_id) for rank, keyword_id in enumerate( self.clusters_phrase[cluster_id][0:10]): print("Rank:%s keywords:%s (score=%s)" % (rank + 1, id2keyword[keyword_id[0]], keyword_id[1])) print("=" * 80) if iteractive: while (True): phrase = input( 'Input keyword (use "_" to concat tokens for phrase): ') if len(phrase) == 0: break if phrase not in keyword2id: print("Out of vocabulary keyword, please try again") continue else: phrase_id = keyword2id[phrase] cluster_member = self.membership[phrase_id] result = [] for cluster_id in range(self.n_cluster): cluster_rank = self.find_phrase_rank( phrase_id, cluster_id) sim = self.calc_cosine( self.data[phrase_id], self.clus.cluster_centers_[cluster_id]) result.append((cluster_id, cluster_rank, sim)) print("Put in cluster: %s" % cluster_member) print("Rank information in all clusters: %s" % str(result))
sns.scatterplot( data=scatter_df, x="x", y="y", hue="Label", ax=ax, legend="full", hue_order=true_color_dict.keys(), palette=true_color_dict.values(), ) plt.legend(bbox_to_anchor=(1.0, 1), loc=2, borderaxespad=0.0) # %% [markdown] # # vmm = VonMisesFisherMixture(n_clusters=2, init="spherical-k-means") pred_labels = vmm.fit_predict(latent) pred_color_dict = get_color_dict(pred_labels, pal=cc.glasbey_warm) # %% [markdown] # # fig, ax = plt.subplots(1, 1, figsize=(20, 20)) scatter_df["Clusters"] = pred_labels sns.scatterplot( data=scatter_df, x="x", y="y", hue="Clusters", ax=ax, legend="full", hue_order=pred_color_dict.keys(), palette=pred_color_dict.values(),