def term_clustering(terms: List[str], wv: Dict[str, np.ndarray], n_clusters: int) -> Tuple[List[int], List[str]]: """Use spherical k-means to cluster word vectors. Args: terms: A list of terms to cluster. wv: A dictionary of word to their vectors. n_clusters: Number of output clusters. Returns: labels: A list of clustering assignment for each word. terms: A list of words, aligned with labels. """ X = [] X_terms = [] n_out_of_vocab = 0 logger.debug(f"#wv {len(wv)}") logger.debug(terms[:20]) for term in terms: try: phrase = term emb = wv[phrase] X.append(emb) X_terms.append(phrase) except KeyError as e: n_out_of_vocab += 1 logger.warning(f"{n_out_of_vocab} / {len(terms)} words out of vocab") logger.info(f"Clustering {len(X)} words") clus = SphericalKMeans(n_clusters=n_clusters) clus.fit(X) logger.info(f"Clustering complete") return clus.labels_, X_terms
def testSpericalKMeans(): # Find K clusters from data matrix X (n_examples x n_features) # spherical k-means skm = SphericalKMeans(n_clusters=3) skm.fit(X) print(skm.labels_)
def fit(self, data, norm=False): ''' Args: data numpy.ndarray: [m, n] m samples every sample with n dimention norm boolean: False as default, ''' if not norm: self.mean = np.mean(data, axis=0) centered = data - self.mean process_data = self.l2norm(centered) else: process_data = data #clusterid, error, nfound = kcluster(process_data, nclusters=self.ncluster)#dist="u" #cdata, cmask = clustercentroids(process_data, mask=None, transpose=0, clusterid=clusterid, method='a') skm = SphericalKMeans(n_clusters=self.ncluster, verbose=0) skm.fit(process_data) self.clusters = skm.cluster_centers_ self.clusterid = skm.labels_ self.loss = skm.inertia_ scores = [] for i in range(self.ncluster): idxs = np.where(self.clusterid == i)[0].tolist() cluster_data = process_data[idxs, :] confs = np.dot(cluster_data, self.clusters[i, :].T) #print(confs) score = np.mean(confs) scores.append(score) #print(scores) self.main_id = np.argmin(scores) print(self.main_id) return self.clusters, self.clusterid, self.main_id
def calc_logit_regress_stats(inputs, outputs, plot_name, K_SIZE): skm = SphericalKMeans(n_clusters=K_SIZE) skm.fit(inputs) input_labels = skm.labels_ out_keys = list(set(outputs)) out_idx_mapping = {out: idx for idx, out in enumerate(out_keys)} #out_key_list = [out_idx_mapping[key] for key in out_keys] print(out_idx_mapping) k_center_labels = [[0] * K_SIZE for x in range(len(out_keys))] for k_c, lab in zip(input_labels, outputs): out_idx = out_idx_mapping[lab] k_center_labels[out_idx][k_c] += 1 k_center_labels = np.asarray(k_center_labels) ind = np.arange(K_SIZE) plots = [] bottom = np.zeros(K_SIZE) for x in range(len(out_keys)): plots.append(plt.bar(ind, k_center_labels[x], bottom=bottom)) bottom += k_center_labels[x] plt.title('Song genres in spherical k-means clusters') plt.xticks(ind, ["K" + str(i + 1) for i in range(K_SIZE)]) #plt.yticks(np.arange(0, 81, 10)) plt.legend(plots, out_keys) plt.savefig(plot_name)
def cluster_test(self, test_file, clusters=10): df_test1 = pd.read_csv(test_file) output = {} for K in clusters: vectors = list() y_true = list() sections = dict() idx = 0 for word, section, y in df_test1.values: sliceIdx = self.yearDict[str(y)] if word in self.vocabularies[sliceIdx]: if section not in sections: sections[section] = idx idx += 1 y_true.append(sections[section]) vectors.append(self.matrices_norm[sliceIdx][ self.vocabularies[sliceIdx][word]]) skm = SphericalKMeans(n_clusters=K, max_iter=100000) skm.fit(np.array(vectors)) metric = normalized_mutual_info_score(skm.predict( np.array(vectors)), y_true, average_method='arithmetic') y_true_bool = [(triplet1 == triplet2) for triplet2 in y_true for triplet1 in y_true] y_pred = skm.predict(np.array(vectors)) y_pred_bool = [(triplet1 == triplet2) for triplet2 in y_pred for triplet1 in y_pred] metric2 = fbeta_score(y_true_bool, y_pred_bool, beta=5) output[f'NMI({K})'] = metric output[f'F_beta-score({K})'] = metric2 return output
def doc_clustering(model, cluster_num): doc_num = len(model.docvecs.doctags.keys()) train_data = np.array( [model.docvecs['a_' + str(doc + 1)] for doc in range(doc_num)]) clusterer = SphericalKMeans(cluster_num) print('Start clustering...') clusterer.fit(train_data) print('Done.') return clusterer
def _init_match(self): skm = SphericalKMeans(n_clusters=self.config['cluster_nums'], init='k-means++', n_init=20) data = self.data data = data[data['qs_embed'].apply( lambda x: True if np.linalg.norm(x) > 0 else False)] skm.fit(data['qs_embed'].tolist()) data['skm_label'] = skm.labels_ data = data[['qid', 'skm_label']] self.data = pd.merge(self.data, data, how='left', on=['qid']) self.data['skm_label'] = self.data['skm_label'].fillna(-1) self._cluster_centers = skm.cluster_centers_
def kmeans_codebook(patches, k=30): shape = patches[0].shape x = patches.reshape(-1, shape[0] * shape[1]) # normalize #x = x / ( 1e-6 + x.sum(axis=1, keepdims=True) ) est = SphericalKMeans(k) #est = KMeans(n_clusters=k) est.fit(x) codebook = est.cluster_centers_.reshape(-1, shape[0], shape[1]) return codebook
def SphericalkMeansCluster(X,nfclusters): # Find K clusters from data matrix X (n_examples x n_features) # spherical k-means skm = SphericalKMeans(nfclusters) skm.fit(X) #print(skm.cluster_centers_) #print("Labels =") #print(skm.labels_) #print("Inertia = ") #print(nfclusters,skm.inertia_) #return skm.inertia_ return skm.labels_
class Clusterer: def __init__(self, data, n_cluster): self.data = data self.n_cluster = n_cluster self.clus = SphericalKMeans(n_cluster) self.clusters = defaultdict(list) # cluster id -> members self.membership = None # a list contain the membership of the data points self.center_ids = None # a list contain the ids of the cluster centers self.inertia_scores = None def fit(self): print("bbbbbbb") self.clus.fit(self.data) print("bbbbbbb") labels = self.clus.labels_ print("bbbbbbb") for idx, label in enumerate(labels): self.clusters[label].append(idx) print("bbbbbbb") self.membership = labels print("bbbbbbb") self.center_ids = self.gen_center_idx() print("bbbbbbb") self.inertia_scores = self.clus.inertia_ print('Clustering concentration score:', self.inertia_scores) # find the idx of each cluster center def gen_center_idx(self): ret = [] for cluster_id in range(self.n_cluster): center_idx = self.find_center_idx_for_one_cluster(cluster_id) ret.append((cluster_id, center_idx)) return ret def find_center_idx_for_one_cluster(self, cluster_id): query_vec = self.clus.cluster_centers_[cluster_id] members = self.clusters[cluster_id] best_similarity, ret = -1, -1 for member_idx in members: member_vec = self.data[member_idx] cosine_sim = self.calc_cosine(query_vec, member_vec) if cosine_sim > best_similarity: best_similarity = cosine_sim ret = member_idx return ret def calc_cosine(self, vec_a, vec_b): return 1 - cosine(vec_a, vec_b)
def cluster_doc(doc_emb, K, method): y_pred = [] if method == "kmeans": # k-means print("Clustering using K-Means") from sklearn.cluster import KMeans km = KMeans(n_clusters=K, n_init=1) km.fit(doc_emb) y_pred = km.labels_ elif method == "skmeans": # spherical k-means print("Clustering using Spherical K-Means") from spherecluster import SphericalKMeans skm = SphericalKMeans(n_clusters=K, n_init=1) skm.fit(doc_emb) y_pred = skm.labels_ return y_pred
def initialize(self): self.R12_train = np.multiply(NMTF1.R12, self.M) """spherical k-means""" skm1 = SphericalKMeans(n_clusters=self.K[0]) skm1.fit(self.R12_train.transpose()) skm2 = SphericalKMeans(n_clusters=self.K[1]) skm2.fit(self.R12_train) self.G1 = skm1.cluster_centers_.transpose() self.G2 = skm2.cluster_centers_.transpose() self.S12 = np.linalg.multi_dot( [self.G1.transpose(), self.R12_train, self.G2]) #Save the factor matrices for the mext models NMTF1.G1 = self.G1 NMTF1.G2 = self.G2
def semantic_sim_driver(self,time_mapping,log_filename = "yao_test1.txt",): df = pd.read_csv("eval/yao/testset_1.csv") try: df.real_year = df.year.apply(lambda x: int(time_mapping[str(x)])) except Exception as e: print(e) print(time_mapping.keys()) print(df.year.unique()) df.real_year = df.year.apply(lambda x: int(time_mapping[str(x // 10 * 10) + "s"])) labels = set(df.label.unique()) labels_mapping = { label : index for index,label in enumerate(labels) } df.label_id = df.label.apply(lambda x: labels_mapping[x]) # print(df.label_id) embeddings,known_index = self.get_embedding_in_a_year(df.word,df.real_year.tolist(),return_known_index =True) from spherecluster import SphericalKMeans scores = [] for n in [10,15,20]: skm = SphericalKMeans(n_clusters = n) skm.fit(embeddings) # print(skm.labels_.shape) # print(len(df.label_id[known_index])) # print(sum(known_index)) score = get_score(skm.labels_,df.label_id[known_index]) score1 = get_score1(skm.labels_,df.label_id[known_index]) scores.append(score) scores.append(score1) print(scores) with open(log_filename, "w", encoding="utf-8") as f: line = "\t".join(["{0:.4f}".format(s) for s in scores]) + "\n" print(line) f.write(line) return None
def initialize(self): self.R12_train = np.multiply(NMTF2.R12, self.M) """spherical k-means""" skm3 = SphericalKMeans(n_clusters=self.K[2]) skm3.fit(NMTF2.R23) #Reload matrices that have already been used before self.G1 = NMTF1.G1 self.G2 = NMTF1.G2 self.G3 = skm3.cluster_centers_.transpose() self.S12 = np.linalg.multi_dot( [self.G1.transpose(), self.R12_train, self.G2]) self.S23 = np.linalg.multi_dot( [self.G2.transpose(), NMTF2.R23, self.G3]) #Save G3 for the next models NMTF2.G3 = self.G3
def get_topic_vecs(model, n_topics=20): """ Computes and returns the topic vectors of a doc2vec model. the topic vectors are simply the centroids of the classes after the documents have been clustered. They are therefore "virtual" documents that are an average of a group of similar documents. Arguments: - (gensim.models.doc2vec.Doc2Vec) model: A doc2vec model - (<float>) n_topics: The number of topics that should be found, defaults to 20. Returns: - (numpy.ndarray) topics: The topic vectors of the model """ from spherecluster import SphericalKMeans skm = SphericalKMeans(n_clusters=n_topics) # getting the data as a numpy array dv = model.docvecs.vectors_docs # carrying out K-means to group documents by topic skm.fit(dv) # extracting topic vectors (centroids of the groups) return skm.cluster_centers_
def initialize(self): self.R12_train = np.multiply(NMTF5.R12, self.M) """spherical k-means""" skm5 = SphericalKMeans(n_clusters=self.K[4]) skm5.fit(NMTF5.R25) self.G1 = NMTF1.G1 self.G2 = NMTF1.G2 self.G3 = NMTF2.G3 self.G4 = NMTF3.G4 self.G5 = skm5.cluster_centers_.transpose() self.S12 = np.linalg.multi_dot( [self.G1.transpose(), self.R12_train, self.G2]) self.S23 = np.linalg.multi_dot( [self.G2.transpose(), NMTF5.R23, self.G3]) self.S34 = np.linalg.multi_dot( [self.G3.transpose(), NMTF5.R34, self.G4]) self.S25 = np.linalg.multi_dot( [self.G2.transpose(), NMTF5.R25, self.G5])
def cluster(self, docs, k): vecs = [] words = [] cnt = 0 for doc in docs: cnt += 1 #print('processing doc {}'.format(cnt), end='\r') ws = self.extract_keywords(doc) words.append(ws) vecs.append(self.sent2vec(ws)) print('processing doc {} over.'.format(cnt)) skm = SphericalKMeans(n_clusters=k) result = skm.fit(np.array(vecs)) return result.labels_, words
def initialize(self): self.R12_train = np.multiply(NMTF3.R12, self.M) """spherical k-means""" skm4 = SphericalKMeans(n_clusters=self.K[3]) skm4.fit(NMTF3.R34) self.G4 = skm4.cluster_centers_.transpose() #Use the same matrices as those precedently computed self.G1 = NMTF1.G1 self.G2 = NMTF1.G2 self.G3 = NMTF2.G3 self.S12 = np.linalg.multi_dot( [self.G1.transpose(), self.R12_train, self.G2]) self.S23 = np.linalg.multi_dot( [self.G2.transpose(), NMTF3.R23, self.G3]) self.S34 = np.linalg.multi_dot( [self.G3.transpose(), NMTF3.R34, self.G4]) #Save G4 for next models NMTF3.G4 = self.G4
def _calculate_gap(self, X: Union[pd.DataFrame, np.ndarray], n_refs: int, n_clusters: int) -> Tuple[float, int]: """ Calculate the gap value of the given data, n_refs, and number of clusters. Return the resutling gap value and n_clusters """ # Holder for reference dispersion results ref_dispersions = np.zeros(n_refs) # type: np.ndarray # For n_references, generate random sample and perform kmeans getting resulting dispersion of each loop # print(0, n_refs) for i in range(n_refs): # Create new random reference set random_data = random_sample_data( X, random_sampling=self.random_sampling) # Fit to it, getting the centroids and labels, and add to accumulated reference dispersions array. if self.algo == "kmeans2": centroids, labels = kmeans2( data=random_data, k=n_clusters, iter=10, minit='points') # type: Tuple[np.ndarray, np.ndarray] dispersion = self._calculate_dispersion( X=random_data, labels=labels, centroids=centroids) # type: float elif self.algo == "kmeans": centroids, dispersion = kmeans( obs=random_data, k_or_guess=n_clusters, iter=10) # type: Tuple[np.ndarray, np.ndarray] elif self.algo == "skl-kmeans": km = KMeans(n_clusters=n_clusters, random_state=0) km.fit(random_data) centroids, labels = km.cluster_centers_, km.labels_ dispersion = km.inertia_ elif self.algo == "sph-kmeans": skm = SphericalKMeans(n_clusters=n_clusters, random_state=0) skm.fit(random_data) centroids, labels = skm.cluster_centers_, skm.labels_ dispersion = skm.inertia_ ref_dispersions[i] = dispersion # Fit cluster to original data and create dispersion calc. if self.algo == "kmeans2": centroids, labels = kmeans2(data=X, k=n_clusters, iter=10, minit='points') dispersion = self._calculate_dispersion(X=X, labels=labels, centroids=centroids) elif self.algo == "kmeans": centroids, dispersion = kmeans( obs=X, k_or_guess=n_clusters, iter=10) # type: Tuple[np.ndarray, np.ndarray] elif self.algo == "skl-kmeans": km = KMeans(n_clusters=n_clusters, random_state=0) km.fit(X) centroids, labels = km.cluster_centers_, km.labels_ dispersion = km.inertia_ elif self.algo == "sph-kmeans": skm = SphericalKMeans(n_clusters=n_clusters, random_state=0) skm.fit(X) centroids, labels = skm.cluster_centers_, skm.labels_ dispersion = skm.inertia_ # Calculate gap statistic ref_log_dispersion = np.mean(np.log(ref_dispersions)) log_dispersion = np.log(dispersion) gap_value = ref_log_dispersion - log_dispersion # compute standard deviation sdk = np.sqrt( np.mean((np.log(ref_dispersions) - ref_log_dispersion)**2.)) sk = np.sqrt(1. + 1. / n_refs) * sdk return gap_value, int( n_clusters), log_dispersion, ref_log_dispersion, sk
data = f.read() text.append(clean_str(data)) cat_list.append(start) result = np.zeros((1, len(cat_list)), dtype=np.int) result = result.tolist()[0] vectorizer = CountVectorizer(min_df=1, stop_words='english', strip_accents='ascii') count_vectorizer = vectorizer.fit_transform(text) transformer = TfidfTransformer(smooth_idf=True) tfidf = transformer.fit_transform(count_vectorizer) km = SphericalKMeans(n_clusters=len(categories)) clusters = km.fit(tfidf) centroids = km.cluster_centers_ labels = km.labels_ for c in range(len(categories)): idx = np.where(labels == c)[0] for l in idx: result[l] = c + 1 print len(result) print result score = normalized_mutual_info_score(cat_list, result) adjusted_score = adjusted_mutual_info_score(cat_list, result) print score print adjusted_score
def sphe_kmeans(matrix, n_clusters, nb_init): labeler = SphericalKMeans(n_clusters=n_clusters, n_init=nb_init, max_iter=100) print("sphe_kmeans") return labeler.fit(matrix)
km_mu_0_idx = np.argmin(cdists) km_mu_1_idx = 1 - km_mu_0_idx km_mu_0_error = np.linalg.norm(mus[0] - km.cluster_centers_[km_mu_0_idx]) km_mu_1_error = np.linalg.norm(mus[1] - km.cluster_centers_[km_mu_1_idx]) km_mu_0_error_norm = np.linalg.norm( mus[0] - km.cluster_centers_[km_mu_0_idx] / np.linalg.norm(km.cluster_centers_[km_mu_0_idx])) km_mu_1_error_norm = np.linalg.norm( mus[1] - km.cluster_centers_[km_mu_1_idx] / np.linalg.norm(km.cluster_centers_[km_mu_1_idx])) ############################################################################### # Spherical K-Means clustering skm = SphericalKMeans(n_clusters=2, init='k-means++', n_init=20) skm.fit(X) cdists = [] for center in skm.cluster_centers_: cdists.append(np.linalg.norm(mus[0] - center)) skm_mu_0_idx = np.argmin(cdists) skm_mu_1_idx = 1 - skm_mu_0_idx skm_mu_0_error = np.linalg.norm(mus[0] - skm.cluster_centers_[skm_mu_0_idx]) skm_mu_1_error = np.linalg.norm(mus[1] - skm.cluster_centers_[skm_mu_1_idx]) ############################################################################### # Mixture of von Mises Fisher clustering (soft) vmf_soft = VonMisesFisherMixture(n_clusters=2, posterior_type='soft',
from sklearn import metrics nps=np.load("/home/psrivastava/Intern_Summer/data/tfs_encode.npy") df=pd.DataFrame(nps,columns=['embds','title','sets','catg']) embed=[np.array(x) for x in df.iloc[:20000,0].to_list()] sets=[x for x in df.iloc[:20000,2].to_list()] catg=[x for x in df.iloc[:20000,3].to_list()] title=[x for x in df.iloc[:20000,1].to_list()] #dim_reduc=KernelPCA(n_components=2000,kernel='cosine').fit_transform(np.array(embed)) #print(dim_reduc.shape) print(np.squeeze(np.array(embed)).shape) skm=SphericalKMeans(n_clusters=11) skm.fit(np.squeeze(np.array(embed))) X_embeded=TSNE(n_components=2,metric="cosine").fit_transform(np.squeeze(np.array(embed))) dim_reduc=X_embeded uni,coun=np.unique(np.array(sets),return_counts=True) print("Dimension reduc",X_embeded.shape) print(dict(zip(uni,coun))) la=skm.labels_ print(metrics.silhouette_score(np.squeeze(np.array(embed)),la,metric="cosine")) #cluster_center=skm.cluster_centers_ #print(cluster_center.shape) #print(skm.inertia_.shape)
def initialize(self, initialize_strategy,verbose): if initialize_strategy == "random": if verbose==True: print("Association matrix filename: " + self.filename) print("Used parameters: " + '\033[1m' + " k\u2081 = " + str(self.k1) + " and" + " k\u2082 = " + str(self.k2) + '\033[0m') print("Non-zero elements of the association matrix = " + '\033[1m' + "{}".format(np.count_nonzero(self.association_matrix)) + '\033[0m') if self.G_left is None: self.G_left = np.random.rand(self.association_matrix.shape[0], self.k1) self.G_left_primary = True if self.G_right is None: self.G_right = np.random.rand(self.association_matrix.shape[1], self.k2) self.G_right_primary = True elif initialize_strategy == "oldkmeans": if verbose==True: print("Association matrix filename: " + self.filename) print("Used parameters: " + '\033[1m' + " k\u2081 = " + str(self.k1) + " and" + " k\u2082 = " + str(self.k2) + '\033[0m') print("Non-zero elements of the association matrix = " + '\033[1m' + "{}".format(np.count_nonzero(self.association_matrix)) + '\033[0m') if self.G_left is None: with suppress_stdout(): km = KMeans(n_clusters=self.k1).fit(self.association_matrix) self.G_left = np.zeros((self.association_matrix.shape[0], self.k1)) for row in range(self.association_matrix.shape[0]): for col in range(self.k1): self.G_left[row,col] = np.linalg.norm(self.association_matrix[row] - km.cluster_centers_[col]) self.G_left_primary = True if self.G_right is None: with suppress_stdout(): km = KMeans(n_clusters=self.k2).fit(self.association_matrix.transpose()) self.G_right = np.zeros((self.association_matrix.shape[1], self.k2)) for row in range(self.association_matrix.shape[1]): for col in range(self.k2): self.G_right[row,col] = np.linalg.norm(self.association_matrix.transpose()[row] - km.cluster_centers_[col]) self.G_right_primary = True elif initialize_strategy == "kmeans": if verbose==True: print("Association matrix filename: " + self.filename) print("Used parameters: " + '\033[1m' + " k\u2081 = " + str(self.k1) + " and" + " k\u2082 = " + str(self.k2) + '\033[0m') print("Non-zero elements of the association matrix = " + '\033[1m' + "{}".format(np.count_nonzero(self.association_matrix)) + '\033[0m') if self.G_left is None: with suppress_stdout(): km = KMeans(n_clusters=self.k1, n_init = 10).fit_predict(self.association_matrix.transpose()) self.G_left = np.array([np.mean([self.association_matrix[:,i] for i in range(len(km)) if km[i] == p], axis = 0) for p in range(self.k1)]).transpose() self.G_left_primary = True if self.G_right is None: with suppress_stdout(): km = KMeans(n_clusters=self.k2, n_init = 10).fit_predict(self.association_matrix) self.G_right = np.array([np.mean([self.association_matrix[i] for i in range(len(km)) if km[i] == p], axis = 0) for p in range(self.k2)]).transpose() self.G_right_primary = True elif initialize_strategy == "skmeans": if verbose==True: print("Association matrix filename: " + self.filename) print("Used parameters: " + '\033[1m' + " k\u2081 = " + str(self.k1) + " and" + " k\u2082 = " + str(self.k2) + '\033[0m') print("Non-zero elements of the association matrix = " + '\033[1m' + "{}".format(np.count_nonzero(self.association_matrix)) + '\033[0m') #with suppress_stdout(): if self.G_left is None: with suppress_stdout(): skm = SphericalKMeans(n_clusters=self.k1) skm = skm.fit(self.association_matrix.transpose()) #Factor matrices are initialized with the center coordinates self.G_left = skm.cluster_centers_.transpose() self.G_left_primary = True if self.G_right is None: with suppress_stdout(): skm = SphericalKMeans(n_clusters=self.k2).fit(self.association_matrix) #Factor matrices are initialized with the center coordinates self.G_right = skm.cluster_centers_.transpose() self.G_right_primary = True for am in self.dep_own_left_other_left: if am.G_left is None: am.G_left = self.G_left for am in self.dep_own_left_other_right: if am.G_right is None: am.G_right = self.G_left for am in self.dep_own_right_other_left: if am.G_left is None: am.G_left = self.G_right for am in self.dep_own_right_other_right: if am.G_right is None: am.G_right = self.G_right if verbose==True: print(self.leftds, self.rightds, self.association_matrix.shape) print("Shape Factor Matrix left " + str(self.G_left.shape)) print("Shape Factor Matrix right " + str(self.G_right.shape) + "\n") self.S = np.linalg.multi_dot([self.G_left.transpose(), self.association_matrix, self.G_right])
np.random.seed(args.seed) tf.set_random_seed(args.seed) if args.random: centers = Xtrain[np.random.choice(Xtrain.shape[0], replace=False, size=args.num_centers)] else: kmeans = SphericalKMeans(args.num_centers, verbose=1, n_init=5) dump_path = Path("out") / ("kmeans-cifar-%u-%u.pkl" % (args.num_centers, args.seed)) if dump_path.exists(): with open(dump_path, 'rb') as fp: centers = pickle.load(fp) else: kmeans.fit(Xtrain) centers = kmeans.cluster_centers_ with open(dump_path, 'wb') as fp: pickle.dump(centers, fp) print("Calculating distances...") Xtrain = cdist(Xtrain, centers, "cosine") Xtest = cdist(Xtest, centers, "cosine") Xtrain, Xtest = 1 - Xtrain, 1 - Xtest print("Sorting...") if args.top_centers is not None: furthest = (-Xtrain).argsort(axis=1)[:, args.top_centers:] rows = np.array([[i] * furthest.shape[1] for i in range(Xtrain.shape[0])]).ravel() cols = furthest.ravel()
strip_accents='ascii') count_vectorizer = vectorizer.fit_transform(text) transformer = TfidfTransformer(smooth_idf=True) tfidf = transformer.fit_transform(count_vectorizer) #LSA svd = TruncatedSVD(100) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) reduced_matrix = lsa.fit_transform(tfidf) print reduced_matrix.shape km = SphericalKMeans(n_clusters=len(categories)) clusters = km.fit(reduced_matrix) centroids = km.cluster_centers_ labels = km.labels_ for c in range(len(categories)): idx = np.where(labels == c)[0] for l in idx: result[l] = c + 1 print len(result) print result score = normalized_mutual_info_score(cat_list, result) adjusted_score = adjusted_mutual_info_score(cat_list, result) print score print adjusted_score
centroid_file.write(str(len(doc_centroids[0])) + "\n") for cen in doc_centroids: for i in range(len(cen)): centroid_file.write(str(cen[i]) + "\n") centroid_file.close() ############################################################################################ centroids = [list() for x in range(11)] number_of_clusters = 20 for i in range(11): number_of_clusters = min(number_of_clusters, len(documents[i])) kmeans_clustering = SphericalKMeans(number_of_clusters) for i in range(11): idx = kmeans_clustering.fit(documents[i]) centroids[i] = idx.cluster_centers_ writeDocVectorCentroids(centroids[i], i) print str(i) + " clustered" ########################################################################################### ###################################Cosine Distance######################################### def cosineDistance(vector1, vector2, vectorSize): res = 0 norm1 = 0 norm2 = 0 for i in range(vectorSize): res += vector1[i] * vector2[i] for i in range(vectorSize):
def initialize(self): self.R12_train = np.multiply(NMTF.R12, self.M) if self.init_method == 'random': """Random uniform""" self.G1 = np.random.rand(NMTF.n1, self.K[0]) self.G2 = np.random.rand(NMTF.n2, self.K[1]) self.G3 = np.random.rand(NMTF.n3, self.K[2]) self.G4 = np.random.rand(NMTF.n4, self.K[3]) self.G5 = np.random.rand(NMTF.n5, self.K[4]) if self.init_method == 'skmeans': """spherical k-means""" #Sperical k-means clustering is done on the initial data skm1 = SphericalKMeans(n_clusters=self.K[0]) skm1.fit(self.R12_train.transpose()) skm2 = SphericalKMeans(n_clusters=self.K[1]) skm2.fit(self.R12_train) skm3 = SphericalKMeans(n_clusters=self.K[2]) skm3.fit(NMTF.R23) skm4 = SphericalKMeans(n_clusters=self.K[3]) skm4.fit(NMTF.R34) skm5 = SphericalKMeans(n_clusters=self.K[4]) skm5.fit(NMTF.R25) #Factor matrices are initialized with the center coordinates self.G1 = skm1.cluster_centers_.transpose() self.G2 = skm2.cluster_centers_.transpose() self.G3 = skm3.cluster_centers_.transpose() self.G4 = skm4.cluster_centers_.transpose() self.G5 = skm5.cluster_centers_.transpose() if self.init_method == 'acol': """random ACOL""" #We will "shuffle" the columns of R matrices and take the mean of k batches Num1 = np.random.permutation(NMTF.n2) Num2 = np.random.permutation(NMTF.n1) Num3 = np.random.permutation(NMTF.n2) Num4 = np.random.permutation(NMTF.n3) Num5 = np.random.permutation(NMTF.n2) G1 = [] for l in np.array_split(Num1, self.K[0]): G1.append(np.mean(self.R12_train[:,l], axis = 1)) self.G1 = np.array(G1).transpose() G2 = [] for l in np.array_split(Num2, self.K[1]): G2.append(np.mean(self.R12_train.transpose()[:,l], axis = 1)) self.G2 = np.array(G2).transpose() G3 = [] for l in np.array_split(Num3, self.K[2]): G3.append(np.mean(NMTF.R23.transpose()[:,l], axis = 1)) self.G3 = np.array(G3).transpose() G4 = [] for l in np.array_split(Num4, self.K[3]): G4.append(np.mean(NMTF.R34.transpose()[:,l], axis = 1)) self.G4 = np.array(G4).transpose() G5 = [] for l in np.array_split(Num5, self.K[4]): G5.append(np.mean(NMTF.R25.transpose()[:,l], axis = 1)) self.G5 = np.array(G5).transpose() if self.init_method == 'kmeans': """k-means with clustering on previous item""" #As for spherical k-means, factor matrices will be initialized with the centers of clusters. km1 = KMeans(n_clusters=self.K[0], n_init = 10).fit_predict(self.R12_train.transpose()) km2 = KMeans(n_clusters=self.K[1], n_init = 10).fit_predict(self.R12_train) km3 = KMeans(n_clusters=self.K[2], n_init = 10).fit_predict(self.R23) km4 = KMeans(n_clusters=self.K[3], n_init = 10).fit_predict(self.R34) km5 = KMeans(n_clusters=self.K[4], n_init = 10).fit_predict(self.R25) self.G1 = np.array([np.mean([self.R12_train[:,i] for i in range(len(km1)) if km1[i] == p], axis = 0) for p in range(self.K[0])]).transpose() self.G2 = np.array([np.mean([self.R12_train[i] for i in range(len(km2)) if km2[i] == p], axis = 0) for p in range(self.K[1])]).transpose() self.G3 = np.array([np.mean([self.R23[i] for i in range(len(km3)) if km3[i] == p], axis = 0) for p in range(self.K[2])]).transpose() self.G4 = np.array([np.mean([self.R34[i] for i in range(len(km4)) if km4[i] == p], axis = 0) for p in range(self.K[3])]).transpose() self.G5 = np.array([np.mean([self.R25[i] for i in range(len(km5)) if km5[i] == p], axis = 0) for p in range(self.K[4])]).transpose() self.S12 = np.linalg.multi_dot([self.G1.transpose(), self.R12_train, self.G2]) self.S23 = np.linalg.multi_dot([self.G2.transpose(), self.R23, self.G3]) self.S34 = np.linalg.multi_dot([self.G3.transpose(), self.R34, self.G4]) self.S25 = np.linalg.multi_dot([self.G2.transpose(), self.R25, self.G5])
docs, terms = doc_term_matrix.nonzero() for doc, term in zip(docs, terms): # Inefficient way of calculating the average... doc_num_terms[doc] = doc_num_terms[doc] + 1 doc_embed_matrix[doc, :] = ((doc_embed_matrix[doc, :] * \ doc_num_terms[doc] - 1) + \ (doc_term_matrix[doc, term] * embed[term]) ) / doc_num_terms[doc] print('Formed document embedding matrix in {:.6f}s'.format(time() - t0)) # Cluster documents by spherical K-means (cluster centroids are projected # onto the unit hypersphere) t0 = time() skm = SphericalKMeans(n_clusters=n_clusters) skm.fit(doc_embed_matrix) print('Clustered documents by spherical K-means in {:.6f}s'.format(time() - t0)) # Add column to dataframe to hold assigned cluster and output to CSV grants['cluster'] = skm.labels_.tolist() grants.to_csv(path_or_buf=output_data) # Find terms associated with each cluster by applying Tf-idf to a corpus of # documents that are the concatenation of all grants assigned to each cluster. t0 = time() cluster_docs = grants.groupby('cluster')['document'].agg( lambda x: ' '.join(x)) cluster_vectorizer = TfidfVectorizer(stop_words='english')
def kb_doc_clustering(train_data, cluster_num): clusterer = SphericalKMeans(cluster_num) print('Start clustering...') clusterer.fit(train_data) print('Done.') return clusterer