def get_domi_color_new_image(image, n_clusters=2): ''' INPUT: image: numpy array n_clusters: integer OUTPUT: domi_color: numpy array ''' if len(image.shape) == 3: image = transform.resize(image, (300,300,3)) else: return -1 # Flatten the image matrix: nrow, ncol, depth = image.shape lst_of_pixels = [image[irow][icol] for irow in range(nrow) for icol in range(ncol)] # Clustering the colors of each pixel: kmean = KMeans(n_clusters=n_clusters) kmean.fit_transform(lst_of_pixels) domi_colors = kmean.cluster_centers_ # Get the dominant color of the furniture (darker than the background): if np.mean(domi_colors[0]) < np.mean(domi_colors[1]): domi_color = domi_colors[0] else: domi_color = domi_colors[1] return domi_color
def mfcc_clustering(file_name, n_clusters): """ From Prem :return: """ clusterer = KMeans(n_clusters=n_clusters) print(file_name) mix, sr = librosa.load(file_name) mix_stft = librosa.stft(mix) comps, acts = find_template(mix_stft, sr, 100, 101, 0, mix_stft.shape[1]) cluster_comps = librosa.feature.mfcc(S=comps)[1:14] save_mfcc_img(file_name[:-4] + "_mfcc.png", np.flipud(cluster_comps)) clusterer.fit_transform(cluster_comps.T) labels = clusterer.labels_ # print(labels) sources = [] for cluster_index in range(n_clusters): indices = np.where(labels == cluster_index)[0] template, residual = extract_template(comps[:, indices], mix_stft) t = librosa.istft(template) sources.append(t) return np.array(sources)
def run_kmeans(vector=None, links=[], iters=500, clusters=8): km = KMeans(n_clusters=clusters, max_iters=iters) km.fit_transform(vec) clusters = defaultdict(list) for i in xrange(len(links)): clusters[km.labels[i]].append(links[i]) for x in clusters: print x, clusters[x] return km.labels_
def get_kmean_clusters(self,X): ''' Returns labels of kmeans clustering INPUTS: X = feature matrix as 2d numpy float array OUTPUTS: KMeans cluster labels as 1d numpy array of strings ''' kmeans = KMeans(5) kmeans.fit_transform(X) return kmeans.labels_
def wrapper_scikit(K): pics_t = np.empty((pics.shape[0],np.power(pics.shape[1],2))) for i in range(pics_t.shape[0]): pics_t[i] = pics[i].flatten() time1 = time.time() kmean = KMeans(init='random', n_clusters=K) kmean.fit_transform(pics_t) time2 = time.time() return (time2-time1)*1000.
def findElbow(features, n = 10): error = [] for i in xrange(n): km = KMeans(n_clusters = i + 1) km.fit_transform(features) error.append(kmeansError(features, km)) plt.figure(figsize=(10,10)) plt.plot(range(1,n + 1),error,'k',linewidth=10) plt.plot(range(1,n + 1),error,'ko',markersize=25) plt.show()
def get_kmean_model(X, true_k, n_init=10, verbose=False): km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=n_init, verbose=verbose) km.fit_transform(X) return km
def train_model(texts, points, num_classses, model_dir, text_encoding='utf-8'): """ Given an iterable of (text, lat, lon) items, cluster the points into #num_classes and use them as labels, then extract unigram features, train a classifier and save it in models/model_name for future use. Args: texts -- an iterable (e.g. a list) of texts e.g. ['this is the first text', 'this is the second text']. points -- an iterable (e.g. a list) of tuples in the form of (lat, lon) where coordinates are of type float e.g. [(1.2343, -10.239834r),(5.634534, -12.47563)] num_classes -- the number of desired clusters/labels/classes of the model. model_name -- the name of the directory within models/ that the model will be saved. """ if os.path.exists(model_dir): logging.error("Model directory " + model_dir + " already exists, please try another address.") sys.exit(-1) else: os.mkdir(model_dir) from sklearn.cluster import KMeans from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.stochastic_gradient import SGDClassifier kmeans = KMeans(n_clusters=num_classses, random_state=0) points_arr = numpy.array(points) kmeans.fit_transform(points_arr) cluster_centers = kmeans.cluster_centers_ sample_clusters = kmeans.labels_ label_coordinate = {} for i in range(cluster_centers.shape[0]): lat, lon = cluster_centers[i, 0], cluster_centers[i, 1] label_coordinate[i] = (lat, lon) logging.info('extracting features from text...') vectorizer = TfidfVectorizer(encoding=text_encoding, stop_words='english', ngram_range=(1,1), max_df=0.5, min_df=0, binary=True, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True) X_train = vectorizer.fit_transform(texts) Y_train = sample_clusters vectorizer.stop_words_ = None logging.info('the number of samples is %d and the number of features is %d' % (X_train.shape[0], X_train.shape[1])) logging.info('training the classifier...') logging.warn('Note that alpha (regularisation strength) should be tuned based on the performance on validation data.') clf = SGDClassifier(loss='log', penalty='elasticnet', alpha=5e-5, l1_ratio=0.9, fit_intercept=True, n_iter=5, n_jobs=2, random_state=0, learning_rate="optimal") clf.fit(X_train, Y_train) clf.coef_ = csr_matrix(clf.coef_) logging.info('retrieving address of the given points using geopy (requires internet access).') coordinate_address = retrieve_location_from_coordinates(label_coordinate.values()) logging.info('dumping the the vectorizer, clf (trained model), label_coordinates and coordinate_locations into pickle files in ' + model_dir) dump_model(clf, vectorizer, coordinate_address, label_coordinate, model_dir)
def kmeans(embedding,n_components, mask): import numpy as np from sklearn.cluster import KMeans all_vertex=range(embedding.shape[0]) masked_embedding = np.delete(embedding, mask, 0) cortex=np.delete(all_vertex, mask) est = KMeans(n_clusters=n_components, n_jobs=-2, init='k-means++', n_init=300) est.fit_transform(masked_embedding) labels = est.labels_ kmeans_results = labels.astype(np.float) kmeans_recort = recort(len(all_vertex), kmeans_results, cortex, 1) return kmeans_recort
def best_lda_cluster_wine(self): dh = data_helper() dh = data_helper() X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ## ## K-Means ## km = KMeans(n_clusters=4, algorithm='full') X_train_transformed = km.fit_transform(X_train_scl) X_test_transformed = km.transform(X_test_scl) # save filename = './' + self.save_dir + '/wine_kmeans_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_kmeans_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False) ## ## GMM ## gmm = GaussianMixture(n_components=4, covariance_type='full') X_train_transformed = km.fit_transform(X_train_scl) X_test_transformed = km.transform(X_test_scl) # save filename = './' + self.save_dir + '/wine_gmm_lda_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/wine_gmm_lda_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def decompose_map(map1, method, r=40, out='inter'): map1.reset_solution() if method == 'EIG': map1.decompose('EIG', dim_num=r) elif method == 'PCA': map1.decompose('PCA', dim_num=r) elif method == 'ICE': map1.decompose('ICE', dim_num=r) elif method == 'K-means': from k_means_pdist import kmeanssample DIST = -np.array(map1.contact_map) ## simi to dist centres, xtoc, dist = kmeanssample(DIST, np.eye(DIST.shape[0]), r, nsample=0, delta=0.001, maxiter=20, verbose=0) map1.contact_group = -np.matrix(dist) ## dist to simi elif method == '3D-K-means': km = KMeans(n_clusters=r) dfile = 'pdb.txt' pb, vx = map1.get_locations(dfile, st=1, ch=0, po=1, nm=2, add=0) pb, vy = map1.get_locations(dfile, st=1, ch=0, po=1, nm=3, add=0) pb, vz = map1.get_locations(dfile, st=1, ch=0, po=1, nm=4, add=0) X = np.zeros((map1.contact_map.shape[0], 3)) C = np.zeros(map1.contact_map.shape[0]) for i,x,y,z in zip(pb,vx,vy,vz): X[i,0] = x X[i,1] = y X[i,2] = z C[i] += 1 C[C==0] = 1 X /= C[:,np.newaxis] map1.contact_group = -np.matrix(km.fit_transform(X)) elif method == 'NMF': map1.decompose('NND', dim_num=r) map1.decompose('NMF-Gaussian', dim_num=r) map1.contact_group = np.dot(map1.contact_group, map1.group_map) elif method == 'BNMF': map1.decompose('NND', dim_num=r) map1.decompose('NMF-PoissonManifoldEqual', dim_num=r, par_lam=0) map1.contact_group = np.dot(map1.contact_group, map1.group_map) elif method == 'Random': n = map1.contact_map.shape[0] map1.contact_group = np.zeros((n,r)) from math import ceil size = int(ceil(n/float(r))) for i in xrange(n): map1.contact_group[i, i/size] = 1 elif method == 'Armatus': from run_armatus import Armatus map1.save() map2 = Armatus('../tools/armatus2.1/armatus', name=map1.name) map2.load() map2.decompose() map1.contact_group = map2.contact_group elif method == 'TAD': from run_domaincall import DomainCall map1.save() map2 = DomainCall('../tools/domaincall/', name=map1.name) map2.load() map2.decompose() map1.contact_group = map2.contact_group else: raise ValueError('Unknow method name '+method)
def run(lines,vectorizerCls): print(TIMENOW(),'VECTORIZE','-'*42) vectorizer=vectorizerCls(stop_words=['le','de','la','les','je','un','une','des','est','et','il','elle','du','ai','au',]) data =vectorizer.fit_transform(lines) num_samples, num_features = data.shape print("#samples: %d, #features: %d" % (num_samples, num_features)) #samples: 5, #features: 25 #samples: 2, #features: 37 print(TIMENOW(),'KMEANS','-'*42) km =KMeans(n_clusters=n_clusters) res =km.fit_transform(data) labels = km.labels_ labels_shape = km.labels_.shape print ("labels : ", labels) print ("labels_shape : ", labels_shape) print(TIMENOW(),'DONE','-'*42) print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() result = dict() for i in range(n_clusters): result[i]=list() print("Cluster %d:" % i, end='') for ind in order_centroids[i, :25]: print(' %s' % terms[ind], end='\n') result[i].append(terms[ind]) print() return result
def KinKmeans(var, nk=False, tol=1e-4, n_init=100): ''' Uses pseudo-F to estimate the best number of K in K-Means From MJCarvalho GapStatistics :param numpy var: Numpy array with input data :param int nk: Initial number of K :param float tol: Tolerance for K-Means :param int n_init: Number of initializations for K-Means :return int: Number of K and f statistic ''' from sklearn.cluster import KMeans Nd = np.size(var, axis=0) S = np.zeros(Nd) f = np.zeros(Nd) alpha = np.zeros(Nd) if not nk: term = 3 else: term = nk kink = [0] i = 0 while len(kink) <= term: ## Kmeans kmeans = KMeans(init='k-means++', n_clusters=i+1, n_init=n_init, tol=tol) T = kmeans.fit_transform(var, y=None) I = np.nansum(T**2, axis=0) S[i] = np.nansum(I, axis=0) ## Det. Alpha if i == 1: alpha[i] = 1.0 - (3.0/(4.0*Nd)) elif i > 1: alpha[i] = alpha[i-1] + (1-alpha[i-1])/6.0 ## Det. f(k) if i == 0: f[i] = 1 else: f[i] = S[i] / (alpha[i] * S[i-1]) if not nk: kink = np.arange(len(f))[ np.r_[True, f[1:] < f[:-1]] & np.r_[f[:-1] <= f[1:], True] | np.r_[True, f[1:] <= f[:-1]] & np.r_[f[:-1] < f[1:], True] ] else: kink.append(0) i += 1 return kink[1], f
def clusterGoalies(df, idx, numOfClusters): model = KMeans(n_clusters=numOfClusters, n_init=20) distMat = model.fit_transform(df) resultList = [[] for i in range(numOfClusters)] for i, rowList in enumerate(distMat): minIndex = min(enumerate(rowList), key = lambda x: x[1])[0] resultList[minIndex].append(idx[i]) return resultList
def make_cluster(df): cluster_df = pd.DataFrame() clusters = KMeans(n_clusters=4) distance_matrix = clusters.fit_transform(cust_data_transform) cluster_df["cluster"] = clusters.labels_ # Finding the euclidean distance from the point to its cluster center cluster_df["dist"] = [min(x) for x in distance_matrix] return cluster_df, clusters.cluster_centers_
def vectorize(self, term_docs, n_clusters = 8): self.n_clusters = n_clusters tf = TfidfVectorizer() X = tf.fit_transform(term_docs) km = KMeans(n_clusters = n_clusters) x = km.fit_transform(X) self.labels = km.labels_ return km.labels_
def kcluster(dataframe, n=3, n_clusters=5): X_centered = preprocessing.scale(dataframe.fillna(0)) pca = decomposition.PCA(n_components=n) X_pca = pca.fit_transform(X_centered) kpy.plot_k_sse(X_pca) k = KMeans(n_clusters=n_clusters) km = k.fit_transform(X_pca) plt.hist(k.labels_) return pca, X_pca, k, km
def make_clustering(data_frame, number_of_clusters): # initializing KMeans object, computing clustering and transforming X to cluster-distance space k_means_model = KMeans(n_clusters=number_of_clusters) distances = k_means_model.fit_transform(data_frame.iloc[:, 2:]) # adding to out data frame information about unit' cluster and distances to every cluster data_frame["cluster"] = k_means_model.labels_ for i in range(number_of_clusters): data_frame["dist " + str(i) + " cluster"] = distances[:, i] return data_frame
def runKMeansSKLearn(X, k = None): kmeans = KMeans(n_clusters=k, n_jobs=-1) clusters = kmeans.fit_predict(X).tolist() cluster_distance = kmeans.fit_transform(X).tolist() cluster_centers = kmeans.cluster_centers_ coords = [] for cluster in clusters: coord = [cluster_centers[cluster,0], cluster_centers[cluster, 1]] coords.append(coord) return [None, coords]
def getWordCentroidMap(self, model, num_clusters): start = time.time() word_vectors = model.syn0 print "begin to clustering to gaining code book" kmeans_clustering = KMeans(n_clusters = num_clusters) idx = kmeans_clustering.fit_transform(word_vectors) end = time.time() elapsed= end - start print "Time takes for K means clustering: {} seconds".format(elapsed) return dict(zip(model.index2word, idx))
def cluster_points(points, number_of_clusters): '''This function should take a list of points (in two dimensions) and return a list of clusters, each of which is a list of points. For example, if you passed in [(0, 0), (-0.1, 0.1), (2,3), (2.1, 3)] with number_of_clusters set to 2, it should return [[(0, 0), (-0.1, 0.1)], [(2,3), (2.1, 3)]].''' model = KMeans(n_clusters=number_of_clusters) distMat = model.fit_transform(points) resultList = [[] for i in range(number_of_clusters)] for i, rowList in enumerate(distMat): minIndex = min(enumerate(rowList), key = lambda x: x[1])[0] resultList[minIndex].append(points[i]) return resultList
def Analysis(vector, K=2): arr = (np.array(vector)) # mean normalization of the data . converting into normal distribution having mean=0 , -0.1<x<0.1 sc = StandardScaler() x = sc.fit_transform(arr) # Breaking into principle components pca = PCA(n_components=2) components = (pca.fit_transform(x)) # Applying kmeans algorithm for finding centroids kmeans = KMeans(n_clusters=K, n_jobs=-1) kmeans.fit_transform(components) print("labels: ", kmeans.labels_) centers = kmeans.cluster_centers_ # lables are assigned by the algorithm if 2 clusters then lables would be 0 or 1 lables = kmeans.labels_ colors = ["r.", "g.", "b.", "y.", "c."] colors = colors[:K + 1] for i in range(len(components)): plt.plot(components[i][0], components[i][1], colors[lables[i]], markersize=10) plt.scatter(centers[:, 0], centers[:, 1], marker="x", s=150, linewidths=10, zorder=15) plt.xlabel("1st Principle Component") plt.ylabel("2nd Principle Component") title = "Styles Clusters" plt.title(title) plt.savefig("Results" + ".png") plt.show()
def main(): listCorpus, listSize = readFile() embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') # Corpus with example sentences corpus = [ 'A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.', 'The girl is carrying a baby.', 'The baby is carried by the woman', 'A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.', 'A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.' ] # listCorpus=corpus # listSize=[1,1,1,1,1,1,1,1,1,1,1] corpus_embeddings = embedder.encode(listCorpus) num_clusters = 150 clustering_model = KMeans(n_clusters=num_clusters) cluster_dist = clustering_model.fit_transform(corpus_embeddings) cluster_dist = cluster_dist.min(1) cluster_assignment = clustering_model.labels_ final_assignment = -1 * np.ones(len(listCorpus)) keywords_list = [] for i in range(0, num_clusters): theta = (cluster_dist * (cluster_assignment == i)) if len(np.nonzero(theta)[0]) == 0: continue idx = np.where(theta == np.min(theta[np.nonzero(theta)])) final_assignment[cluster_assignment == i] = idx[0][0] keywords_list.append(listCorpus[idx[0][0]]) final_assignment1 = [listCorpus[int(i)] for i in final_assignment] start = 0 line_no = 0 # with open('citi_file_cluster.csv', 'w', newline='') as write_file: # for i in range(0, len(listSize)): # writer = csv.writer(write_file) # col1 = "/".join(listCorpus[start:start + listSize[line_no]]) # col2 = "/".join(final_assignment1[start:start + listSize[line_no]]) # col3 = "/".join(list(set(final_assignment1[start:start + listSize[line_no]]))) # start += listSize[line_no] # writer.writerow([col1, col2, col3]) # line_no += 1 # with open('keywords_list.csv', 'w', newline='') as write_file: writer = csv.writer(write_file) writer.writerow(keywords_list)
def do_nmf_and_clustering(input_file, n_clusters): """ :return: """ clusterer = KMeans(n_clusters=n_clusters) mix, sr = librosa.load(input_file) mix_stft = librosa.stft(mix) comps, acts = find_template(mix_stft, sr, 100, 101, 0, mix_stft.shape[1]) cluster_comps = librosa.feature.mfcc(S=comps)[1:14] clusterer.fit_transform(cluster_comps.T) labels = clusterer.labels_ sources = [] for cluster_index in range(n_clusters): indices = np.where(labels == cluster_index)[0] template, residual = extract_template(comps[:, indices], mix_stft) t = librosa.istft(template) sources.append(t) return np.array(sources), cluster_comps
def kmeans_(): # use features for clustering from sklearn.cluster import KMeans km = KMeans(n_clusters=N, init='k-means++') #features = np.reshape(x_train, newshape=(features.shape[0], -1)) km_trans = km.fit_transform(x_train) pred = km.predict(x_train) print pred.shape print('acc=', met.acc(y_train, pred), 'nmi=', met.nmi(y_train, pred), 'ari=', met.ari(y_train, pred)) return km_trans, pred
def CQ_ABC(img, K): # init n_solutions = 100 solutions = np.random.randint(0, 255, size=(n_solutions, K, 3)) MCN = 100 pixels = np.reshape(img, newshape=(img.shape[0] * img.shape[1], 3)) fitness_array = np.zeros(shape=n_solutions) for n in range(MCN): # employed bee for solution in solutions: k_means = KMeans(n_clusters=K, init=solution, max_iter=5) # mapping and evaluation mapped = k_means.fit_transform(pixels)
def vectorize(self, term_docs, n_clusters = 8): self.n_clusters = n_clusters tf = TfidfVectorizer() X = tf.fit_transform(term_docs) km = KMeans(n_clusters = n_clusters) artist_distance = km.fit_transform(X) #ipdb.set_trace() self.labels = km.labels_ self.km = km return km.labels_, artist_distance
def clustering(atributes, amount_centroides): centroides = atributes[np.random.choice(atributes.shape[0], amount_centroides, replace=False)] kmeans = KMeans(n_clusters=amount_centroides, init=centroides, max_iter=500, n_init=1, random_state=0) distances = kmeans.fit_transform(atributes) centros = kmeans.cluster_centers_ return kmeans, distances, centros
def add_kmeans(clusters_list, group, X, results_df): for clusters in clusters_list: km = KMeans(n_clusters=clusters, random_state=0, n_init=10, algorithm='auto', n_jobs=-1) y_km_transform = km.fit_transform(X) y_km_labels = km.labels_ results_df[group + ' ' + str(clusters) + ' K-Means'] = y_km_labels return results_df
def Semi_supervised_learning(self): from sklearn.datasets import load_digits X_digits, y_digits = load_digits(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, test_size=0.33, random_state=42) from sklearn.linear_model import LogisticRegression n_labeled = 50 log_reg = LogisticRegression(random_state=42) print(X_train.shape) log_reg.fit(X_train[:n_labeled], y_train[:n_labeled]) print(log_reg.score(X_test, y_test)) kmeans = KMeans(n_clusters=n_labeled, random_state=42) X_digits_dist = kmeans.fit_transform(X_train) print(X_digits_dist.shape) # 获取列方向上的最小索引 representative_digits_idx = np.argmin(X_digits_dist, axis=0) X_representative_digits = X_train[representative_digits_idx] plt.figure(figsize=(8, 2)) for index, X_representative_digit in enumerate( X_representative_digits): plt.subplot(n_labeled // 10, 10, index + 1) plt.imshow(X_representative_digit.reshape(8, 8), cmap="binary", interpolation="bilinear") plt.axis('off') # plt.show() # 根据距离kmean计算最近的几个元素距离 log_reg = LogisticRegression(random_state=42) log_reg.fit(X_train[representative_digits_idx], y_train[representative_digits_idx]) print(log_reg.score(X_test, y_test)) # 根据kmaeans打标签 y_representative_digits = np.array([ 4, 8, 0, 6, 8, 3, 7, 7, 9, 2, 5, 5, 8, 5, 2, 1, 2, 9, 6, 1, 1, 6, 9, 0, 8, 3, 0, 7, 4, 1, 6, 5, 2, 4, 1, 8, 6, 3, 9, 2, 4, 2, 9, 4, 7, 6, 2, 3, 1, 1 ]) y_train_propagated = np.empty(len(X_train), dtype=np.int32) for i in range(50): y_train_propagated[kmeans.labels_ == i] = y_representative_digits[i]
def do_kmeans_analysis(data_frame: DataFrame, clusters_number: int) -> KMeansAnalysisResult: k_means: KMeans = None if clusters_number > 0: k_means = KMeans(n_clusters=clusters_number) else: k_means = KMeans() labels_mapping = normalise_data_frame(data_frame.iloc[:, :3]) columns = labels_mapping.columns k_means.fit_transform(labels_mapping) labels_mapping['Label'] = MinMaxScaler().fit_transform( k_means.labels_.reshape(-1, 1)) centroids = DataFrame(data=k_means.cluster_centers_, columns=columns) return KMeansAnalysisResult(labels_mapping=labels_mapping, labels_mapping_labels=get_columns_labels( data_frame, columns), centroids=centroids)
def kmeans_cleaning(self, use_cache, cache): if self.USE_KMEANS and not use_cache: tfidf_info, tdmatrix, _ = self.makeTermDocMatrix(self.X_train) # print(tdmatrix) kmeans = KMeans(self.N_CLUSTERS, n_jobs=-1) X_new = kmeans.fit_transform( tdmatrix ) # hope they don't mess with indices of training data. corressponding_dists_with_indices_not_messed_hopefully = [ X_new[(i, x)] for i, x in enumerate(kmeans.labels_) ] print("len of list", len(corressponding_dists_with_indices_not_messed_hopefully)) print("max threshold,",max(corressponding_dists_with_indices_not_messed_hopefully),"\n",\ "min threshold:", min(corressponding_dists_with_indices_not_messed_hopefully)) # plt.hist(corressponding_dists_with_indices_not_messed_hopefully) # plt.show() self.cleanedId = (np.where( np.array(corressponding_dists_with_indices_not_messed_hopefully ) < self.THRESHOLD)[0]).tolist() for cl in self.classes: self.docsId[cl] = set(self.docsId[cl]).intersection( self.cleanedId) # updating docsId # need to update X_train, y_train also. # print(type(cleanedId)) print("Number of new documents:", len(self.cleanedId)) # print(self.cleanedId[:5]) self.X_train = (np.array(self.X_train)[self.cleanedId]).tolist() self.y_train = (np.array(self.y_train)[self.cleanedId]).tolist() elif self.USE_KMEANS and use_cache: self.X_train, self.y_train, self.cleanedId = cache self.docs = { cl: [self.idToDoc[x] for x in self.docsId[cl]] for cl in self.classes } self.LEN_OF_CLASS = {} for cl in self.classes: self.LEN_OF_CLASS[cl] = len(self.docs[cl]) self.TOTAL_DOCS = sum(len(self.docs[cl]) for cl in self.classes) # to be collected outside or may be inside. return self.X_train, self.y_train, self.cleanedId
def evaluate_components(A,Yr,psx): #%% clustering components Ys=Yr psx = cse.pre_processing.get_noise_fft(Ys,get_spectrum=True); #[sn,psx] = get_noise_fft(Ys,options); #P.sn = sn(:); #fprintf(' done \n'); psdx = np.sqrt(psx[:,3:]); X = psdx[:,1:np.minimum(np.shape(psdx)[1],1500)]; #P.psdx = X; X = X-np.mean(X,axis=1)[:,np.newaxis]# bsxfun(@minus,X,mean(X,2)); % center X = X/(+1e-5+np.std(X,axis=1)[:,np.newaxis]) from sklearn.cluster import KMeans from sklearn.decomposition import PCA,NMF from sklearn.mixture import GMM pc=PCA(n_components=5) nmf=NMF(n_components=2) nmr=nmf.fit_transform(X) cp=pc.fit_transform(X) gmm=GMM(n_components=2) Cx1=gmm.fit_predict(cp) L=gmm.predict_proba(cp) km=KMeans(n_clusters=2) Cx=km.fit_transform(X) Cx=km.fit_transform(cp) Cx=km.cluster_centers_ L=km.labels_ ind=np.argmin(np.mean(Cx[:,-49:],axis=1)) active_pixels = (L==ind) centroids = Cx;
def clusterKMeans(indexer, function, clusters, seeds): # First, set up the data correctly # Import the vectorization as done by the indexing. normalized = indexer.get_normalized_paper_values("paper_text", function) # Use the results found in the indexing as the vector. vectorizer = DictVectorizer() X = vectorizer.fit_transform(normalized.values()) # Cluster documents model = KMeans(n_clusters=clusters, init='k-means++', n_init=seeds) model.fit_transform(X) # Print top terms per cluster clusters, getting and sorting centroids and terms print("Top terms per cluster:") order_centroids = model.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() # Print top terms per cluster clusters, actually printing them for i in range(clusters): print("Cluster %d:" % i, '\n') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], ) print('\n') # Getting and printing the clusters and amount of points in them labels, counts = np.unique(model.labels_[model.labels_ >= 0], return_counts=True) for i in range(clusters): print('Cluster %d has %d points in it.' % (labels[i], counts[i])) # Computing and printing silhouette score sil_coeff = silhouette_score(X, model.labels_, metric='euclidean') print("For n_clusters={}, The Silhouette Coefficient is {}".format( clusters, sil_coeff)) return X, model
def tfIdf_Kmeans(texts, clusters): """ Transform texts to Tf-Idf coordinates and cluster texts using K-Means """ print "def tfIdf_Kmeans(texts, clusters):" # vectorizer = TfidfVectorizer(tokenizer=process_text, # stop_words=stopwords.words('portuguese'), # max_df=0.5, # min_df=0.1, # lowercase=True) #experimento 1 vectorizer = TfidfVectorizer() #experimento 2 # vectorizer = TfidfVectorizer(max_df=0.6, # min_df=0.3) #experimento 3 # vectorizer = TfidfVectorizer(max_df=0.6, # min_df=0.3) tfidf_model = vectorizer.fit_transform(texts) # km_model = MiniBatchKMeans(n_clusters=clusters) #Valor ideal, após experimentos = 100000 km_model = KMeans(n_clusters=clusters, n_init=100000) #VALOR PARA TESTE! #km_model = KMeans(n_clusters=clusters, n_init=1) km_model.fit_transform(tfidf_model) clustering = collections.defaultdict(list) for idx, label in enumerate(km_model.labels_): clustering[label].append(idx) return clustering
def GP_SE(dealer): df = data_dropped[data_dropped['dealer'] == dealer] df.sort_values(['year', 'month'], inplace=True) # If a column is all nan , it would drop the feature so , the dimension wont be matched. try: df.iloc[:, 1:] = SimpleImputer(missing_values=np.nan, strategy='median').fit_transform( np.array(df.iloc[:, 1:])) except: return [dealer] low_whisker = df.iloc[:, 1:5].quantile(0.01) mask_1 = (df.iloc[:, 1:5] < low_whisker) df.iloc[:, 1:5] = np.where(mask_1, low_whisker, df.iloc[:, 1:5]) high_whisker = df.iloc[:, 1:5].quantile(0.99) mask_2 = (df.iloc[:, 1:5] > high_whisker) df.iloc[:, 1:5] = np.where(mask_2, high_whisker, df.iloc[:, 1:5]) df['GPNV'] = df['R27'] / df['U27'] df['SE'] = df['MV1'] / df['EXP1'] df = df.replace([np.inf, -np.inf], np.nan) df.dropna(inplace=True) corr = df[['GPNV', 'SE']].corr().iloc[0, 1] X_GPNV = MinMaxScaler().fit_transform(np.array(df['GPNV']).reshape(-1, 1)) y_SE = MinMaxScaler().fit_transform(np.array(df['SE']).reshape(-1, 1)) df['GPNV_scaled'] = X_GPNV df['SE_scaled'] = y_SE X_2 = PolynomialFeatures(degree=2).fit_transform(X_GPNV) regress_2 = linear_model.LinearRegression() regress_2.fit(X_2, y_SE) X_3 = PolynomialFeatures(degree=3).fit_transform(X_GPNV) regress_3 = linear_model.LinearRegression() regress_3.fit(X_3, y_SE) k = KMeans(1) radius = max(k.fit_transform(df[['GPNV_scaled', 'SE_scaled']])) centroid = k.cluster_centers_ collector = [ dealer, df.shape[0], df['SE'].mean(), df['GPNV'].mean(), corr, radius, centroid[0], centroid[1] ] collector.extend(list(regress_2.coef_[0, :])) collector.append(regress_2.score(X_2, np.array(df['SE']))) collector.extend(list(regress_3.coef_[0, :])) collector.append(regress_3.score(X_3, np.array(df['SE']))) return df, collector
def begin(self, inarray): inarray = self.make2d(inarray) inarray = normalize(inarray) inarray = self.make2d(inarray) for i in range(2, 12): kd = KMeans(n_clusters=i) temparray = inarray result = kd.fit_transform(temparray) params = kd.get_params() print('\n\n\n======================================\n\n\n') print("variance = {0}".format(result.var())) print(params) fIO.FileIO().saveWork((result, params, kd), 'kmeansfit_fulldata'.format(i), 2) input("press any key to exit...")
def run_Kmeans(X_train, X_test, y_train, y_test, experiment_number, dataset_name, neighbors=None): if neighbors: algorithm = KMeans(random_state=0, n_clusters=neighbors) else: algorithm = KMeans(random_state=0) # add conversions transformed_X_train = algorithm.fit_transform(X_train) transformed_X_test = algorithm.fit_transform(X_test) df = pd.DataFrame() title = "Kmeans" confidence = algorithm.score(X_test, y_test) inertia = algorithm.inertia_ if neighbors >= 2: df['label'] = pd.Series([i[0] for i in y_train.tolist()]) df['comp-one'] = transformed_X_train[:, 0] df['comp-two'] = transformed_X_train[:, 1] transformed_df = pd.DataFrame(transformed_X_train) c = transformed_df.corr().abs() s = c.unstack() so = s.sort_values(kind="quicksort") so = so[so != 1] max_cross_section = so.idxmax() min_cross_section = so.idxmin() for cross_section in [max_cross_section, min_cross_section]: plot_cross_section(transformed_X_train, cross_section, title, neighbors, experiment_number, dataset_name) return confidence, inertia, transformed_X_train, transformed_X_test, df
class KMeansClustering(AnomalyModel): def __init__(self, anomaly_dict, settings, features): super().__init__(anomaly_dict, settings, features) self.kmeans = KMeans(n_clusters=settings.n_clusters, init=settings.init, n_init=settings.n_init, max_iter=settings.max_iter, tol=settings.tol, verbose=settings.verbose) # algortihm = 'full') def fit_Kmeans(self): self.kmeans.fit_transform(self.X.transpose()) #print(self.kmeans.labels_) def predict(self, x): x_dict = x.get_feature_dict() return self.kmeans.predict( np.array([x_dict[feature] for feature in self.features]).reshape(1, -1)) def send_labels(self): return self.kmeans.labels_
def add_clusters_to_data_kmeans(): x_train = data.DATA['fashion']['base']['x_train'] x_test = data.DATA['fashion']['base']['x_test'] # KMeans (k = 4) on train kmeans = KMeans(n_clusters=4, random_state=SEED) x_train_transformed = kmeans.fit_transform(x_train) x_train_new = pd.concat( [x_train, pd.DataFrame(x_train_transformed)], axis=1) scaler_train = StandardScaler() x_train_new_scaled = scaler_train.fit_transform(x_train_new) pd.DataFrame(x_train_new_scaled).to_csv( f'{DATA_FOLDER}/fashion_aug_kmeans_x_train.csv') # KMeans (k = 4) on test kmeans = KMeans(n_clusters=4, random_state=SEED) x_test_transformed = kmeans.fit_transform(x_test, ) x_test_new = pd.concat([x_test, pd.DataFrame(x_test_transformed)], axis=1) scaler_test = StandardScaler() x_test_new_scaled = scaler_test.fit_transform(x_test_new) pd.DataFrame(x_test_new_scaled).to_csv( f'{DATA_FOLDER}/fashion_aug_kmeans_x_test.csv')
def simple_k_means(X: pd.DataFrame, n_clusters=3, score_metric='euclidean') -> Dict: model = KMeans(n_clusters=n_clusters) clusters = model.fit_transform(X) labels = model.labels_ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) print(labels) print('Estimated number of clusters:', n_clusters) # There are many methods of deciding a score of a cluster model. Here is one example: score = metrics.silhouette_score(X, model.labels_, metric=score_metric) return dict(model=model, score=score, clusters=clusters, labels=labels)
def create_tensor(self, tuples, k): m, l, k = self.lengths['user'], self.lengths['image'], self.lengths['location'] tensor = np.zeros((m,l,k)) for i in range(0, len(tuples)-1): tensor[tuples[i][0]][tuples[i][1]][tuples[i][2]] += 1 factors = parafac(tensor, rank=k, init='random', tol=10e-2) k = [] for j in range(len(factors)): wcss = [] slope = 1; slopeA = [1, 1, 1, 1, 1]; i = 1 val = 1; while True: kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0) kmeans.fit(factors[j]) wcss.append(kmeans.inertia_) if i > 5: slope, intercept = np.polyfit(range(1, i + 1), wcss, 1); slopeA.append(slope); term1 = abs(slopeA[i - 1]) term2 = abs(slopeA[i - 2]) val = (term1 - term2) / term2; print(i); if len(factors[j]) <= i: break; if abs(val) < 0.05: break i += 1; k.append(i) # plt.plot(range(1, 10), wcss) # plt.title('the elbow method') # plt.xlabel('Number of clusters') # plt.show(); print(k); # print(elapsed_time); final = [] for x in range(len(k)): kmeans = KMeans(n_clusters=k[x], init='k-means++', max_iter=300, n_init=10, random_state=0) final.append(kmeans.fit_transform(factors[j])) # print(final[0][0], "\n\n\n", final[1][0]) print(final)
def analyze(n_preview=10): global vectorizer, km # Encode: logger.info('Encoding...') vectorizer = TfidfVectorizer(max_df=0.5, max_features=common.n_features, min_df=2, stop_words='english') common.X = vectorizer.fit_transform(common.doc_texts) common.save_pickle(vectorizer, 'vectorizer.pickle') common.vocab = np.array(vectorizer.get_feature_names()) logger.info(f'X: {common.X.shape}') common.save_encoded_vocab() logger.info('Clustering...') # km = MiniBatchKMeans(n_clusters=common.n_topics, init=init_centroids(), init_size=1000, batch_size=1000, # verbose=0, random_state=common.random_seed) # km = MiniBatchKMeans(n_clusters=common.n_topics, verbose=1, random_state=1) km = KMeans(n_clusters=common.n_topics, init=init_centroids(), max_iter=3, verbose=1, random_state=2) # Analyze: common.doc_topics = km.fit_transform(common.X) # the smaller, the closer common.doc_topics_reduced = np.argmin(common.doc_topics, axis=1) common.topics = km.cluster_centers_ common.save_pickle(km, 'km.pickle') logger.info(f'doc_topics: {common.doc_topics.shape}') logger.info(f'topics: {common.topics.shape}') print() print('----------------') for i, topic_dist in enumerate(common.topics): top_words = common.vocab[np.argsort(topic_dist)[-10:][::-1]] print(f"Topic {i}: {' '.join(top_words)}") print() print('----------------') for i in range(n_preview): print( f'Article {i} (topic: {common.doc_topics_reduced[i]}), {common.doc_titles[i]}' ) print() common.save_analyze_result()
def k_means(self): # k개의 centroid를 반환 model = KMeans() visualizer = KElbowVisualizer(model, metric='calinski_harabasz', k=(3, 100)) visualizer.fit( self.reduced_new_lst ) #visualizer.show() K = visualizer.elbow_value_ if K == None: K = 50 print('K= ',K) model = KMeans(init="k-means++", n_clusters=K, random_state=0) xys = model.fit_transform(self.reduced_new_lst) y_kmeans = model.predict(self.reduced_new_lst) #print(xys) word_vector = self.embedding_model.wv keys = word_vector.vocab.keys() xs = xys[:, 0] ys = xys[:, 1] #self.plot_2d_graph(keys, xs, ys) # 아래는 dataframe으로 뿌리기 위한 용도이구나 pd_reduced_new_lst = pd.DataFrame(self.reduced_new_lst) keys = [k for k in keys] pd_keys = pd.DataFrame(keys) pd_keys = pd_keys.rename(columns={0: "keyword"}) df = pd.concat([pd_reduced_new_lst, pd_keys], 1) #print(df) plt.figure() plt.scatter(xs, ys, c=y_kmeans, s=50, cmap='viridis') words = df['keyword'] for i, word in enumerate(words): plt.annotate(word, xy=(xs[i], ys[i])) centers = model.cluster_centers_ #plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5) pd_centers = pd.DataFrame(centers) #print(pd_centers) new = pd.concat([pd_centers, pd_keys], axis=1, join='inner') print(new) #plt.show() return new
def cluster(self, dataloader, clusters): names, features = self.pca(dataloader, dataloader, 1024) algo = KMeans(clusters, algorithm="elkan", init="random", random_state=42) dist = algo.fit_transform(features) dist = dist.argmin(axis=1) for i in range(clusters): os.mkdir( f"{'/'.join(dataloader.dataset.path.split('/')[:-1])}/{i}") for idx, i in enumerate(dist): shutil.copy2( names[idx], f"{'/'.join(dataloader.dataset.path.split('/')[:-1])}/{i}/{names[idx].split('/')[-1]}" )
def create_sampling_distribution(self, base_learner, data, fold_results): k_means = KMeans(n_clusters=self.configs.active_items_per_iteration * 2) I = data.is_train.nonzero()[0] X_cluster_space = k_means.fit_transform(data.x[I]) #cluster_inds = k_means.fit_predict(data.x[I]) centroid_inds = self.get_cluster_centroids(X_cluster_space) permuted_inds = np.random.permutation(centroid_inds) centroid_pairs = np.reshape(permuted_inds, (permuted_inds.size / 2, 2)) for ind, (idx1, idx2) in enumerate(centroid_pairs): if data.true_y[idx1] <= data.true_y[idx2]: continue centroid_pairs[ind] = centroid_pairs[ind, ::-1] d = np.zeros(centroid_pairs.shape[0]) d[:] = 1 d = d / d.sum() return d, centroid_pairs
def kmean_distance(filename, group): k = KMeans(n_clusters = group, tol=0.000000001, init='random') rowname = filename[:,0] filename = filename[:,1:] g = k.fit_predict(filename) ##group distance = k.fit_transform(filename) ## caculate distance between point and "every" group center g = np.column_stack((rowname, g, np.zeros((len(filename),)) )) ## combine raw data and for nrow in range(len(g)): id = int(g[nrow,1]) ##catch the group id d = distance[nrow,id] ## get the distance with point's own group center g[nrow,2] = d ## combine ##g_8 is the result cnt = Counter(g[:,1]) cnt = sorted(cnt.items(),key = itemgetter(0)) print "total group: %s" % (group) print "cnt of each group %s" % (cnt) return g
def main(): args = parse_args() df = build_dataframe() df = df.sample(n=args.n_datapoints, random_state=args.seed) model = build_model(classes=None, input_shape=(args.res, args.res, 3), base_weights=args.model) image_shape = (args.res, args.res) df["feature_vector"] = df[["image_path"]].apply( lambda x: compute_features(model, x[0], image_shape), axis=1) kmeans = KMeans(n_clusters=args.n_clusters) distances = kmeans.fit_transform(list(df["feature_vector"])) df["distance"] = distances.min(axis=1) df["cluster"] = kmeans.labels_ df.to_json(args.save_path)
def week9(csv, x_1, y_1, x_2, y_2, x_3, y_3): data = pd.read_csv(csv, delimiter=',', index_col='Object') coords = data.drop('Cluster', axis=1) centroid = np.array([[x_1, y_1], [x_2, y_2], [x_3, y_3]]) kmeans = KMeans(n_clusters=3, init=centroid, max_iter=100, n_init=1) model = kmeans.fit(coords) answers = model.labels_.tolist() dist = kmeans.fit_transform(coords) my_claster = [] for i in range(len(dist)): if answers[i] == 0: my_claster.append(dist[i][0].tolist()) return answers, round(np.mean(my_claster), 3)
def k_means(self, clusters): '''K-means''' self.algorithm = "kmeans" kmeans = KMeans(n_clusters=clusters) self.X_dist_matrix = kmeans.fit_transform(self.X) self.labels = kmeans.labels_ labels = ["X", "Y"] self.num_clusters = clusters self.df = pd.DataFrame(data=self.X, columns=labels) self.labels_df = pd.DataFrame(data=self.labels) self.df['labels'] = self.labels_df #self.X_labeled = np.append(self.X, self.labels, axis=1) print("Kmeans complete")
def k_means(fileName, dimensions): f = open(fileName, 'r') fw = open(fileName + '.km', 'w') data = [] video_info = [] while 1: features = split_line_into_tokens(f.readline()) if not features: break data.append(features[3:]) video_info.append(features[:3]) kmeans = KMeans(dimensions) transformedData = kmeans.fit_transform(data).tolist() index = 0 for row in transformedData: finalfeatures = video_info[index] + row fw.write("; ".join(map(lambda x: str(x), finalfeatures)) + "\n") index = index + 1
def main(): # Create a database connection connection = sqlite3.connect("wildfires.sqlite") df = pd.read_sql_query("SELECT LATITUDE,LONGITUDE FROM 'Fires'", connection) attributes = ['LATITUDE', 'LONGITUDE'] #df = df.drop(['LATITUDE','LONGITUDE'], axis=1) #df['LATITUDE'] = df['LATITUDE'].fillna(df['LATITUDE'].median()) #df['LONGITUDE'] = df['LONGITUDE'].fillna(df['LONGITUDE'].median()) data_attributes = df[attributes] kmeans_model = KMeans(n_clusters=2, random_state=1) distances = kmeans_model.fit_transform(data_attributes) labels = kmeans_model.labels_ plt.scatter(distances[:, 0], distances[:, 1], c=labels) plt.title('K-means') plt.show()
def CreateDataset(X, Xtest, datasets = []): for dataset in datasets: if dataset == 'text': X, Xtest = TextTransform(X, Xtest) elif dataset == 'log': X, Xtest = np.log10(X + 1), np.log10(Xtest + 1) elif dataset == 'original': pass elif dataset == 'kmeans': clf = KMeans(n_clusters = 200, n_init = 40, max_iter = 300, verbose = 1, n_jobs = -1) X = np.vstack([X, Xtest]) XX = clf.fit_transform(X) X = XX[:len(X)] Xtest = XX[len(X):] else: logging.warning("Datasets must be one of: text, original, log") SaveDataset(dataset, X, Xtest)
def clustering(X, clusters=4, max_iter=100, slow=False, init_size=2000, batch_size=2000, cluster_type="kmeans"): """ Takes a tf-idf matrix and clusters it. Parameters: X: A sparse tf-idf matrix clusters: Integer. Number of clusters to be used max_iter: Integer. How many iterations to go before stopping slow: bool. Not used anymore. Use cluster_type instead init_size: Integer. If using the mini-kmeans batch, this determines the initialize size batch_size: Integer. Used for mini-kmeans batch cluster_type: String. Takes "kmeans" or "agg" currently. If the name isn't recognized your stuck with mini- kmeans Return: clustered_distances: a numpy array with two columns and rows for each document representing the distance between docs cluster_labels: a list with the cluster name for each doc. """ time1 = time() if cluster_type == "kmeans": km = KMeans(n_clusters=clusters, max_iter=max_iter) elif cluster_type == "agg": cluster = AgglomerativeClustering(n_clusters=clusters, affinity="cosine", linkage="average") cluster.fit(X) print('Agglomerative clustering done in {}s'.format(time() - time1)) return [], cluster.labels_ else: km = MiniBatchKMeans(n_clusters=clusters, max_iter=max_iter, init_size=init_size, batch_size=batch_size) clustered_distances = km.fit_transform(X) cluster_labels = km.labels_ print('KMeans clustering done in {}s'.format(time() - time1)) return clustered_distances, cluster_labels
def km_mlp(X, y): start = time.time() kmeans = KMeans(n_clusters=2) X_km = kmeans.fit_transform(X) train_sizes = [50, 100, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000] cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) estimator = MLPClassifier(hidden_layer_sizes=(40,), max_iter=10000, activation='relu', solver='adam', random_state=0) title = 'Learning curve for KM + MLP Classifier on Wave Data' print("Plotting", title) train_sizes, train_scores, valid_scores = learning_curve(estimator=estimator, X=X_km, y=y, train_sizes=train_sizes, cv=cv, scoring='neg_mean_squared_error') train_scores_mean = -train_scores.mean(axis=1) valid_scores_mean = -valid_scores.mean(axis=1) end = time.time() total = end - start print("TOTAL TIME TAKEN : ", total) plt.style.use('seaborn') plt.plot(train_sizes, train_scores_mean, marker='.', label='Training error') plt.plot(train_sizes, valid_scores_mean, marker='.', label='Validation error') plt.ylabel('MSE', fontsize=14) plt.xlabel('Training set size', fontsize=14) plt.title(title, fontsize=16, y=1.03) plt.legend() # plt.ylim(0, ) plt.savefig('KM_MLP_LC_2.png') plt.clf() return total
def graph_clustering(A_matrix,method,n_clusters,ratio=None,graph_num=None,plotting=True,Mean=False): if(graph_num==None): graph_num = random.randint(1,len(A_matrix))-1 if(Mean): graph_num = 0; A_matrix = np.mean(A_matrix,axis=0,keepdims=True) n = A_matrix.shape[1] if(method=='kmeans'): #kmeans on first n vectors with nonzero eigenvalues _, vecs = graph_representation(train_A=A_matrix,graph_num=graph_num,Prop='Spectral',plotting=False) kmeans = KMeans(n_clusters=n_clusters) kmeans.fit(vecs[:,1:n_clusters].reshape(-1,n_clusters-1)) if(ratio==None): return kmeans.labels_ num = np.sum(kmeans.labels_) ind = 0 if num>(n//2) else 1 prob = (kmeans.fit_transform(vecs[:,1:n_clusters].reshape(-1,n_clusters-1))) thresh = np.quantile(prob[:,ind], ratio) return (prob[:,ind] >= thresh) elif(method=='Spectral_clustering'): adjacency_matrix = A_matrix[graph_num].reshape(n,n) sc = SpectralClustering(n_clusters, affinity='precomputed', n_init=100, assign_labels='discretize') Class = sc.fit_predict(adjacency_matrix) if(plotting): Ab_matrix = A_binarize(A_matrix) G = nx.Graph(Ab_matrix[graph_num]) plt.figure(); nx.draw(G, node_size=200, pos=nx.spring_layout(G)); plt.show() plt.figure(); nx.draw(G, node_color=Class, node_size=200, pos=nx.spring_layout(G)); plt.show() return Class elif(method=='Affinity_propagation'): _, vecs = graph_representation(train_A=A_matrix,graph_num=graph_num,Prop='Spectral',plotting=False) clustering = AffinityPropagation().fit(vecs[:,1:n_clusters]) elif(method=='Agglomerative_clustering'): _, vecs = graph_representation(train_A=A_matrix,graph_num=graph_num,Prop='Spectral',plotting=False) clustering = AgglomerativeClustering(n_clusters=n_clusters).fit(vecs[:,1:n_clusters].reshape(-1,n_clusters-1)) elif(method=='Graclus'): sA = sparse.csr_matrix(A_matrix[graph_num]) edge_index, edge_weight = g_utils.from_scipy_sparse_matrix(sA) cluster = graclus_cluster(edge_index[0], edge_index[1], edge_weight) return cluster.numpy() else: raise Exception("non-existing clustering method") return clustering.labels_
colors = datainfo.colors f = h5py.File(datainfo.dpath + datainfo.name + ext + '.hdf5', 'r') for s, nclusters in zip(datainfo.sensors, datainfo.clusters): print s ldata = [] for dfiles in datainfo.datafiles: d = f[dfiles + '/' + s + '/' + 'PeaksResamplePCA'] dataf = d[()] ldata.append(dataf) data = ldata[0] #np.concatenate(ldata) km = KMeans(n_clusters=nclusters, n_jobs=-1) km.fit_transform(data) lsignals = [] cnt = Counter(list(km.labels_)) lmax = [] for i in range(km.n_clusters): lmax.append((i,np.max(km.cluster_centers_[i]))) lmax = sorted(lmax, key=itemgetter(1)) print lmax print data.shape lhisto = [] for dataf, ndata in zip(ldata, datainfo.datafiles): histo = np.zeros(nclusters) for i in range(dataf.shape[0]):
def clustering(self): kmeans = KMeans(n_clusters=26) kmeans.fit_transform(self.train) return kmeans
def kmeans(embedding, n_components): est = KMeans(n_clusters=n_components, n_jobs=-1, init='k-means++', n_init=300) est.fit_transform(embedding) labels = est.labels_ data = labels.astype(np.float) return data