def setup_data(): X, labels_true = make_blobs(n_samples=700, centers=N, n_features=20, random_state=0) xm_clust = XMeans() xm_clust.fit(X) yield xm_clust, labels_true
def fit(self, data): # initialize xmeans xmeans = XMeans(kmin=self.kmin, kmax=self.kmax, distance_function=self.xmeans_df, verbose=self.verbose) # fit xmeans on the dataset xmeans.fit(data) # store the results: labels, clusters and centers xmeans_labels = xmeans.labels_ # create a dict from cluster label to the points in it xmeans_clusters = points_labels_to_clusters(data, xmeans_labels) # compute the medoids of the clusters xmeans_centers = get_clusters_medoids(xmeans_clusters, euclidean_distance) # create a dictionary from points to clusters label inverse_xmeans_clusters = dict() for cluster_id, points_cluster in xmeans_clusters.items(): for p in points_cluster: inverse_xmeans_clusters[(p[0], p[1])] = cluster_id # compute the pairwise distances between the centers of xmeans data_dist = pdist(xmeans_centers, metric=self.singlelinkage_df) # performs the single linkage data_link = linkage(data_dist, method='single', metric=self.singlelinkage_df) # compute the height at which to cut the dendogram cut_dist = get_cut_distance(data_link, is_outlier=self.is_outlier, min_k=self.kmin, min_dist=self.min_dist) self.cut_dist_ = cut_dist # forms flat clusters from the hierarchical clustering defined by the linkage matrix singlelinkage_labels = fcluster(data_link, cut_dist, 'distance') # create a dict from cluster label to the points in it singlelinkage_clusters = points_labels_to_clusters(xmeans_centers, singlelinkage_labels) # aggregate clusters according to the cut distance tosca_clusters = defaultdict(list) for sl_cid, sl_pl in singlelinkage_clusters.items(): # for each point in the cluster for p in sl_pl: # takes the cluster label given from x means xm_cid = inverse_xmeans_clusters[(p[0], p[1])] # merges the clusters of xmeans tosca_clusters[sl_cid].extend(xmeans_clusters[xm_cid]) # compute the medoids of the new clusters tosca_centers = get_clusters_medoids(tosca_clusters, euclidean_distance) # get the list of points and cluster labels data, tosca_labels = clusters_to_points_labels(tosca_clusters) # final value of k after cluster aggregation tosca_k = len(tosca_centers) self.cluster_centers_ = tosca_centers self.labels_ = tosca_labels self.k_ = tosca_k self.data_ = data return self
def XMeans_duplicate_removal(dataframe): # Note this method now takes a dataframe as input if len(dataframe) < 2: # nothing to do return dataframe Crater_data = dataframe # extract axes x = Crater_data[0].values.tolist() y = Crater_data[1].values.tolist() r = Crater_data[2].values.tolist() p = Crater_data[3].values.tolist() Points = [] X = np.column_stack((x, y)) xm_clust = XMeans() xm_clust.fit(X) groups_pred = xm_clust.labels_ for c in set(groups_pred): idx = [i for i, e in enumerate(groups_pred) if e == c] Group_x = [] Group_y = [] Group_r = [] Group_p = [] index = [] for i in idx: if i in range(0, len(x)): Group_x.append(x[i]) Group_y.append(y[i]) Group_r.append(r[i]) Group_p.append(p[i]) index.append(i) # after group is defined, extract its elements from list Points.append([Group_x, Group_y, Group_r, Group_p]) # now reduce groups center_size = [] for i, (Xs, Ys, Rr, Ps) in enumerate(Points): # we take the point with best prediction confidence best_index = np.argmax(Ps) x_center = Xs[best_index] y_center = Ys[best_index] radius = Rr[best_index] prob = Ps[best_index] center_size += [[x_center, y_center, radius, prob]] return pd.DataFrame(center_size)
def compute_similarity(PCP, bound_idxs, xmeans=False, k=5, N=32, seed=None): """Main function to compute the segment similarity of file file_struct.""" # Get PCP segments pcp_segments = get_pcp_segments(PCP, bound_idxs) # Get the 2d-FMCs segments fmcs = pcp_segments_to_2dfmc_fixed(pcp_segments, N=N) if fmcs == [] or fmcs is None: return np.arange(len(bound_idxs) - 1) # Compute the labels using kmeans if xmeans: xm = XMeans(fmcs, plot=False, seed=seed) k = xm.estimate_K_knee(th=0.01, maxK=8) est_labels = compute_labels_kmeans(fmcs, k=k) # Plot results #plot_pcp_wgt(PCP, bound_idxs) return est_labels
def compute_similarity(PCP, bound_idxs, dirichlet=False, xmeans=False, k=5): """Main function to compute the segment similarity of file file_struct.""" # Get PCP segments pcp_segments = get_pcp_segments(PCP, bound_idxs) # Get the 2d-FMCs segments fmcs = pcp_segments_to_2dfmc_max(pcp_segments) if len(fmcs) == 0: return np.arange(len(bound_idxs) - 1) # Compute the labels using kmeans if dirichlet: k_init = np.min([fmcs.shape[0], k]) # Only compute the dirichlet method if the fmc shape is small enough if fmcs.shape[1] > 500: labels_est = compute_labels_kmeans(fmcs, k=k) else: dpgmm = mixture.DPGMM(n_components=k_init, covariance_type='full') #dpgmm = mixture.VBGMM(n_components=k_init, covariance_type='full') dpgmm.fit(fmcs) k = len(dpgmm.means_) labels_est = dpgmm.predict(fmcs) #print "Estimated with Dirichlet Process:", k if xmeans: xm = XMeans(fmcs, plot=False) k = xm.estimate_K_knee(th=0.01, maxK=8) labels_est = compute_labels_kmeans(fmcs, k=k) #print "Estimated with Xmeans:", k else: labels_est = compute_labels_kmeans(fmcs, k=k) # Plot results #plot_pcp_wgt(PCP, bound_idxs) return labels_est
def test_3(visualize=False): X, y = make_blobs(n_samples=500, n_features=2, centers=8, cluster_std=1.5, center_box=(-10.0, 10.0), shuffle=True, random_state=1) x, y = X[:, 0], X[:, 1] X = np.c_[x, y] st = time.time() x_means = XMeans(random_state=1).fit(np.c_[X]) et = time.time() - st if visualize: print(x_means.labels_) print(x_means.cluster_centers_) plt.scatter(x, y, c='black', marker='o', s=50) plt.title("x-means_sample") plt.grid() plt.show() colors = ["g", "b", "c", "m", "y", "b", "w"] for label in range(x_means.labels_.max() + 1): plt.scatter(x[x_means.labels_ == label], y[x_means.labels_ == label], c=colors[label], label="sample", s=30) plt.scatter(x_means.cluster_centers_[:, 0], x_means.cluster_centers_[:, 1], c="r", marker="+", label="center", s=250) plt.title("x-means_test") plt.legend() plt.grid() plt.show() return et
def test_1(visualize=False): x = np.array([ np.random.normal(loc, 0.1, 20) for loc in np.repeat([1, 2], 2) ]).flatten() y = np.array([ np.random.normal(loc, 0.1, 20) for loc in np.tile([1, 2], 2) ]).flatten() st = time.time() x_means = XMeans(random_state=1).fit(np.c_[x, y]) et = time.time() - st if visualize: print(x_means.labels_) print(x_means.cluster_centers_) colors = ["g", "b", "c", "m", "y", "b", "w"] for label in range(x_means.labels_.max() + 1): plt.scatter(x[x_means.labels_ == label], y[x_means.labels_ == label], c=colors[label], label="sample", s=30) plt.scatter(x_means.cluster_centers_[:, 0], x_means.cluster_centers_[:, 1], c="r", marker="+", label="center", s=100) plt.xlim(0, 3) plt.ylim(0, 3) plt.title("x-means_test") plt.legend() plt.grid() plt.show() return et
def xmeans(D_matrix): return XMeans(random_state=1).fit_predict(D_matrix)
Y_axis.append(float(row[1])) W_size.append(float(row[2])) probs.append(float(row[3])) xmax = np.max(X_axis) ymax = np.max(Y_axis) wmax = np.max(W_size) X_axis=np.asarray(X_axis, dtype=np.float64)/xmax Y_axis=np.asarray(Y_axis, dtype=np.float64)/ymax W_size=np.asarray(W_size, dtype=np.float64)/wmax a=np.c_[X_axis, Y_axis, W_size] datafit = np.c_[X_axis, Y_axis, W_size] kmeans = KMeans(n_clusters=214, max_iter=1000, tol=0.0001, algorithm='auto').fit(datafit) x_means = XMeans(random_state=1).fit(np.c_[X_axis, Y_axis, W_size]) # print(x_means.labels_) # print(x_means.cluster_centers_) # print(x_means.cluster_log_likelihoods_) # print(x_means.cluster_sizes_) removed_list_cnn = [] for row in x_means.cluster_centers_: xc = row[0] * xmax yc = row[1] * ymax ws = row[2] * wmax removed_list_cnn.append([xc, yc, ws]) removed_file = open("crater_25_cnn_removed.csv","w", newline='') with removed_file: writer = csv.writer(removed_file, delimiter=',')