def test_transform_match_across_dtypes(): X, _ = make_blobs(n_samples=80, n_features=4, random_state=0) brc = Birch(n_clusters=4) Y_64 = brc.fit_transform(X) Y_32 = brc.fit_transform(X.astype(np.float32)) assert_allclose(Y_64, Y_32, atol=1e-6)
def getClusters(dt_all, cols_cat): # cols # cols_encode_label = dt_all.filter(regex = "Encode_Label").columns.values.tolist() cols_tsne = [ 'X118', 'X127', 'X47', 'X315', 'X311', 'X179', 'X314', 'X232', 'X29', 'X232', 'X261' ] # standardize dt_all_norm = StandardScaler().fit_transform(dt_all[cols_tsne]) n_comp_tnse = 2 # tsne tsne = TSNE(random_state=2016, perplexity=50, verbose=2) tsne_result = tsne.fit_transform(dt_all_norm) dt_tsne = pd.DataFrame({"x1": tsne_result[:, 0], "x2": tsne_result[:, 1]}) dt_tsne = StandardScaler().fit_transform(dt_tsne) # mds mds = MDS(n_components=n_comp_tnse, random_state=888) mds_result = mds.fit_transform(dt_all_norm) # Birch n_clusters_birch = 2 birch = Birch(n_clusters=n_clusters_birch) birch_result = birch.fit_transform(dt_all_norm) # kmeans kmeans = KMeans(n_clusters=4, random_state=0).fit(dt_tsne) # DBSCAN dbscan = DBSCAN(eps=0.196, min_samples=100).fit(dt_tsne) # Append decomposition components to datasets for i in range(1, n_comp_tnse + 1): dt_all['CL_TSNE_' + str(i)] = tsne_result[:, i - 1] dt_all['CL_MDS_' + str(i)] = mds_result[:, i - 1] for i in range(1, n_clusters_birch + 1): dt_all['CL_BIRCH_' + str(i)] = birch_result[:, i - 1] for i in np.unique(kmeans.labels_): x = kmeans.labels_ == i x = x.astype("int64") dt_all['CL_Kmeans_' + str(i)] = x for i in np.unique(dbscan.labels_): x = dbscan.labels_ == i x = x.astype("int64") dt_all['CL_DBSCAN_' + str(i)] = x return (dt_all)
def birch_clustering(values, branching_factor=50, threshold=0.5): """ Clusters input using the birch algorithm :param values: :type values: :param branching_factor: :type branching_factor: int :param threshold: treshold, default=0.5 this is very high :type threshold: int :return: return list[labels, centroids, class, fitted class] :rtype: list """ birchc = Birch(branching_factor=branching_factor, n_clusters=None, threshold=threshold, compute_labels=True) x_new = birchc.fit_transform(values) labels = birchc.labels_ subc_centroids = birchc.subcluster_centers_ return [labels, subc_centroids, birchc, x_new]
def build_model(df, cluster_type="kmeans", seed=1): if cluster_type == "birch": model = Birch(n_clusters=N_CLUSTERS) res = model.fit_predict(df) elif cluster_type == "minibatch": model = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=seed) res = model.fit_predict(df) elif cluster_type == "em": model = mixture.GMM(n_components=N_CLUSTERS) model.fit(df) res = model.predict(df) elif cluster_type == 'lda': model = lda.LDA(n_topics=N_CLUSTERS, n_iter=1500, random_state=seed) data_to_cluster = np.array(df).astype(int) lda_res = model.fit_transform(data_to_cluster) res = [] for i in lda_res: #for now - do hard clustering, take the higheset propability res.append(i.argmax()) else: model = KMeans(n_clusters=N_CLUSTERS, random_state=seed) res = model.fit_predict(df) df_array = np.array(df) dis_dict = {} for i in range(N_CLUSTERS): dis_dict[i] = clusters_centers[i] all_dist = [] for line_idx in range(len(df_array)): label = model.labels_[line_idx] dist = calc_distance(df_array[line_idx],dis_dict[label]) all_dist.append(dist) df["distance_from_cluster"] = all_dist #clusters = model.labels_.tolist() #print ("clusters are:",clusters) print(""">>>> model is: %s, # of clusters:%s, and %s""" %(cluster_type,N_CLUSTERS,Counter(res))) res = [str(i) for i in res] docs_clusteres = zip(df.index,res) return docs_clusteres
#df_new.fillna(df_new.mean()) #X = StandardScaler().fit_transform(df_new) # In[59]: # Compute DBSCAN #db = DBSCAN(eps=.8, min_samples=10).fit(X) #normalize the data min_max_scalar = preprocessing.MinMaxScaler() x_scaled = min_max_scalar.fit_transform(df_new) df_norm = pd.DataFrame(x_scaled) db = Birch(branching_factor=50, n_clusters=5, threshold=25, compute_labels=True) db.fit_transform(df_new) #core_samples_mask = np.zeros_like(db.labels_, dtype=bool) #core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ cluster_centers = db.subcluster_centers_ # In[60]: import matplotlib.pyplot as plt cluster_centers = db.subcluster_centers_ labels = db.labels_ fig = plt.figure(figsize=(15, 15), dpi=200) ax = fig.add_subplot(111) ax.set_title("Cluter centers on 2 Component PCA in Disease Dataset on Birch") for x, y, lab in zip(cluster_centers[:, 0], cluster_centers[:, 1], labels):
def qmrf_regions(data, edges, nbow=20, lamda=1, sampling='random', nsamples=10000, label_potential='l1', unary_sq=True, online=True, gamma=None, max_iter=5, truncated=False, rng=42, verbose=True, return_centers=False, return_edge_costs=True): with Timer('Colors'): if nbow == 'birch': clf = Birch(threshold=0.8, branching_factor=100) elif online: clf = MiniBatchKMeans(n_clusters=nbow, verbose=verbose, random_state=rng, batch_size=100, max_iter=100, max_no_improvement=10) else: clf = KMeans(n_clusters=nbow, verbose=verbose, random_state=rng) if nsamples is None: dist = clf.fit_transform(data) else: if sampling == 'random': idx = np.random.choice(data.shape[0], nsamples, replace=False) else: n = np.sqrt(nsamples) ratio = image.shape[0] / float(image.shape[1]) ny = int(n * ratio) nx = int(n / ratio) y = np.linspace(0, image.shape[0], ny, endpoint=False) + (image.shape[0] // ny // 2) x = np.linspace(0, image.shape[1], nx, endpoint=False) + (image.shape[1] // nx // 2) xx, yy = np.meshgrid(x, y) idx = np.round(yy * image.shape[1] + xx).astype(int).flatten() clf.fit(data[idx]) dist = clf.transform(data) if nbow == 'birch': centers = clf.subcluster_centers_ else: centers = clf.cluster_centers_ with Timer('Unary'): K = centers.shape[0] if label_potential == 'color': unary_cost = np.zeros((data.shape[0], centers.shape[0]), np.float32) for i in range(centers.shape[0]): unary_cost[:, i] = colordiff(data, centers[i:i + 1]) else: unary_cost = dist.astype(np.float32) if unary_sq: unary_cost **= 2 with Timer('Pairwise'): if label_potential == 'l1': label_cost = np.abs(centers[:, None, :] - centers[None, ...]).sum(-1) elif label_potential == 'l2': label_cost = np.sqrt( ((centers[:, None, :] - centers[None, ...])**2).sum(-1)) elif label_potential == 'potts': label_cost = np.ones((K, K), int) - np.eye(K, dtype=int) elif label_potential == 'color': label_cost = np.zeros((centers.shape[0], centers.shape[0]), np.float32) for i in range(centers.shape[0]): label_cost[:, i] = colordiff(centers, centers[i:i + 1]) if truncated: label_cost = np.maximum(1, label_cost) label_cost = (label_cost * lamda).astype(np.float32) if verbose: print("=================") print("Minimizing graph:") print("Nodes: %d, edges: %d, labels: %d" % \ (unary_cost.shape[0], edges.shape[0], label_cost.shape[0])) print("UnarySq: %s, LabelPotential: %s, EdgeCost: %s" % \ (unary_sq, label_potential, (gamma is not None))) print("#################") with Timer('Edge Cost'): diff = ((data[edges[:, 0]] - data[edges[:, 1]])**2).sum(axis=1) if gamma is not None and type(gamma) in [int, float]: edge_costs = np.exp(-gamma * diff).astype(np.float32) elif gamma == 'auto': edge_costs = np.exp(-diff.mean() * diff).astype(np.float32) elif gamma == 'color': edge_costs = 1. / (1. + colordiff(data[edges[:, 0]], data[edges[:, 1]])) edge_costs = edge_costs.astype(np.float32) else: edge_costs = np.ones(edges.shape[0], dtype=np.float32) with Timer('Minimize'): if label_cost.shape[0] == 2: labels = solve_binary(edges, unary_cost, edge_costs, label_cost) else: labels = solve_aexpansion(edges, unary_cost, edge_costs, label_cost) if return_centers: return labels, label_cost, centers return labels, label_cost