def birch_clustering(data): # 设置birch函数 birch = Birch(n_clusters=3) # 训练数据 birch.fit_predict(data) label_pred = birch.labels_ x0 = data[label_pred == 0] x1 = data[label_pred == 1] x2 = data[label_pred == 2] x00 = x0[:, 0] y00 = x0[:, 1] z00 = x0[:, 2] x11 = x1[:, 0] y11 = x1[:, 1] z11 = x1[:, 2] x22 = x2[:, 0] y22 = x2[:, 1] z22 = x2[:, 2] fig = plt.figure() ax = Axes3D(fig) ax.scatter(x00, y00, z00, c="red", marker='o', label='label0') ax.scatter(x11, y11, z11, c="green", marker='*', label='label1') ax.scatter(x22, y22, z22, c="blue", marker='+', label='label2') '''plt.scatter(x0[:, 1], x0[:, 2], c="red", marker='o', label='label0') plt.scatter(x1[:, 1], x1[:, 2], c="green", marker='*', label='label1') plt.scatter(x2[:, 1], x2[:, 2], c="blue", marker='+', label='label2') plt.xlabel('petal length') plt.ylabel('petal width')''' plt.legend(loc=2) plt.show() return x0, x1, x2
def do_birch(self, values, threshold): values = np.array(values) normalized_time = preprocessing.normalize([np.array(values) ]).reshape(-1, 1) birch = Birch(branching_factor=50, n_clusters=None, threshold=threshold, compute_labels=True) birch.fit_predict(normalized_time) return (np.unique(birch.labels_).size > 1)
def get_personalization(self, service): weight_average = 0.0 num = 0 max_corr = 0.01 metrics = [] for _, _, data in self.anomalous_subgraph.in_edges(service, data=True): weight_average += data['weight'] num += 1 for _, destination, data in self.anomalous_subgraph.out_edges( service, data=True): if self.anomalous_subgraph.nodes[destination]['type'] == 'service': num += 1 weight_average += data['weight'] hosts = self.trace_data.loc[self.trace_data.serviceName == service].cmdb_id.unique() host_groups = self.host_data[self.host_data['cmdb_id'].isin( hosts)].groupby('cmdb_id')[['name', 'value']] for host, host_data_subset in host_groups: for KPI, values in host_data_subset.groupby('name')['value']: anomalous_data = pd.Series( list(self.trace_data.loc[ (self.trace_data.path == self.anomalous_edges[service]) & (self.trace_data.cmdb_id == host)]['elapsedTime'])) values = pd.Series(list(values)) correlation = 0 if len(set(anomalous_data)) > 1 and len(set(values)) > 1: correlation = abs(anomalous_data.corr(values)) normalized_time = preprocessing.normalize( [np.array(values)]).reshape(-1, 1) birch = Birch(branching_factor=50, n_clusters=None, threshold=0.005, compute_labels=True) birch.fit_predict(normalized_time) labels = birch.labels_ coefficient = int(np.unique(labels).size > 1) correlation = coefficient * correlation if pd.isna(correlation): correlation = 0 if correlation > max_corr: metrics.append((host, KPI, correlation)) max_corr = correlation data = weight_average * max_corr metrics.sort(key=lambda tup: tup[2], reverse=True) if len(metrics) > 1: if metrics[1][2] / metrics[0][2] > 0.9: return data, metrics[:2] else: return data, metrics[:1] else: return data, metrics
def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA): #t = time.clock() global quota_for_each_cluster global brc global v global quota global select quota = 10000 result_arr = QLINK_URLS + UNKNOWN_URLS for i, url in enumerate(result_arr): result_arr[i] = urlparse.urlparse(unquote(url.strip())) #l_dict = v = DictVectorizer(sparse=False) data = v.fit_transform(extract_features(result_arr)) ind_list = [] ind_list_data = [] low_bound = 8 for col in xrange(data.shape[1]): if (np.sum(data[:, col]) > low_bound): ind_list.append(1) ind_list_data.append(col) else: ind_list.append(0) v = v.restrict(ind_list) data = data[:, ind_list_data] #if (start_url[0].find("wikipedia") != -1): # out_data("som_data_wiki/qlink.tfxidf", data[:500], start_url[:500]) # out_data("som_data_wiki/notqlink.tfxidf", data[500:], start_url[500:]) # out_data("som_data_wiki/data.tfxidf", data, start_url) # out_template("som_data_wiki/data_features.tv", v.get_feature_names(), len(data)) # out_template("som_data_wiki/qlink_features.tv", v.get_feature_names(), len(data) / 2) # out_template("som_data_wiki/notqlink_features.tv", v.get_feature_names(), len(data) / 2) # return 0 best_cou_clusters = data.shape[1] #k_means = KMeans(n_clusters=best_cou_clusters, init = 'random') #clust = k_means.fit_predict(data) brc = Birch(branching_factor=50, n_clusters=best_cou_clusters, threshold=0.2, compute_labels=True) clust = brc.fit_predict(data) select = SelectKBest(k=min(data.shape[1], 30)) data = select.fit_transform(data, clust) clust = brc.fit_predict(data) #print data.shape quota_for_each_cluster = np.zeros(best_cou_clusters) clust_qlink = list(clust[:500]) for i in xrange(best_cou_clusters): quota_for_each_cluster[i] = clust_qlink.count(i) / 500.0 * QUOTA quota_for_each_cluster *= 2.0
def birch(data): space = { 'threshold': hp.uniform('threshold', 0, 1), 'branching_factor': hp.choice('branching_factor', range(25, 75)), } algo = partial(tpe.suggest, n_startup_jobs=10) best = fmin(hyper_birch, space, algo=algo, max_evals=50) model = Birch(threshold=best['threshold'], branching_factor=int(best['branching_factor'] + 25)) return best, model.fit_predict(data), sil_score( data, model.fit_predict(data)), model.fit(data)
def skLearnBirch(data): threshold = getOptimalClustersSilhoutte(data, ClusteringAlgorithm.skLearnBirch) brc = Birch(branching_factor=50, n_clusters=None, threshold=threshold, compute_labels=True) labels = brc.fit_predict(data) selectedClusterNumber = len(brc.subcluster_centers_) return (selectedClusterNumber, brc)
def birch(tfidf_matrix): b_cluster = Birch(n_clusters=90, threshold=0.7) result = b_cluster.fit_predict(tfidf_matrix) rbirch = sklearn.metrics.normalized_mutual_info_score( cluster, result, average_method='warn') print(rbirch)
def hyper_birch(args): global data_file bir = Birch(threshold=args['threshold'], branching_factor=int(args['branching_factor'])) pred = bir.fit_predict(data_file.data) temp = sil_score(data_file.data, pred) return -temp
def compute_optimal_birch_clustering(node_path_counts: np.array, pca_target_dimension: int, number_of_walks: int, significance_level: float): """ Given an array of node path counts, clusters the nodes into an optimal number of clusters using birch clustering. The number of clusters is incrementally increased. The optimal number of clusters is the smallest number of clusters such that have statistically similar path count distributions at a specified significance level. """ standardized_path_counts = ( (node_path_counts - np.mean(node_path_counts, axis=1)[:, None]) / np.mean(node_path_counts, axis=1)[:, None]).T feature_vectors = compute_principal_components(feature_vectors=standardized_path_counts, target_dimension=pca_target_dimension) number_of_feature_vectors = feature_vectors.shape[0] cluster_labels = None for number_of_clusters in range(2, number_of_feature_vectors): # start from 2 since zero/one clusters is invalid # clusterer = KMeans(n_clusters=number_of_clusters, max_iter=30, n_init=8) clusterer = Birch(n_clusters=number_of_clusters, threshold=0.05) cluster_labels = clusterer.fit_predict(feature_vectors) node_path_counts_of_clusters = get_node_path_counts_of_clusters(node_path_counts, cluster_labels) if test_quality_of_clusters(node_path_counts_of_clusters, number_of_walks, significance_level): return cluster_labels else: continue return cluster_labels
def birch(data): X = data birch = Birch(n_clusters=2, threshold=0.5) ##训练数据 labels = birch.fit_predict(X) print(Counter(labels)) return labels
def birch(X_input, k): from sklearn.cluster import Birch print('start birch cluster:') clusterer = Birch(n_clusters=k,threshold=1) y = clusterer.fit_predict(X_input) print(y) return y
def birch_classer(): data = get_feature() libs.logger.log(data) libs.logger.log('birch cluster begining......') sse = [] for clust in range(100, 101): libs.logger.log('clust [' + str(clust) + '] is begining.....') birch_cluster = Birch(n_clusters=clust, ) result = birch_cluster.fit_predict(data) #store model joblib.dump( birch_cluster, config.MODEL_PATH + 'web_fingerprint_birch_cluster_fit_result.pkl') calinski_harabasz_ccore = metrics.calinski_harabaz_score(data, result) f = open('web_fingerprint_birch_10w_' + str(clust) + '.txt', 'w') for i in range(len(result)): info = str(result[i]) + '\n' f.write(info) f.write('calinski_harabasz_ccore:' + str(calinski_harabasz_ccore)) f.close() libs.logger.log('clust [' + str(clust) + '] finish') libs.logger.log('calinski_harabasz_ccore: ' + str(calinski_harabasz_ccore)) libs.logger.log(result)
def visual(c, X, y): from sklearn.cluster import Birch cluster_object = Birch() y_pred = cluster_object.fit_predict(X) colors = [ 'red', 'green', 'blue', 'cyan', 'black', 'yellow', 'magenta', 'brown', 'orange', 'silver', 'goldenrod', 'olive', 'dodgerblue' ] clusters = np.unique(y_pred) print("Cluster Labels") print(clusters) print("Evaluation") evaluation_labels(y, y_pred) evaluation(X, y_pred) for cluster in np.unique(y): row_idx = np.where(y == cluster) plt.scatter(X[row_idx, 0], X[row_idx, 1]) plt.title('Dataset') plt.xlabel('X1') plt.ylabel('X2') plt.legend() plt.show() for cluster in clusters: row_idx = np.where(y_pred == cluster) plt.scatter(X[row_idx, 0], X[row_idx, 1]) plt.title('Clusters') plt.xlabel('X1') plt.ylabel('X2') plt.legend() plt.show()
def birch(X, y, n): """ Birchによるクラスタリング Parameters ---------- X : numpy array データ y : numpy array 正解ラベル n : int クラスタ数 Returns ------- acc_br : float 正解率 time_br : float 実行時間 """ br = Birch(n_clusters=2) start_br = time.time() y_br = br.fit_predict(X) end_br = time.time() y_br = np.reshape(y_br, (1, len(y[0]))) acc_br, _, _ = acc(y_br, y) time_br = round(end_br - start_br, 2) make_graph(X, y_br, n, "Birch") return acc_br, time_br
def choose_stocks_index(self): stock_choosen_num = {} for i in range(self.__X.shape[0]): birch = Birch(threshold=0.001, n_clusters=self.__n_clusters) y_pred = birch.fit_predict(self.__X[i, :, :]) subcluster_centers = birch.subcluster_centers_ choosen_stock = np.array([0 for _ in range(self.__n_clusters)]) min_distance = np.array([-1.0 for _ in range(self.__n_clusters)]) for ind in range(self.__stock_num): stock = self.__X[i, ind, :] stock_label = y_pred[ind] distance = np.linalg.norm(stock - subcluster_centers[stock_label], ord=2) if min_distance[stock_label] == -1 or min_distance[ stock_label] > distance: min_distance[stock_label] = distance choosen_stock[stock_label] = ind for stock in choosen_stock: if stock in stock_choosen_num.keys(): stock_choosen_num[stock] += 1 else: stock_choosen_num[stock] = 1 stock_choosen_num = list( sorted(stock_choosen_num.items(), key=lambda x: x[1], reverse=True)) choosen_stock_ind = list(map(lambda x: x[0], stock_choosen_num))[:self.__n_clusters] return choosen_stock_ind
def birch(X, k): # 待聚类点阵,聚类个数 from sklearn.cluster import Birch clusterer = Birch(n_clusters=k) y = clusterer.fit_predict(X) return y
def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA): # url to obj qlinks = map(parse_url, QLINK_URLS) ulinks = map(parse_url, UNKNOWN_URLS) # check netloc # print qlinks[0].netloc # extract features start = time.time() qlinks_f = [dict(Counter(zip(*extract_features([link], 0))[0])) for link in qlinks] ulinks_f = [dict(Counter(zip(*extract_features([link], 0))[0])) for link in ulinks] # print time.time() - start # start = time.time() v = DictVectorizer(sparse=False) x_ = v.fit_transform(qlinks_f + ulinks_f) best_features = np.sum(x_, axis=0) > 5 m_features = np.sum(best_features) v = v.restrict(best_features) x_ = x_[:, best_features] clustering = Birch(branching_factor=BIRCH_BRANCHING_FACTOR, n_clusters=m_features, threshold=BIRCH_THRESHOLD, compute_labels=True) y_ = clustering.fit_predict(x_) sel = SelectKBest(k=min(m_features, KBEST_K)) x = sel.fit_transform(x_, y_) y = clustering.fit_predict(x) q_or_u = np.repeat([1, 0], [len(QLINK_URLS), len(UNKNOWN_URLS)]) q_ = np.vstack((y, q_or_u)).T quota = zip(np.unique(y), (np.array([np.sum(q_[q_[:, 0] == c, 1]) for c in np.unique(y)]) / float(len(QLINK_URLS))) * QUOTA * 2) quota = {c: int(q) for c, q in quota} algos[qlinks[0].netloc] = { "clustering": clustering, "quota": quota, "sel": sel, "vect": v, "total_quota": QUOTA, }
def clusteringBirch(X, nclusters, paramlist): bcl = Birch(threshold=0.5, branching_factor=50, n_clusters=nclusters, compute_labels=True, copy=True) labels = bcl.fit_predict(X) return labels
def main(): filename = 'dataset.txt' x = convert_to_int(load_input(filename)) brc = Birch(branching_factor=50, n_clusters=7, threshold=0.5, compute_labels=True) ans = brc.fit_predict(x) plot_points(ans, x)
def birch(data,threshold,branching_factor): # bir = Birch(threshold=args['threshold'], branching_factor=int(args['branching_factor'])) db = Birch(threshold=threshold, branching_factor=branching_factor) db.fit(data) pred = db.fit_predict(data) score = sil_score(data,pred) print(score) return db,pred,score
def hyper_birch(args): global basic_data global all_data bir = Birch(threshold=args['threshold'], branching_factor=int(args['branching_factor'])) pred = bir.fit_predict(basic_data) temp = sil_score(all_data, pred) # print(args) return -temp
def birch(test_arr, testDt_List, T, B): cluster = Birch(n_clusters=None, threshold=T, branching_factor=B) #可能需要调threshold参数 y = cluster.fit_predict(test_arr) print(y) label = [] # 每个样本所属的类 for i in range(1, len(cluster.labels_)): label.append((testDt_List[i - 1], cluster.labels_[i - 1])) return label
def getOptimalClustersSilhoutte(data, algorithm=ClusteringAlgorithm.skLearnKMeans): silhoutteScores = {} rotationStored = {} thresholdValues = {} if algorithm == ClusteringAlgorithm.customKMeans: for clusterKmeansNumber in range(2, 20): try: clf = kMeans.K_Means(clusterKmeansNumber, tolerance=0.00001, max_iterations=800) rotation = randamozieSeed(data, clusterKmeansNumber) clf.fit(data, spherical=True, rotationArray=rotation) labels = clf.getLabels(data) silhouette_avg = silhouette_score(data, labels) silhoutteScores[clusterKmeansNumber] = silhouette_avg rotationStored[clusterKmeansNumber] = rotation # print(clusterKmeansNumber,">>>>>>>",rotation) except: continue # print(clusterKmeansNumber," chucked") elif algorithm == ClusteringAlgorithm.skLearnKMeans: for clusterKmeansNumber in range(2, 20): clf = KMeans(n_clusters=clusterKmeansNumber) labels = clf.fit_predict(data) silhouette_avg = silhouette_score(data, labels) silhoutteScores[clusterKmeansNumber] = silhouette_avg elif algorithm == ClusteringAlgorithm.skLearnBirch: for i in range(2, 100): brc = Birch(branching_factor=50, n_clusters=None, threshold=0.01 * i, compute_labels=True) labels = brc.fit_predict(data) print(len(labels)) try: silhouette_avg = silhouette_score(data, labels) clusterNumber = len(set(labels)) silhoutteScores[clusterNumber] = silhouette_avg thresholdValues[clusterNumber] = i * 0.01 except: continue sortedSil = sorted(silhoutteScores.items(), key=itemgetter(1)) selectedClusterNumber = sortedSil[-1][0] print("selected number of clusters=", selectedClusterNumber) if algorithm == ClusteringAlgorithm.customKMeans: return (selectedClusterNumber, rotationStored[selectedClusterNumber]) elif algorithm == ClusteringAlgorithm.skLearnBirch: return selectedClusterNumber else: return thresholdValues[selectedClusterNumber]
def find_anomalous_edges(self): for edge in self.edges: elapsed_time = np.array( list(self.trace_data[self.trace_data.path == edge] ['elapsedTime'])) normalized_time = preprocessing.normalize([elapsed_time ]).reshape(-1, 1) if self.take_minute_averages_of_trace_data: birch = Birch(branching_factor=50, n_clusters=None, threshold=0.05, compute_labels=True) else: birch = Birch(branching_factor=50, n_clusters=None, threshold=0.001, compute_labels=True) birch.fit_predict(normalized_time) labels = birch.labels_ if np.unique(labels).size > 1: self.anomalous_edges[edge.split('-')[1]] = edge
def julei(word, weight): clusterer = Birch(n_clusters=3) y = clusterer.fit_predict(weight) print(y) print(y.shape) for i in range(14): f2 = open(file3[i], 'w+') for j in range(len(y)): f2.write(word[j] + " " + str(y[j]) + "\n") # print(word[j] + " " + str(y[j])) f2.close() return y
def build_model(df, cluster_type="kmeans", seed=1): if cluster_type == "birch": model = Birch(n_clusters=N_CLUSTERS) res = model.fit_predict(df) elif cluster_type == "minibatch": model = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=seed) res = model.fit_predict(df) elif cluster_type == "em": model = mixture.GMM(n_components=N_CLUSTERS) model.fit(df) res = model.predict(df) elif cluster_type == 'lda': model = lda.LDA(n_topics=N_CLUSTERS, n_iter=1500, random_state=seed) data_to_cluster = np.array(df).astype(int) lda_res = model.fit_transform(data_to_cluster) res = [] for i in lda_res: #for now - do hard clustering, take the higheset propability res.append(i.argmax()) else: model = KMeans(n_clusters=N_CLUSTERS, random_state=seed) res = model.fit_predict(df) df_array = np.array(df) dis_dict = {} for i in range(N_CLUSTERS): dis_dict[i] = clusters_centers[i] all_dist = [] for line_idx in range(len(df_array)): label = model.labels_[line_idx] dist = calc_distance(df_array[line_idx],dis_dict[label]) all_dist.append(dist) df["distance_from_cluster"] = all_dist #clusters = model.labels_.tolist() #print ("clusters are:",clusters) print(""">>>> model is: %s, # of clusters:%s, and %s""" %(cluster_type,N_CLUSTERS,Counter(res))) res = [str(i) for i in res] docs_clusteres = zip(df.index,res) return docs_clusteres
def make_birch_clustering(self, short_filenames, input_texts): output_dir = self.output_dir + 'birch/' if not os.path.exists(output_dir): os.makedirs(output_dir) if self.need_tf_idf: self.signals.PrintInfo.emit("Расчет TF-IDF...") idf_filename = output_dir + 'tf_idf.csv' msg = self.calculate_and_write_tf_idf(idf_filename, input_texts, self.tf_idf_norm, self.tf_idf_is_smooth_idf, self.tf_idf_sublinear_tf) self.signals.PrintInfo.emit(msg) if self.need_tf_idf_formula: self.signals.PrintInfo.emit( "Расчет TF-IDF по формуле на изображении...") idf_filename = output_dir + 'tf_idf_formula.csv' msg = self.calculate_and_write_tf_idf_formula( idf_filename, input_texts) self.signals.PrintInfo.emit(msg) vectorizer = CountVectorizer() X = vectorizer.fit_transform(input_texts) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) birch = Birch(threshold=self.birch_threshold, branching_factor=self.birch_branching_factor, n_clusters=self.birch_clusters_count) predict_result = birch.fit_predict(X) self.signals.PrintInfo.emit('\nПрогноз по документам:\n') clasters_output = '' for claster_index in range(max(predict_result) + 1): clasters_output += ('Кластер ' + str(claster_index) + ':\n') for predict, document in zip(predict_result, short_filenames): if predict == claster_index: clasters_output += (' ' + str(document) + '\n') clasters_output += '\n' self.signals.PrintInfo.emit(clasters_output) self.signals.PrintInfo.emit('Сохранено в:' + str(output_dir + 'clusters.txt')) writeStringToFile(clasters_output, output_dir + 'clusters.txt') self.draw_clusters_plot(X, predict_result, short_filenames)
def BIR(data_matrix, C=None, model_path=None): ''' 层次聚类 :param data_matrix: 输入矩阵 :param C: 簇的个数 :return: ''' BIR_model = Birch(n_clusters=C) labels = BIR_model.fit_predict(data_matrix) if model_path is not None: joblib.dump(value=BIR_model, filename=model_path) # print(BIR_model) # labels = get_trans_label(model=BIR_model,labels=labels) return labels
def Bir(data, Data_for_Cluster, k, threshold, branching_factor): #Birch聚类的参数选择 #k = 2 #[4-15,None] #threshold = 0.5 #[0.5,0.3,0.1] #branching_factor= 50 #[50,20,10] print('Bir', '聚类数:', k, 'threshold:', threshold, 'branching_factor:', branching_factor) Birmod = Birch(n_clusters=k, threshold=threshold, branching_factor=branching_factor) pred = Birmod.fit_predict(Data_for_Cluster) for i in data.index: data.loc[i, 'clustering'] = pred[i] return data
def cluster_birch(n_clusters): """ birch聚类方法,处理经过PCA处理的特征向量 :param n_clusters:质心数量 :return: """ data = get_data("../data/feature_vector_pca.csv") birch = Birch(n_clusters=n_clusters, threshold=0.4, branching_factor=50) clusters = birch.fit_predict(data) print("Calinski-Harabasz Score", metrics.calinski_harabaz_score(data, clusters)) print("每个样本点所属类别索引", clusters) # print("簇中心", birch.cluster_centers_) data_labeled_to_csv(clusters, "data/data_labeld_birch.csv")
def birch_cluster(init_ds,ts_flag = False): ''' Parameters: init_ds - 2D list of data ts_flag - boolean specifying if the first column of init_ds is a datetime object or not Returns: 2D list with additional column denoting which cluster said row falls into ''' if ts_flag: init_ds = [i[1:] for i in init_ds] brc = Birch() labels = brc.fit_predict(init_ds) return [init_ds[i]+[labels[i]] for i in range(len(init_ds)) ]
def birch(filename,output,ktype): """ use BIRCH cluster training """ pass # model, word_vectors = w2v(filename) # n_words = word_vectors.shape[0] # vec_size = word_vectors.shape[1] # # K means training # kmeans = KMeans(n_clusters= ktype, n_jobs=-1, random_state=0) # idx = kmeans.fit_predict(word_vectors) # # Use Simhash # word_centroid_list = list(zip(model.wv.index2word, idx)) # word_centroid_list_simhash = [(Simhash(get_features(item[0])).value,item[1]) for item in word_centroid_list] # Use BIRCH training # better: cf is 4, sample in cs is 20 brc = Birch(branching_factor=50, n_clusters=None, threshold=0.5,compute_labels=True) # brc.fit(word_centroid_list_simhash) brc.fit_predict(word_centroid_list_simhash)
def birch_sample(self): """Applies Birch and DBSCAN to the image. Not for consumer use. """ self.birch_thr = self.eps_filter/10. brc = Birch(branching_factor=50, n_clusters=None, threshold=self.birch_thr, compute_labels=True) self.divide_labels = brc.fit_predict(self.xyz) tmp_brc = brc.subcluster_centers_ _frac = 100.*brc.subcluster_centers_.shape[0]/np.float64((self.img_original_reshape.shape[0]*self.img_original_reshape.shape[1])) lab_out = np.ones(brc.subcluster_centers_.shape[0], dtype=np.int32) agal = DBSCAN(eps=self.eps_filter, min_samples=self.min_samples_reduce, algorithm='ball_tree', n_jobs=-1) lab_out = agal.fit_predict(tmp_brc).astype(np.int32) _frac = 100.*np.sum(lab_out > -1)/np.float64((self.img_original_reshape.shape[0]*self.img_original_reshape.shape[1])) self.dbs_samp_frac = _frac return lab_out, tmp_brc
print(data, citypos.shape) # KMeans km = KMeans(n_clusters=100, n_init=1) itime = time.perf_counter() kmlabels = km.fit_predict(citypos) etime = time.perf_counter() print ('K-means Time = ', etime-itime) # Minibatch Kmeans itime = time.perf_counter() mbkm = MiniBatchKMeans(n_clusters=100, batch_size=1000, n_init=1, max_iter=5000) mbkmlabels = mbkm.fit_predict(citypos) etime = time.perf_counter() print ('MB K-means Time = ', etime-itime) print('Similarity Km vs MBKm', adjusted_mutual_info_score(kmlabels, mbkmlabels)) # Birch itime = time.perf_counter() birch = Birch(threshold=0.02, n_clusters=100, branching_factor=100) birchlabels = birch.fit_predict(citypos) etime = time.perf_counter() print ('BIRCH Time = ',etime-itime) print('Similarity Km vs BIRCH',adjusted_mutual_info_score(kmlabels, birchlabels))
import numpy as np from sklearn.cluster import Birch import cluster import csv clusters = 20 submit_file = 'submit_birch.csv' X, plays = cluster.get_matrix() brc = Birch() X = np.array(X, dtype=float) plays = np.array(plays, dtype=float) # print X.shape print "Running Birch on training data...", brc = Birch(branching_factor=50, n_clusters=clusters, threshold=0.5, compute_labels=True) labels = brc.fit_predict(X) print "Done!" print labels # plays_sums = [0] * clusters # cluster_size = [0] * clusters plays_sums = {} # Median for idx, label in enumerate(labels): if label in plays_sums: plays_sums[label].append(plays[idx]) else: plays_sums[label] = [plays[idx]] # cluster_size[label] += 1
np.random.seed(0) X = np.c_[data_thr.orbit, data_thr.rate, data_thr.rateA, data_thr.rateB, data_thr.rateC, data_thr.rateCA] Html_file = open("clustering_files/birch.html", "w") scaler = StandardScaler() X = scaler.fit_transform(X) for n_clusters in range(2, 10): km = Birch(n_clusters=n_clusters) preds = km.fit_predict(X) print "components:", set(preds) print np.bincount(preds) data_thr['preds'] = pd.Series(preds).astype("category") color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink", "brown", "green", "orange"] * 2 # Spectral9 # color_key = color_key[:len(set(preds))+2] # single plot rateCA vs rate with predicted classes and ellipses: single_plot = bokeh_datashader_plot(data_thr, covs=None, means=None, x_name='rateCA',
# set up clustering algorithms db = DBSCAN(eps=0.3, min_samples=5) ac = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='average') #km = MiniBatchKMeans(n_clusters=2, random_state=1, n_init=15) bc = Birch(n_clusters=2) #sp = SpectralClustering(n_clusters=2, eigen_solver='arpack', random_state=1) #bandwidth = estimate_bandwidth(X, quantile=0.3) #ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) #ap= AffinityPropagation(damping=.9, preference=-200) #y_km = km.fit_predict(X) y_ac = ac.fit_predict(X) utils.swap_label(y_ac) y_bc = bc.fit_predict(X) utils.swap_label(y_bc) y_db = db.fit_predict(X) y_db[y_db==-1] = 1 #print np.unique(y_db) #y_sp = sp.fit_predict(X) #y_ms = ms.fit_predict(X) #y_ap = ap.fit_predict(X) labels = {'AgglomerativeClustering':y_ac} #labels['MiniBatchKMeans'] = y_km labels['DBSCAN'] = y_db labels['Birch'] = y_bc # make plot about the clustering results fig, axes = plt.subplots(3,len(labels), figsize=(17,10))