def get_new_features_by_clustering_features(self, n_clusters): """ use k-means to group features. As features in each group can be considered similar, replace a group of “similar” features by a cluster centroid, which becomes a new feature. """ clusterer = sk_KMeans(n_clusters=n_clusters).fit(self.X_df.values.T) return clusterer.cluster_centers_.T
def __init__(self, path_to_data_csv, model_params, random_state=0): self._kmeans = \ sk_KMeans( n_clusters = model_params['num_clusters'], n_init = model_params['n_init'], random_state=random_state ) self.concat_data, self.labels, self.data_by_sample = self._load_data(path_to_data_csv, model_params) self.logistic_regressor = LogisticRegression(penalty='l1', C=1e10)
def test_k_means_python(benchmark, make_data): dataset, cluster_index = make_data model = sk_KMeans(3, init="random", algorithm="full", max_iter=100, tol=1e-4, n_init=1) labels = benchmark(model.fit_predict, dataset) assert len(labels) == len(cluster_index)
def fit(x_, **kmeans_params_): if batch_version is False: try: model = sk_KMeans(**kmeans_params_).fit(x_) except MemoryError: warnings.warn("An MemoryError occurred during the execution of k-Means non-batch version. Be careful " "that all provides parameters via `kmeans_params` are accept by the batch version and " "have the desired value.") model = sk_MiniBatchKMeans(**kmeans_params_).fit(x_) else: model = sk_MiniBatchKMeans(**kmeans_params_).fit(x_) return model.cluster_centers_, model.labels_
def fit(self, X): """ fit the model by the training data. Args: X: training data """ model = sk_KMeans(n_clusters=self.n_clusters).fit(X) self.model = model self.cluster_centers_ = model.cluster_centers_ self.labels_ = model.labels_ self.inertia_ = model.inertia_ self.data = X
def select_clusters(self, X, beginning_clusters = 2, end_clusters = 10): params = self.get_params() n_clusters = [] inertia = [] for i in range(beginning_clusters, end_clusters + 1): params['n_clusters'] = i k_means_model = sk_KMeans(**params) k_means_model.fit(X = X) n_clusters.append(i) inertia.append(k_means_model.inertia_) return n_clusters, inertia
def execute(self): """Constroi o modelo de clusterizacao.""" self.model = sk_KMeans(n_clusters=self.n_clusters, init=self.init, n_init=self.n_init, max_iter=self.max_iter, tol=self.tol, precompute_distances=self.precompute_distances, verbose=self.verbose, random_state=self.random_state, copy_x=self.copy_x, n_jobs=self.n_jobs, algorithm=self.algorithm).fit(self.data) self.clusters = super().make_clusters(self.data, self.model.labels_)
def find_k(presentence_embedding, max=10): """ 选择拐点 https://stackoverflow.com/questions/19197715/scikit-learn-k-means-elbow-criterion """ from sklearn.metrics import silhouette_score X = presentence_embedding p_y = [] p_x = [] for n_cluster in range(2, max): kmeans = sk_KMeans(n_clusters=n_cluster).fit(X) label = kmeans.labels_ sil_coeff = silhouette_score(X, label, metric='euclidean') print("For n_clusters={}, The Silhouette Coefficient is {}".format( n_cluster, sil_coeff)) p_y.append(sil_coeff) p_x.append(n_cluster) plt.figure() plt.plot(p_x, p_y) plt.xlabel("k ") plt.ylabel("SSE") plt.show()
def fit2(self, markings,user_ids,jpeg_file=None,debug=False): for n in range(1,len(markings)): kmeans = sk_KMeans(init='k-means++', n_clusters=n, n_init=10).fit(markings) labels = kmeans.labels_ unique_labels = set(labels) #need to check if all clusters are either "clean" or noise clean = True for k in unique_labels: users = [ip for index,ip in enumerate(user_ids) if labels[index] == k] if len(users) < self.min_samples: continue #we have found a "clean" - final - cluster if len(set(users)) != len(users): clean = False break if clean: break print n return None,None,None
add_ClusteringServiceServicer_to_server(ClusteringService(model), grpc_server) # Start GRPC Server grpc_server.add_insecure_port('[::]:5001') grpc_server.start() # Keep application alive try: while True: time.sleep(60 * 60 * 24) except KeyboardInterrupt: grpc_server.stop(0) if __name__ == "__main__": logging.basicConfig() n_clusters = 100 (dataset, labels) = make_blobs(n_clusters) if os.getenv("RUST", None) is None: model = sk_KMeans(n_clusters, init="random", algorithm="full", max_iter=100) model.fit(dataset) log(30, "Python model has been loaded") else: model = KMeans.load("data/rust_k_means_model.json") log(30, "Rust model has been loaded") serve(model)
def Pre_KMeans(new_text_list, tokenizer, model, n_cluster=10): presentence_embedding, text_list, _ = get_embedding_np( new_text_list, [], tokenizer, model) kmeans = sk_KMeans(n_clusters=n_cluster).fit(presentence_embedding) # print kmeans return kmeans.labels_
def KMeans(vectors, n_clusters, max_iter): km = sk_KMeans(n_clusters=n_clusters, precompute_distances=False, init='k-means++', max_iter=max_iter, n_init=1) predict = km.fit_predict(vectors) return predict
def test_set(): data = {} timer = Timer() for test in range(0, 1): path = './data/test' + str(test) + '.dot' print("Extracting data from : ", path) vertex_set, edge_set = dot_extract(path=path) print("Number Of Items : ", len(vertex_set.values())) for n_clusters in [3, 5]: print("NEW CLUSTER SIZE", n_clusters) for seed in range(10): print("CLUSTER SIZE", n_clusters, "Iteration : ", seed) print("Model 1") model_1 = KMeans(random_state=seed, n_clusters=n_clusters, n_init=10, max_iter=300) timer.start() k_1 = model_1.fit(vertex_set, edge_set) timer.stop() c_1_dist = evaluate_model(vertex_set, edge_set, k_1) l_1_1_norm = np.sum(c_1_dist) print("verticies Chosen") print(k_1) print("centroid distance") print(c_1_dist) print("l1 norm") print(l_1_1_norm) print("time") print(timer.get_time()) name = "Model_1_test" + str(test) + "_" + "f_" + str( n_clusters) + "_" data[name + "dist"] = data.get(name + "dist", []) + c_1_dist.tolist() data[name + "l1"] = data.get(name + "l1", []) + [l_1_1_norm] data[name + "t"] = data.get(name + "t", []) + [timer.get_time()] print("Model 2") model_2 = VoronoiFacilitySelection(random_state=seed, max_iter=300, n_cells=n_clusters) timer.start() k_2 = model_2.fit(vertex_set, edge_set) timer.stop() c_2_dist = evaluate_model(vertex_set, edge_set, k_2) l_1_2_norm = np.sum(c_2_dist) print("verticies Chosen") print(k_2) print("centroid distance") print(c_2_dist) print("l1 norm") print(l_1_2_norm) print("time") print(timer.get_time()) name = "Model_2_test" + str(test) + "_" + "f_" + str( n_clusters) + "_" data[name + "dist"] = data.get(name + "dist", []) + c_2_dist.tolist() data[name + "l1"] = data.get(name + "l1", []) + [l_1_2_norm] data[name + "t"] = data.get(name + "t", []) + [timer.get_time()] print("Model 3") timer.start() model_3 = sk_KMeans(random_state=seed, n_clusters=n_clusters).fit( list(vertex_set.values())) timer.stop() NN = NearestNeighbors(n_neighbors=1, radius=0.0000000000001).fit( np.array(list(vertex_set.values()))) Y = NN.kneighbors(model_3.cluster_centers_, 1, return_distance=False) k_3 = [list(vertex_set.keys())[i[0]] for i in Y] c_3_dist = evaluate_model(vertex_set, edge_set, k_3) l_1_3_norm = np.sum(c_3_dist) print("verticies Chosen") print(k_3) print("centroid distance") print(c_3_dist) print("l1 norm") print(l_1_3_norm) print("time") print(timer.get_time()) name = "Model_3_test" + str(test) + "_" + "f_" + str( n_clusters) + "_" data[name + "dist"] = data.get(name + "dist", []) + c_3_dist.tolist() data[name + "l1"] = data.get(name + "l1", []) + [l_1_3_norm] data[name + "t"] = data.get(name + "t", []) + [timer.get_time()] #pprint(data) print("Writing to csv") for key, item in data.items(): with open('data/' + key + '.csv', 'w+', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=',') writer.writerow([key]) for d in item: writer.writerow([d])