def plot_network(n_clusters, subset_job, subset_edu, no_jobs, no_edu): plt.figure(figsize=(6, 8)) job_kmeans = KMeans(n_clusters=n_clusters) job_predict = job_kmeans.fit_predict(subset_job) empl_edu_kmean = KMeans(n_clusters=n_clusters) empl_predict = empl_edu_kmean.fit_predict(subset_edu) cluster_sum_jobs, cluster_sum_employ_edu = [], [] for i in range(n_clusters): cluster_sum_employ_edu.append( sum_cluster(empl_predict, i, no_edu) / sum(no_edu)) cluster_sum_jobs.append( sum_cluster(job_predict, i, no_jobs) / sum(no_jobs)) jobs_centres = job_kmeans.cluster_centers_ emp_edu_centres = empl_edu_kmean.cluster_centers_ result, all_coords = min_span_tree(jobs_centres, emp_edu_centres, cluster_sum_jobs, cluster_sum_employ_edu) city_labels() plot_california_counties() plot_california() for i in range(len(result)): for j in range(len(result[i])): if result[i][j] == 0: # NO LINK continue plt.scatter(jobs_centres[i][0] if i < n_clusters else emp_edu_centres[i - n_clusters][0], jobs_centres[i][1] if i < n_clusters else emp_edu_centres[i - n_clusters][1], edgecolors='b', facecolors='none') plt.scatter(jobs_centres[j][0] if j < n_clusters else emp_edu_centres[j - n_clusters][0], jobs_centres[j][1] if j < n_clusters else emp_edu_centres[j - n_clusters][1], edgecolors='b', facecolor='none') plt.plot( (jobs_centres[i][0] if i < n_clusters else emp_edu_centres[i - n_clusters][0], jobs_centres[j][0] if j < n_clusters else emp_edu_centres[j - n_clusters][0]), (jobs_centres[i][1] if i < n_clusters else emp_edu_centres[i - n_clusters][1], jobs_centres[j][1] if j < n_clusters else emp_edu_centres[j - n_clusters][1]), 'b-') plt.show()
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') data = np.loadtxt(args.data_points) if args.root is not None: data = np.sqrt(data) (k, initial_points) = get_initial_centers(args.clusters, args.start_points) log.info('calculate center points') kmeans = KMeans(k, initial_points, 1, args.max_iter, copy_x=False) predict = kmeans.fit_predict(data) log.info('storing results') if args.model: save_object_to_file(kmeans, args.model) with utf8_file_open(args.outfile, 'w') as outfile: for i in xrange(predict.shape[0]): outfile.write(u'%d\n' % predict[i]) if args.centroids: np.savetxt(args.centroids, kmeans.cluster_centers_) log.info('finished')
def test_KMeansConstrained_parity_digits(): iris = datasets.load_iris() X = iris.data k = 8 random_state = 1 size_min, size_max = None, None # No restrictions and so should produce same result clf_constrained = KMeansConstrained( n_clusters=k, size_min=size_min, size_max=size_max, random_state=random_state ) y_constrained = clf_constrained.fit_predict(X) clf_kmeans = KMeans( n_clusters=k, random_state=random_state ) y_kmeans = clf_kmeans.fit_predict(X) assert_array_equal(y_constrained, y_kmeans) assert_almost_equal(clf_constrained.cluster_centers_, clf_kmeans.cluster_centers_) assert_almost_equal(clf_constrained.inertia_, clf_kmeans.inertia_)
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') data = np.loadtxt(args.data_points) if args.root is not None: data = np.sqrt(data) (k, initial_points) = get_initial_centers(args.clusters, args.start_points) log.info('calculate center points') kmeans = KMeans(k, initial_points, 1, args.max_iter, copy_x=False) predict = kmeans.fit_predict(data) log.info('storing results') if args.model: save_object_to_file(kmeans, args.model) with utf8_file_open(args.outfile, 'w') as outfile: for i in range(predict.shape[0]): outfile.write('%d\n' % predict[i]) if args.centroids: np.savetxt(args.centroids, kmeans.cluster_centers_) log.info('finished')
def plot_employment_edu_cluster(n_clusters, no_edu, subset_edu, kmeans=None): if not kmeans: kmeans = KMeans(n_clusters=n_clusters) empl_predict = kmeans.fit_predict(subset_edu) plot_california() plot_california_counties() for i in range(n_clusters): mean_employment_score = np.mean( [no_edu[j] for j in range(len(no_edu)) if empl_predict[j] == i]) plt.scatter([ subset_edu[j][0] for j in range(len(subset_edu)) if empl_predict[j] == i ], [ subset_edu[j][1] for j in range(len(subset_edu)) if empl_predict[j] == i ], label=f"Mean Employment Score:{mean_employment_score:.5f}", s=4.5) plt.legend() plt.gca().set_xlabel("Longitude") plt.gca().set_ylabel("Latitude") plt.xlim((-120, -116)) plt.ylim((33, 35)) plt.axis('equal') plt.show()
def k_means_clustering(self, matrix): for k in range(3, 10): km = KMeans(n_clusters=k) self.cluster_number.append( [k, silhouette_score(matrix, km.fit_predict(matrix))]) return self
def perform_cluster(data, params): km = KMeans() km.set_params(**params) vectorizer = TfidfVectorizer() print(data[1][0]) tfidf = vectorizer.fit_transform(data[1]) labels = km.fit_predict(tfidf) result = {i: [] for i in set(labels)} for i, l in zip(range(len(labels)), labels): result[l].append(data[0][i]) return result
def simulation(n, n_clusters, k_range, dim, runs=100): all_data = [] k_low, k_hi = k_range for idx in range(runs): data, labels = make_blobs(n_samples=n, n_features=dim, centers=n_clusters, cluster_std=0.1, center_box=(-1.0, 1.0)) for k in range(k_low, k_hi + 1): # Get a model specified, fit to data, score for error, mark error as -1 if fails model = KMeans(n_clusters=k, random_state=0) labels = model.fit_predict(data) avg_score = silhouette_score(data, labels) all_data.append([n, n_clusters, k, dim, avg_score]) df = pd.DataFrame(all_data, columns=['n', 'n_clusters', 'k', 'dim', 'avg_score']) return df
def bisection(max_k: int, data: np.ndarray) -> tree_node: current_k = 1 data_centroid = np.mean(data, 0) root = tree_node(0, data_centroid) root_sse = sum_square_error(data_centroid, data) next_split_order = 1 next_node_id = 1 queue = PriorityQueue() queue.put((-1.0 * root_sse, root, data)) # print(f"rootsse {root.sse}") while current_k < max_k: _, leaf_to_split, split_data = queue.get() # print(f"leaf_to_split sse {leaf_to_split.sse}") leaf_to_split.split_order = next_split_order next_split_order += 1 k = KMeans(2) labels = np.array(k.fit_predict(split_data), dtype=np.float32) labels = labels.reshape([len(labels), 1]) left_idx = np.asanyarray([i for i in range(split_data.shape[0]) if labels[i] == 0]) left_data = split_data[left_idx, :] left_child = tree_node(next_node_id, np.mean(left_data, 0)) next_node_id += 1 leaf_to_split.left_child = left_child queue.put((-1.0 * sum_square_error(left_child.centroid, left_data), left_child, left_data)) # print(f"left_child sse {left_child.sse}") right_idx = np.asanyarray([i for i in range(split_data.shape[0]) if labels[i] == 1]) right_data = split_data[right_idx, :] right_child = tree_node(next_node_id, np.mean(right_data, 0)) next_node_id += 1 leaf_to_split.right_child = right_child queue.put((-1.0 * sum_square_error(right_child.centroid, right_data), right_child, right_data)) # print(f"right_child sse {right_child.sse}") current_k += 1 # it is only one leaf node more _assign_leaf_ids(root) return root
def plot_job_cluster(n_clusters, no_jobs, subset, kmeans=None): if not kmeans: kmeans = KMeans(n_clusters=n_clusters) job_predict = kmeans.fit_predict(subset) plot_california_counties() for i in range(n_clusters): mean_jobs = np.mean( [no_jobs[j] for j in range(len(no_jobs)) if job_predict[j] == i]) plt.scatter( [subset[j][0] for j in range(len(subset)) if job_predict[j] == i], [subset[j][1] for j in range(len(subset)) if job_predict[j] == i], label=f"Mean No. Jobs:{mean_jobs:.0f}", s=4.5) # city_labels() plt.legend() plt.gca().set_xlabel("Longitude") plt.gca().set_ylabel("Latitude") plt.xlim((-120, -116)) plt.ylim((33, 35)) plt.axis('equal') plt.show()
def test_KMeansConstrained_parity_digits(): iris = datasets.load_iris() X = iris.data k = 8 random_state = 1 size_min, size_max = None, None # No restrictions and so should produce same result clf_constrained = KMeansConstrained(size_min=size_min, size_max=size_max, n_clusters=k, random_state=random_state, init='k-means++', n_init=10, max_iter=300, tol=1e-4) y_constrained = clf_constrained.fit_predict(X) # TODO: Testing scikit-learn has be set to v0.19. This is because there is a discrepancy scikit-learn v0.22 https://github.com/scikit-learn/scikit-learn/issues/16623 clf_kmeans = KMeans(n_clusters=k, random_state=random_state, init='k-means++', n_init=10, max_iter=300, tol=1e-4) y_kmeans = clf_kmeans.fit_predict(X) # Each cluster should have the same number of datapoints assigned to it constrained_ndp = pd.Series(y_constrained).value_counts().values kmeans_ndp = pd.Series(y_kmeans).value_counts().values assert_almost_equal(constrained_ndp, kmeans_ndp) # Sort the cluster coordinates (otherwise in a random order) constrained_cluster_centers = sort_coordinates( clf_constrained.cluster_centers_) kmean_cluster_centers = sort_coordinates(clf_kmeans.cluster_centers_) assert_almost_equal(constrained_cluster_centers, kmean_cluster_centers)
arraylines = fr.readlines() label = [] for line in arraylines: label.extend(line.split(" ")) return label regionslabel = loadlabel("../imagelabel/0103468.regions.txt") layerslabel = loadlabel("../imagelabel/0103468.layers.txt") surfaceslabel = loadlabel("../imagelabel/0103468.surfaces.txt") imgData, row, col = loadData('../slic_segment/0103468.jpg') #加载数据 km = KMeans(n_clusters=5) #聚类获得每个像素所属的类别 originallabel = km.fit_predict(imgData) label = originallabel.reshape([row, col]) #创建一张新的灰度图以保存聚类后的结果 pic_new = image.new("L", (row, col)) #根据类别向图片中添加灰度值 for i in range(row): for j in range(col): pic_new.putpixel((i, j), int(256 / (label[i][j] + 1))) pbmlabel = [] pbmlabel.append(originallabel) result, pbmValue = computePBM(imgData, pbmlabel) print 'pbm值为%s' % pbmValue regionslabel = normalized_mutual_info_score(regionslabel, originallabel) layerslabel = normalized_mutual_info_score(layerslabel, originallabel) surfaceslabel = normalized_mutual_info_score(surfaceslabel, originallabel)
def main(): """CONFIGURATION""" num_clusters = 5; #Number of clusters random = False #If true, it will randomly assign clusters to the states w/ equal prob. If false, it will actually computer the clusters. working_dir = "/home/jmaxk/proj/geoloc/cluster/fb1/" #The input working_dir, which has 1 file per class. Each file contains the results of the linguistic ethnography tool """END CONFIGURATION""" if random: saveFiles = getSaveFiles(working_dir + 'results/random') else: saveFiles = getSaveFiles(working_dir + 'results/real') clusterFile = saveFiles[0] mapFile = saveFiles[1] featureIndeces = dict() classIndeces = [] counter =0 vecs = [] #Turn each file into a vector to be clustered. Note for root, dirs, files in os.walk(working_dir): for f in files: fullpath = os.path.join(root, f) if os.path.splitext(fullpath)[1] == '.txt': with open(fullpath) as fp: lines = fp.readlines() vec = [0.0]*(len(lines) + 1) for line in lines: featVals = line.split(' ') key = featVals[0] val = featVals[1] if not featureIndeces.has_key(key): featureIndeces[key] = counter counter = counter + 1 index = featureIndeces.get(key); vec[index] = float(val) vecs.append(vec) abbr = os.path.basename(fullpath).split(".")[0] #we only want to save actual states if (us.states.lookup(abbr) != None): st = (str(us.states.lookup(abbr).name)) classIndeces.append(st) #transform data into numpy array mylist = [] for item in vecs: mylist.append(numpy.array(item)) data = numpy.array(mylist) #cluster with kmeans, and save the clusters km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, n_init=10, verbose=False) raw_results = km.fit_predict(data) results = dict(zip(classIndeces, raw_results)) saveClusters(data,km, clusterFile) #this doesn't working_dir with random # save the map if random: random_results = dict() for key in results: random_results[key] = randint(0,5) colors = genColors(random_results) saveMap(random_results,colors, mapFile) else: colors = genColors(results) saveMap(results,colors, mapFile)
### you'll want to change this line to ### for f1, f2, _ in finance_features: ### (as it's currently written, the line below assumes 2 features) # for f1, f2 in finance_features: # plt.scatter( f1, f2 ) # plt.show() for f1, f2, f3 in finance_features: # Clustering with 3 Features plt.scatter(f1, f2) plt.show() ### cluster here; create predictions of the cluster labels ### for the data and store them to a list called pred myClassifier = KMeans(n_clusters=2) pred = myClassifier.fit_predict(finance_features) print("type(pred) - {}\n".format(type(pred))) print("pred - {}\n".format(pred)) ### rename the "name" parameter when you change the number of features ### so that the figure gets saved to a different file try: Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2) except NameError: # print "no predictions object named pred found, no clusters to plot"
np.random.seed(seed=seed) X = np.random.rand(n_X, d) clf = KMeansConstrained(n_cluster, size_min=None, size_max=None, init='k-means++', n_init=10, max_iter=300, tol=1e-4, verbose=False, random_state=seed, copy_x=True, n_jobs=1) y = clf.fit_predict(X) # time = timeit('y = clf.fit_predict(X)', number=1, globals=globals()) from sklearn import datasets from sklearn.cluster import KMeans import pandas as pd iris = datasets.load_iris() X = iris.data k = 8 random_state = 1 clf_kmeans = KMeans(n_clusters=k, random_state=random_state, algorithm='full') y = clf_kmeans.fit_predict(X) # Count number of data points for each cluster and sort ndp = pd.Series(y).value_counts().values