def km(tx, ty, rx, ry, add="", times=5): #this does the exact same thing as the above errs = [] checker = KM(n_clusters=2) checker.fit(ry) truth = checker.predict(ry) # so we do this a bunch of times for i in range(2,times): clusters = {x:[] for x in range(i)} clf = KM(n_clusters=i) clf.fit(tx) #fit it to our data test = clf.predict(tx) result = clf.predict(rx) # and test it on the testing set for index, val in enumerate(result): clusters[val].append(index) mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(i)} processed = [mapper[val] for val in result] errs.append(sum((processed-truth)**2) / float(len(ry))) plot([0, times, min(errs)-.1, max(errs)+.1],[range(2, times), errs, "ro"], "Number of Clusters", "Error Rate", "KMeans clustering error", "KM"+add) td = np.reshape(test, (test.size, 1)) rd = np.reshape(result, (result.size, 1)) newtx = np.append(tx, td, 1) newrx = np.append(rx, rd, 1) nn(newtx, ty, newrx, ry, add="onKM"+add)
def KMeansRatio(self): ''' Type: K-Means Y-axis: No Reaction X-axis: Reaction ''' if self.authenticated: from sklearn.cluster import KMeans as KM algorithm = KM(n_clusters=2) categories = algorithm.fit_predict(self.allCoord) plt.scatter(self.allCoord[categories == 0, 0], self.allCoord[categories == 0, 1], c="green") plt.scatter(self.allCoord[categories == 1, 0], self.allCoord[categories == 1, 1], c="red") plt.scatter(algorithm.cluster_centers_[:, 0], algorithm.cluster_centers_[:, 1], c="black", marker="*") for i, txt in enumerate(self.labels): plt.annotate(txt, (self.allCoord[i][0], self.allCoord[i][1])) plt.ylabel("NO REACTION") plt.xlabel("REACTION") plt.annotate("NO INFLAMMATION", algorithm.cluster_centers_[0]) plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centers_[1]) plt.title("K-Means: Reaction, No Reaction") plt.show()
def KMeansPartPercent(self): ''' Type: K-Means Y-axis: % Reactions X-axis: # Reactions ''' if self.authenticated: from sklearn.cluster import KMeans as KM algorithm = KM(n_clusters=2) # partPercent = np.array([np.array([x, percent]) for j in self.stuff for _, x, _, percent in j]) categories = algorithm.fit_predict(self.partPercent) plt.scatter(self.partPercent[categories == 0, 0], self.partPercent[categories == 0, 1], c="green") plt.scatter(self.partPercent[categories == 1, 0], self.partPercent[categories == 1, 1], c="red") plt.scatter(algorithm.cluster_centers_[:, 0], algorithm.cluster_centers_[:, 1], c="black", marker="*") for i, txt in enumerate(self.labels): plt.annotate(txt, (self.partPercent[i][0], self.partPercent[i][1])) plt.ylabel("PERCENT") plt.xlabel("NUM OF INFLAMS") plt.annotate("NO INFLAMMATION", algorithm.cluster_centers_[0]) plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centers_[1]) plt.title("K-Means: # Reactions, % Reactions") plt.show()
def extractColors(stream, maxColor): pixData = np.array(stream) h, w, d = pixData.shape data = np.reshape(pixData, (h * w, d)) km = KM(n_clusters=maxColor) km.fit(data) theme = np.array(km.cluster_centers_, dtype=np.uint8).tolist() return getrgbAndHex(theme)
def cluster(self, inputs): t = time() helper._print('Training clusters (KMeans)...') kmeans = KM(n_clusters=self.num_clusters, init=self.cluster_init, max_iter=1000, tol=0.000001) cluster_pred = kmeans.fit_predict(inputs) helper._print(f'Done training clusters. Finished in {int((time() - t)/60)} minutes and {int((time() - t) % 60)} seconds!') return cluster_pred
def __init__(self, data, k, t, iter, maxE): #if t == 0: # t = 'k-means++' #else: # t = 'random' self.kmean = KM(k, t, iter, maxE).fit(np.array(data)) self.labels = self.kmean.labels_ self.clusters = self.kmean.cluster_centers_
def __init__(self, pixData, maxColor, useSklearn=True): super(KMeans, self).__init__() h, w, d = pixData.shape self.pixData = np.reshape(pixData, (h * w, d)) self.maxColor = maxColor if useSklearn: self._KMeans = KM(n_clusters=maxColor) else: self._KMeans = KMDiy(n_clusters=maxColor)
def do_KM(self, letter, k, iter=200, dump=True, parr=-2): if type(letter) == str: l, v = self.find_by_start(letter) km_obj = KM(n_clusters=k, max_iter=iter, n_jobs=parr) results = km_obj.fit(v) if dump == True: filename = DT.now().strftime( '%d%m%y-%H%M%S') + self.generate_filename() + '-k' + str(k) self.dump(l, results.labels_, filename) self.km_labels = l self.km_clusters = results.labels_ print('Stored KM output in self.km_labels, self.km_clusters') else: self.km_labels = l self.km_clusters = results.labels_ print('Stored KM output in self.km_labels, self.km_clusters') if type(letter) == list: l, v = self.find_by_list(letter) km_obj = KM(n_clusters=k, max_iter=iter, n_jobs=parr) results = km_obj.fit(v) self.predicted_clusters = {} for v, k in zip(l, results.labels_): self.predicted_clusters[v] = k self.pred_prep = sorted(self.predicted_clusters.items(), key=itemgetter(0)) self.predicted = [i[1] for i in self.pred_prep] if dump == True: filename = DT.now().strftime( '%d%m%y-%H%M%S') + self.generate_filename() + '-k' + str(k) self.dump(l, results.labels_, filename) self.km_labels = l self.km_clusters = results.labels_ print('Stored KM output in {self}.km_labels, self.km_clusters') else: self.km_labels = l self.km_clusters = results.labels_ print('Stored KM output in self.km_labels, self.km_clusters')
def set_bin_ranges_for_property( self, property_to_entities: Dict[int, Set[Entity]], class_to_entities: Dict[int, Set[Entity]], property_to_timestamps: Dict[int, List[TimeStamp]], property_id: int): if not property_to_timestamps: self.load_property_to_timestamps(property_to_timestamps, property_id) property_values = np.array([ ts.value for ts in property_to_timestamps[property_id] ]).reshape(-1, 1) kmeans = KM(n_clusters=self.bin_count - 1).fit(property_values) return list( sorted([centroid[0] for centroid in kmeans.cluster_centers_]))
def findClusterCenters(data): # prepare data nonzero_xy = np.nonzero(data) if len(nonzero_xy) != 2: return if nonzero_xy[0].size < 8: return processed_data = np.vstack((nonzero_xy[0], nonzero_xy[1])).transpose() # K-Means algorithm kmeans = KM(n_clusters = 8, max_iter=20, n_init = 5) kmeans.fit(processed_data) center_points = np.fliplr(kmeans.cluster_centers_) return center_points
def KMeans(self): return1 = self.printPoints() if not return1: return algorithm = KM(n_clusters=2) categories = algorithm.fit_predict(self.allCoord) print(self.allCoord) print(categories) plt.scatter(self.allCoord[categories == 0, 0], self.allCoord[categories == 0, 1], c= "green") plt.scatter(self.allCoord[categories == 1, 0],self.allCoord[categories == 1, 1], c="red") plt.scatter(algorithm.cluster_centers_[:, 0], algorithm.cluster_centers_[:, 1], c= "black", marker="*") print(len(self.labels), len(self.allCoord)) for i, txt in enumerate(self.labels): plt.annotate(txt, (self.allCoord[i][0], self.allCoord[i][1])) plt.annotate("NO INFLAMMATION", algorithm.cluster_centers_[0]) plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centers_[1]) plt.savefig("static/" + NAME) self.src = NAME
def KMeansPercentTotal(self): ''' Type: K-Means Y-axis: % Reactions X-axis: # Observations ''' if self.authenticated: from sklearn.cluster import KMeans as KM algorithm = KM(n_clusters=2) fig = plt.figure() # partPercent = np.array([np.array([x, percent]) for j in self.stuff for _, x, _, percent in j]) categories = algorithm.fit_predict(self.percentTotal) plt.scatter(self.percentTotal[categories == 0, 0], self.percentTotal[categories == 0, 1], c="green") plt.scatter(self.percentTotal[categories == 1, 0], self.percentTotal[categories == 1, 1], c="red") plt.scatter(algorithm.cluster_centers_[:, 0], algorithm.cluster_centers_[:, 1], c="black", marker="*") for i, txt in enumerate(self.labels): plt.annotate( txt, (self.percentTotal[i][0], self.percentTotal[i][1])) plt.ylabel("PERCENT") plt.xlabel("TOTAL") plt.annotate("NO INFLAMMATION", algorithm.cluster_centers_[0]) plt.annotate("CAUSES INFLAMMATION", algorithm.cluster_centers_[1]) plt.title("K-Means: # Observations, % Reactions") # plt.show() # mpld3.show() # plt.savefig() tmpfile = BytesIO() # plt.savefig('test.png') fig.savefig(tmpfile, format='png') encoded = base64.b64encode(tmpfile.getvalue()) html = '<img src=\'data:image/png;base64,{}\'>'.format( encoded.decode("utf-8")) with open('KMeansPercentTotal.html', 'w') as f: f.write(html)
def ttsf(ID, CONTENTS): length = len(ID) wei = [] for content in CONTENTS: cut = jieba.cut(content, cut_all=False) words = (' '.join(cut)) wei.append(words) vector = TfidfVectorizer() tfidf = vector.fit_transform(wei) d = tfidf.toarray() k = 3 clf = KM(k) hehe = clf.fit_predict(d) results = [] for i in range(length): if hehe[i] == hehe[0] and i != 0: results.append(ID[i]) else: None print(results) return results
def passl_local_graph_partial(site, loc_param_indices, params): X = site.buff[loc_param_indices[0]] K, rbf_sigma, local_graph_index, n_cluster, centers_index, point_cluster_index, inter_graph_index, member_id_index = params nins = NN(K + 1, None, metric='euclidean').fit(X) W = nins.kneighbors_graph(nins._fit_X, K + 1, mode='distance') #W.data=W.data**2 W.data = np.exp(-W.data**2 / rbf_sigma) W[np.diag_indices(W.shape[0])] = 0 #W[np.diag_indices(W.shape[0])]=0 site.buff[local_graph_index] = W kins = KM(n_cluster) point_cluster = kins.fit_predict(X) site.buff[point_cluster_index] = point_cluster site.buff[centers_index] = kins.cluster_centers_ #print(kins.cluster_centers_) site.buff[inter_graph_index] = {} member_id = [] for i in range(n_cluster): member_id.append(np.where(point_cluster == i)[0]) #print(member_id[-1]) site.buff[member_id_index] = member_id
def cluster_list_k(self, k_list, data): data = sp.vstack(data, format='csr') # self.data = pd.DataFrame.sparse.from_spmatrix(v) # data = pd.DataFrame(sp.vstack(data, format='csr').toarray()) # data = np.array(data) self.k_index = 0 self.labels_list = [] self.RSS_list = [] self.centroids_list = [] ks = [] for k in k_list: # print('\n') print('\n' + str(k) + ":", end=" ") # # try: # labels, RSS, cents = KMeans.cluster_with_k(k, data) # self.RSS_list.append(RSS) # self.labels_list.append(np.array(labels)) # self.centroids_list.append(cents) # ks.append(k) # # except: # # print('Failed', end='') # try: sk = KM(k, n_init=1, max_iter=30).fit(data) self.labels_list.append(np.array(sk.labels_)) self.centroids_list.append(sk.cluster_centers_) self.RSS_list.append(sk.inertia_) ks.append(k) # except: # print('Failed', end='') plt.plot(ks, self.RSS_list) plt.savefig('./cluster_rss') plt.show() np.save("index" + file_post, [self.k_index]) np.save("label" + file_post, self.labels_list) np.save("rss" + file_post, self.RSS_list) np.save("cent" + file_post, self.centroids_list)
def em(tx, ty, rx, ry, reduced_data, add="", times=5, dataset="", alg=""): clf = EM(n_components=times) clf.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure() plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) # centroids = clf.cluster_centers_ # plt.scatter(centroids[:, 0], centroids[:, 1], # marker='x', s=169, linewidths=3, # color='w', zorder=10) plt.title(dataset + ': EM clustering (' + alg + '-reduced data)') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.show() clf = EM(n_components=times) clf.fit(tx) #fit it to our data test = clf.predict(tx) result = clf.predict(rx) checker = EM(n_components=times) ry = ry.reshape(-1,1) checker.fit(ry) truth = checker.predict(ry) td = np.reshape(test, (test.size, 1)) rd = np.reshape(result, (result.size, 1)) # newtx = np.append(td) # newrx = np.append(rd) myNN(test, ty, result, ry, alg="EM_"+alg) errs = [] scores = [] # this is what we will compare to checker = EM(n_components=2) ry = ry.reshape(-1,1) checker.fit(ry) truth = checker.predict(ry) adj_rand = [] v_meas = [] mutual_info = [] adj_mutual_info = [] # so we do this a bunch of times for i in range(2,times): clusters = {x:[] for x in range(i)} # create a clusterer clf = EM(n_components=i) clf.fit(tx) #fit it to our data test = clf.predict(tx) result = clf.predict(rx) # and test it on the testing set for index, val in enumerate(result): clusters[val].append(index) mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(i)} processed = [mapper[val] for val in result] errs.append(sum((processed-truth)**2) / float(len(ry))) scores.append(clf.score(tx, ty)) adj_rand.append(metrics.adjusted_rand_score(ry.ravel(), result)) v_meas.append(metrics.v_measure_score(ry.ravel(), result)) mutual_info.append(metrics.fowlkes_mallows_score(ry.ravel(), result)) adj_mutual_info.append(metrics.homogeneity_score(ry.ravel(), result)) # plot([0, times, min(scores)-.1, max(scores)+.1],[range(2, times), scores, "-"], "Number of Clusters", "Log Likelihood", dataset+": EM Log Likelihood - " + alg, dataset+"_EM_"+alg) # other metrics # names = ["Adjusted Random", "V Measure", "Mutual Info", "Adjusted Mutual Info"] plt.figure() plt.title(dataset+": EM Clustering measures - "+alg) plt.xlabel('Number of clusters') plt.ylabel('Score value') plt.plot(range(2,times),adj_rand, label="Adjusted Random") plt.plot(range(2,times),v_meas, label="V Measure") plt.plot(range(2,times),mutual_info, label = "Fowlkes Mallows Score") plt.plot(range(2,times),adj_mutual_info, label="Homogeneity Score") plt.legend() plt.savefig("EMMetrics"+dataset+"_"+alg+".png") kmeans = KM(n_clusters=2) kmeans.fit(reduced_data) Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in mesh. Use last trained model. Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure() plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) # Plot the centroids as a white X centroids = kmeans.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) plt.title(dataset + ': EM clustering (' + alg + '-reduced data)\n' 'Centroids are marked with a white cross') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.show()
categories=categories, shuffle=True, random_state=None, remove=('headers', 'footers')) fp = open('Result/metrics.txt', 'w+') # Question 1 data = dataset.data target = dataset.target data_matrix = tf_idf_matrix(data) fp.write('tf_idf_matrix shape: {}\n'.format(data_matrix.shape)) # Question 2 target = [1 if i > 3 else 0 for i in target] cluster = KM(n_clusters=2, random_state=0, max_iter=1000, n_init=30) cluster.fit(data_matrix) predict_target = cluster.labels_ fp.write("Contingency table: \n{}\n".format( contingency_matrix(target, predict_target))) # Question 3 metrics_dict = { "homogeneity_score": homogeneity_score, 'completeness_score': completeness_score, "v_measure_score": v_measure_score, "adjusted_rand_score": adjusted_rand_score, "adjusted_mutual_info_score": adjusted_mutual_info_score } for metrics_name in metrics_dict.keys():
# SCALE THE DATA scaler = SS() scaler.fit(df) scaled_data = scaler.transform(df) # PCA pca = PCA(n_components=2) pca.fit(scaled_data) x_pca = pca.transform(scaled_data) # K MEANS - MAKES A LOT OF SENSE TO APPLY THIS ON TOP OF PCA from sklearn.cluster import KMeans as KM model = KM(n_clusters=2) model.fit(df) fig, axes = plt.subplots(1, 2, figsize=(10, 6)) fig.suptitle('Breast Cancer', fontsize=20) axes[0].set_title('Diagnosis') axes[0].scatter( x_pca[:, 0], x_pca[:, 1], c=model.labels_, cmap='coolwarm', ) axes[1].set_title('KMC') axes[1].scatter(x_pca[:, 0], x_pca[:, 1], c=cancer['target'], cmap='coolwarm') plt.show()
def km(tx, ty, rx, ry, reduced_data, add="", times=5, dataset="", alg=""): processed = [] adj_rand = [] v_meas = [] mutual_info = [] adj_mutual_info = [] sil = [] inertia = [] for i in range(2,times): clusters = {x:[] for x in range(i)} clf = KM(n_clusters=i) clf.fit(tx) test = clf.predict(tx) result = clf.predict(rx) adj_rand.append(metrics.adjusted_rand_score(ry.ravel(), result)) v_meas.append(metrics.v_measure_score(ry.ravel(), result)) mutual_info.append(metrics.fowlkes_mallows_score(ry.ravel(), result)) adj_mutual_info.append(metrics.homogeneity_score(ry.ravel(), result)) inertia.append(clf.inertia_) plots = [adj_rand, v_meas, mutual_info, adj_mutual_info] plt.title(dataset+": KM Clustering measures - "+alg) plt.xlabel('Number of clusters') plt.ylabel('Score value') plt.plot(range(2,times), adj_rand, label="Adjusted Random") plt.plot(range(2,times), v_meas, label="V Measure") plt.plot(range(2,times), mutual_info, label = "Fowlkes Mallows Score") plt.plot(range(2,times), adj_mutual_info, label="Homogeneity Score") plt.legend() plt.ylim(ymin=-0.05, ymax=1.05) plt.savefig("KMeansMetric"+dataset+"_"+alg+".png") plt.figure() plt.title(dataset+": KMeans Inertia - "+alg) plt.xlabel('Number of clusters') plt.ylabel('Inertia') plt.plot(range(2,times), inertia) plt.savefig("KM-Inertia-"+dataset+"-"+alg+".png") td = np.reshape(test, (test.size, 1)) rd = np.reshape(result, (result.size, 1)) newtx = np.append(tx, td, 1) newrx = np.append(rx, rd, 1) h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 best_clusterer = KM(n_clusters=4) best_clusterer.fit(X) Z = best_clusterer.predict(X) print(len(Z)) print(len(X)) plt.figure(1) plt.clf() colors = ['r', 'g', 'b', 'y', 'c', 'm','#eeefff', '#317c15', '#4479b4', '#6b2b9c', '#63133b', '#6c0d22', '#0c7c8c', '#67c50e','#c5670e', '#946c47', '#58902a', '#54b4e4', '#e4549e', '#2b2e85' ] for i in range(0, len(X)): plt.plot(X[i][0], X[i][1], marker='.', color=colors[Z[i]], markersize=2) #plt.plot(X[:, 0], X[:, 1], 'k.', markersize=2) # Plot the centroids as a white X centroids = best_clusterer.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='k', zorder=10) plt.title('K-means Clusters ' + alg) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.show() kmeans = KM(n_clusters=3) kmeans.fit(tx) result=pd.DataFrame(kmeans.transform(tx), columns=['KM%i' % i for i in range(3)]) my_color = pd.Series(ty).astype('category').cat.codes fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(result['KM0'], result['KM1'], result['KM2'], c=my_color, cmap="Dark2_r", s=60) plt.show() reduced_data = PCA(n_components=2).fit_transform(tx) kmeans = KM(n_clusters=4) kmeans.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure() plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) centroids = kmeans.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) plt.title(dataset + ': K-means clustering (' + alg + '-reduced data)\n' 'Centroids are marked with a white cross') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.show() checker = KM(n_clusters=2) ry = ry.reshape(-1,1) checker.fit(ry) truth = checker.predict(ry) clusters = {x:[] for x in range(4)} clf = KM(n_clusters=4) clf.fit(tx) #fit it to our data test = clf.predict(tx) result = clf.predict(rx) # and test it on the testing set for index, val in enumerate(result): clusters[val].append(index) mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(4)} processed = [mapper[val] for val in result] print(sum((processed-truth)**2) / float(len(ry))) clf = KM(n_clusters=times) clf.fit(tx) #fit it to our data test = clf.predict(tx) result = clf.predict(rx) checker = KM(n_clusters=times) ry = ry.reshape(-1,1) checker.fit(ry) truth = checker.predict(ry) td = np.reshape(test, (test.size, 1)) rd = np.reshape(result, (result.size, 1)) newtx = np.append(td) newrx = np.append(rd) myNN(test, ty, result, ry, alg="KM_"+alg) nn(newtx, ty, newrx, ry, add="onKM"+add)
best_model = model best_error = model.error else: if best_error > model.error: best_error = model.error best_model = model label = best_model.predict(X) center = best_model.center ####### plot ###################### plt.figure(1) plt.subplot(121) plt.title('my KMeans') plt.scatter(X[:, 0], X[:, 1], c=label) plt.scatter(center[:, 0], center[:, 1], c='r', marker='+') ####### compare to scikit ######## from sklearn.cluster import KMeans as KM km = KM(n_clusters=k) km.fit(X) plt.subplot(122) plt.title('scikit KMeans') plt.scatter(X[:, 0], X[:, 1], c=km.labels_) plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], c='r', marker='+') plt.show()
def __init__(self, imgPixels, K): self.imgPixels = imgPixels self.KM = KM(n_clusters=K, random_state=0).fit(self.imgPixels)
def kmtable(tx, ty, rx, ry, dataset=""): processed = [] adj_rand = [] v_meas = [] mutual_info = [] adj_mutual_info = [] sil = [] inertia = [] compressor = PCA(n_components = tx[1].size/2) compressor.fit(tx, y=ty) pcatx = compressor.transform(tx) pcarx = compressor.transform(rx) p = [] compressor = ICA(n_components = tx[1].size/2) compressor.fit(tx, y=ty) icatx = compressor.transform(tx) icarx = compressor.transform(rx) ic = [] compressor = RandomProjection(tx[1].size/2) compressor.fit(tx, y=ty) rptx = compressor.transform(tx) rprx = compressor.transform(rx) r = [] compressor = best(k=tx[1].size/2) compressor.fit(tx, y=ty) kbtx = compressor.transform(tx) kbrx = compressor.transform(rx) k = [] for i in range(2,8): # clusters = {x:[] for x in range(i)} clf = KM(n_clusters=i) clf.fit(pcatx) test = clf.predict(pcatx) result = clf.predict(pcarx) p.append(metrics.v_measure_score(ry.ravel(), result)) clf = KM(n_clusters=i) clf.fit(icatx) test = clf.predict(icatx) result = clf.predict(icarx) ic.append(metrics.v_measure_score(ry.ravel(), result)) clf = KM(n_clusters=i) clf.fit(rptx) test = clf.predict(rptx) result = clf.predict(rprx) r.append(metrics.v_measure_score(ry.ravel(), result)) clf = KM(n_clusters=i) clf.fit(kbtx) test = clf.predict(kbtx) result = clf.predict(kbrx) k.append(metrics.v_measure_score(ry.ravel(), result)) # adj_rand.append(metrics.adjusted_rand_score(ry.ravel(), result)) # v_meas.append(metrics.v_measure_score(ry.ravel(), result)) # mutual_info.append(metrics.fowlkes_mallows_score(ry.ravel(), result)) # adj_mutual_info.append(metrics.homogeneity_score(ry.ravel(), result)) plt.figure() plt.title(dataset+": KM Clustering & DR") plt.xlabel('Number of clusters') plt.ylabel('V Measure Score value') plt.plot(range(2,8), p, label="PCA") plt.plot(range(2,8), ic, label="ICA") plt.plot(range(2,8), r, label = "RP") plt.plot(range(2,8), k, label="KB") plt.legend() plt.ylim(ymin=-0.05, ymax=0.5) plt.savefig("KM_DR_"+dataset+"_VM.png", dpi=300)
lower conversion price and more dilution to BankAmerica stock holders, noted Daniel Williams, analyst with Sutro Group. Several analysts said that while they believe the Brazilian debt problem will continue to hang over the banking industry through the quarter, the initial shock reaction is likely to ease over the coming weeks. Nevertheless, BankAmerica, which holds about 2.70 billion dlrs in Brazilian loans, stands to lose 15-20 mln dlrs if the interest rate is reduced on the debt, and as much as 200 mln dlrs if Brazil pays no interest for a year, said Joseph Arsenio, analyst with Birr, Wilson and Co. He noted, however, that any potential losses would not show up in the current quarter. ''' kmeans = KM(n_clusters=32, init='random', n_init=1, verbose=1) kmeans.fit(features) print kmeans.labels_ new_post_vector = vectorizer.transform([new_post_2]) new_post_label = kmeans.predict(new_post_vector)[0] print "new posts label", new_post_label similar_indices = (kmeans.labels_ == new_post_label).nonzero()[0] similar = [] for i in similar_indices: dist = sp.linalg.norm((new_post_vector - features[i]).toarray())
if __name__ == '__main__': dir_from = '../drosophila_kc167_1_images' dir_to = '../KMeansResults' number_clusters = [i for i in range(2, 70, 2)] #[2, 4, 6, 8, 10, 12, 14, 16] for file_name in os.listdir(dir_from): array_time = [] path_file = dir_from + '/' + file_name name_without_extension = file_name[:-4] dir_save = dir_to + '/' + name_without_extension + '/' # os.mkdir(dir_save) data = KMeans(path_file).pixels sse = [] for K in number_clusters: path_file = '../drosophila_kc167_1_images/CPvalid1_48_40x_Tiles_p0003DAPI.TIF' #label, center = KMeans(path_file, K).kmeans() km = KM(K) km.fit(data) sse.append(km.inertia_) # Plot sse against k plt.figure(figsize=(6, 6)) plt.plot(number_clusters, sse, '-o') plt.xlabel(r'Number of clusters *k*') plt.ylabel('Sum of squared distance'); plt.show() exit() # center = np.uint8(center) # res = center[label].reshape((512, 512, 3)) # Image.fromarray(res, 'RGB').save(dir_save + str(K) + '.bmp')
X_train_people_nmf = nmf_decomp.transform(X_train_people) X_test_people_nmf = nmf_decomp.transform(X_test_people) nmf_decomp = NMF(n_components=5, init='nndsvd', random_state=37).fit(X_train_energy) X_train_energy_nmf = nmf_decomp.transform(X_train_energy) X_test_energy_nmf = nmf_decomp.transform(X_test_energy) nmf_decomp = NMF(n_components=100, random_state=37).fit(X_train_mnist) X_train_mnist_nmf = nmf_decomp.transform(X_train_mnist) X_test_mnist_nmf = nmf_decomp.transform(X_test_mnist) print('nmf') #k-Means Clustering from sklearn.cluster import KMeans as KM km_cluster = KM(n_clusters=10, random_state=37).fit(X_train_people) X_train_people_km = km_cluster.transform(X_train_people) X_test_people_km = km_cluster.transform(X_test_people) km_cluster = KM(n_clusters=10, n_init=5, random_state=37).fit(X_train_energy) X_train_energy_km = km_cluster.transform(X_train_energy) X_test_energy_km = km_cluster.transform(X_test_energy) km_cluster = KM(n_clusters=10, random_state=37).fit(X_train_mnist) X_train_mnist_km = km_cluster.transform(X_train_mnist) X_test_mnist_km = km_cluster.transform(X_test_mnist) print('km\n') """ ##############################Classification############################## """ from sklearn.neighbors import KNeighborsClassifier as KNC
def __init__(self): self.ma1 = TF.fit_transform(ma) self.model = KM(6) self.res = self.model.fit_predict(self.ma1)
#先随机在数据中扔几个点,作为核心。再计算每一个点距离核心的位置。 #每个核心肯定会有边境点,之后计算每个核心分离的中间位置,把核心偏移过去。重新形成新的边境 #多次循环后核心就会稳定 import pandas as pda import numpy as np from sklearn.cluster import KMeans as KM if __name__ == '__main__': data = pda.read_csv('luqu.csv') x = data.iloc[:, 1:4].as_matrix() km = KM(n_clusters=2, n_jobs=2) print(km.fit_predict(x))
for key, name in webcolors.css21_hex_to_names.items(): r_c, g_c, b_c = webcolors.hex_to_rgb(key) rd = (r_c - rgb_[0])**2 gd = (g_c - rgb_[1])**2 bd = (b_c - rgb_[2])**2 min_colors[(rd + gd + bd)] = name return min_colors[min(min_colors.keys())] # Construct the argument parser and parse the arguments ap = argparse.ArgumentParser() ap.add_argument("--image", required=True, help="Input Image Path") ap.add_argument("--clusters", required=True, type=int, help="# of clusters") args = vars(ap.parse_args()) image = cv2.imread(args["image"]) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Reshape the image to be a list of pixels image = image.reshape((image.shape[0] * image.shape[1], 3)) # K-Means Clustering clt = KM(n_clusters=args["clusters"]) clt.fit(image) # Get dominant colors colors = clt.cluster_centers_.astype("uint8").tolist() for rgb in colors: color_name = get_color_name(rgb) print("Dominant color :", color_name)
scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # One hot encode target values one_hot = OneHotEncoder() y_train_hot = one_hot.fit_transform(y_train.reshape(-1, 1)).todense() y_test_hot = one_hot.transform(y_test.reshape(-1, 1)).todense() labels = y_train dim = [2, 3, 4, 5] km = KM(random_state=42) gmm = GMM(random_state=42) Score = defaultdict(list) adjMI = defaultdict(list) S_homog = defaultdict(list) S_adjMI = defaultdict(list) S_vm = defaultdict(list) for i in dim: reduced_X = PCA(n_components=i, random_state=42).fit_transform(X_train_scaled) k = 30 km.set_params(n_clusters=k) gmm.set_params(n_components=k) km.fit(reduced_X)
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.cluster import KMeans as KM file = pd.read_csv("Mall_Customers.csv") X = file.iloc[:, 3:5].values elbow = [] for i in range(10): km = KM(n_clusters=i + 1) km.fit(X) elbow.append(km.inertia_) plt.plot(range(1, 11), elbow) plt.xlabel("No. of Clusters") plt.ylabel("Cost") plt.show() km = KM(n_clusters=5) res = km.fit_predict(X) colors = ["red", "blue", "green", "yellow", "silver"] for i in range(5): plt.scatter(X[res == i, 0], X[res == i, 1], c=colors[i]) plt.axes().get_xaxis().set_visible(False) plt.axes().get_yaxis().set_visible(False) plt.show()