def calc_anchors(): """ 计算anchors :return: """ ratios = [] sizes = [] bbx = [] root = '/media/hvt/95f846d8-d39c-4a04-8b28-030feb1957c6/dataset/充电宝/遮挡问题/core' for label_file_path in sorted(os.listdir(osp.join(root, 'Annotation'))): labels = pd.read_csv(osp.join(root, 'Annotation', label_file_path), delimiter=' ', header=None).values for label in labels: _, name, xmin, ymin, xmax, ymax = label if name not in label_dict.keys() or ymax - ymin < 2: # print(label_file_path, label) continue ratios.append((xmax - xmin) / (ymax - ymin)) sizes.append([xmax - xmin, ymax - ymin]) bbx.append([xmin, ymin, xmax, ymax]) # 聚类分析anchor cluster = KMeans(n_clusters=3).fit(sizes) # 两种分辨率 print(cluster.cluster_centers_ * (1333 / 2000)) cluster = KMeans(n_clusters=2).fit(np.array(ratios).reshape(-1, 1)) print(cluster.cluster_centers_) plt.hist(ratios, bins=10) plt.show() plt.hist2d(np.array(sizes)[:, 0], np.array(sizes)[:, 1], bins=20) plt.show()
def train_k_means_by_step(n_clusters, init_cluster_centers, x_array, eps): # eps = 1e-4 # eps = 0.1 # eps = 100.0 # prev_sample = np.array(clf.cluster_centers_, np.float) prev_centers = init_cluster_centers clf = KMeans(init=prev_centers, n_clusters=n_clusters, n_init=1, n_jobs=-1, tol=eps, max_iter=1) # if isinstance(prev_centers, str): # prev_centers = clf.cluster_centers_ clf.fit(x_array) new_centers = clf.cluster_centers_ centers_list = [prev_centers, new_centers] args = [1] values = [clf.inertia_] while get_distance(prev_centers, new_centers) > eps: prev_centers = new_centers clf = KMeans(init=prev_centers, n_clusters=n_clusters, n_init=1, n_jobs=-1, tol=eps, max_iter=1).fit(x_array) new_centers = clf.cluster_centers_ args.append(len(args) + 1) values.append(clf.inertia_) centers_list.append(new_centers) # print "k = %s, len centers = %s" % (n_clusters, len(f_values)) return args, values, centers_list
def plot_network(n_clusters, subset_job, subset_edu, no_jobs, no_edu): plt.figure(figsize=(6, 8)) job_kmeans = KMeans(n_clusters=n_clusters) job_predict = job_kmeans.fit_predict(subset_job) empl_edu_kmean = KMeans(n_clusters=n_clusters) empl_predict = empl_edu_kmean.fit_predict(subset_edu) cluster_sum_jobs, cluster_sum_employ_edu = [], [] for i in range(n_clusters): cluster_sum_employ_edu.append( sum_cluster(empl_predict, i, no_edu) / sum(no_edu)) cluster_sum_jobs.append( sum_cluster(job_predict, i, no_jobs) / sum(no_jobs)) jobs_centres = job_kmeans.cluster_centers_ emp_edu_centres = empl_edu_kmean.cluster_centers_ result, all_coords = min_span_tree(jobs_centres, emp_edu_centres, cluster_sum_jobs, cluster_sum_employ_edu) city_labels() plot_california_counties() plot_california() for i in range(len(result)): for j in range(len(result[i])): if result[i][j] == 0: # NO LINK continue plt.scatter(jobs_centres[i][0] if i < n_clusters else emp_edu_centres[i - n_clusters][0], jobs_centres[i][1] if i < n_clusters else emp_edu_centres[i - n_clusters][1], edgecolors='b', facecolors='none') plt.scatter(jobs_centres[j][0] if j < n_clusters else emp_edu_centres[j - n_clusters][0], jobs_centres[j][1] if j < n_clusters else emp_edu_centres[j - n_clusters][1], edgecolors='b', facecolor='none') plt.plot( (jobs_centres[i][0] if i < n_clusters else emp_edu_centres[i - n_clusters][0], jobs_centres[j][0] if j < n_clusters else emp_edu_centres[j - n_clusters][0]), (jobs_centres[i][1] if i < n_clusters else emp_edu_centres[i - n_clusters][1], jobs_centres[j][1] if j < n_clusters else emp_edu_centres[j - n_clusters][1]), 'b-') plt.show()
def plot_employment_edu_cluster(n_clusters, no_edu, subset_edu, kmeans=None): if not kmeans: kmeans = KMeans(n_clusters=n_clusters) empl_predict = kmeans.fit_predict(subset_edu) plot_california() plot_california_counties() for i in range(n_clusters): mean_employment_score = np.mean( [no_edu[j] for j in range(len(no_edu)) if empl_predict[j] == i]) plt.scatter([ subset_edu[j][0] for j in range(len(subset_edu)) if empl_predict[j] == i ], [ subset_edu[j][1] for j in range(len(subset_edu)) if empl_predict[j] == i ], label=f"Mean Employment Score:{mean_employment_score:.5f}", s=4.5) plt.legend() plt.gca().set_xlabel("Longitude") plt.gca().set_ylabel("Latitude") plt.xlim((-120, -116)) plt.ylim((33, 35)) plt.axis('equal') plt.show()
def initial_kmeans(k, rand_state, data, reallabels): min_clusters, max_clusters = k_range(k) # 根据真实类标签数得到实验所用的簇数量范围 bestAri_arr = [] # 每一个k簇值ari最好值的集合 # bestCr_arr = [] #每一个k簇值CR最好值的集合 kmeans_labels = [] # 某一k簇值得到的最好的划分 kmeans_labels_arr = [] # 每一个k簇值的最好划分的集合 for clusters in range(min_clusters, max_clusters): bestAri = 0 # 某一k簇值中的ari最好值 # bestCr = -1 for i in range(ini_generation): y_kmeans = KMeans(n_clusters=clusters, random_state=rand_state).fit_predict(data) kmeans_ari = adjusted_rand_score(reallabels, y_kmeans) # kmeans_cr = corrected_rand(reallabels, y_kmeans) if kmeans_ari > bestAri: bestAri = kmeans_ari kmeans_labels = y_kmeans # if kmeans_cr > bestCr: # bestCr = kmeans_cr # bestCr_arr.append(bestCr) bestAri_arr.append(bestAri) ind_kmeans = creator.Individual(kmeans_labels) kmeans_labels_arr.append(ind_kmeans) # print ('kmeans的最好CR值为:%s'%bestCr_arr) return kmeans_labels_arr, bestAri_arr
def k_means_clustering(self, matrix): for k in range(3, 10): km = KMeans(n_clusters=k) self.cluster_number.append( [k, silhouette_score(matrix, km.fit_predict(matrix))]) return self
def rsnn(sampledData, remainedData, sampledIndex, remainedIndex, singleName): predicted_labelAll = [] for i in range(len(sampledData)): # clusters = random.randint(min_clusters,max_clusters) clusters = random.randint(2, 11) # clusters = random.randint(2,11)#范围是[2,10] if singleName == 'kmeans': predicted_label = KMeans(n_clusters=clusters).fit_predict( sampledData[i]) elif singleName in ('ward', 'complete', 'average'): predicted_label = AgglomerativeClustering( linkage=singleName, n_clusters=clusters).fit_predict(sampledData[i]) predicted_labelAll.append(predicted_label.tolist()) ##对采样出来的数据集的预测标签集合 assinALLNnLabels = [] #全部的通过近邻分配的标签 #remainedData和sampleedData拥有的数据的行数是一致的,所以j的值无论从len(remainedData)还是从len(sampledData)取都可以 for j in range(len(remainedData)): assinNnLabels = [] # 通过近邻分配的标签 for m in range(len(remainedData[j])): minDist = inf minindex = -1 for k in range(len(sampledData[j])): distJI = distEclud(remainedData[j][m], sampledData[j][k]) if distJI < minDist: minDist = distJI minindex = k assinNnLabels.append( predicted_labelAll[j][minindex]) #对除采样外的数据集的根据近邻关系得到的预测标签集合 assinALLNnLabels.append(assinNnLabels) #对两个预测标签和序列值分别进行组合 combineIndex = [] combinedLables = [] for column in range(len(predicted_labelAll)): combineIndexOne = sampledIndex[column] + remainedIndex[column] combinedLablesOne = predicted_labelAll[column] + assinALLNnLabels[ column] combineIndex.append(combineIndexOne) combinedLables.append(combinedLablesOne) #把打乱的序号按照从小到大排列出来,得到元素升序的序列值 seqIndexAll = [] for combineIndex1 in combineIndex: seqIndex = [] for seq in range(len(sampledData[0]) + len(remainedData[0])): for elementIndex in range(len(combineIndex1)): if combineIndex1[elementIndex] == seq: seqIndex.append(elementIndex) seqIndexAll.append(seqIndex) #得到真正的sampledData和remainedData组合后的标签值 finalLabel = [] for finalIndex in range(len(combinedLables)): finallabelone = [] for index in seqIndexAll[finalIndex]: finallabelone.append(combinedLables[finalIndex][index]) finalLabel.append(finallabelone) #最终聚类结果 return finalLabel
def answer(test_path): import warnings warnings.filterwarnings("ignore") import time t0 = time.time() from learning import process_test_data, training_data, training_answers from sklearn.cluster.k_means_ import KMeans from sklearn.linear_model.logistic import LogisticRegression test_data = process_test_data(test_path) km = KMeans() km.fit(training_data, training_answers) myNum = km.predict(test_data).item() numX = [1, 2, 4, 2, 7, 0, 2, 7, 4, 3, 2, 1, 4, 5, 5, 1, 3, 0, 4, 2] numbers = [[num] for num in numX] letX = [ 'a', 'a', 'o', 'a', 'o', 'o', 'a', 'a', 'o', 'a', 'a', 'o', 'a', 'o', 'o', 'o', 'a', 'a', 'o', 'a' ] letters = [[letter] for letter in letX] lr = LogisticRegression() lr.fit(numbers, letters) ans = lr.predict(myNum).item() t1 = time.time() return [ans, t1 - t0]
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') data = np.loadtxt(args.data_points) if args.root is not None: data = np.sqrt(data) (k, initial_points) = get_initial_centers(args.clusters, args.start_points) log.info('calculate center points') kmeans = KMeans(k, initial_points, 1, args.max_iter, copy_x=False) predict = kmeans.fit_predict(data) log.info('storing results') if args.model: save_object_to_file(kmeans, args.model) with utf8_file_open(args.outfile, 'w') as outfile: for i in xrange(predict.shape[0]): outfile.write(u'%d\n' % predict[i]) if args.centroids: np.savetxt(args.centroids, kmeans.cluster_centers_) log.info('finished')
def test_KMeansConstrained_parity_digits(): iris = datasets.load_iris() X = iris.data k = 8 random_state = 1 size_min, size_max = None, None # No restrictions and so should produce same result clf_constrained = KMeansConstrained( n_clusters=k, size_min=size_min, size_max=size_max, random_state=random_state ) y_constrained = clf_constrained.fit_predict(X) clf_kmeans = KMeans( n_clusters=k, random_state=random_state ) y_kmeans = clf_kmeans.fit_predict(X) assert_array_equal(y_constrained, y_kmeans) assert_almost_equal(clf_constrained.cluster_centers_, clf_kmeans.cluster_centers_) assert_almost_equal(clf_constrained.inertia_, clf_kmeans.inertia_)
def k_way_spectral_clustering(): x = np.load('q2data.npy') A = np.load('AMatrix.npy') WeightMatrix = np.zeros((16, 16)) for i in range(16): for j in range(16): if A[i][j] == 1: WeightMatrix[i][j] = np.exp(-1 * ((np.linalg.norm(x[i] - x[j]) ** 2))) else: WeightMatrix[i][j] = 0 DegreeMatrix = np.sum(WeightMatrix, axis=1) L = DegreeMatrix - WeightMatrix DSquareRoot = np.diag(1.0 / (DegreeMatrix ** (0.5))) Lnorm = np.dot(np.dot(DSquareRoot, L), DSquareRoot) eigvals, eigvecs = np.linalg.eig(Lnorm) eigvecs = np.array(eigvecs, dtype=np.float64) sortedinds = eigvals.argsort() eigvec1, eigvec2, eigvec3, eigvec4 = eigvecs[:, 10], eigvecs[:, 11], eigvecs[:, 13], eigvecs[:, 14] kmeans = KMeans(n_clusters=3, init='random') kmeans.fit(eigvecs) components = kmeans.labels_ return components
def train_kNN_after_kMeans(n_clusters, train_x_array, eps, predict_x_array): k_means = KMeans(init="random", n_clusters=n_clusters, n_init=1, n_jobs=-1, tol=eps).fit(train_x_array) # clf.cluster_centers_ # clf.fit(X, y) iter_i = [ k_means.cluster_centers_[j].reshape((28, 28)) for j in range(n_clusters) ] picture = np.column_stack(iter_i) plt.imshow(picture, cmap="gray") input_data = raw_input("enter %s digits via space" % n_clusters) new_y = [int(i) for i in input_data.split(" ")] k_nn = KNeighborsClassifier(n_neighbors=1, n_jobs=-1) k_nn.fit(k_means.cluster_centers_, new_y) result_file = open("result-k-%s.csv" % n_clusters) result_file.write("ImageId,Label\n") prediction = k_nn.predict(predict_x_array) for i in range(len(prediction)): string = str(i + 1) + "," + str(prediction[i]) + "\n" result_file.write(string)
def getClusters(input_data): km = KMeans(n_clusters=10, random_state=0).fit(input_data) centers = km.cluster_centers_ np.insert( centers, 0, 1, 1 ) ####=========================== ADD BIAS =====================================## print("Centers : ", centers.shape) return centers
def getClustering(method_name="k-mean", param_map={}): if method_name == "k-mean": from sklearn.cluster.k_means_ import KMeans return KMeans(**param_map) elif method_name == "dbscan": from sklearn.cluster import DBSCAN return DBSCAN(**param_map) return None
def performKmeans(data,n_clusters): print "Performing K-Means on data" est = KMeans(n_clusters) est.fit(data) orb_cb_handler.store_estimator(est) return est
def K_means_BERT(datasets, pred_vector, labels, opt): # datasets: a list ,each element is a [3,max_len] array sample # pred_vector: a model's function to predict embedding # num_classes: num of class num_classes = len(np.unique(labels)) feature_embeddings = model_pred_BERT(datasets, pred_vector, labels, opt) kmeans = KMeans(n_clusters=num_classes, n_init=10).fit(feature_embeddings) label_list = kmeans.labels_.tolist() return label_list, create_msg(label_list), kmeans.cluster_centers_, feature_embeddings
def fsrsnn(sampledData, remainedData, sampledIndex, remainedIndex, sampledDataFs, k): min_clusters, max_clusters = k_range(k) # 根据真实类标签数得到实验所用的簇数量范围 predicted_labelAll = [] for i in range(len(sampledData)): clusters = random.randint(min_clusters, max_clusters) # clusters = random.randint(2,11)#范围是[2,10] predicted_label = KMeans(n_clusters=clusters).fit_predict( sampledDataFs[i]) predicted_labelAll.append(predicted_label.tolist()) ##对采样出来的数据集的预测标签集合 assinALLNnLabels = [] #全部的通过近邻分配的标签 #remainedData和sampleedData拥有的数据的行数是一致的,所以j的值无论从len(remainedData)还是从len(sampledData)取都可以 for j in range(len(remainedData)): assinNnLabels = [] # 通过近邻分配的标签 for m in range(len(remainedData[j])): minDist = inf minindex = -1 for k in range(len(sampledData[j])): distJI = distEclud(remainedData[j][m], sampledData[j][k]) # 计算质心和数据点之间的距离 if distJI < minDist: minDist = distJI minindex = k assinNnLabels.append( predicted_labelAll[j][minindex]) #对除采样外的数据集的根据近邻关系得到的预测标签集合 assinALLNnLabels.append(assinNnLabels) #对两个预测标签和序列值分别进行组合 combineIndex = [] combinedLables = [] for column in range(len(predicted_labelAll)): combineIndexOne = sampledIndex[column] + remainedIndex[column] combinedLablesOne = predicted_labelAll[column] + assinALLNnLabels[ column] combineIndex.append(combineIndexOne) combinedLables.append(combinedLablesOne) #把打乱的序号按照从小到大排列出来,得到元素升序的序列值 seqIndexAll = [] for combineIndex1 in combineIndex: seqIndex = [] for seq in range(len(sampledData[0]) + len(remainedData[0])): for elementIndex in range(len(combineIndex1)): if combineIndex1[elementIndex] == seq: seqIndex.append(elementIndex) seqIndexAll.append(seqIndex) #得到真正的sampledData和remainedData组合后的标签值 finalLabel = [] for finalIndex in range(len(combinedLables)): finallabelone = [] for index in seqIndexAll[finalIndex]: finallabelone.append(combinedLables[finalIndex][index]) finalLabel.append(finallabelone) #最终聚类结果 return finalLabel
def performKmeans(data, n_clusters): print "Performing K-Means on data" est = KMeans(n_clusters) est.fit(data) labels = est.labels_ labels_np = np.array(labels) return labels, est
def initial_kmeans(k, rand_state, data): min_clusters, max_clusters = k_range(k) # 根据真实类标签数得到实验所用的簇数量范围 kmeans_labels_arr = [] # 每一个k簇值的最好划分的集合 for clusters in range(min_clusters, max_clusters): kmeans_labels = KMeans(n_clusters=clusters, random_state=rand_state).fit_predict(data) ind_kmeans = creator.Individual(kmeans_labels) kmeans_labels_arr.append(ind_kmeans) return kmeans_labels_arr
def evaluate_kmeans_unsupervised(data, nclusters, k_init=20): """ Clusters data with kmeans algorithm and then returns the cluster centroids :param data: Points that need to be clustered as a numpy array :param nclusters: Total number of clusters :param method_name: Name of the method from which the clustering space originates (only used for printing) :return: Formatted string containing metrics and method name, cluster centers """ kmeans = KMeans(n_clusters=nclusters, n_init=k_init) kmeans.fit(data) return kmeans.cluster_centers_
def perform_cluster(data, params): km = KMeans() km.set_params(**params) vectorizer = TfidfVectorizer() print(data[1][0]) tfidf = vectorizer.fit_transform(data[1]) labels = km.fit_predict(tfidf) result = {i: [] for i in set(labels)} for i, l in zip(range(len(labels)), labels): result[l].append(data[0][i]) return result
def initialMultiRun(data, times, singleName): predicted_labelAll = [] for i in range(times): clusters = random.randint(2, 11) if singleName == "kmeans": predicted_label = KMeans(n_clusters=clusters).fit_predict(data) elif singleName in ('ward', 'average', 'complete'): predicted_label = AgglomerativeClustering( linkage=singleName, n_clusters=clusters).fit_predict(data) predicted_labelAll.append(predicted_label.tolist()) return predicted_labelAll
def __init__(self, tweet_file_path, no_of_clusters): """ The constructor reads csv file and builds the data matrix. """ self.np_extractor = ConllExtractor() self.pos_tagger = NLTKTagger() self.tweet_file_path = tweet_file_path self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path) self.vectorizer = DictVectorizer(sparse=True) self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters)
def start_algorithm(self): """ start clustering the stored tweets :return: list of clusters containing tweets """ vectors = self.vectorize_data() kmeans = KMeans(init='k-means++', n_clusters=self.cluster_amount, n_init=10) kmeans.fit(vectors) return self.cluster_tweet(kmeans.labels_)
def ClusterBalance(self, indexesToPick, stopCount, kmeansFlag=True): print "ClusterBalancing..." indexesPicked = [] obs1 = self.observations[indexesToPick] obs = normalize(obs1, axis=0) if len(indexesToPick) != 0: if kmeansFlag: if (len(indexesToPick) < self.numClusters): cluster = KMeans(init='k-means++', n_clusters=len(obs), n_init=10) else: cluster = KMeans(init='k-means++', n_clusters=self.numClusters, n_init=10) else: if (len(indexesToPick) < self.numClusters): cluster = spectral_clustering(n_clusters=len(obs), n_init=10) else: cluster = spectral_clustering(n_clusters=self.numClusters, n_init=10) cluster.fit(obs) labels = cluster.labels_ whenToStop = max(2, stopCount) count = 0 while count != whenToStop: cluster_list = range(self.numClusters) index = 0 for j in labels: if j in cluster_list: indexesPicked.append(indexesToPick[index]) cluster_list.remove(j) count += 1 if count == whenToStop: break labels[index] = -1 if len(cluster_list) == 0: break index += 1 return indexesPicked
def _centroids(n_clusters: int, points: List[List[float]]) -> List[List[float]]: """ Return n_clusters centroids of points """ k_means = KMeans(n_clusters=n_clusters) k_means.fit(points) closest, _ = pairwise_distances_argmin_min(k_means.cluster_centers_, points) return list(map(list, np.array(points)[closest.tolist()]))
def evaluateKMeans(data, labels, nclusters, method_name): ''' Clusters data with kmeans algorithm and then returns the string containing method name and metrics, and also the evaluated cluster centers :param data: Points that need to be clustered as a numpy array :param labels: True labels for the given points :param nclusters: Total number of clusters :param method_name: Name of the method from which the clustering space originates (only used for printing) :return: Formatted string containing metrics and method name, cluster centers ''' kmeans = KMeans(n_clusters=nclusters, n_init=20) kmeans.fit(data) return getClusterMetricString(method_name, labels, kmeans.labels_), kmeans.cluster_centers_
def K_means(datasets, pred_vector, num_classes, opt): ''' Args: datasets: a list ,each element is a [3,max_len] array sample pred_vector: a model's function to predict embedding num_classes: num of class Returns: K-means results -- a tuple(label_list, message, cluster_centers, features) ''' feature_embeddings = model_pred(datasets, pred_vector, opt) kmeans = KMeans(n_clusters=num_classes, n_init=10).fit(feature_embeddings) label_list = kmeans.labels_.tolist() return label_list, create_msg(label_list), kmeans.cluster_centers_, feature_embeddings
def perLabel(label_name, labels, sample_size, n_clusters): print(79 * '_') print label_name print( '% 9s' % 'feature' ' time inertia h**o compl v-meas ARI AMI silhouette') #print "number of distinct classes for true labels for ",label_name, len(Counter(labels)) estimator = KMeans(n_clusters=n_clusters) bench_k_means(labels, sample_size, estimator, "RGB", rgb_data) bench_k_means(labels, sample_size, estimator, "LAB", lab_data) bench_k_means(labels, sample_size, estimator, "HOG", hog_data) bench_k_means(labels, sample_size, estimator, "GIST", gist_data) bench_k_means(labels, sample_size, estimator, "SURF", surf_data) bench_k_means(labels, sample_size, estimator, "SIFT", sift_data) bench_k_means(labels, sample_size, estimator, "ORB", orb_data)
def extract_word_clusters(commentList, commentCount): brown_ic = wordnet_ic.ic('ic-brown.dat') a, corpus, global_synsets = extract_global_bag_of_words(commentList, True) similarity_dict = {} i = 0 t = len(global_synsets)**2 for syn_out in global_synsets: similarity_dict[syn_out] = {} for syn_in in global_synsets: if syn_in.pos() == syn_out.pos(): similarity_dict[syn_out][syn_in] = syn_out.lin_similarity(syn_in, brown_ic) else: similarity_dict[syn_out][syn_in] = max(wn.path_similarity(syn_out,syn_in), wn.path_similarity(syn_in,syn_out)) if i % 10000 == 0: print i, 'synsets processed out of',len(global_synsets)**2, '(',float(i)/(t),'%)' i += 1 tuples = [(i[0], i[1].values()) for i in similarity_dict.items()] vectors = [np.array(tup[1]) for tup in tuples] # Rule of thumb n = sqrt(len(global_synsets)/2) print "Number of clusters", n km_model = KMeans(n_clusters=n) km_model.fit(vectors) clustering = collections.defaultdict(list) for idx, label in enumerate(km_model.labels_): clustering[label].append(tuples[idx][0]) pprint.pprint(dict(clustering), width=1) feature_vector = np.zeros([len(corpus),n]) for i,comment in enumerate(corpus): for w in comment: for key, clust in clustering.items(): if w in clust: feature_vector[i][key] += 1 if i % 1000 == 0: print i, 'comments processed' print feature_vector '''