def fit(self, X, y): if self.inferStds: # compute stds from data self.centers, self.stds = km.k_means(X, self.k) else: # use a fixed std self.centers, _ = km.k_means(X, self.k) dMax = max([ np.abs(c1 - c2) for c1 in self.centers for c2 in self.centers ]) self.stds = np.repeat(dMax / np.sqrt(2 * self.k), self.k) # training for epoch in range(self.epochs): for i in range(X.shape[0]): # forward pass a = np.array([ self.rbf(X[i], c, s) for c, s, in zip(self.centers, self.stds) ]) F = a.T.dot(self.w) + self.b loss = (y[i] - F).flatten()**2 # print('Loss: {0:.2f}'.format(loss[0])) # backward pass error = -(y[i] - F).flatten() # online update #chu = np.dot(a , error) self.w = self.w - self.lr * a * error self.b = self.b - self.lr * error
def compare_dist(iters=100): pts = gen_data() cs = [] dist = [] for i in range(iters): c,_ = kmeans.k_means(pts, 3) cpp,_ = kmeans.k_means(pts, 3, kmeans.k_means_pp_initialization) d = kmeans.compute_avg2_distortion(pts, c) dpp = kmeans.compute_avg2_distortion(pts, cpp) dist.append([d,dpp]) cs.append(np.array([c,cpp])) return np.array(dist), np.array(cs)
def validar_agrupamento(df): # Calcula kmeans quando K=2, K=3, K=4 for i in range(2, 5): # Remove a última coluna, pois ela é a classe, vou utilizar ela para o algoritmo kmeans data_sem_classe = df.drop(df.columns[-1], axis=1, inplace=False) # Chama a função para realizar o kmeans data = k_means(data_sem_classe, i) # Adiciona a coluna "Classe" de volta ao Dataframe data['Classe'] = df[df.columns[-1]] # Agrupa por 'Classe' e 'Grupo' grouped_df = data.groupby(["Classe", "Grupo"]) # Converte o objeto para Dataframe e calcula a contagem em cada grupo data_contagem = pd.DataFrame( grouped_df.size().reset_index(name="Contagem")) # Guarda em uma variável todos os tipos de classe, removendo aquelas repetidas por estarem em mais de um grupo classeList = sorted(set(data_contagem['Classe'].values.tolist())) # Monta uma estrutura de Dataframe para calcular a pureza data_estrutura = montar_estrutura(data_contagem, classeList, i) # Calcula a pureza data_pureza = calcula_pureza(data_estrutura, i)
def main(): clusters = args.clusters if clusters not in [4, 6, 8, 10]: raise ValueError("Number of clusters must be 4, 6, 8, or 10!") data_path = './data/Mall_Customers.csv' attr = ['Gender', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)'] f1, f2 = 'Age', 'Spending Score (1-100)' df = pd.read_csv(data_path) data = df[[f1, f2]].iloc[:, :].to_numpy() if f1 == 'Gender': gender = data[:, 0] gender[gender == 'Male'] = 5. gender[gender == 'Female'] = 10. print(gender) data[:, 0] = gender print(f'Shape of the data: {data.shape}') km = k_means(num_clusters=clusters, tol=1e-4) est_centroid, history_centroids, predict_labels, loss, num_iter = km.get_cluster( data) plotting(predict_labels, data, clusters, est_centroid, f1, f2)
def outer_criteria(X, k_range, reference, iterations=200): n = len(X) RS = dict() # Rand Statistic FM = dict() # Fowlkes-Mallows for k in k_range: TP, FN, FP, TN = (0, ) * 4 if k < 2: continue _, clusters = kmeans.k_means(X, k, iterations) # Compute TP, FN, FP, TN. for i in range(n): for j in range(i + 1, n): if clusters[i] == clusters[j] and reference[i] == reference[j]: TP += 1 elif clusters[i] != clusters[j] and reference[i] == reference[ j]: FN += 1 elif clusters[i] == clusters[ j] and reference[i] != reference[j]: FP += 1 else: TN += 1 RS[k] = (TP + TN) / n precision = TP / (TP + FP) recall = TP / (TP + FN) FM[k] = np.sqrt(precision * recall) best_rs = max(RS, key=RS.get) best_fm = max(FM, key=FM.get) return RS, best_rs, FM, best_fm
def k_means_test(): X = np.concatenate((np.random.normal(5, 30, 100), np.random.normal(150, 44, 100), np.random.normal(300, 20, 100))) Y = np.concatenate((np.random.normal(5, 30, 100), np.random.normal(150, 44, 100), np.random.normal(300, 20, 100))) data = np.column_stack((X, Y)) clusters, result, centroids = k_means(data, 3, euclidean_dist) cluster_points = [] for cluster in clusters: Xaux = [X[i] for i in cluster] Yaux = [Y[i] for i in cluster] cluster_points.append((Xaux, Yaux)) plt.figure() colors = [ 'green', 'red', 'blue', 'yellow', 'gray', 'lightblue', 'pink', 'black' ] i = 0 for cluster in cluster_points: X, Y = cluster plt.scatter(X, Y, c=colors[i]) i = (i + 1) % len(colors) plt.show()
def spectral_cluster(W, k): L = get_laplacian(W) Lambda, v = np.linalg.eig(L) length = len(Lambda) eigen_pairs = [(Lambda[i], v[:, i]) for i in range(length)] eigen_pairs = sorted(eigen_pairs, reverse=False, key=lambda k: k[0]) temp = np.column_stack((eigen_pairs[i][1] for i in range(k))) result = kmeans.k_means(temp.T, 2, 5) return result
def main(): print "\n>> Since, matplotlib has inconsistent behaviour, I am saving all the generated plots in the directory './images'\n" if not os.path.exists('./images'): os.mkdir('./images') print "> Running K-Means for blob_data\n\n" blob_data = np.genfromtxt('./hw5_blob.csv', delimiter=',') for k in [2, 3, 5]: clusters = k_means(blob_data, k, "blob", True) print "> Running K-Means for circle_data\n\n" circle_data = np.genfromtxt('./hw5_circle.csv', delimiter=',') for k in [2, 3, 5]: clusters = k_means(circle_data, k, "circle", True) print "> Running Kernel K-Means for circle_data\n\n" kernel_kmeans.main() print "> Running EM Algorithm for blob_data\n\n" em_algorithm.main()
def compress_image(im_path, k, init_f=uniform_mode_dist_init): """ returns the original and compressed version of an image together with time profile data Arguments: im_path: string k: int init_f: function of 2d array x 1d array x int x function -> 2d array Output: image: numpy 2d numerical array compressed_image numpy 2d numerical array mse: float time_profile: dict of string -> float """ # Read image image = cv2.imread(im_path) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Store original shape original_shape = image.shape # Reshape into a numpy 2d array image = image.reshape([image.shape[0] * image.shape[1], 3]) # Run k-means t0 = time.time() c_means, clusters, mse, time_profile = k_means(image, k, rgb_distance, init_f, pixel_to_str, str_to_pixel) t1 = time.time() time_profile['k_means'] = t1 - t0 # Create compressed image compressed_image = np.zeros(image.shape, dtype=np.uint8) for i in range(compressed_image.shape[0]): compressed_image[i] = c_means[clusters[i]].astype(np.uint8) # Return to original shape image = image.reshape(original_shape).astype(np.uint8) compressed_image = compressed_image.reshape(original_shape).astype( np.uint8) image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) compressed_image = cv2.cvtColor(compressed_image, cv2.COLOR_RGB2BGR) aid = ptp_idm(image, compressed_image) return image, compressed_image, mse, aid, time_profile
def main(argv): config_path = args.conf num_anchors = args.anchors with open(config_path) as config_buffer: config = json.loads(config_buffer.read()) data = { 'train': { 'image_folder': config['train']['train_image_folder'], 'annot_folder': config['train']['train_annot_folder'], }, 'valid': { 'image_folder': config['valid']['valid_image_folder'], 'annot_folder': config['valid']['valid_annot_folder'], } } video_folder_list, video_annot_list = data_preparation(data['train'], FOR_YOLO=True) grid_w = config['model']['input_size'] / 32 grid_h = config['model']['input_size'] / 32 cell_w = 1280.0 / grid_w cell_h = 720.0 / grid_h # run k_mean to find the anchors annotation_dims = [] for video_annot in video_annot_list: labels = np.loadtxt(video_annot, delimiter=',') for label in labels: relative_w = label[2] / cell_w relative_h = label[3] / cell_h if math.isnan(relative_w) or math.isnan(relative_h): # print("NaN annotations! {}".format(basename(video_annot))) 1 else: annotation_dims.append(map(float, (relative_w, relative_h))) annotation_dims = np.array(annotation_dims) centroids, cluster_assignment = k_means(annotation_dims, num_anchors) # write anchors to file print '\naverage IOU for', num_anchors, 'anchors:', '%0.2f' % avg_IOU( annotation_dims, centroids) print_anchors(centroids)
def main(): #data, label = datasets.load_iris().data, datasets.load_iris().traget #data = load_data('F:\machine learning\PythonCode\dataset\cluster\spectral_cluster_data.txt') data, label = load_data_with_label('F:\machine learning\PythonCode\dataset\cluster\Spiral.txt') data = normal_data(data) #显示原来的数据分布 plt.ion() #kmeans.mat_plot_sample(data) mat_plot_cluster_sample(data, label, 'original sample') # k-means聚类,对比实验 k_dist1, k_centers1, cluster_group = kmeans.k_means(data, 3) kmeans.mat_plot_k_means_sample(k_dist1, k_centers1) # 谱聚类,第三个参数为聚类是采用的算法,有'NJW'、'self-tuning'、'sl',默认为'sl' spectral_cluster(data, 3) spectral_cluster(data, 3, 'NJW') spectral_cluster(data, 3, 'self-tuning') plt.ioff() plt.show()
def training_model(number_of_templates, using_kmeans, read_feature_from_file=0): """ training the model using DTW to recognize the records :param number_of_templates: int, using number_of_templates to train the model :param using_kmeans: 1 if using k-means to templates to generate some states :param read_feature_from_file:1 then read mfcc feature from file,0 then compute mfcc feature from records :return: DTW_obj:a DTW object,the trained model :return: inputs:list,every element of it is a template using for test """ templates = [] inputs = [] for digit in xrange(0, 10): temp = get_templates_and_inputs(digit, number_of_templates, read_feature_from_file) # print 'temp',temp[0] # print 'k_means(temp[0], number_of_states=5)',kmeans.k_means(temp[0], number_of_states=5) templates.extend( kmeans.k_means(temp[0], number_of_states=5)[0] if using_kmeans and number_of_templates == 5 else temp[0]) inputs.extend(temp[1]) DTW_obj = DTW.DTW(templates) return DTW_obj, inputs
def spectral_clustering(data, k, gamma=0.1): print("Spectral clustering...") # Number of samples n = data.shape[0] # Adjacency matrix W = rbf_kernel_gram_matrix(data, gamma) # Build Degree matrix D = compute_degrees(W) # Graph Laplacian L = D - W # Normalized Cut D_inv_sqrt = scipy.linalg.fractional_matrix_power(D, -1 / 2) L_sym = D_inv_sqrt.dot(L).dot(D_inv_sqrt) # Compute the first k eigenvectors of L_sym eig_values, eig_vectors = LA.eig(L_sym) # T contains first k eigenvectors of normalized Laplacian T = np.zeros((n, k)) for i in range(k): T[:, i] = eig_vectors[:, i] # Resubstitude matrix H and normalize by its rows. H = D_inv_sqrt.dot(T) H /= LA.norm(H, axis=1, ord=2)[:, np.newaxis] # Cluster data points in eigenspace centroids, cluster_assignment = k_means(H, k) # Show the data points at the same point in eigenspace. # discrete_h = np.zeros_like(H) # c, discrete_h[:, 0] = k_means(H[:, 0].reshape(-1, 1), k) # c, discrete_h[:, 1] = k_means(H[:, 1].reshape(-1, 1), k) # centroids, cluster_assignment = k_means(discrete_h, k) return cluster_assignment
def inner_criteria(X, k_range, iterations=200): DB = dict() # Davies-Bouldin CH = dict() # Calinski-Harabasz for k in k_range: if k < 2: continue centroids, clusters = kmeans.k_means(X, k, iterations) DB[k] = DaviesBouldin(X, centroids, clusters) CH[k] = CHIndex(X, centroids, clusters) ch_list = list(CH.items()) db_best = min(DB, key=DB.get) ch_best = 0 delta = sys.maxsize for k in range(1, len(CH) - 1): temp = ch_list[k + 1][1] - 2 * ch_list[k][1] + ch_list[k - 1][1] if temp < delta: delta = temp ch_best = ch_list[k][0] return DB, db_best, CH, ch_best
f = open( 'docs.json', 'r' ) for doc in f.readlines(): #Remove the '\n' at the end of doc doc = doc.strip('\n') doc_dict = json.loads( doc ) Doc_List.append( doc_dict ) #obtain Title_List Title_List.append( doc_dict[ 'title' ] ) f.close() #Calculate tf-idf of each word in each doc: #doc_collection's structure: [doc_vector1,doc_vector2,...] #doc_vector's structure:{word1:tf-idf val, word2:tf-idf val,...} doc_collection = TF_IDF() #this block will generate value_collection. Its structure is: [[1,2,0,1,2,...],[0,2,1,3,0...],...] value_collection = [] for doc_vector in doc_collection: doc_value = [] for word in doc_vector: doc_value.append( doc_vector[ word ] ) value_collection.append( doc_value ) #print value_collection #K Means is coming! k = raw_input( "Please input a value for k: " ) k = int( k ) KMeans = kmeans.k_means( k ) KMeans.K_MEANS( value_collection, Title_List )
def serial_kmeans(data, k, dist): return kmeans.k_means(data,k,dist)
dataSet = [] # 打开数据文件 fileIn = open('testSet.txt') # 读取每一行,使用中间的tab进行切割 for line in fileIn.readlines(): # 切割 lineArr = line.strip().split('\t') # 存入数据列表(有序),转化为float数据类型,数据文件的每行根据tab分割为两部分,使用下标来访问 dataSet.append([float(lineArr[0]), float(lineArr[1])]) # -------------------------- # ************************** # step 2: clustering... # 第二步:聚类... print "第二步:聚类..." # 使用mat函数把列表(数组)转换成矩阵 dataSet = mat(dataSet) # 参数k为聚类中心数目 k = 4 # 执行kmeans文件中的k_means函数,传入矩阵和聚类中心数目k参数,分别赋值 centroids, clusterAssment = kmeans.k_means(dataSet, k) # ************************** # ++++++++++++++++++++++++++ # step 3: show the result... # 第三步:显示结果... print "第三步:显示结果..." # 执行kmeans文件中的显示图表函数 kmeans.showCluster(dataSet, k, centroids, clusterAssment) # ++++++++++++++++++++++++++
reader = csv.reader(f) #iterator object vote_topic = next(reader) #acc. to csv data headers = next(reader) #acc. to csv data for person, state, district, vote, name, party in reader: senator = Senator(name, party, state) accumulated_record[senator].append(vote_value[vote]) pprint(accumulated_record, width=500) record = { senator: tuple(votes) for senator, votes in accumulated_record.items() } #type: Dict[Senator,VoteHistory] pprint(record, width=500) #use kmeans to locate the cluster centroids from patterns of votes, assign each senetor to the nearest cluster centroids = k_means(data=record.values(), num_centroids=3) clustered_votes = assign_data(centroids=centroids, data=record.values()) pprint(clustered_votes) #build a reverse mapping from a vote history to a list of senators that voted that way votes_to_senators = defaultdict( list) #type: DefaultDict[VoteHistory, List[Senator]] for senator, vote_history in record.items(): votes_to_senators[vote_history].append(senator) #we know that come senators might have the same voting history, #also check if all 100 are present even after rearranging by using assertion assert sum([len(cluster) for cluster in votes_to_senators.values()]) == NUM_SENATORS #display clusters and members of each clusters
def cluster_k(db_path, file_name, file_name_p, n_clusters): score = kmeans.k_means(dump_path=db_path, file_name=file_name, file_name_p=file_name_p, n_clusters=int(n_clusters)) print '{0}:{1}'.format(n_clusters, score)
vote_value = {'Nay': -1, 'Not Voting': 0, 'Yea': 1} # type: Dict[str, VoteValue] accumulated_record = defaultdict(list) # type: DefaultDict[Senator, List[VoteValue]] for filename in glob.glob('congress_data/*.csv'): with open(filename) as f: reader = csv.reader(f) vote_topic = next(reader) headers = next(reader) for person, state, district, vote, name, party in reader: senator = Senator(name, party, state) accumulated_record[senator].append(vote_value[vote]) # Transform record into plain dict mapping a senator to a tuple of vote values record = {senator: tuple(votes) for senator, votes in accumulated_record.items()} # type: Dict[Senator, Tuple[VoteValue, ...]] # Use k-means to locate the cluster centroids and assign senators to the nearest cluster centroids = k_means(record.values(), k=3, iterations=50) clustered_votes = assign_data(centroids, record.values()) # Build a reverse mapping from a pattern of votes to senators who voted that way votes_to_senators = defaultdict(list) # type: DefaultDict[Tuple[VoteValue, ...], List[Senator]] for senator, votes in record.items(): votes_to_senators[votes].append(senator) assert sum(map(len, clustered_votes.values())) == 100 # Display the clusters and the members of each cluster for i, votes_in_cluster in enumerate(clustered_votes.values(), start=1): print(f'=========== Voting Cluster #{i} ===========') party_totals = Counter() # type: Counter for votes in set(votes_in_cluster): for senator in votes_to_senators[votes]: party_totals[senator.party] += 1
for i in range(0, len(test_dataset_np)): if test_dataset_np[i][0] == 1: test_count[0] += 1 if test_dataset_np[i][0] == 4: test_count[1] += 1 if test_dataset_np[i][0] == 8: test_count[2] += 1 #print(train_count) train_dataset_np_order = train_dataset_np[ train_dataset_np[:, 0].argsort()] #The number:1:1005, 4:652, 8:542 test_dataset_np_order = test_dataset_np[test_dataset_np[:, 0].argsort()] #print(train_dataset_np_order) result = k_means(train_dataset_np_order, 3, 1000) print(result[1]) #A method to assign class labels to each of your clusters: acc = Cal_Accuracy_testdata(result[1], train_count) print("The acc of training is :", acc) #Predict labels on the zip.test data: labels = testdata_classify(test_dataset_np_order, result[2]) print("Print labels", labels) #Acc: acc_test = Cal_Accuracy_testdata(labels, test_count) print("The acc of test is :", acc_test) #PCA:
def get_transform_relationship_FSM_table(filename, states_in_each_word=5, using_continuous_feature=0, **kwargs): """ 从有限状态机中获取模板,状态转移列表,起始状态,每个nonemitting state所对应的emitting state 在模板中连续 :param filename: 有限状态机文件名 :param states_in_each_word: 每个单词有多少states组成 :param kwargs: :return: template,states的模板 transform_list,第i个元素为列表,代表第i个state可以从哪些states进入 begin_list,t=0时可进入的states word_in_template,第i个表示template中第i个元素代表的word """ start_states, terminal_states, nonemitting_transform_list = get_info_FSM_table(filename) number_of_nonemitting_states = len(nonemitting_transform_list) template = [] digit_template = [] # 第i个值表示数字i的template number_of_frames_in_each_state = [] # 第i个元素代表state i的frame数 number_of_frames_in_each_state_for_digit = [] # 第i个元素代表数字i中每个state拥有的frame数量 covariance_matrix_in_each_state = [] mean_in_each_state = [] covariance_matrix_for_each_digit = [] mean_for_each_digit = [] word_in_template = [] # 第i个表示template中第i个元素代表的word if using_continuous_feature: digit_template = SR.get_digit_feature_from_continuous_speech() for digit in xrange(0, 10): temp = SR.get_templates_and_inputs(digit, number_of_templates=10) if not using_continuous_feature: result = kmeans.k_means(temp[0], number_of_states=5) digit_template.extend(result[0]) covariance_matrix_for_each_digit.append(result[2]) mean_for_each_digit.append(result[3]) temp_number_of_frames_in_each_state = [0 for i in xrange(states_in_each_word)] for number_of_frames_in_each_state_in_each_template in result[5]: for i in xrange(states_in_each_word): temp_number_of_frames_in_each_state[i] += number_of_frames_in_each_state_in_each_template[i] number_of_frames_in_each_state_for_digit.append(temp_number_of_frames_in_each_state) # print number_of_frames_in_each_state # 获取number_of_emitting_states_begin_from_nonemitting_states number_of_emitting_states_begin_from_nonemitting_states = [] # 第i个元素表示源于第i个nonemitting_state的edge包含的states个数 for nonemitting_state in nonemitting_transform_list: # nonemitting_state形如{1: [2, 3, 4, 5, 6, 7, 8, 9], 3: []} temp = [] for edge_list in nonemitting_state.values(): temp.extend(edge_list) for i in temp: # 获取template covariance_matrix_in_each_state.extend(covariance_matrix_for_each_digit[i]) mean_in_each_state.extend(mean_for_each_digit[i]) number_of_frames_in_each_state.extend(number_of_frames_in_each_state_for_digit[i]) template.extend(digit_template[i]) word_in_template.append(i) number_of_emitting_states = len(set(temp)) number_of_emitting_states_begin_from_nonemitting_states.append(number_of_emitting_states) # 获取begin_states_index_for_each_nonemitting_state begin_states_index_for_each_nonemitting_state = [] # 第i个元素表示源于第i个nonemitting_state的edge包含的states之前有多少states for i in xrange(number_of_nonemitting_states): begin_states_index_for_each_nonemitting_state.append( sum(number_of_emitting_states_begin_from_nonemitting_states[:i])) # 获取emitting_out_list,第i个元素为一个列表,代表第i个nonemitting state可以进入的states emitting_out_list = [[] for i in xrange(number_of_nonemitting_states)] for i, nonemitting_state in enumerate(nonemitting_transform_list): # nonemitting_state形如{1: [2, 3, 4, 5, 6, 7, 8, 9], 3: []} begin_index = begin_states_index_for_each_nonemitting_state[i] number_of_edges = sum(map(len, nonemitting_state.values())) for j in xrange(number_of_edges): emitting_out_list[i].append(states_in_each_word * (begin_index + j)) changed = True # 考虑无条件跳转的情况 while changed: new_emitting_out_list = emitting_out_list[:] for i, nonemitting_state in enumerate(nonemitting_transform_list): for key in nonemitting_state.keys(): if nonemitting_state[key] == []: new_emitting_out_list[i].extend(new_emitting_out_list[key]) changed = (new_emitting_out_list != emitting_out_list) emitting_out_list = new_emitting_out_list # 获取emitting_in_list,第i个元素为一个列表,代表第i个nonemitting state可以由哪些states进入 emitting_in_list = [[] for i in xrange(number_of_nonemitting_states)] for i, nonemitting_state in enumerate(nonemitting_transform_list): # nonemitting_state形如{1: [2, 3, 4, 5, 6, 7, 8, 9], 3: []} begin_index = begin_states_index_for_each_nonemitting_state[i] j = 0 for key in nonemitting_state.keys(): for value in nonemitting_state[key]: emitting_in_list[key].append(states_in_each_word * (begin_index + j)) j += 1 changed = True # 考虑无条件跳转的情况 while changed: new_emitting_in_list = emitting_in_list[:] for i, nonemitting_state in enumerate(nonemitting_transform_list): for key in nonemitting_state.keys(): if nonemitting_state[key] == []: new_emitting_in_list[key].extend(new_emitting_in_list[i]) changed = (new_emitting_in_list != emitting_in_list) emitting_in_list = new_emitting_in_list # 计算transform_list,每个元素为一个列表,表示可以进入这个state的states transform_list = [[] for i in xrange(sum(number_of_emitting_states_begin_from_nonemitting_states) * states_in_each_word)] for i in xrange(number_of_nonemitting_states): for next_state in emitting_out_list[i]: for cur_state in emitting_in_list[i]: transform_list[next_state].append(cur_state + 4) for i, element in enumerate(transform_list): element.append(i) if i % states_in_each_word > 0: element.append(i - 1) if i % states_in_each_word > 1: element.append(i - 2) begin_states = emitting_out_list[0] return template, transform_list, begin_states, word_in_template, number_of_frames_in_each_state, covariance_matrix_in_each_state, mean_in_each_state
vote_value: Dict[str, VoteValue] = {'Nay': -1, 'Not Voting': 0, 'Yea': 1} accumulated_record: DefaultDict[Senator, List[VoteValue]] = defaultdict(list) for filename in glob.glob('congress_data/*.csv'): with open(filename, encoding='utf-8') as f: reader = csv.reader(f) vote_topic = next(reader) headers = next(reader) for person, state, district, vote, name, party in reader: senator = Senator(name, party, state) accumulated_record[senator].append(vote_value[vote]) # Transform record into plain dict mapping a senator to a tuple of vote values record: Dict[Senator, Tuple[VoteValue, ...]] = {senator: tuple(votes) for senator, votes in accumulated_record.items() } # Use k-means to locate the cluster centroids and assign senators to the nearest cluster centroids = k_means(record.values(), k=3, iterations=50) clustered_votes = assign_data(centroids, record.values()) # Build a reverse mapping from a pattern of votes to senators who voted that way votes_to_senators: DefaultDict[Tuple[VoteValue, ...], List[Senator]] = defaultdict(list) for senator, votes in record.items(): votes_to_senators[votes].append(senator) assert sum(len(cluster) for cluster in clustered_votes.values()) == 100 # Display the clusters and the members of each cluster for i, votes_in_cluster in enumerate(clustered_votes.values(), start=1): print(f'=========== Voting Cluster #{i} ===========') party_totals: Counter = Counter() for votes in set(votes_in_cluster): for senator in votes_to_senators[votes]: party_totals[senator.party] += 1
import seaborn as sns import numpy as np import pandas as pd from skimage import io #import imp #kmean = imp.load_source('./kmeans.py') #import kmean import kmeans as keman pic = io.imread('./data/bird_small.png') / 255. #io.imshow(pic) data = pic.reshape(128 * 128, 3) #k-mean C, centroids, cost = keman.k_means(pd.DataFrame(data), 16, epoch=10, n_init=3) compressed_pic = centroids[C].reshape( (128, 128, 3)) #a[b]中每个元素为b[i]为索引,对应a中的每行 #sklearn KMeans #from sklearn.cluster import KMeans #model = KMeans(n_clusters=16, n_init=100, n_jobs=-1) #model.fit(data) #centroids = model.cluster_centers_ #C = model.predict(data) #compressed_pic = centroids[C].reshape((128,128,3)) fix, ax = plt.subplots(1, 2) ax[0].imshow(pic) ax[1].imshow(compressed_pic) plt.show()
NC=list(range(3, 4))#numOfModels+1)) # list of numbers of clusters accuracies = np.zeros((numOfEPS, len(NC))) clusteringResult = {} for numOfClusters in NC: clusteringResult[numOfClusters] = [] with open("clustering_result.txt", "w") as fp: for numOfClusters in NC: # clustering into c groups print("Clustering: {} clusters".format(numOfClusters)) # kmeans = KMeans(n_clusters=numOfClusters, random_state=0).fit(predVec) # for c in range(numOfClusters): # clusteringResult[numOfClusters].append(np.where(kmeans.labels_ == c)[0]) # print(np.where(kmeans.labels_ == c)[0]) assignments = k_means(predVec, numOfClusters, "L2", "ZerosFarAway") fp.write("## number of clusters: "+str(numOfClusters)+"\n") for c in range(numOfClusters): cluster = np.where(assignments==c)[0] clusteringResult[numOfClusters].append(cluster) print(cluster) fp.write("\t"+str(cluster)+"\n") fp.write("\n") def vote1(participants): ''' Input: participants: a list of opinions. Each element in the list is a numpy array, N X 2. N is the number of events. The second dimension contains (opinion/label, confidence) Output: voteResult : a numapy array NX2 represents opinion and confidence across N events
reader = csv.reader(f) vote_topic = next(reader) header = next(reader) for person, state, district, vote, name, party in reader: senator = Senator(name, party, state) accumulated_record[senator].append(vote_value[vote]) # Transform the record into a plain dict that maps to a tuple of votes. record = { senator: tuple(votes) for senator, votes in accumulated_record.items() } # type: Dict[Senator, VoteHistory] # Use k-means to locate the cluster centroids from pattern of votes, assign # each senator to the nearest cluster. centroids = k_means(record.values(), k=3) clustered_votes = assign_data(centroids, record.values()) # Build a reverse mapping from a vote history to a list of senators who voted # that way. votes_to_senators: DefaultDict[VoteHistory, List[Senator]] = defaultdict(list) for senator, votehistory in record.items(): votes_to_senators[votehistory].append(senator) assert sum(len(cluster) for cluster in votes_to_senators.values()) == NUM_SENATORS # Display the clusters and the members (senators) of each cluster. for i, votes_in_cluster in enumerate(clustered_votes.values(), start=1): print(f"==================== Voting Cluster #{i} ====================") party_totals: Counter[str] = Counter() for votes in set(votes_in_cluster):
if __name__ == "__main__": data = sio.loadmat('ExtYaleB10.mat') train_sample = data['train'] test_sample = data['test'] x_train_full = np.column_stack(train_sample[0, i][:, :, j].reshape(192 * 168, 1) for i in range(10) for j in range(50)) x_test_full = np.column_stack(test_sample[0, i][:, :, j].reshape(192 * 168, 1) for i in range(10) for j in range(14)) I = np.identity(10) y_train = np.column_stack(I[:, i] for i in range(10) for j in range(50)) y_test = np.column_stack(I[:, i] for i in range(10) for j in range(14)) print("spectral_clustering computing") v = spectral.spectral_clustering(x_train_full, 10, 1, 10) for i in range(v.shape[1]): v[:, i] = v[:, i] / (la.norm(v[:, i])) final_c, final_z = km.k_means(np.asarray(v.T), 10, 10) y_train = np.zeros(500, ) cost = 0 for i in range(10): for j in range(50): y_train[i * 50 + j] = i for i in range(500): for j in range(10): if final_z[i, j] == 1: if y_train[i] != j: cost += 1 print("Errors of spectral clustering", cost)
with open(filename) as f: reader = csv.reader(f) vote_topic = next(reader) headers = next(reader) for person, state, district, vote, name, party in reader: senator = Senator(name, party, state) accumulated_record[senator].append(vote_value[vote]) # Transform record into a plain dict maps a senator to a tuple of vote values record = { senator: tuple(votes) for senator, votes in accumulated_record.items() } # Show-off our talent with k-means advances machine learning like skynet or the HAL5000 or Deep Blue centroids = kmeans.k_means(list(record.values()), k=2, iterations=50) clustered_votes = kmeans.assign_data(centroids, list( record.values())) # type: Dict[Tuple[VoteValue], List[Tuple[VoteValue]] # Build a reverse mapping. Given a voting record, get the list of senators who voted that way votes_to_senators = collections.defaultdict(list) for senator, votes in record.items(): votes_to_senators[votes].append(senator) # Display the clusters and the people who voted that way for i, votes_in_clusters in enumerate(clustered_votes.values(), start=1): print( f'======================= Voting Cluster #{i} =========================' ) for votes in set(votes_in_clusters): for senator in votes_to_senators[votes]:
if __name__ == '__main__': data_dir = 'data' results_dir = 'results' os.makedirs(results_dir, exist_ok=True) clusters_range = range(2, 10) # Inner criterias' block. image_path = os.path.join(data_dir, 'policemen.jpg') image = np.array(Image.open(image_path), dtype=np.uint8) new_image = image.reshape(image.shape[0] * image.shape[1], image.shape[2]) db, best_db, ch, best_ch = inner_criteria(new_image, clusters_range) best_inner = (best_db + best_ch) // 2 # Save the clustered image. centroids, clusters = kmeans.k_means(new_image, best_inner, iterations=200) new_image = np.vstack([centroids[i] for i in clusters ]).astype(np.uint8).reshape(image.shape) Image.fromarray(new_image).save( os.path.join(results_dir, '%d-clusters-policemen.jpg' % best_inner)) # Outer criterias' block. outer_criterias_input = os.path.join(data_dir, 'outer_criterias_input.txt') data = np.loadtxt(outer_criterias_input, delimiter=' ') reference, points = data[:, 0], data[:, 1:] rs, best_rs, fm, best_fm = outer_criteria(points, clusters_range, reference) # Draw the results. fig, ax = plt.subplots(nrows=2, ncols=2) ax1, ax2, ax3, ax4 = ax.flatten()
def main(): r = 5 # number of random initial d = 2 # reduced dimension k = 2 # number of cluster N = 200 # sample size part = input("Input part A/B \n") number = input("Input number 1~6 \n") if (part == 'A'): data = sio.loadmat('HW3_Data/dataset1.mat') Y = data['Y'] if (number == 1): plt.close() x1 = Y[0, :] y1 = Y[1, :] plt.scatter(x1, y1) plt.show() plt.close() if (number == 2): plt.close() x1 = Y[0, :] y1 = Y[2, :] plt.scatter(x1, y1) plt.show() if (number == 3): plt.close() u, y_reduced = pca.pca(Y, 2) x = y_reduced[0, :] y = y_reduced[1, :] plt.scatter(np.asarray(x), np.asarray(y)) plt.show() if (number == 4): plt.close() result = kmeans.k_means(np.matrix(Y), k, r) x1 = [] y1 = [] x2 = [] y2 = [] U, y_2d = pca.pca(Y, d) for i in range(N): if (result[i] == 0): x1.append(y_2d[0, i]) y1.append(y_2d[1, i]) else: x2.append(y_2d[0, i]) y2.append(y_2d[1, i]) plt.scatter(x1, y1, color='red') plt.scatter(x2, y2, color='blue') plt.show() if (number == 5): plt.close() U, y_2d = pca.pca(Y, d) result = kmeans.k_means(y_2d, k, r) x1 = [] y1 = [] x2 = [] y2 = [] for i in range(200): if (result[i] == 0): x1.append(y_2d[0, i]) y1.append(y_2d[1, i]) else: x2.append(y_2d[0, i]) y2.append(y_2d[1, i]) plt.scatter(x1, y1, color='red') plt.scatter(x2, y2, color='blue') plt.show() if (part == 'B'): data = sio.loadmat('HW3_Data/dataset2.mat') Y = data['Y'] if (number == 1): x1 = Y[0, :] y1 = Y[1, :] plt.scatter(x1, y1) plt.show() if (number == 2): x1 = Y[0, :] y1 = Y[2, :] plt.scatter(x1, y1) plt.show() if (number == 3): u, y_reduced = pca.pca(Y, 2) x = y_reduced[0, :] y = y_reduced[1, :] plt.scatter(np.asarray(x), np.asarray(y)) plt.show() if (number == 4): U, y_2d = pca.pca(Y, d) result = kmeans.k_means(np.matrix(y_2d), k, r) x1 = [] y1 = [] x2 = [] y2 = [] for i in range(N): if (result[i] == 0): x1.append(y_2d[0, i]) y1.append(y_2d[1, i]) else: x2.append(y_2d[0, i]) y2.append(y_2d[1, i]) plt.scatter(x1, y1, color='red') plt.scatter(x2, y2, color='blue') plt.show() if (number == 5): kernel = pca.get_kernel(Y) u = pca.kernel_pca(kernel, d) y_reduced = np.matrix(kernel * u) result = kmeans.k_means(y_reduced.T, k, r) x1 = [] y1 = [] x2 = [] y2 = [] #U, y_2d = pca.pca(Y, d) y_2d = y_reduced.T for i in range(N): if (result[i] == 0): x1.append(y_2d[0, i]) y1.append(y_2d[1, i]) else: x2.append(y_2d[0, i]) y2.append(y_2d[1, i]) plt.scatter(x1, y1, color='red') plt.scatter(x2, y2, color='blue') plt.show() if (number == 6): W = np.matrix(spectral.get_w_matrix(Y, 5, 1)) result = spectral.spectral_cluster(W, 2) x1 = [] y1 = [] x2 = [] y2 = [] U, x = pca.pca(Y, 2) y_2d = x for i in range(200): if (result[i] == 0): x1.append(y_2d[0, i]) y1.append(y_2d[1, i]) else: x2.append(y_2d[0, i]) y2.append(y_2d[1, i]) plt.scatter(x1, y1, color='red') plt.scatter(x2, y2, color='blue') plt.show()
else: idf[j] = math.log(idf[j]) for i in range(len(urls)): for j in range(len(taglist)): tf[i][j] *= idf[j] t = np.array(tf) t = t.transpose() mat,trans = pca_min.pca(t,3) plot([trans[:9],trans[9:17],trans[17:]]) #print mat #print trans ##m,r,a = kmeans.k_means(np.array(tf),2) ##print r,a ##c0 = np.array([[trans[i][0],trans[i][1],trans[i][2]] for i in range(len(trans)) if a[i]==0]) ##c1 = np.array([[trans[i][0],trans[i][1],trans[i][2]] for i in range(len(trans)) if a[i]==1]) ##plot([c0,c1]) m,r,a = kmeans.k_means(np.array(trans),3) for i in range(10): m1,r1,a1 = kmeans.k_means(np.array(trans),3) print r1,a1 if(r1<r): m = m1 r = r1 a = a1 print r,a c0 = np.array([[trans[i][0],trans[i][1],trans[i][2]] for i in range(len(trans)) if a[i]==0]) c1 = np.array([[trans[i][0],trans[i][1],trans[i][2]] for i in range(len(trans)) if a[i]==1]) c2 = np.array([[trans[i][0],trans[i][1],trans[i][2]] for i in range(len(trans)) if a[i]==2]) plot([c0,c1,c2])
from scipy import cluster from sklearn.datasets import load_iris import kmeans from matplotlib import pyplot as plt from os import mkdir iris = load_iris() data = iris.data names = iris.feature_names np.random.shuffle(data) k = 3 t = time.time() centers, center_steps = kmeans.k_means(data, k=k, distance='e') print(time.time() - t) for step in range(len(center_steps)): print("{0}/{1}".format(step + 1, len(center_steps))) for combination in itertools.combinations(range(data.shape[1]), 2): step_centers = np.array(center_steps[step]) assignment, cdist = cluster.vq.vq(data[:, combination], step_centers[:, combination]) if not os.path.exists(names[combination[0]] + "-" + names[combination[1]]): mkdir(names[combination[0]] + "-" + names[combination[1]]) for i in range(k): data = np.append(data, [step_centers[i]], axis=0)
k = 2 edgelist = pd.read_csv(path + file[0], delimiter=' ', skiprows=1, header=None) G = nx.from_pandas_edgelist(edgelist, source=0, target=1, create_using=nx.Graph()) adj_matr = nx.to_pandas_adjacency(G, dtype=np.float64) laplacian = sparse.csgraph.laplacian(adj_matr.values, normed=True) eigenvalues, eigenvectors = np.linalg.eig(laplacian) eigenvectors = eigenvectors.astype(np.float64) centroids, clusters = k_means(eigenvectors, k, random_seed=1, num_iters=10, plot=False) print(centroids) cut_edges = 0 for i in range(edgelist.shape[0]): if (clusters[edgelist[0][i]] != clusters[edgelist[1][i]]): cut_edges += 1 counter = collections.Counter(clusters) smallest = 100000 for key, value in counter.items(): if (value < smallest): smallest = value
plt.subplot(121) plt.axis('off') plt.imshow(img) plt.subplot(122) c = np.loadtxt('data/0.txt') plt.axis('off') plt.imshow(c) plt.show() pos = np.zeros((img.shape[0], img.shape[1], 2)) for i in range(pos.shape[0]): for j in range(pos.shape[1]): pos[i, j, :] = [i, j] pos = pos.reshape((-1, 2)) data = np.hstack((pos, np.reshape(img, (-1, 3)))) # k_means res = k_means(data, 3, iter_times=20, dist_func=dist) tag = res[:, -1] tag = np.reshape(tag, c.shape) plt.figure(2) plt.imshow(tag) plt.axis('off') plt.show() # gmm res = gmm(data, 3, iter_times=20) tag = res[:, -1] tag = np.reshape(tag, c.shape) plt.figure(3) plt.imshow(tag) plt.axis('off') plt.show() # dbscan
train_data = data[[x_axis, y_axis]].values.reshape((n_data, 2)) #Normalizing data columns so everything exists between -1 and 1 train_data_mean = np.mean(np.abs(train_data), axis=0) for j in range(train_data.shape[1]): train_data[:, j] -= train_data_mean[j] train_data_scale = np.max(train_data, axis=0) for j in range(train_data.shape[1]): train_data[:, j] /= train_data_scale[j] # Enter k_means inputs. clusters = 1 # Number of clusters into which we want to split our training dataset. iterations = 50 # maximum number of training iterations. # Init k_means instance. k_means = k_means(train_data, clusters) # Train k_means instance. #(centroids, nearest_centroid) = k_means.train(iterations) (centroids, nearest_centroid) = k_means.kmeansOpt() # Denormalizing column vectors for j in range(train_data.shape[1]): train_data[:, j] *= train_data_scale[j] centroids[:, j] *= train_data_scale[j] for j in range(train_data.shape[1]): train_data[:, j] += train_data_mean[j] centroids[:, j] += train_data_mean[j] # Plot actual clusters for reference plt.subplot(2, 2, 3)
print("took: ", time2 - time1) print("constructing the cost-matrix") dist_matrix = get_dist_matrix(x, y, z) time3 = time.time() print("took: ", time3 - time2) #print (dist_matrix) number_of_clusters = 3 show_3d_data(A, cmap="viridis", savename="data_k_" + str(number_of_clusters) + "_b_" + str(A.shape[0])) random_start = torch.randint(A.shape[0], [number_of_clusters], device=device) cluster_assign, data = k_means(A.reshape((A.shape[0], x * y * z)), dist_matrix, number_of_clusters, x, y, z, regularizer) print("assignment to clusters: ", cluster_assign) labels = [ "cluster_start_" + str(random_start[i].item()) for i in range(0, data.shape[0]) ] # time_loc_start = time.time() # wasbary_1 = compute_wasbary(A, verbose=True, method='pot',numItermax=maxiter, reg= reg) # time_loc_end = time.time() # print ("computation of barycenter with pot took ", time_loc_end-time_loc_start) #time_loc_start = time.time() #wasbary_1 = compute_wasbary(A, verbose=True, method='ipot',numItermax=10,reg=reg) #time_loc_end = time.time() #print ("computation of barycenter with ipot took ", time_loc_end-time_loc_start)