def kmedoidsWithScores(filenameData, filenameSilhMean, filenameDBS, filenameCHS, kClusters): data = read_sample(str(root) + '\\' + filenameData) #kClusters = canoc(data, kmin, kmax) initial_medoids = randomCenters(len(data), kClusters) kmedoids_instance = kmedoids(data, initial_medoids, metric=metricResearch) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() predicted = kmedoids_instance.predict(data) silhouetteScore = silhouette(data, clusters).process().get_score() meanSilhouetteScore = np.mean(silhouetteScore) witTXT(meanSilhouetteScore, filenameSilhMean, filepath=root, note='k: ' + str(kClusters)) dbsScore = dbs(data, predicted) witTXT(dbsScore, filenameDBS, filepath=root, note='k: ' + str(kClusters)) chsScore = chs(data, predicted) witTXT(chsScore, filenameCHS, filepath=root, note='k: ' + str(kClusters))
def kmediansWithScore(nameData, nameSilhouetteMean, nameDBS, nameCHS, k_clusters, measure, kmin, kmax): data = read_sample(str(root) + '\\' + nameData) initial_medians = kppi(data, k_clusters).initialize() kmedians_instance = kmedians(data, initial_medians) kmedians_instance.process() clusters = kmedians_instance.get_clusters() # final_medians = kmedians_instance.get_medians() predicted = kmedians_instance.predict(data) silhouetteScore = silhouette(data, clusters).process().get_score() meanSilhouetteScore = np.mean(silhouetteScore) #wlitCSV(silhouetteScore, filenameSilhouette, '', root) #witCSV(meanSilhouetteScore, nameSilhouetteMean, '', root) dbsScore = dbs(data, predicted) #witCSV(dbsScore, nameDBS, '', root) chsScore = chs(data, predicted) #witCSV(chsScore, nameCHS, '', root) elbow_instance = elbow(data, kmin, kmax) elbow_instance.process() amount_clusters = elbow_instance.get_amount( ) # most probable amount of clusters wce = elbow_instance.get_wce()
def get_silhouette(args_dict): """ Get the silhouette coefficients as an average for all the elements. :param cl: clustering result :return: silhouette avg score """ cl = args_dict["cl"] distance_metric = args_dict["distance_metric"] log = args_dict["log"] # get the clusters result with the following example format: [[0,4][1,2,3]] -> 2 clusters # where the indexes inside represents the chunk's row from distance metric clusters = cl.get_clusters() # initialize array with size of the comparable chunks # each index in the array represent the chunk'a row from distance metric while the value is the cluster's id cluster_indicator = [0] * len(distance_metric) i = 0 for cluster in clusters: for chunk_index in cluster: cluster_indicator[chunk_index] = i i += 1 silhouette_width_list = silhouette(distance_metric, clusters).process().get_score() silhouette_width = 0 for score in silhouette_width_list: silhouette_width += score silhouette_width = float(silhouette_width) / len(silhouette_width_list) log.info("K={num_clusters}".format(num_clusters=str(len(clusters)))) log.info("{result}".format(result=str(clusters))) log.info("Silhouette width={sil}".format(sil=str(silhouette_width))) return silhouette_width, cluster_indicator, cl
def get_silhouette(samples1, samples2): cluster1, medoid_id1, kmedoid_instance1 = run_kmedoids(samples1, 1) cluster2, medoid_id2, kmedoid_instance12 = run_kmedoids(samples2, 1) cluster2 = np.array([[len(samples1) + x for x in cluster2[0]]]) samples = np.concatenate((samples1, samples2), axis=0) clusters = np.concatenate((cluster1, cluster2), axis=0) score = sum(silhouette(samples, clusters).process().get_score()) / len(samples) return score
def template_correct_scores(self, sample_path, answer_path): sample = read_sample(sample_path) clusters = answer_reader(answer_path).get_clusters() scores = silhouette(sample, clusters).process().get_score() assertion.eq(len(sample), len(scores)) for score in scores: assertion.le(-1.0, score) assertion.ge(1.0, score)
def correct_scores(sample_path, answer_path, ccore_flag): sample = read_sample(sample_path) clusters = answer_reader(answer_path).get_clusters() scores = silhouette(sample, clusters, ccore=ccore_flag).process().get_score() assertion.eq(len(sample), len(scores)) for score in scores: assertion.le(-1.0, score) assertion.ge(1.0, score)
def correct_scores(sample_path, answer_path, ccore_flag, **kwargs): data_type = kwargs.get('data_type', 'points') sample = read_sample(sample_path) if data_type == 'distance_matrix': sample = calculate_distance_matrix(sample, distance_metric(type_metric.EUCLIDEAN_SQUARE)) clusters = answer_reader(answer_path).get_clusters() scores = silhouette(sample, clusters, ccore=ccore_flag, data_type=data_type).process().get_score() assertion.eq(len(sample), len(scores)) for score in scores: assertion.le(-1.0, score) assertion.ge(1.0, score) return scores
initial_medoids = ut.initial_medoids_paper_method(sample, k, distances) # create instance of K-Medoids algorithm kmedoids_instance = kmedoids(sample, initial_medoids) # run cluster analysis and obtain results kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() # show clusters for clusteri in clusters: print(clusteri) print("\n") # Calculate Silhouette score dirtyscore = silhouette(sample, clusters).process().get_score() score = [x for x in dirtyscore if str(x) != 'nan'] print("score promedio de silhoette") print(np.mean(np.asarray(score))) print("\n") #computing minimum distance in data minimo = [ np.asarray([ distances[i][j] for j in range(number_of_data_points) if distances[i][j] != 0 ]) for i in range(number_of_data_points) ] minimo = [np.min(x) for x in minimo] minimo = np.asarray(minimo)
def meanSilh(data, clusters): silhouetteScore = silhouette(data, clusters).process().get_score() return np.mean(silhouetteScore)
metric = distance_metric(type_metric.USER_DEFINED, func=weighted_distance); random_state = check_random_state(None) print('success') options = [30] for i in range(len(options)): print("enter round " + str(options[i])) initial_medoids = kpp_init(np.array(X), options[i], random_state) print(initial_medoids) kmedoids_instance = kmedoids(X, initial_medoids, data_type='distance_matrix'); kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() store_clusters(clusters, data, length, options[i]) medoids = kmedoids_instance.get_medoids() medoids_vectors = store_medoids(medoids, data, options[i]) score = silhouette(data, clusters, metric = metric).process().get_score() print(sum(score)/len(score)) visual= cluster_visualizer_multidim() visual.append_clusters(clusters, data) visual.show(pair_filter=[[0, 10], [1, 10], [3,10], [4,10], [5,10],[6,10], [7,10], [9,10], [0,1]]) visual.show() def append_id_on_vector(): with open('clean_ids.csv', newline='') as csvfile: id_d = list(csv.reader(csvfile)) id_data = [] length = [] for line in id_d: playlist = list(filter(None, line)) if len(playlist)>0:
def main_fun(): initial_centers = kmeans_plusplus_initializer(sample, 2).initialize() # Create instance of K-Means algorithm with prepared centers. kmeans_instance = kmeans(sample, initial_centers) # Run cluster analysis and obtain results. kmeans_instance.process() clusters = kmeans_instance.get_clusters() final_centers = kmeans_instance.get_centers() tolerance = 10 cluster_ids = random.sample(range(0, len(sample)), 2) # Create instance of K-Medoids algorithm. kmedoids_instance = kmedoids(sample, cluster_ids, tolerance=tolerance, ccore=True) # Run cluster analysis and obtain results. kmedoids_instance.process() clusters1 = kmedoids_instance.get_clusters() final_centers1 = kmedoids_instance.get_medoids() print('K-Mean Centroids: ' + str(final_centers)) final_center = [] final_center.append(sample[final_centers1[0]]) final_center.append(sample[final_centers1[1]]) print('K-Medoid Medoid' + str(final_center)) if (sample[final_centers1[0]][0] > sample[final_centers1[1]][0]): c3 = clusters1[:1][0] c4 = clusters1[1:][0] else: c4 = clusters1[:1][0] c3 = clusters1[1:][0] if (final_centers[0][0] > final_centers[1][0]): c1 = clusters[:1][0] c2 = clusters[1:][0] else: c2 = clusters[:1][0] c1 = clusters[1:][0] visualizer = cluster_visualizer() visualizer.append_cluster(cluster=c1, data=sample, color='red', markersize=8) visualizer.append_cluster(cluster=c2, data=sample, color='black', markersize=8) visualizer.append_cluster(cluster=c3, data=sample, color='yellow') visualizer.append_cluster(cluster=c4, data=sample, color='lime') visualizer.append_cluster(cluster=final_centers, marker='*', markersize=10, color='purple') visualizer.append_cluster(cluster=final_center, marker='*', markersize=10, color='pink') visualizer.set_canvas_title(text='k-mean vs k-medoids') visualizer.show(invisible_axis=False) kmeoids_score = silhouette.silhouette(sample, clusters1).process().get_score() kmean_score = silhouette.silhouette(sample, clusters).process().get_score() print('K-Medoid Score:' + str(sum(kmeoids_score))) print('K-Means Score:' + str(sum(kmean_score)))
return medoidsToInit def kmedoidsWithScore(nameData, nameSilhouetteMean, nameDBS, nameCHS, k_clusters, measure, kmin, kmax): data = read_sample(str(root)+'\\'+filenameData) kClusters = canoc(data, kmin, kmax) initial_medoids = rci(data, kClusters).initialize() kmedoids_instance = kmedoids(data, initial_medoids) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() predicted = kmedoids_instance.predict(data) silhouetteScore = silhouette(data, clusters).process().get_score() meanSilhouetteScore = np.mean(silhouetteScore) #wlitCSV(silhouetteScore, filenameSilhouette, '', root) #witCSV(meanSilhouetteScore, nameSilhouetteMean, '', root) dbsScore = dbs(data, predicted) #witCSV(dbsScore, nameDBS, '', root) chsScore = chs(data, predicted) #witCSV(chsScore, nameCHS, '', root) # elbow_instance = elbow(data, kmin, kmax) # elbow_instance.process() # amount_clusters = elbow_instance.get_amount() # most probable amount of clusters # wce = elbow_instance.get_wce()