def main(): points = read_points_from_file(FILENAME) start_time = time.time() # TODO - remove all files in output folder (can be in makefile) k_means(N_KLUSTERS, points, N_THREADS) end_time = time.time() print("Execution time: %.3f" % (end_time - start_time)) return
def model_parameters(data, structure, C_old, U_old, k): """ input: path string, path to file structure list(int, list(floats), list(floats)), number of non-hypertime dimensions, list of hypertime radii nad list of wavelengths C_old numpy array kxd, centres from last iteration U_old numpy array nxd, weights from last iteration k positive integer, number of clusters output: C numpy array kxd, matrix of k d-dimensional cluster centres U numpy array kxn, matrix of weights COV numpy array kxdxd, matrix of covariance matrices densities numpy array kx1, matrix of number of measurements belonging to every cluster uses: dio.create_X(), cl.k_means(), covariance_matrices() objective: to find model parameters """ X = dio.create_X(data, structure) # test to find out if clusters are known from previous clustering try: len(U_old) ##### POKUS !!! #used_method = 'stable_init' # originaly 'prev_dim' used_method = 'random' ##### KONEC POKUSU !!! except TypeError: used_method = 'random' #print('type of initialization for clustering: ' + used_method) C, U, densities = cl.k_means( X, k, structure, method=used_method, version='hard', # weight calculation fuzzyfier=1, # weighting exponent iterations=100, C_in=C_old, U_in=U_old) COV = covariance_matrices(X, C, U, structure) return C, U, COV, densities
num_cluster_list = range(2, 3) # the list of number of clusters SSE_list = np.zeros(len(num_cluster_list)) sil_list = np.zeros(len(num_cluster_list)) ch_list = np.zeros(len(num_cluster_list)) i = 0 for num_cluster in num_cluster_list: # k-means centroid, assignment, cost, sorted_zooms, sil_avg_kmeans, SSE_sum_kmeans, ch_score_kmeans = clustering.k_means( zooms_stnd, projct2d, dist_mat, num_cluster, max_iter, randm_init, interval + 1, w, plot=True, sil=True, SSE=True, sort=True) print( 'K-means clustering evaluation: Silhouettes Index = %.3g; SSE= %.3g, C-H sore =%.3g' % (sil_avg_kmeans, SSE_sum_kmeans, ch_score_kmeans)) SSE_list[i] = SSE_sum_kmeans sil_list[i] = sil_avg_kmeans ch_list[i] = ch_score_kmeans i += 1 #%%
def main(dist_constant: int = None, file_path: str = '../data/matches.pkl', hero_weight: float = 0.05): if dist_constant is not None: print('main(): dist_constant is: ') print('main(): Data file path: ' + file_path) print('main(): hero weight: ' + str(hero_weight)) collector = data_collector.DataCollector(data_file_path=file_path) list_of_dota_matches = collector.read_dota_matches_from_file() print('main(): DataCollector found: ' + str(len(list_of_dota_matches)) + ' matches from file') hero_names_dict = data_collector.get_hero_names() # for match_dict in list_of_match_dicts: # if match_dict['avg_mmr'] is not None: # try: # match = DotaMatch(match_dict, hero_names_dict) # list_of_dota_matches.append(match) # print(str(match)) # except json.decoder.JSONDecodeError: # print('Caught json.decoder.JSONDecodeError Exception, ignoring match...') # print('Query returned ' + str(len(list_of_dota_matches)) + ' matches') # Create the initial cluster that contains all matches that were read from file initial_cluster = clustering.Cluster(matches=list_of_dota_matches, hero_names=hero_names_dict) initial_cluster.print_center() max_dist = 0 num_of_matches = len(initial_cluster.matches) farthest_cluster_pair = 0, 0 avg_dist = 0 num_of_disjoint_pairs = 0 if dist_constant is None: # Get all disjoint pair distances, and find the average distance between points as well as the maximum distance for i in range(num_of_matches): for j in range(i + 1, num_of_matches): dist = clustering.get_distance(initial_cluster.matches[i], initial_cluster.matches[j], hero_weight=hero_weight) avg_dist += dist num_of_disjoint_pairs += 1 if dist > max_dist: max_dist = dist farthest_cluster_pair = i, j # print('Distance is: ' + str(dist)) avg_dist = (avg_dist / num_of_disjoint_pairs) print('Average distance between Dota matches: ' + str(avg_dist) + ' max dist is: ' + str(max_dist)) else: avg_dist = dist_constant print('Using dist constant of: ' + str(avg_dist) + ' max dist is: ' + str(max_dist)) divisive_clustering_clusters = [] divisive_start_time = time.time() # Run divisive clustering with the initial cluster and use the avg_dist as the "user-defined" constant # Time the computation time clustering.run_divisive_clustering(initial_cluster, hero_names_dict, int(avg_dist), divisive_clustering_clusters, hero_weight=hero_weight) divisive_end_time = time.time() suggested_value_of_k = len(divisive_clustering_clusters) print('Num of clusters found in divisive clustering: ' + str(len(divisive_clustering_clusters)) + ' Time of computation: %s seconds. About to print centers...' % (divisive_end_time - divisive_start_time)) for cluster in divisive_clustering_clusters: print('Number of matches in cluster: ' + str(len(cluster.matches))) cluster.print_center() reclustering_start_time = time.time() # Run K-means using the value of k found by divisive clustering, and using the clusters already found in that step # Time the computation time k_means_clusters = clustering.k_means( hero_names_dict=hero_names_dict, num_of_clusters=suggested_value_of_k, final_clusters=divisive_clustering_clusters, hero_weight=hero_weight) reclustering_end_time = time.time() print('K-Means with re-clustering found clusters for k=' + str(len(k_means_clusters)) + ' In %s seconds. About to print centers...' % (reclustering_end_time - reclustering_start_time)) for cluster in k_means_clusters: print('Number of matches in cluster: ' + str(len(cluster.matches))) cluster.print_center() random_start_time = time.time() # In order to compare computation time between k_means_random_clusters = clustering.k_means( hero_names_dict=hero_names_dict, num_of_clusters=suggested_value_of_k, matches=list_of_dota_matches, hero_weight=hero_weight) random_end_time = time.time() print('K-Means with random initialization found clusters for k=' + str(len(k_means_random_clusters)) + ' In %s seconds. About to print centers...' % (random_end_time - random_start_time)) for cluster in k_means_random_clusters: print('Number of matches in cluster: ' + str(len(cluster.matches))) cluster.print_center()
# # extract clusters information from clean data x_train_tmp, y_train_tmp, x_valid_tmp, y_valid_tmp, x_test_tmp, y_test_tmp = utils.generate_batches( filename='../data/power_consumption.csv', window=sequence_len, stride=stride, mode='validation', non_train_percentage=.3, val_rel_percentage=.8, normalize='maxmin01', time_difference=False, td_method=None) # cluster info relative to signal's value (cluster's means) clusters_info = clst.k_means(x_train_tmp, n_clusters) # extract train and test x_train, y_train, x_valid, y_valid, x_test, y_test = utils.generate_batches( filename='../data/power_consumption.csv', window=sequence_len, stride=stride, mode='validation', non_train_percentage=.3, val_rel_percentage=.8, normalize='maxmin01', time_difference=True, td_method=np.log2) # cluster info relative to time difference (cluster's means) clusters_info_td = clst.k_means(x_train, n_clusters_td)
filename = "exp_results/%s_gridsize=%s_neighsize=200" % (lime_version, g), qualities) del image_pool # --------------------------------------------------------------------------------------------- # Testing Lime# Clustering -------------------------------------------------------------------- elif lime_version == 'lime#C': ks_for_each_grid_size = [20, 17, 16, 26, 18] images = clustering.load_images(imgs_clustering) for k, g in zip(ks_for_each_grid_size, grid_sizes): und = clustering.undersample_images(images, g) flattened = clustering.flatten_images(und) inertia, lbls, centers = clustering.k_means(flattened, k) qualities = evaluation_measures.evaluate_explanations( 'lime#C', model, bb_outcomes, imgs, gts, neigh_size=100, segmentation_fun=partial(gridSegmentation, g), clustering_labels=lbls) lime_version = 'lime#C-NEW' filename = "exp_results/%s_gridsize=%s_neighsize=200" % (lime_version, g), qualities)