def find_best_distance_epsilon(dbscan_xs_points_k_distance_method, threshold_majority): num_points_dbscan_xs_data = len(dbscan_xs_points_k_distance_method) dbscan_xs_points_k_distance_method_unique = array_sort(array_unique(dbscan_xs_points_k_distance_method)) accumulative_probabilities = array_matrix(list(accumulate_sum(array_matrix(list(collection_counter(dbscan_xs_points_k_distance_method).values())) / num_points_dbscan_xs_data))) final_distance_epsilon = dbscan_xs_points_k_distance_method_unique[compute_arg_majority(accumulative_probabilities, threshold_majority)] return final_distance_epsilon
def dbscan_final_clustering(xs_features_data, ys_labels_true, best_epsilon_value, num_closest_k_neighbors = 5): ys_labels_predicted, clusters_centroids_indices, clusters_centroids_points, clusters_border_points, xs_features_data_inliers, xs_features_data_outliers = dbscan_clustering_method(xs_features_data, best_epsilon_value, num_closest_k_neighbors = num_closest_k_neighbors) num_clusters_centroids = ( nan_max(ys_labels_predicted) + 1 ) plot_clusters_centroids_and_radii("DBSCAN", xs_features_data, ys_labels_predicted, clusters_centroids_points, num_clusters = num_clusters_centroids, epsilon = best_epsilon_value, damping = None, final_clustering = True) if( num_clusters_centroids >= 2 ): if( num_clusters_centroids <= 26 ): plot_silhouette_analysis("DBSCAN", xs_features_data, ys_labels_predicted, clusters_centroids_points, num_clusters_centroids, epsilon = best_epsilon_value, damping = None, final_clustering = True) dbscan_final_clustering_silhouette_score, dbscan_final_clustering_precision_score, dbscan_final_clustering_recall_score, dbscan_final_clustering_rand_index_score, dbscan_final_clustering_f1_score, dbscan_final_clustering_adjusted_rand_score, dbscan_final_clustering_confusion_matrix_rand_index = compute_clustering_performance_metrics("K-Means", xs_features_data, ys_labels_true, ys_labels_predicted, num_clusters_centroids, final_clustering = True) plot_confusion_matrix_rand_index_clustering_heatmap("DBSCAN", dbscan_final_clustering_confusion_matrix_rand_index, num_clusters_centroids, epsilon = best_epsilon_value, damping = None, final_clustering = True) xs_ids_examples = list(range(0, len(ys_labels_predicted))) html_report_cluster_labels(array_matrix(xs_ids_examples), ys_labels_predicted, "dbscan.html") return dbscan_final_clustering_silhouette_score, dbscan_final_clustering_precision_score, dbscan_final_clustering_recall_score, dbscan_final_clustering_rand_index_score, dbscan_final_clustering_f1_score, dbscan_final_clustering_adjusted_rand_score, dbscan_final_clustering_confusion_matrix_rand_index
def k_means_final_clustering(xs_features_data, ys_labels_true, num_clusters=4): ys_labels_predicted, k_means_estimator_centroids, k_means_final_clustering_error = k_means_clustering_method( xs_features_data, num_clusters=num_clusters, max_iterations=300) plot_clusters_centroids_and_radii("K-Means", xs_features_data, ys_labels_predicted, k_means_estimator_centroids, num_clusters=num_clusters, epsilon=None, damping=None, final_clustering=True) if (num_clusters >= 2): if (num_clusters <= 26): plot_silhouette_analysis("K-Means", xs_features_data, ys_labels_predicted, k_means_estimator_centroids, num_clusters, epsilon=None, damping=None, final_clustering=True) k_means_final_clustering_silhouette_score, k_means_final_clustering_precision_score, k_means_final_clustering_recall_score, k_means_final_clustering_rand_index_score, k_means_final_clustering_f1_score, k_means_final_clustering_adjusted_rand_score, k_means_final_clustering_confusion_matrix_rand_index = compute_clustering_performance_metrics( "K-Means", xs_features_data, ys_labels_true, ys_labels_predicted, num_clusters, final_clustering=True) plot_confusion_matrix_rand_index_clustering_heatmap( "K-Means", k_means_final_clustering_confusion_matrix_rand_index, num_clusters, epsilon=None, damping=None, final_clustering=True) xs_ids_examples = list(range(0, len(xs_features_data))) html_report_cluster_labels(array_matrix(xs_ids_examples), ys_labels_predicted, "k-means.html") return k_means_final_clustering_error, k_means_final_clustering_silhouette_score, k_means_final_clustering_precision_score, k_means_final_clustering_recall_score, k_means_final_clustering_rand_index_score, k_means_final_clustering_f1_score, k_means_final_clustering_adjusted_rand_score, k_means_final_clustering_confusion_matrix_rand_index
def affinity_propagation_final_clustering(xs_features_data, ys_labels_true, best_damping_value = 0.5): ys_labels_predicted, clusters_centroids_indices, clusters_centroids_points = affinity_propagation_clustering_method(xs_features_data, damping_value = best_damping_value, max_iterations = 300) num_clusters_centroids = ( nan_max(ys_labels_predicted) + 1 ) if( num_clusters_centroids >= 2 ): affinity_propagation_final_clustering_silhouette_score, affinity_propagation_final_clustering_precision_score, affinity_propagation_final_clustering_recall_score, affinity_propagation_final_clustering_rand_index_score, affinity_propagation_final_clustering_f1_score, affinity_propagation_final_clustering_adjusted_rand_score, affinity_propagation_final_clustering_confusion_matrix_rand_index = compute_clustering_performance_metrics("Affinity-Propagation", xs_features_data, ys_labels_true, ys_labels_predicted, num_clusters_centroids, final_clustering = True) plot_confusion_matrix_rand_index_clustering_heatmap("Affinity-Propagation", affinity_propagation_final_clustering_confusion_matrix_rand_index, num_clusters_centroids, epsilon = None, damping = best_damping_value, final_clustering = True) xs_ids_examples = list(range(0, len(ys_labels_predicted))) html_report_cluster_labels(array_matrix(xs_ids_examples), ys_labels_predicted, "affinity-propagation.html") return affinity_propagation_final_clustering_silhouette_score, affinity_propagation_final_clustering_precision_score, affinity_propagation_final_clustering_recall_score, affinity_propagation_final_clustering_rand_index_score, affinity_propagation_final_clustering_f1_score, affinity_propagation_final_clustering_adjusted_rand_score, affinity_propagation_final_clustering_confusion_matrix_rand_index
def dbscan_clustering_method(xs_features_data, current_epsilon, num_closest_k_neighbors = 5): dbscan_clustering = dbscan(eps = current_epsilon, min_samples = num_closest_k_neighbors) dbscan_clustering.fit(xs_features_data) ys_labels_predicted = dbscan_clustering.labels_ clusters_centroids_indices = dbscan_clustering.core_sample_indices_ xs_features_data_inliers = xs_features_data[ys_labels_predicted != -1] xs_features_data_outliers = xs_features_data[ys_labels_predicted == -1] clusters_centroids_points = xs_features_data[clusters_centroids_indices, :] clusters_border_points = array_matrix([list(point) for point in xs_features_data_inliers if point not in clusters_centroids_points]) return ys_labels_predicted, clusters_centroids_indices, clusters_centroids_points, clusters_border_points, xs_features_data_inliers, xs_features_data_outliers
k_means_xs_points_elbow_method, k_means_ys_points_elbow_method, best_num_clusters = plot_elbow_method("K-Means", k_means_squared_errors_sums_intertias, best_num_clusters, num_max_clusters = NUM_MAX_CLUSTERS) error_k_means_final_clustering = k_means_final_clustering(normalized_data_xs_best_features_priori, ys_labels_true, num_clusters = best_num_clusters) # ---- K-Means Clustering ---- # ---- DBScan Clustering ---- dbscan_num_centroids, dbscan_num_inliers, dbscan_num_outliers, dbscan_silhouette_scores, dbscan_precision_scores, dbscan_recall_scores, dbscan_rand_index_scores, dbscan_f1_scores, dbscan_adjusted_rand_scores = dbscan_pre_clustering(normalized_data_xs_best_features_priori, ys_labels_true, start_epsilon = START_EPSILON, end_epsilon = END_EPSILON, step_epsilon = STEP_EPSILON) num_data_points_sorted_by_distance, k_neighbors_distances_epsilons = compute_distances_nearest_neighbors(normalized_data_xs_best_features_priori, num_closest_k_neighbors = NUM_K_NEAREST_NEIGHBORS) dbscan_xs_points_k_distance_method = array_matrix( range( num_data_points_sorted_by_distance ) ) dbscan_ys_points_k_distance_method = k_neighbors_distances_epsilons if(KNEED_LIB_IN_USE): kneed_locator_k_distance = knee_locator(dbscan_xs_points_k_distance_method, dbscan_ys_points_k_distance_method, S = 1.0, curve = "convex", direction = "increasing") best_distance_epsilon = dbscan_ys_points_k_distance_method[round(kneed_locator_k_distance.elbow, 0)] else: best_distance_epsilon = find_best_distance_epsilon(dbscan_ys_points_k_distance_method, THRESHOLD_MAJORITY) print( "The best Distance ( ε (Epsilon Value) ), for DBScan, found:" )
def bisecting_k_means_clustering(xs_features_data, examples_ids, ys_labels_true, final_max_num_clusters_and_iterations=2): clusters_ids = [0] clusters_data = [xs_features_data] clusters_examples_ids = [examples_ids] clusters_centroids_points = [0] num_clusters = 1 tree_predictions_lists_with_offset = array_empty( (len(xs_features_data), 0)).tolist() tree_predictions_lists_without_offset = array_empty( (len(xs_features_data), 0)).tolist() ys_labels_predicted_with_offset = None ys_labels_predicted_without_offset = None current_iteration = 0 while (num_clusters < final_max_num_clusters_and_iterations): cluster_index_with_more_examples = -1 num_max_examples_in_cluster = -1 for index_cluster in range(num_clusters): num_examples_in_cluster = len(clusters_data[index_cluster]) if (num_examples_in_cluster > num_max_examples_in_cluster): cluster_index_with_more_examples = index_cluster num_max_examples_in_cluster = num_examples_in_cluster cluster_id_to_be_divided = clusters_ids[ cluster_index_with_more_examples] cluster_data_to_be_divided = clusters_data[ cluster_index_with_more_examples] cluster_examples_ids_to_be_divided = clusters_examples_ids[ cluster_index_with_more_examples] cluster_centroid_point_to_be_divided = clusters_centroids_points[ cluster_index_with_more_examples] clusters_data.remove(cluster_data_to_be_divided) clusters_examples_ids.remove(cluster_examples_ids_to_be_divided) if (num_clusters == 1): clusters_ids.pop(cluster_id_to_be_divided) clusters_centroids_points.pop(0) else: cluster_centroid_point_index = 0 cluster_centroid_point_index_triggered = 0 for cluster_centroid_point in clusters_centroids_points: if ((cluster_centroid_point_to_be_divided == cluster_centroid_point).all()): cluster_centroid_point_index_triggered = cluster_centroid_point_index cluster_centroid_point_index = (cluster_centroid_point_index + 1) clusters_centroids_points.pop( cluster_centroid_point_index_triggered) if (num_clusters == 1): two_sub_clusters_ids, two_sub_clusters_examples_ids, two_sub_clusters_data, two_sub_clusters_centroids, ys_labels_predicted_without_offset, ys_labels_predicted_with_offset, cluster_squared_error_sum_intertia = bissect_k_means_into_two_sub_clusters( cluster_data_to_be_divided, cluster_examples_ids_to_be_divided, left_leaf_cluster_id_offset=0, right_leaf_cluster_id_offset=0) else: two_sub_clusters_ids, two_sub_clusters_examples_ids, two_sub_clusters_data, two_sub_clusters_centroids, ys_labels_predicted_without_offset, ys_labels_predicted_with_offset, cluster_squared_error_sum_intertia = bissect_k_means_into_two_sub_clusters( cluster_data_to_be_divided, cluster_examples_ids_to_be_divided, left_leaf_cluster_id_offset=cluster_id_to_be_divided, right_leaf_cluster_id_offset=(num_clusters - 1)) ys_labels_predicted_with_offset_unique = array_sort( array_unique_values(ys_labels_predicted_with_offset)) for sub_cluster_id in range(2): clusters_ids.append(two_sub_clusters_ids[sub_cluster_id]) clusters_data.append(two_sub_clusters_data[sub_cluster_id]) clusters_examples_ids.append( two_sub_clusters_examples_ids[sub_cluster_id]) clusters_centroids_points.append( two_sub_clusters_centroids[sub_cluster_id]) for example_index in range(len(xs_features_data)): if (example_index in two_sub_clusters_examples_ids[sub_cluster_id]): if (sub_cluster_id < len(ys_labels_predicted_with_offset_unique)): tree_predictions_lists_with_offset[ example_index].append( ys_labels_predicted_with_offset_unique[ sub_cluster_id]) tree_predictions_lists_without_offset[ example_index].append(sub_cluster_id) num_clusters = (num_clusters + 1) current_iteration = (current_iteration + 1) ys_final_labels_predicted = matrix_array_zeros(len(xs_features_data)) for example_index in range(len(xs_features_data)): last_index = (len(tree_predictions_lists_with_offset[example_index]) - 1) ys_final_labels_predicted[ example_index] = tree_predictions_lists_with_offset[example_index][ last_index] clusters_centroids_points = array_matrix(clusters_centroids_points) effective_num_clusters = len( array_unique_values(ys_final_labels_predicted)) if (effective_num_clusters >= 2): if (effective_num_clusters <= 26): plot_clusters_centroids_and_radii("Bisecting-K-Means", xs_features_data, ys_final_labels_predicted, clusters_centroids_points, effective_num_clusters, epsilon=None, damping=None, final_clustering=True) plot_silhouette_analysis("Bisecting-K-Means", xs_features_data, ys_final_labels_predicted, clusters_centroids_points, effective_num_clusters, epsilon=None, damping=None, final_clustering=True) bisecting_k_means_final_clustering_silhouette_score, bisecting_k_means_final_clustering_precision_score, bisecting_k_means_final_clustering_recall_score, bisecting_k_means_final_clustering_rand_index_score, bisecting_k_means_final_clustering_f1_score, bisecting_k_means_final_clustering_adjusted_rand_score, bisecting_k_means_final_clustering_confusion_matrix_rand_index = compute_clustering_performance_metrics( "Bisecting-K-Means", xs_features_data, ys_labels_true, ys_final_labels_predicted, effective_num_clusters, final_clustering=True) plot_confusion_matrix_rand_index_clustering_heatmap( "Bisecting-K-Means", bisecting_k_means_final_clustering_confusion_matrix_rand_index, effective_num_clusters, epsilon=None, damping=None, final_clustering=True) xs_ids_examples = list(range(0, len(examples_ids))) html_report_cluster_labels_hierarchical( xs_ids_examples, tree_predictions_lists_without_offset, "bisecting-k-means-hierarchical.html") return clusters_ids, clusters_data, ys_final_labels_predicted, tree_predictions_lists_without_offset, num_clusters, effective_num_clusters