def clustering(headers, min_dist, distance_matrix_filename=None): if distance_matrix_filename is None: dist_matrix, _ = get_distance_matrix(headers) else: dist_matrix, _ = read_dist_matrix(distance_matrix_filename) n_clusters_, labels = fit(dist_matrix, min_dist) return metrics.silhouette_score(np.asmatrix(dist_matrix), labels, metric='precomputed')
def clustering(headers, eps, distance_matrix_filename=None): if distance_matrix_filename is None: dist_matrix, _ = get_distance_matrix(headers) else: dist_matrix, _ = read_dist_matrix(distance_matrix_filename) dbscan = DBSCAN(eps=eps, min_samples=2, metric="precomputed").fit( dist_matrix) return metrics.silhouette_score(np.asmatrix(dist_matrix), dbscan.labels_, metric='precomputed')
def main(dataset_filename, output_data_filename, distance_matrix_filename=None, display=False): start = time.perf_counter() headers_pairs = get_headers_pairs_list(dataset_filename, verbose=True) labels_true = get_labels(dataset_filename, verbose=True) if distance_matrix_filename is None: dist_matrix, max_dist = get_distance_matrix(list(map(lambda x: x[1], headers_pairs)), verbose=True) else: dist_matrix, max_dist = \ read_dist_matrix(distance_matrix_filename, verbose=True) # pd.write_dist_matrix(dist_matrix, max_dist, # r"C:\Users\pavel.zhuk\IdeaProjects\email-" # r"parser\clustering\data\training_set_dist_matr.txt" # r"", verbose=True) affinity_matr = get_affinity_matrix(dist_matrix, verbose=True, max_affinity=max_dist) print("Clustering...") af = AffinityPropagation(affinity="precomputed", verbose=True, copy=True).fit(affinity_matr) print("Done.") cluster_centers_indices = af.cluster_centers_indices_ n_clusters_ = len(cluster_centers_indices) labels = af.labels_ metrics_list = [ n_clusters_, metrics.homogeneity_score(labels_true, labels), metrics.completeness_score(labels_true, labels), metrics.v_measure_score(labels_true, labels), metrics.adjusted_rand_score(labels_true, labels), metrics.adjusted_mutual_info_score(labels_true, labels), metrics.silhouette_score(np.asmatrix(dist_matrix), labels, metric='precomputed') ] print_metrics(metrics_list) write_clusterized_data(output_data_filename, headers_pairs, labels, metrics=metrics_list, verbose=True) end = time.perf_counter() print("\nWorking time: %f sec." % (end - start)) if display: visualize(dist_matrix, labels, cluster_centers_indices, show_cluster_sizes=True)
def main(dataset_filename, output_data_filename, distance_matrix_filename=None, eps=10, display=False): start = time.perf_counter() headers_pairs = get_headers_pairs_list(dataset_filename, verbose=True) labels_true = get_labels(dataset_filename, verbose=True) if distance_matrix_filename is None: dist_matrix, _ = get_distance_matrix(list(map(lambda x: x[1], headers_pairs)), verbose=True) else: dist_matrix, _ = \ read_dist_matrix(distance_matrix_filename, verbose=True) print("Clustering...") dbscan = DBSCAN(eps=eps, min_samples=2, metric="precomputed").fit( dist_matrix) print("Done.") labels = np.copy(dbscan.labels_) n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) clusters_without_noise = n_clusters_ for i, l in enumerate(labels): if l == -1: labels[i] = n_clusters_ n_clusters_ += 1 metrics_list = [ n_clusters_, metrics.homogeneity_score(labels_true, labels), metrics.completeness_score(labels_true, labels), metrics.v_measure_score(labels_true, labels), metrics.adjusted_rand_score(labels_true, labels), metrics.adjusted_mutual_info_score(labels_true, labels), metrics.silhouette_score(np.asmatrix(dist_matrix), labels, metric='precomputed') ] print_metrics(metrics_list) write_clusterized_data(output_data_filename, headers_pairs, labels, metrics=metrics_list, verbose=True) end = time.perf_counter() print("\nWorking time: %f sec." % (end - start)) if display: visualize_dbscan(dist_matrix, dbscan.labels_, clusters_without_noise + 1, show_cluster_sizes=True)
def clustering(headers, distance_matrix_filename=None): if distance_matrix_filename is None: dist_matrix, max_dist = get_distance_matrix(headers) else: dist_matrix, max_dist = read_dist_matrix(distance_matrix_filename) affinity_matr = get_affinity_matrix(dist_matrix, max_affinity=max_dist) af = AffinityPropagation(affinity="precomputed", copy=True).fit( affinity_matr) return metrics.silhouette_score(np.asmatrix(dist_matrix), af.labels_, metric='precomputed')
def main(dataset_filename, output_data_filename, distance_matrix_filename=None, display=False, min_dist=10): start = time.perf_counter() headers_pairs = get_headers_pairs_list(dataset_filename, verbose=True) labels_true = get_labels(dataset_filename, verbose=True) if distance_matrix_filename is None: dist_matrix, max_dist = get_distance_matrix(list(map(lambda x: x[1], headers_pairs)), verbose=True) else: dist_matrix, max_dist = \ read_dist_matrix(distance_matrix_filename, verbose=True) print("Clustering...") n_clusters_, labels = fit(dist_matrix, min_dist) print("Done.") print("clusters {0}".format(n_clusters_)) print(labels) metrics_list = [ n_clusters_, metrics.homogeneity_score(labels_true, labels), metrics.completeness_score(labels_true, labels), metrics.v_measure_score(labels_true, labels), metrics.adjusted_rand_score(labels_true, labels), metrics.adjusted_mutual_info_score(labels_true, labels), metrics.silhouette_score(np.asmatrix(dist_matrix), labels, metric='precomputed') ] print_metrics(metrics_list) write_clusterized_data(output_data_filename, headers_pairs, labels, metrics=metrics_list, verbose=True) end = time.perf_counter() print("\nWorking time: %f sec." % (end - start))