コード例 #1
0
def clustering(headers, min_dist, distance_matrix_filename=None):
    if distance_matrix_filename is None:
        dist_matrix, _ = get_distance_matrix(headers)
    else:
        dist_matrix, _ = read_dist_matrix(distance_matrix_filename)

    n_clusters_, labels = fit(dist_matrix, min_dist)

    return metrics.silhouette_score(np.asmatrix(dist_matrix), labels,
                                    metric='precomputed')
コード例 #2
0
def clustering(headers, eps, distance_matrix_filename=None):
    if distance_matrix_filename is None:
        dist_matrix, _ = get_distance_matrix(headers)
    else:
        dist_matrix, _ = read_dist_matrix(distance_matrix_filename)

    dbscan = DBSCAN(eps=eps, min_samples=2, metric="precomputed").fit(
        dist_matrix)

    return metrics.silhouette_score(np.asmatrix(dist_matrix), dbscan.labels_,
                                    metric='precomputed')
コード例 #3
0
def main(dataset_filename, output_data_filename,
         distance_matrix_filename=None, display=False):
    start = time.perf_counter()

    headers_pairs = get_headers_pairs_list(dataset_filename, verbose=True)
    labels_true = get_labels(dataset_filename, verbose=True)

    if distance_matrix_filename is None:
        dist_matrix, max_dist = get_distance_matrix(list(map(lambda x: x[1],
                                                             headers_pairs)),
                                                    verbose=True)
    else:
        dist_matrix, max_dist = \
            read_dist_matrix(distance_matrix_filename, verbose=True)

    # pd.write_dist_matrix(dist_matrix, max_dist,
    #                      r"C:\Users\pavel.zhuk\IdeaProjects\email-"
    #                      r"parser\clustering\data\training_set_dist_matr.txt"
    #                      r"", verbose=True)

    affinity_matr = get_affinity_matrix(dist_matrix, verbose=True,
                                        max_affinity=max_dist)
    print("Clustering...")
    af = AffinityPropagation(affinity="precomputed", verbose=True,
                             copy=True).fit(affinity_matr)
    print("Done.")

    cluster_centers_indices = af.cluster_centers_indices_
    n_clusters_ = len(cluster_centers_indices)
    labels = af.labels_

    metrics_list = [
        n_clusters_,
        metrics.homogeneity_score(labels_true, labels),
        metrics.completeness_score(labels_true, labels),
        metrics.v_measure_score(labels_true, labels),
        metrics.adjusted_rand_score(labels_true, labels),
        metrics.adjusted_mutual_info_score(labels_true, labels),
        metrics.silhouette_score(np.asmatrix(dist_matrix), labels,
                                 metric='precomputed')
    ]

    print_metrics(metrics_list)

    write_clusterized_data(output_data_filename, headers_pairs, labels,
                           metrics=metrics_list, verbose=True)

    end = time.perf_counter()
    print("\nWorking time: %f sec." % (end - start))

    if display:
        visualize(dist_matrix, labels, cluster_centers_indices,
                  show_cluster_sizes=True)
コード例 #4
0
def main(dataset_filename, output_data_filename,
         distance_matrix_filename=None, eps=10, display=False):
    start = time.perf_counter()

    headers_pairs = get_headers_pairs_list(dataset_filename, verbose=True)
    labels_true = get_labels(dataset_filename, verbose=True)

    if distance_matrix_filename is None:
        dist_matrix, _ = get_distance_matrix(list(map(lambda x: x[1],
                                                      headers_pairs)),
                                             verbose=True)
    else:
        dist_matrix, _ = \
            read_dist_matrix(distance_matrix_filename, verbose=True)

    print("Clustering...")
    dbscan = DBSCAN(eps=eps, min_samples=2, metric="precomputed").fit(
        dist_matrix)
    print("Done.")

    labels = np.copy(dbscan.labels_)

    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    clusters_without_noise = n_clusters_

    for i, l in enumerate(labels):
        if l == -1:
            labels[i] = n_clusters_
            n_clusters_ += 1

    metrics_list = [
        n_clusters_,
        metrics.homogeneity_score(labels_true, labels),
        metrics.completeness_score(labels_true, labels),
        metrics.v_measure_score(labels_true, labels),
        metrics.adjusted_rand_score(labels_true, labels),
        metrics.adjusted_mutual_info_score(labels_true, labels),
        metrics.silhouette_score(np.asmatrix(dist_matrix), labels,
                                 metric='precomputed')
    ]

    print_metrics(metrics_list)

    write_clusterized_data(output_data_filename, headers_pairs, labels,
                           metrics=metrics_list, verbose=True)

    end = time.perf_counter()
    print("\nWorking time: %f sec." % (end - start))

    if display:
        visualize_dbscan(dist_matrix, dbscan.labels_,
                         clusters_without_noise + 1, show_cluster_sizes=True)
コード例 #5
0
def main(dataset_filename, output_data_filename,
         distance_matrix_filename=None, eps=10, display=False):
    start = time.perf_counter()

    headers_pairs = get_headers_pairs_list(dataset_filename, verbose=True)
    labels_true = get_labels(dataset_filename, verbose=True)

    if distance_matrix_filename is None:
        dist_matrix, _ = get_distance_matrix(list(map(lambda x: x[1],
                                                      headers_pairs)),
                                             verbose=True)
    else:
        dist_matrix, _ = \
            read_dist_matrix(distance_matrix_filename, verbose=True)

    print("Clustering...")
    dbscan = DBSCAN(eps=eps, min_samples=2, metric="precomputed").fit(
        dist_matrix)
    print("Done.")

    labels = np.copy(dbscan.labels_)

    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    clusters_without_noise = n_clusters_

    for i, l in enumerate(labels):
        if l == -1:
            labels[i] = n_clusters_
            n_clusters_ += 1

    metrics_list = [
        n_clusters_,
        metrics.homogeneity_score(labels_true, labels),
        metrics.completeness_score(labels_true, labels),
        metrics.v_measure_score(labels_true, labels),
        metrics.adjusted_rand_score(labels_true, labels),
        metrics.adjusted_mutual_info_score(labels_true, labels),
        metrics.silhouette_score(np.asmatrix(dist_matrix), labels,
                                 metric='precomputed')
    ]

    print_metrics(metrics_list)

    write_clusterized_data(output_data_filename, headers_pairs, labels,
                           metrics=metrics_list, verbose=True)

    end = time.perf_counter()
    print("\nWorking time: %f sec." % (end - start))

    if display:
        visualize_dbscan(dist_matrix, dbscan.labels_,
                         clusters_without_noise + 1, show_cluster_sizes=True)
コード例 #6
0
def clustering(headers, distance_matrix_filename=None):
    if distance_matrix_filename is None:
        dist_matrix, max_dist = get_distance_matrix(headers)
    else:
        dist_matrix, max_dist = read_dist_matrix(distance_matrix_filename)

    affinity_matr = get_affinity_matrix(dist_matrix, max_affinity=max_dist)

    af = AffinityPropagation(affinity="precomputed", copy=True).fit(
        affinity_matr)

    return metrics.silhouette_score(np.asmatrix(dist_matrix), af.labels_,
                                    metric='precomputed')
コード例 #7
0
def main(dataset_filename, output_data_filename,
         distance_matrix_filename=None, display=False, min_dist=10):
    start = time.perf_counter()

    headers_pairs = get_headers_pairs_list(dataset_filename, verbose=True)
    labels_true = get_labels(dataset_filename, verbose=True)

    if distance_matrix_filename is None:
        dist_matrix, max_dist = get_distance_matrix(list(map(lambda x: x[1],
                                                             headers_pairs)),
                                                    verbose=True)
    else:
        dist_matrix, max_dist = \
            read_dist_matrix(distance_matrix_filename, verbose=True)

    print("Clustering...")
    n_clusters_, labels = fit(dist_matrix, min_dist)
    print("Done.")

    print("clusters {0}".format(n_clusters_))
    print(labels)

    metrics_list = [
        n_clusters_,
        metrics.homogeneity_score(labels_true, labels),
        metrics.completeness_score(labels_true, labels),
        metrics.v_measure_score(labels_true, labels),
        metrics.adjusted_rand_score(labels_true, labels),
        metrics.adjusted_mutual_info_score(labels_true, labels),
        metrics.silhouette_score(np.asmatrix(dist_matrix), labels,
                                 metric='precomputed')
    ]

    print_metrics(metrics_list)

    write_clusterized_data(output_data_filename, headers_pairs, labels,
                           metrics=metrics_list, verbose=True)

    end = time.perf_counter()
    print("\nWorking time: %f sec." % (end - start))