Exemple #1
0
def cluster(
    vector_names,
    vectors,
    reduced_vectors,
    normalized_vectors,
    vectorizer,
    strategy="automatic",
    cluster_method="kmeans",
    n_clusters=2,
    epsilon=0.5,
    min_samples=5,
    metric="euclidean",
):
    """
    Clustering options:

    Manual:
     The user supplies all required information to do the clustering. This includes the clustering algorithm and
     hyper parameters

    Assisted:
     The user assists the algorithm by suggesting that some samples should or should not be clustered together

    Automatic:
     The multiple clustering strategies and parameters are used in an attempt to get the best clusters
    """
    if strategy == "manual":
        if cluster_method == "kmeans":
            return cluster_with_kmeans(normalized_vectors,
                                       n_clusters=n_clusters)

        elif cluster_method == "dbscan":
            return cluster_with_dbscan(normalized_vectors,
                                       epsilon=epsilon,
                                       min_samples=min_samples,
                                       metric=metric)

        elif cluster_method == "agglomerative":
            return cluster_with_agglomerative(normalized_vectors,
                                              n_clusters=n_clusters,
                                              metric=metric)

        else:
            # Unknown clustering method
            raise NotImplementedError()

    elif strategy == "assisted":
        """
        To display a information about a vector to a user, you can use the following:
        display_vector_index_details(vector_index, vectors, vector_names, vectorizer)
        """
        # todo Try with normalized vectors
        return cluster_interactive(reduced_vectors, vectorizer, vectors,
                                   vector_names)
    elif strategy == "viz":
        clusteringnmap.interactive_state._vector_names = vector_names
        clusteringnmap.interactive_state._vectors = vectors
        clusteringnmap.interactive_state._vectorizer = vectorizer
        clusteringnmap.interactive_state._reduced_vectors = reduced_vectors
        clusteringnmap.interactive_state._normalized_vectors = normalized_vectors
        clusteringnmap.interactive_state._labels = [0] * vectors.shape[0]

        print "Go to http://localhost:5006/Interactive_Clustering_bokeh"
        print "Press CTRL-C to exit web interface"
        parser = argparse.ArgumentParser(Serve.args)
        s = Serve(parser)
        s.invoke(
            parser.parse_args(
                ["clusteringnmap/Interactive_Clustering_bokeh.py"]))
        return clusteringnmap.interactive_state._labels
    elif strategy == "automatic":
        results = []
        smallest_cluster_count = vectors.shape[0]
        for cluster_method in [
                "kmeans",
                "dbscan",
                "agglomerative",
        ]:
            if cluster_method == "kmeans":
                logging.debug("Starting prospective KMeans clusterings")
                move_to_next_method = False
                for n_clusters in xrange(2, smallest_cluster_count):
                    logging.debug("Trying {0}".format(
                        "kmeans(n_clusters={0})".format(n_clusters)))
                    labels = cluster_with_kmeans(reduced_vectors,
                                                 n_clusters=n_clusters)
                    overall_score, per_cluster_score = validate_clusters(
                        vectors, labels)
                    mean_distance = get_average_distance_per_cluster(
                        vectors, labels)[0]

                    tsp, msp, msn = get_common_feature_stats(
                        vectors, labels, vectorizer)

                    # If any cluster has 0 shared features, we just ignore the result
                    if msp <= tsp:
                        logging.debug("Not all clusters are informative")
                        continue
                    if len(set(labels)) > smallest_cluster_count:
                        move_to_next_method = True
                        break
                    if len(set(labels)) < smallest_cluster_count:
                        smallest_cluster_count = len(set(labels))

                    logging.debug(
                        repr((overall_score, min(per_cluster_score.values()),
                              mean_distance, labels, len(set(labels)), tsp,
                              msp, msn,
                              "kmeans(n_clusters={0})".format(n_clusters))))
                    results.append(
                        (overall_score, min(per_cluster_score.values()),
                         mean_distance, labels, len(set(labels)), tsp, msp,
                         msn, "kmeans(n_clusters={0})".format(n_clusters)))
                if move_to_next_method:
                    continue

            if cluster_method == "agglomerative":
                logging.debug("Starting prospective Agglomerative clusterings")
                move_to_next_method = False
                for n_clusters in xrange(2, smallest_cluster_count):
                    logging.debug("Trying {0}".format(
                        "agglomerative(n_clusters={0})".format(n_clusters)))
                    labels = cluster_with_agglomerative(reduced_vectors,
                                                        n_clusters=n_clusters,
                                                        metric=metric)
                    overall_score, per_cluster_score = validate_clusters(
                        vectors, labels)
                    mean_distance = get_average_distance_per_cluster(
                        vectors, labels)[0]

                    tsp, msp, msn = get_common_feature_stats(
                        vectors, labels, vectorizer)

                    # If any cluster has 0 shared features, we just ignore the result
                    if msp <= tsp:
                        logging.debug("Not all clusters are informative")
                        continue
                    if len(set(labels)) > smallest_cluster_count:
                        move_to_next_method = True
                        break
                    if len(set(labels)) < smallest_cluster_count:
                        smallest_cluster_count = len(set(labels))

                    logging.debug(
                        repr(
                            (overall_score, min(per_cluster_score.values()),
                             mean_distance, labels, len(set(labels)), tsp, msp,
                             msn,
                             "agglomerative(n_clusters={0})".format(n_clusters)
                             )))
                    results.append(
                        (overall_score, min(per_cluster_score.values()),
                         mean_distance, labels, len(set(labels)), tsp, msp,
                         msn,
                         "agglomerative(n_clusters={0})".format(n_clusters)))
                if move_to_next_method:
                    continue

            if cluster_method == "dbscan":
                logging.debug("Starting prospective DBSCAN clusterings")
                distance_matrix = precompute_distances(vectors, metric=metric)
                min_distance = sorted(set(list(distance_matrix.flatten())))[1]
                max_distance = sorted(set(list(distance_matrix.flatten())))[-1]
                num_steps = 25.0
                step_size = float(max_distance -
                                  min_distance) / float(num_steps)
                epsilon = min_distance
                while True:
                    logging.debug("Trying {0}".format(
                        "dbscan(epsilon={0})".format(epsilon)))
                    labels = cluster_with_dbscan(reduced_vectors,
                                                 epsilon=epsilon,
                                                 min_samples=1,
                                                 distances=distance_matrix)
                    if len(set(labels)) == 1 and list(set(labels))[0] == 0:
                        break
                    overall_score, per_cluster_score = validate_clusters(
                        vectors, labels)
                    mean_distance = get_average_distance_per_cluster(
                        vectors, labels)[0]

                    tsp, msp, msn = get_common_feature_stats(
                        vectors, labels, vectorizer)

                    # If any cluster has 0 shared features, we just ignore the result
                    if msp <= tsp:
                        logging.debug("Not all clusters are informative")
                        epsilon += step_size
                        continue

                    logging.debug(
                        repr(
                            (overall_score, min(per_cluster_score.values()),
                             mean_distance, labels, len(set(labels)), tsp, msp,
                             msn, "dbscan(epsilon={0})".format(epsilon))))
                    results.append(
                        (overall_score, min(per_cluster_score.values()),
                         mean_distance, labels, len(set(labels)), tsp, msp,
                         msn, "dbscan(epsilon={0})".format(epsilon)))
                    epsilon += step_size

        # Pick best result
        """
        We want to maximize the silhouette score while minimizing the number of labels
        """
        sorted_results = sort_items_by_multiple_keys(
            results,
            {
                #0: True,  # AVG Silhouette
                #1: True,  # Min Silhouette
                #2: False,  # Average distance
                4: False,  # Number of clusters
                #6: True,   # Min common features per cluster
            },
            {
                #0: 1,
                #1: 1,
                #2: 1,
                4: 1,
                #6: 1
            })
        logging.debug(sorted_results)
        best_result = results[sorted_results[0][0]]
        logging.debug(best_result)

        best_method = best_result[-1]
        best_silhouette = best_result[0]
        best_labels = best_result[3]

        logging.info(
            "Best clustering method: {0} (adjusted silhouette == {1})".format(
                best_method, best_silhouette))
        return best_labels

    else:
        # Unknown strategy
        raise NotImplementedError()
Exemple #2
0
def cluster(
        vector_names,
        vectors,
        reduced_vectors,
        normalized_vectors,
        vectorizer,
        strategy="automatic",
        cluster_method="kmeans",
        n_clusters=2,
        epsilon=0.5,
        min_samples=5,
        metric="euclidean",
):
    """
    Clustering options:

    Manual:
     The user supplies all required information to do the clustering. This includes the clustering algorithm and
     hyper parameters

    Assisted:
     The user assists the algorithm by suggesting that some samples should or should not be clustered together

    Automatic:
     The multiple clustering strategies and parameters are used in an attempt to get the best clusters
    """
    if strategy == "manual":
        if cluster_method == "kmeans":
            return cluster_with_kmeans(normalized_vectors, n_clusters=n_clusters)

        elif cluster_method == "dbscan":
            return cluster_with_dbscan(normalized_vectors, epsilon=epsilon, min_samples=min_samples, metric=metric)

        elif cluster_method == "agglomerative":
            return cluster_with_agglomerative(normalized_vectors, n_clusters=n_clusters, metric=metric)

        else:
            # Unknown clustering method
            raise NotImplementedError()

    elif strategy == "assisted":
        """
        To display a information about a vector to a user, you can use the following:
        display_vector_index_details(vector_index, vectors, vector_names, vectorizer)
        """
        # todo Try with normalized vectors
        return cluster_interactive(reduced_vectors, vectorizer, vectors, vector_names)
    elif strategy == "viz":
        clusteringnmap.interactive_state._vector_names = vector_names
        clusteringnmap.interactive_state._vectors = vectors
        clusteringnmap.interactive_state._vectorizer = vectorizer
        clusteringnmap.interactive_state._reduced_vectors = reduced_vectors
        clusteringnmap.interactive_state._normalized_vectors = normalized_vectors
        clusteringnmap.interactive_state._labels = [0] * vectors.shape[0]

        print "Go to http://localhost:5006/Interactive_Clustering_bokeh"
        print "Press CTRL-C to exit web interface"
        parser = argparse.ArgumentParser(Serve.args)
        s = Serve(parser)
        s.invoke(parser.parse_args(["clusteringnmap/Interactive_Clustering_bokeh.py"]))
        return clusteringnmap.interactive_state._labels
    elif strategy == "automatic":
        results = []
        smallest_cluster_count = vectors.shape[0]
        for cluster_method in [
            "kmeans",
            "dbscan",
            "agglomerative",
        ]:
            if cluster_method == "kmeans":
                logging.debug("Starting prospective KMeans clusterings")
                move_to_next_method = False
                for n_clusters in xrange(2, smallest_cluster_count):
                    logging.debug("Trying {0}".format("kmeans(n_clusters={0})".format(n_clusters)))
                    labels = cluster_with_kmeans(reduced_vectors, n_clusters=n_clusters)
                    overall_score, per_cluster_score = validate_clusters(vectors, labels)
                    mean_distance = get_average_distance_per_cluster(vectors, labels)[0]

                    tsp, msp, msn = get_common_feature_stats(vectors, labels, vectorizer)

                    # If any cluster has 0 shared features, we just ignore the result
                    if msp <= tsp:
                        logging.debug("Not all clusters are informative")
                        continue
                    if len(set(labels)) > smallest_cluster_count:
                        move_to_next_method = True
                        break
                    if len(set(labels)) < smallest_cluster_count:
                        smallest_cluster_count = len(set(labels))

                    logging.debug(repr((
                            overall_score,
                            min(per_cluster_score.values()),
                            mean_distance,
                            labels,
                            len(set(labels)),
                            tsp,
                            msp,
                            msn,
                            "kmeans(n_clusters={0})".format(n_clusters)
                        )))
                    results.append(
                        (
                            overall_score,
                            min(per_cluster_score.values()),
                            mean_distance,
                            labels,
                            len(set(labels)),
                            tsp,
                            msp,
                            msn,
                            "kmeans(n_clusters={0})".format(n_clusters)
                        )
                    )
                if move_to_next_method:
                    continue

            if cluster_method == "agglomerative":
                logging.debug("Starting prospective Agglomerative clusterings")
                move_to_next_method = False
                for n_clusters in xrange(2, smallest_cluster_count):
                    logging.debug("Trying {0}".format("agglomerative(n_clusters={0})".format(n_clusters)))
                    labels = cluster_with_agglomerative(reduced_vectors, n_clusters=n_clusters, metric=metric)
                    overall_score, per_cluster_score = validate_clusters(vectors, labels)
                    mean_distance = get_average_distance_per_cluster(vectors, labels)[0]

                    tsp, msp, msn = get_common_feature_stats(vectors, labels, vectorizer)

                    # If any cluster has 0 shared features, we just ignore the result
                    if msp <= tsp:
                        logging.debug("Not all clusters are informative")
                        continue
                    if len(set(labels)) > smallest_cluster_count:
                        move_to_next_method = True
                        break
                    if len(set(labels)) < smallest_cluster_count:
                        smallest_cluster_count = len(set(labels))


                    logging.debug(repr((
                        overall_score,
                        min(per_cluster_score.values()),
                        mean_distance,
                        labels,
                        len(set(labels)),
                        tsp,
                        msp,
                        msn,
                        "agglomerative(n_clusters={0})".format(n_clusters)
                    )))
                    results.append(
                        (
                            overall_score,
                            min(per_cluster_score.values()),
                            mean_distance,
                            labels,
                            len(set(labels)),
                            tsp,
                            msp,
                            msn,
                            "agglomerative(n_clusters={0})".format(n_clusters)
                        )
                    )
                if move_to_next_method:
                    continue

            if cluster_method == "dbscan":
                logging.debug("Starting prospective DBSCAN clusterings")
                distance_matrix = precompute_distances(vectors, metric=metric)
                min_distance = sorted(set(list(distance_matrix.flatten())))[1]
                max_distance = sorted(set(list(distance_matrix.flatten())))[-1]
                num_steps = 25.0
                step_size = float(max_distance - min_distance) / float(num_steps)
                epsilon = min_distance
                while True:
                    logging.debug("Trying {0}".format("dbscan(epsilon={0})".format(epsilon)))
                    labels = cluster_with_dbscan(reduced_vectors, epsilon=epsilon, min_samples=1, distances=distance_matrix)
                    if len(set(labels)) == 1 and list(set(labels))[0] == 0:
                        break
                    overall_score, per_cluster_score = validate_clusters(vectors, labels)
                    mean_distance = get_average_distance_per_cluster(vectors, labels)[0]

                    tsp, msp, msn = get_common_feature_stats(vectors, labels, vectorizer)

                    # If any cluster has 0 shared features, we just ignore the result
                    if msp <= tsp:
                        logging.debug("Not all clusters are informative")
                        epsilon += step_size
                        continue

                    logging.debug(repr((
                        overall_score,
                        min(per_cluster_score.values()),
                        mean_distance,
                        labels,
                        len(set(labels)),
                        tsp,
                        msp,
                        msn,
                        "dbscan(epsilon={0})".format(epsilon)
                    )))
                    results.append(
                        (
                            overall_score,
                            min(per_cluster_score.values()),
                            mean_distance,
                            labels,
                            len(set(labels)),
                            tsp,
                            msp,
                            msn,
                            "dbscan(epsilon={0})".format(epsilon)
                        )
                    )
                    epsilon += step_size

        # Pick best result
        """
        We want to maximize the silhouette score while minimizing the number of labels
        """
        sorted_results = sort_items_by_multiple_keys(
            results,
            {
                #0: True,  # AVG Silhouette
                #1: True,  # Min Silhouette
                #2: False,  # Average distance
                4: False,  # Number of clusters
                #6: True,   # Min common features per cluster
            },
            {
                #0: 1,
                #1: 1,
                #2: 1,
                4: 1,
                #6: 1
            }
        )
        logging.debug(sorted_results)
        best_result = results[sorted_results[0][0]]
        logging.debug(best_result)

        best_method = best_result[-1]
        best_silhouette = best_result[0]
        best_labels = best_result[3]

        logging.info("Best clustering method: {0} (adjusted silhouette == {1})".format(best_method, best_silhouette))
        return best_labels

    else:
        # Unknown strategy
        raise NotImplementedError()
Exemple #3
0
    logging.info("Vectorizing complete")

    logging.info("Reducing vector dimensions with PCA")
    normalized_vectors = normalize(vectors)
    reduced_vectors = pca(vectors)

    # Cluster the vectors
    logging.info("Clustering")
    labels = cluster(vector_names, vectors, reduced_vectors,
                     normalized_vectors, vectorizer, args.strategy,
                     args.method, args.n_clusters, args.epsilon,
                     args.min_samples, args.metric)
    logging.info("Clustering Complete")

    # Test cluster validity
    overall_score, per_cluster_score = validate_clusters(vectors, labels)

    # Analysis relevant to the person reading results
    universal_positive_features, universal_negative_features, shared_features = get_common_features_from_cluster(
        vectors, labels, vectorizer)

    # Reduce results and relevant information to per cluster data
    cluster_details = {}
    for cluster_id in per_cluster_score.keys():
        cluster_details[cluster_id] = {
            "silhouette":
            per_cluster_score[cluster_id],
            "shared_positive_features":
            shared_features[cluster_id]['positive'],
            #"shared_negative_features": shared_features[cluster_id]['negative'],
            "ips": [
Exemple #4
0
    logging.info("Vectorizing")
    vector_names, vectors, vectorizer = vectorize(args.path)
    logging.debug("Loaded {0} vectors with {1} features".format(len(vector_names), vectors.shape[1]))
    logging.info("Vectorizing complete")

    logging.info("Reducing vector dimensions with PCA")
    normalized_vectors = normalize(vectors)
    reduced_vectors = pca(vectors)

    # Cluster the vectors
    logging.info("Clustering")
    labels = cluster(vector_names, vectors, reduced_vectors, normalized_vectors, vectorizer, args.strategy, args.method, args.n_clusters, args.epsilon, args.min_samples, args.metric)
    logging.info("Clustering Complete")

    # Test cluster validity
    overall_score, per_cluster_score = validate_clusters(vectors, labels)

    # Analysis relevant to the person reading results
    universal_positive_features, universal_negative_features, shared_features = get_common_features_from_cluster(vectors, labels, vectorizer)

    # Reduce results and relevant information to per cluster data
    cluster_details = {}
    for cluster_id in per_cluster_score.keys():
        cluster_details[cluster_id] = {
            "silhouette": per_cluster_score[cluster_id],
            "shared_positive_features": shared_features[cluster_id]['positive'],
            #"shared_negative_features": shared_features[cluster_id]['negative'],
            "ips": [vector_names[x] for x in xrange(len(vector_names)) if labels[x] == cluster_id]
        }

    print_cluster_details(cluster_details, shared_features)