Example #1
0
def fuzzy_c_medoids(data,
                    distance_matrix,
                    components=10,
                    eps=1e-4,
                    max_iter=1000,
                    fuzzifier=2,
                    initialization_method="random_choice",
                    empty_clusters_method="nothing",
                    medoids_idx=None,
                    progress_bar=True):
    """ Performs the fuzzy c-medoids clustering algorithm on a dataset.

    :param data: The dataset into which the clustering will be performed. The dataset must be 2D np.array with rows as
    examples and columns as features.
    :param distance_matrix: The pairwise distance matrix applied across all examples from the data matrix. The distance
    matrix must be encoded into a condensed distance vector (see:
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.squareform.html)
    :param components: The number of components (clusters) wanted.
    :param eps: Criterion used to define convergence. If the absolute differences between two consecutive losses is
    lower than `eps`, the clustering stop.
    :param max_iter: Criterion used to stop the clustering if the number of iterations exceeds `max_iter`.
    :param fuzzifier: Membership fuzzification coefficient.
    :param initialization_method: Method used to initialise the centroids. Can take one of the following values :
    * "random_uniform" or "uniform", samples values between the min and max across each dimension.
    * "random_gaussian" or "gaussian", samples values from a gaussian with the same mean and std as each data's
    dimension.
    * "random_choice" or "choice", samples random examples from the data without replacement.
    * "central_dissimilar_medoids", sample the first medoid as the most central point of the dataset, then sample all
    successive medoids as the most dissimilar to all medoids that have already been picked.
    * "central_dissimilar_random_medoids", same as "central_dissimilar_medoids", but the first medoid is sampled
    randomly.
    :param empty_clusters_method: Method used at each iteration to handle empty clusters. Can take one of the following
    values :
    * "nothing", do absolutely nothing and ignore empty clusters.
    * "random_example", assign a random example to all empty clusters.
    * "furthest_example_from_its_centroid", assign the furthest example from its centroid to each empty cluster.
    :param medoids_idx: Initials medoids indexes to use instead of randomly initialize them.
    :param progress_bar: If `False`, disable the progress bar.
    :return: A tuple containing :
    * The memberships matrix.
    * The medoids matrix.
    * An array with all losses at each iteration.
    """
    assert len(data.shape) == 2, "The data must be a 2D array"
    assert data.shape[0] > 0, "The data must have at least one example"
    assert data.shape[1] > 0, "The data must have at least one feature"
    assert (
        distance_matrix.shape[0] == distance_matrix.shape[1] == data.shape[0]
    ) or (
        scipy.spatial.distance.is_valid_y(distance_matrix)
    ), "The distance matrix is not encoded into a condensed distance vector, nor is a square distance matrix"
    assert 1 <= components <= data.shape[
        0], "The number of components wanted must be between 1 and %s" % data.shape[
            0]
    assert 0 <= max_iter, "The number of max iterations must be positive"
    assert fuzzifier > 1, "The fuzzifier must be greater than 1"
    assert (medoids_idx is None) or (medoids_idx.shape == components), \
        "The given medoids indexes do not have a correct shape. Expected shape : {}, given shape : {}".format(
            (components,), medoids_idx.shape
    )
    assert (medoids_idx is None) or np.all(medoids_idx < data.shape[0]), \
        "The provided medoid indexes array contains unreachable indexes"

    # Initialisation
    if medoids_idx is None:
        medoids_idx = cluster_initialization(data,
                                             components,
                                             initialization_method,
                                             need_idx=True)

    if distance_matrix.shape[0] == distance_matrix.shape[1]:
        # The distance matrix is a squared distance matrix, apply usual methods
        _compute_memberships = _compute_memberships_square
        _compute_medoids = _compute_medoids_square
        _compute_loss = _compute_loss_square
    else:
        # The distance matrix is a condensed distance matrix.
        # Indexing is different, thus use other methods
        _compute_memberships = _compute_memberships_condensed
        _compute_medoids = _compute_medoids_condensed
        _compute_loss = _compute_loss_condensed

    with tqdm(total=max_iter,
              bar_format=_FORMAT_PROGRESS_BAR,
              disable=not progress_bar) as progress_bar:
        best_memberships = None
        best_medoids_idx = None
        best_loss = np.inf

        memberships = None
        losses = []
        current_iter = 0
        while (current_iter < max_iter) and \
              ((current_iter < 2) or (abs(losses[-2] - losses[-1]) > eps)):
            # Compute memberships
            memberships = _compute_memberships(distance_matrix=distance_matrix,
                                               medoids_idx=medoids_idx,
                                               fuzzifier=fuzzifier,
                                               n=data.shape[0])
            handle_empty_clusters(distance_matrix,
                                  medoids_idx,
                                  memberships,
                                  strategy=empty_clusters_method)

            # Compute medoids
            medoids_idx = _compute_medoids(distance_matrix=distance_matrix,
                                           memberships=memberships,
                                           fuzzifier=fuzzifier,
                                           n=data.shape[0])

            # Compute loss
            loss = _compute_loss(distance_matrix=distance_matrix,
                                 medoids_idx=medoids_idx,
                                 memberships=memberships,
                                 fuzzifier=fuzzifier,
                                 n=data.shape[0])
            losses.append(loss)
            if loss < best_loss:
                best_loss = loss
                best_memberships = memberships
                best_medoids_idx = medoids_idx

            # Update the progress bar
            current_iter += 1
            progress_bar.update()
            progress_bar.set_postfix({
                "Loss": "{0:.6f}".format(loss),
                "best_loss": "{0:.6f}".format(best_loss)
            })

    affectations = best_memberships.argmax(axis=1)
    clusters_id, clusters_cardinal = np.unique(affectations,
                                               return_counts=True)
    return {
        # Clustering results
        "memberships": best_memberships,
        "affectations": affectations,
        "medoids_indexes": best_medoids_idx,
        "clusters_center": data[best_medoids_idx, :],
        "clusters_id": clusters_id,
        "losses": np.array(losses),
        "extended_time": progress_bar.last_print_t - progress_bar.start_t,

        # Evaluation : Memberships matrix
        "ambiguity": ambiguity(best_memberships),
        "partition_coefficient": partition_coefficient(best_memberships),
        "partition_entropy": partition_entropy(best_memberships),

        # Evaluation : Clusters center
        "clusters_diameter": clusters_diameter(data, affectations,
                                               clusters_id),
        "clusters_cardinal": clusters_cardinal,

        # Evaluation : Affectations
        "silhouette_samples": silhouette_samples(data, affectations),
        "silhouette": silhouette_score(data, affectations),
        "variance_ratio": calinski_harabasz_score(data, affectations),
        "davies_bouldin": davies_bouldin_score(data, affectations)
    }
Example #2
0
def kmeans(data, components=10, eps=1e-4, max_iter=1000, weights=None,
           initialization_method="random_choice", empty_clusters_method="nothing",
           centroids=None, progress_bar=True):
    """ Performs the k-means clustering algorithm on a dataset.

    :param data: The dataset into which the clustering will be performed. The dataset must be 2D np.array with rows as
    examples and columns as features.
    :param components: The number of components (clusters) wanted.
    :param eps: Criterion used to define convergence. If the absolute differences between two consecutive losses is
    lower than `eps`, the clustering stop.
    :param max_iter: Criterion used to stop the clustering if the number of iterations exceeds `max_iter`.
    :param weights: Weighting of each features during clustering. Must be an Iterable of weights with the same size as
    the number of features.
    :param initialization_method: Method used to initialise the centroids. Can take one of the following values :
    * "random_uniform" or "uniform", samples values between the min and max across each dimension.
    * "random_gaussian" or "gaussian", samples values from a gaussian with the same mean and std as each data's
    dimension.
    * "random_choice" or "choice", samples random examples from the data without replacement.
    * "central_dissimilar_medoids", sample the first medoid as the most central point of the dataset, then sample all
    successive medoids as the most dissimilar to all medoids that have already been picked.
    * "central_dissimilar_random_medoids", same as "central_dissimilar_medoids", but the first medoid is sampled
    randomly.
    :param empty_clusters_method: Method used at each iteration to handle empty clusters. Can take one of the following
    values :
    * "nothing", do absolutely nothing and ignore empty clusters.
    * "random_example", assign a random example to all empty clusters.
    * "furthest_example_from_its_centroid", assign the furthest example from its centroid to each empty cluster.
    :param centroids: Initials centroids to use instead of randomly initialize them.
    :param progress_bar: If `False`, disable the progress bar.
    :return: A tuple containing :
    * The memberships matrix.
    * The centroids matrix.
    * An array with all losses at each iteration.
    """
    assert len(data.shape) == 2, "The data must be a 2D array"
    assert data.shape[0] > 0, "The data must have at least one example"
    assert data.shape[1] > 0, "The data must have at least one feature"
    assert 1 <= components <= data.shape[0], "The number of components wanted must be between 1 and %s" % data.shape[0]
    assert 0 <= max_iter, "The number of max iterations must be positive"
    assert (weights is None) or (len(weights) == data.shape[1]),\
        "The number of weights given must be the same as the number of features. Expected size : %s, given size : %s" %\
        (data.shape[1], len(weights))
    assert (centroids is None) or (centroids.shape == (components, data.shape[1])), \
        "The given centroids do not have a correct shape. Expected shape : {}, given shape : {}".format(
            (components, data.shape[1]), centroids.shape
        )

    if weights is not None:
        # Applying weighted euclidean distance is equivalent to applying traditional euclidean distance into data
        # weighted by the square root of the weights, see [5]
        data = data * np.sqrt(weights)

    # Initialisation
    if centroids is None:
        centroids = cluster_initialization(data, components, strategy=initialization_method, need_idx=False)

    with tqdm(total=max_iter, bar_format=_FORMAT_PROGRESS_BAR, disable=not progress_bar) as progress_bar:
        best_memberships = None
        best_centroids = None
        best_loss = np.inf

        memberships = None
        losses = []
        current_iter = 0
        while (current_iter < max_iter) and \
              ((current_iter < 2) or (abs(losses[-2] - losses[-1]) > eps)):
            memberships = _optim_memberships(data, centroids)
            handle_empty_clusters(data, centroids, memberships, strategy=empty_clusters_method)

            centroids = _optim_centroids(data, memberships)

            loss = _compute_loss(data, memberships, centroids)
            losses.append(loss)
            if loss < best_loss:
                best_loss = loss
                best_memberships = memberships
                best_centroids = centroids

            # Update the progress bar
            current_iter += 1
            progress_bar.update()
            progress_bar.set_postfix({
                "Loss": "{0:.6f}".format(loss),
                "best_loss": "{0:.6f}".format(best_loss)
            })

    affectations = best_memberships.argmax(axis=1)
    clusters_id, clusters_cardinal = np.unique(affectations, return_counts=True)
    return {
        # Clustering results
        "memberships": best_memberships,
        "affectations": affectations,
        "clusters_center": best_centroids,
        "clusters_id": clusters_id,
        "losses": np.array(losses),
        "extended_time": progress_bar.last_print_t - progress_bar.start_t,

        # Evaluation : Memberships matrix
        "ambiguity": ambiguity(best_memberships),
        "partition_coefficient": partition_coefficient(best_memberships),
        "partition_entropy": partition_entropy(best_memberships),

        # Evaluation : Clusters center
        "clusters_diameter": clusters_diameter(data, affectations, clusters_id),
        "clusters_cardinal": clusters_cardinal,

        # Evaluation : Affectations
        "silhouette_samples": silhouette_samples(data, affectations),
        "silhouette": silhouette_score(data, affectations),
        "variance_ratio": calinski_harabasz_score(data, affectations),
        "davies_bouldin": davies_bouldin_score(data, affectations)
    }
Example #3
0
def linearized_fuzzy_c_medoids(data,
                               distance_matrix,
                               components=10,
                               eps=1e-4,
                               max_iter=1000,
                               fuzzifier=2,
                               membership_subset_size=None,
                               initialization_method="random_choice",
                               empty_clusters_method="nothing",
                               medoids_idx=None):
    """ Performs the linearized fuzzy c-medoids clustering algorithm on a dataset.

    :param data: The dataset into which the clustering will be performed. The dataset must be 2D np.array with rows as
    examples and columns as features.
    :param distance_matrix: The pairwise distance matrix applied across all examples from the data matrix. The distance
    matrix must be encoded into a condensed distance vector (see:
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.squareform.html)
    :param components: The number of components (clusters) wanted.
    :param eps: Criterion used to define convergence. If the absolute differences between two consecutive losses is
    lower than `eps`, the clustering stop.
    :param max_iter: Criterion used to stop the clustering if the number of iterations exceeds `max_iter`.
    :param fuzzifier: Membership fuzzification coefficient.
    :param membership_subset_size: Size of subset to inspect during the memberships matrix computation. Reduce
    computations length.
    :param initialization_method: Method used to initialise the centroids. Can take one of the following values :
    * "random_uniform" or "uniform", samples values between the min and max across each dimension.
    * "random_gaussian" or "gaussian", samples values from a gaussian with the same mean and std as each data's
    dimension.
    * "random_choice" or "choice", samples random examples from the data without replacement.
    * "central_dissimilar_medoids", sample the first medoid as the most central point of the dataset, then sample all
    successive medoids as the most dissimilar to all medoids that have already been picked.
    * "central_dissimilar_random_medoids", same as "central_dissimilar_medoids", but the first medoid is sampled
    randomly.
    :param empty_clusters_method: Method used at each iteration to handle empty clusters. Can take one of the following
    values :
    * "nothing", do absolutely nothing and ignore empty clusters.
    * "random_example", assign a random example to all empty clusters.
    * "furthest_example_from_its_centroid", assign the furthest example from its centroid to each empty cluster.
    :param medoids_idx: Initials medoids indexes to use instead of randomly initialize them.
    :return: A tuple containing :
    * The memberships matrix.
    * The medoids matrix.
    * An array with all losses at each iteration.
    """
    assert len(data.shape) == 2, "The data must be a 2D array"
    assert data.shape[0] > 0, "The data must have at least one example"
    assert data.shape[1] > 0, "The data must have at least one feature"
    assert is_valid_y(
        distance_matrix
    ), "The distance matrix is not encoded into a condensed distance vector"
    assert 1 <= components <= data.shape[
        0], "The number of components wanted must be between 1 and %s" % data.shape[
            0]
    assert 0 <= max_iter, "The number of max iterations must be positive"
    assert fuzzifier > 1, "The fuzzifier must be greater than 1"
    assert (membership_subset_size is None) or (1 <= membership_subset_size <= data.shape[0]), \
        "The membership subset size wanted must be between 1 and %s" % data.shape[0]
    assert (medoids_idx is None) or (medoids_idx.shape == components), \
        "The given medoids indexes do not have a correct shape. Expected shape : {}, given shape : {}".format(
            (components,), medoids_idx.shape
        )
    assert (medoids_idx is None) or np.all(medoids_idx < data.shape[0]), \
        "The provided medoid indexes array contains unreachable indexes"

    raise NotImplementedError("TODO")

    # If no `membership_subset_size` is specified, [1] suggest to use a value much smaller than the average of points
    # in a cluster
    if membership_subset_size is None:
        membership_subset_size = distance_matrix.shape[0] // components

    # Initialisation
    if medoids_idx is None:
        medoids_idx = cluster_initialization(distance_matrix,
                                             components,
                                             initialization_method,
                                             need_idx=True)

    with tqdm(total=max_iter, bar_format=_FORMAT_PROGRESS_BAR) as progress_bar:
        best_memberships = None
        best_medoids_idx = None
        best_loss = np.inf

        memberships = None
        medoids_idx_old = None
        losses = []
        current_iter = 0
        while (current_iter < max_iter) and \
                ((current_iter < 1) or (not all(medoids_idx == medoids_idx_old))) and \
                ((current_iter < 2) or not (abs(losses[-1] - losses[-2]) <= eps)):

            medoids_idx_old = medoids_idx
            memberships = _compute_memberships(distance_matrix, medoids_idx,
                                               fuzzifier)
            handle_empty_clusters(distance_matrix,
                                  medoids_idx,
                                  memberships,
                                  strategy=empty_clusters_method)

            top_memberships_mask = _compute_top_membership_subset(
                memberships, membership_subset_size)
            medoids_idx = _compute_medoids(distance_matrix, memberships,
                                           fuzzifier, top_memberships_mask)

            loss = _compute_loss(distance_matrix, medoids_idx, memberships,
                                 fuzzifier)
            losses.append(loss)
            if loss < best_loss:
                best_loss = loss
                best_memberships = memberships
                best_medoids_idx = medoids_idx

            # Update the progress bar
            current_iter += 1
            progress_bar.update()
            progress_bar.set_postfix({
                "Loss": "{0:.6f}".format(loss),
                "best_loss": "{0:.6f}".format(best_loss)
            })

    return {
        "memberships": best_memberships,
        "medoids_indexes": best_medoids_idx,
        "clusters_center": data[best_medoids_idx, :],
        "losses": np.array(losses),
        "affectations": best_memberships.argmax(axis=1),
        "ambiguity": ambiguity(best_memberships),
        "partition_coefficient": partition_coefficient(best_memberships),
        "partition_entropy": partition_entropy(best_memberships),
        "extended_time": progress_bar.last_print_t - progress_bar.start_t,
    }