コード例 #1
0
def intra_cluster_distance(X, labels, *, metric='euclidean', **kwds):
    X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr'])

    # Check for non-zero diagonal entries in precomputed distance matrix
    if metric == 'precomputed':
        atol = np.finfo(X.dtype).eps * 100
        if np.any(np.abs(np.diagonal(X)) > atol):
            raise ValueError(
                'The precomputed distance matrix contains non-zero '
                'elements on the diagonal. Use np.fill_diagonal(X, 0).')

    le = LabelEncoder()
    labels = le.fit_transform(labels)
    n_samples = len(labels)
    label_freqs = np.bincount(labels)
    check_number_of_labels(len(le.classes_), n_samples)

    kwds['metric'] = metric
    reduce_func = functools.partial(_silhouette_reduce,
                                    labels=labels,
                                    label_freqs=label_freqs)
    results = zip(
        *pairwise_distances_chunked(X, reduce_func=reduce_func, **kwds))
    intra_clust_dists, inter_clust_dists = results
    intra_clust_dists = np.concatenate(intra_clust_dists)
    inter_clust_dists = np.concatenate(inter_clust_dists)
    denom = (label_freqs - 1).take(labels, mode='clip')
    with np.errstate(divide="ignore", invalid="ignore"):
        intra_clust_dists /= denom
    return intra_clust_dists, inter_clust_dists
コード例 #2
0
ファイル: __init__.py プロジェクト: quandb/atc
def davies_bouldin_index(X, labels, metric='euclidean'):
    """Compute the Davies Bouldin index.
  The index is defined as the ratio of within-cluster
  and between-cluster distances.
  Parameters
  ----------
  X : array-like, shape (``n_samples``, ``n_features``)
      List of ``n_features``-dimensional data points. Each row corresponds
      to a single data point.
  labels : array-like, shape (``n_samples``,)
      Predicted labels for each sample.
  Returns
  -------
  score : float
      The resulting Davies-Bouldin index.
  References
  ----------
  .. [1] `Davies, David L.; Bouldin, Donald W. (1979).
     "A Cluster Separation Measure". IEEE Transactions on
     Pattern Analysis and Machine Intelligence. PAMI-1 (2): 224-227`_
  """
    X, labels = check_X_y(X, labels)
    le = LabelEncoder()
    labels = le.fit_transform(labels)
    n_samples, _ = X.shape
    n_labels = len(le.classes_)

    check_number_of_labels(n_labels, n_samples)
    intra_dists = np.zeros(n_labels)
    centroids = np.zeros((n_labels, len(X[0])), np.float32)
    # print("Start")
    # print(labels)
    # print(X)
    for k in range(n_labels):
        cluster_k = X[labels == k]
        mean_k = np.mean(cluster_k, axis=0)
        centroids[k] = mean_k
        # print("Process")
        # print(mean_k)
        # print(cluster_k)
        intra_dists[k] = np.average(
            pairwise_distances(cluster_k, [mean_k], metric=metric))
    centroid_distances = pairwise_distances(centroids, metric=metric)
    with np.errstate(divide='ignore', invalid='ignore'):
        if np.all((intra_dists[:, None] + intra_dists) == 0.0) or \
          np.all(centroid_distances == 0.0):
            return 0.0
        scores = (intra_dists[:, None] + intra_dists) / centroid_distances
        # remove inf values
        scores[scores == np.inf] = np.nan
        return np.nanmax(scores, axis=1)
コード例 #3
0
def new_silhouette_samples(X, labels, metric='precomputed', **kwds):
    """ I will remove 'distances' object since in this case X is already a distance 
        matrix. I will also exclude '-1' label from the computation"""
    X = X.astype(np.float32)
    X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr'])
    le = LabelEncoder()
    labels = le.fit_transform(labels)
    unsupervised.check_number_of_labels(len(le.classes_), X.shape[0])

    unique_labels = le.classes_
    n_samples_per_label = np.bincount(labels, minlength=len(unique_labels))

    # For sample i, store the mean distance of the cluster to which
    # it belongs in intra_clust_dists[i]
    intra_clust_dists = np.zeros(X.shape[0], dtype=X.dtype)

    # For sample i, store the mean distance of the second closest
    # cluster in inter_clust_dists[i]
    inter_clust_dists = np.inf + intra_clust_dists

    for curr_label in range(len(unique_labels)):
        # Do not consider noise label (that is 0)
        if curr_label != 0:
            # Find inter_clust_dist for all samples belonging to the same
            # label.
            mask = labels == curr_label
            current_distances = X[mask]

            # Leave out current sample.
            n_samples_curr_lab = n_samples_per_label[curr_label] - 1
            if n_samples_curr_lab != 0:
                intra_clust_dists[mask] = np.sum(current_distances[:, mask],
                                                 axis=1) / n_samples_curr_lab

            # Now iterate over all other labels, finding the mean
            # cluster distance that is closest to every sample.
            for other_label in range(len(unique_labels)):
                if other_label != curr_label and other_label != 0:
                    other_mask = labels == other_label
                    other_distances = np.mean(current_distances[:, other_mask],
                                              axis=1)
                    inter_clust_dists[mask] = np.minimum(
                        inter_clust_dists[mask], other_distances)

    sil_samples = inter_clust_dists - intra_clust_dists
    sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)
    # score 0 for clusters of size 1, according to the paper
    sil_samples[n_samples_per_label.take(labels) == 1] = 0
    return sil_samples
コード例 #4
0
def silhouette_samples_memory_saving(X, labels, metric='euclidean', **kwds):
    X, labels = check_X_y(X, labels, accept_sparse=['csc', 'csr'])
    le = LabelEncoder()
    labels = le.fit_transform(labels)
    check_number_of_labels(len(le.classes_), X.shape[0])

    unique_labels = le.classes_
    n_samples_per_label = np.bincount(labels, minlength=len(unique_labels))

    # For sample i, store the mean distance of the cluster to which
    # it belongs in intra_clust_dists[i]
    intra_clust_dists = np.zeros(X.shape[0], dtype=X.dtype)

    # For sample i, store the mean distance of the second closest
    # cluster in inter_clust_dists[i]
    inter_clust_dists = np.inf + intra_clust_dists

    for curr_label in range(len(unique_labels)):

        # Find inter_clust_dist for all samples belonging to the same label.
        mask = labels == curr_label

        # Leave out current sample.
        n_samples_curr_lab = n_samples_per_label[curr_label] - 1
        if n_samples_curr_lab != 0:
            intra_clust_dists[mask] = euclidean_distances_sum(
                X[mask, :]) / n_samples_curr_lab

        # Now iterate over all other labels, finding the mean
        # cluster distance that is closest to every sample.
        for other_label in range(len(unique_labels)):
            if other_label != curr_label:
                other_mask = labels == other_label
                other_distances = euclidean_distances_mean(
                    X[mask, :], X[other_mask, :])
                inter_clust_dists[mask] = np.minimum(inter_clust_dists[mask],
                                                     other_distances)

    sil_samples = inter_clust_dists - intra_clust_dists
    sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)
    # score 0 for clusters of size 1, according to the paper
    sil_samples[n_samples_per_label.take(labels) == 1] = 0
    return sil_samples