コード例 #1
0
def cluster(m, n_colors=32):
    from sklearn.utils import shuffle
    from sklearn.cluster import KMeans
    from sklearn.metrics import pairwise_distances_argmin

    def recreate_image(codebook, labels, w, h):
        """Recreate the (compressed) image from the code book & labels"""
        d = codebook.shape[1]
        image = np.zeros((w, h, d))
        label_idx = 0
        for i in range(w):
            for j in range(h):
                image[i][j] = codebook[labels[label_idx]]
                label_idx += 1
        return image

    # Load Image and transform to a 2D numpy array.
    w, h, d = original_shape = tuple(m.shape)
    image_array = np.reshape(m, (w * h, d))
    image_array_sample = shuffle(image_array, random_state=0)[:1000]
    kmeans = KMeans(n_clusters=n_colors).fit(image_array_sample)

    codebook_random = shuffle(image_array, random_state=0)[:n_colors + 1]
    labels_random = pairwise_distances_argmin(codebook_random, image_array, axis=0)

    return recreate_image(codebook_random, labels_random, w, h)
コード例 #2
0
ファイル: sbcv.py プロジェクト: ericflores/stab-cluster-val
 def measure_label_concurrency(self, assignations):
     """It takes a dictionary with 'variants', 'labels' and 'centers'
     This function return the same dictionary after adding the
     metric measuring how similar are the cluster found among the
     two or more cluster runs.
     This function implements the pairwise concurrency metric
     described in Tibshirani's Prediction Strength."""
     test_points=assignations['variants'][1]
     test_labels=assignations['labels'][1]
     train_centers=assignations['centers'][0]
     clustermetric=[]
     #This loop goes over each cluster in the test set
     for clusternum in set(test_labels):
         #Prepare subset having only the data points for current cluster
         clusterset=test_points[test_labels==clusternum]
         clustersize=len(clusterset)
         if clustersize>1:
             #The line below finds the nearest Train set cluster center
             #for each data point of this Test cluster
             membership=pairwise_distances_argmin(clusterset, train_centers)
             #Placeholder for comembership cummulative value
             comembership=0
             #These two loops will compare cluster center of each data point
             #in the cluster to all other data points in the same cluster
             for i in range(len(membership)):
                 for j in range(len(membership)):
                     #If testing two different data points and they share nearest Train cluster centr
                     if i<>j and membership[i]==membership[j]:
                         #Then incremement metric
                         comembership=comembership+1
             clustermetric.append(comembership/(clustersize*(clustersize-1)))
         pass
     return dict(assignations.items() + [('metric', min(clustermetric))])
コード例 #3
0
def test_birch_predict():
    """Test the predict method predicts the nearest centroid."""
    rng = np.random.RandomState(0)
    X = generate_clustered_data(n_clusters=3, n_features=3, n_samples_per_cluster=10)

    # n_samples * n_samples_per_cluster
    shuffle_indices = np.arange(30)
    rng.shuffle(shuffle_indices)
    X_shuffle = X[shuffle_indices, :]
    brc = Birch(n_clusters=4, threshold=1.0)
    brc.fit(X_shuffle)
    centroids = brc.subcluster_centers_
    assert_array_equal(brc.labels_, brc.predict(X_shuffle))
    nearest_centroid = pairwise_distances_argmin(X_shuffle, centroids)
    assert_almost_equal(v_measure_score(nearest_centroid, brc.labels_), 1.0)
    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            New data to predict.
        Returns
        -------
        labels : array, shape (n_samples,)
            Index of the cluster each sample belongs to.
        """
        check_is_fitted(self, "cluster_centers_indices_")
        if not hasattr(self, "cluster_centers_"):
            raise ValueError("Predict method is not supported when " "affinity='precomputed'.")

        return pairwise_distances_argmin(X, self.cluster_centers_)
コード例 #5
0
def quantinization(picture):
    picture = np.array(picture, dtype=np.float64) / 255
    # plt.imshow(picture)
    # plt.show()

    w, h, d = tuple(picture.shape)
    assert d == 3
    image_array = np.reshape(picture, (w * h, d))

    image_array_sample = shuffle(image_array, random_state=0)[:1000]
    kmeans = KMeans(n_clusters=n_colors, random_state=0).fit(image_array_sample)
    labels = kmeans.predict(image_array)

    codebook_random = shuffle(image_array, random_state=0)[:n_colors + 1]
    labels_random = pairwise_distances_argmin(codebook_random, image_array, axis=0)

    picture = recreate_image(kmeans.cluster_centers_, labels, w, h)
    return picture
コード例 #6
0
def find_clusters(X, n_clusters, rseed=2):
    # 1. 随机选择中心点
    rng = np.random.RandomState(rseed)
    i = rng.permutation(X.shape[0])[:n_clusters]
    centers = X[i]

    while True:
        # 2a. 基于最近的中心指定标签
        labels = pairwise_distances_argmin(X, centers)

        # 2b. 根据点的平均值找到新的中心
        new_centers = np.array(
            [X[labels == i].mean(0) for i in range(n_clusters)])

        # 2c. 确认收敛
        if np.all(centers == new_centers):
            break
        centers = new_centers

    return centers, labels
コード例 #7
0
def find_clusters(X, n_clusters, r_state=2):
    # Randomly choose what points to use as centers
    rnd = np.random.RandomState(r_state)
    i = rnd.permutation(X.shape[0])[:n_clusters]
    center_points = X.iloc[i]

    while True:
        # Finding the label of the points closest to the center points
        labels = pairwise_distances_argmin(X, center_points)

        # Find new centers from mean of points
        new_centers = np.array(
            [X[labels == a].mean(0) for a in range(n_clusters)])

        # See if it matches
        if np.all(center_points == new_centers):
            break
        center_points = new_centers

    return center_points, labels
コード例 #8
0
ファイル: Kmeans_shortcut.py プロジェクト: ANUNAYBAGGA/Kmeans
def find_clusters(X, n_clusters, rseed=2):
    #1 Choose Clusters Randomly
    rng = np.random.RandomState(rseed)
    i = rng.permutation(X.shape[0])[:n_clusters]
    centers = X[i]
    while True:
        #2a Assign labels based on closest center
        labels = pairwise_distances_argmin(
            X, centers
        )  #selects min dist center an assigns center's label to point

        #2b Find new centers from mean
        new_centers = np.array(
            [X[labels == i].mean(0) for i in range(n_clusters)])

        #2c Check whether previous center = new center
        if np.all(centers == new_centers):
            break
        centers = new_centers
    return centers, labels
コード例 #9
0
        def find_clusters(X, n_clusters, rseed=2):
            # 1. 選擇隨機群組
            rng = np.random.RandomState(rseed)
            i = rng.permutation(X.shape[0])[:n_clusters]
            centers = X[i]

            while True:
                # 2a. 基於最近的中心設定標籤
                labels = pairwise_distances_argmin(X, centers)

                # 2b. 從點的平均找出新的中心
                new_centers = np.array(
                    [X[labels == i].mean(0) for i in range(n_clusters)])

                # 2c. 檢查是否收斂
                if np.all(centers == new_centers):
                    break
                centers = new_centers

            return centers, labels
コード例 #10
0
def find_clusters(X, n_clusters, rseed=2):
    # 1. Randomly choose clusters
    rng = np.random.RandomState(rseed)
    i = rng.permutation(X.shape[0])[:n_clusters]
    centers = X[i]

    while True:
        # 2a. Assign labels based on closest center
        labels = pairwise_distances_argmin(X, centers)

        # 2b. Find new centers from means of points
        new_centers = np.array(
            [X[labels == i].mean(0) for i in range(n_clusters)])

        # 2c. Check for convergence
        if np.all(centers == new_centers):
            break
        centers = new_centers

    return centers, labels
コード例 #11
0
def cluster_colors(n_colors, im):
    # First, flatten each WxH color channel
    # w, h, d = im.shape
    # im_flat = np.reshape(im, (w*h, d))
    #w = im.shape[1]**(1/2)
    #im_flat = im.reshape(im.shape[1], 3)
    im_flat = im

    # Next, fit a model to a small sub-sample of the image
    im_sample = shuffle(im_flat, random_state=0)[:int(w / 4.)]
    kmeans = KMeans(n_clusters=n_colors, random_state=0).fit(im_sample)

    # Using this model, label each point
    labels = kmeans.predict(im_flat)

    # Create a codebook to cluster colors
    cb_random = shuffle(im_flat, random_state=0)[:n_colors]
    labels_random = pairwise_distances_argmin(cb_random, im_flat, axis=0)

    return cb_random, labels_random
コード例 #12
0
    def predict(self, files, features):
        X = []
        for file in files:
            audio = Audio(file.stream)
            x = _extract_features(audio, features)
            X.append(x)

        X = np.array(X)
        X_2d = X
        if X.shape[1] != 2:
            mds = MDS(n_components=2, random_state=RANDOM_STATE)
            X_2d = mds.fit_transform(X)
        centroid = pairwise_distances_argmin(X, self.centroids)

        return [{
            'name': files[i].filename,
            'x': float(X_2d[i, 0].round(2)),
            'y': float(X_2d[i, 1].round(2)),
            'label': self.labels[centroid[i]],
            'centroid': i,
        } for i in range(len(centroid))]
コード例 #13
0
    def predict(self, X):
        """
        Predict data using the ``centroids_`` of subclusters.

        Avoid computation of the row norms of X.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Input data.

        Returns
        -------
        labels : ndarray of shape(n_samples,)
            Labelled data.
        """
        X = check_array(X, accept_sparse='csr')
        self._check_fit(X)
        kwargs = {'Y_norm_squared': self._subcluster_norms}
        return self.subcluster_labels_[pairwise_distances_argmin(
            X, self.subcluster_centers_, metric_kwargs=kwargs)]
コード例 #14
0
    def train(self, dataMatrix: np.ndarray, n_epochs=20, eta=0.2):

        num_features = dataMatrix.shape[1]

        nodes = np.random.uniform(low=0,
                                  high=1,
                                  size=(self.num_nodes, num_features))

        for ep in range(1, n_epochs + 1):

            nb_size = self.neighbor_size(ep)

            for p in dataMatrix:
                p = p.reshape(1, -1)

                center_arg = pairwise_distances_argmin(p, nodes)

                nodes += eta * (p - nodes) * self.get_neighbors(
                    center_arg, nb_size)

        self.nodes = nodes
コード例 #15
0
ファイル: quantize.py プロジェクト: yxurrbzallkd/-
def quantize(img, n_colors=64):
    # Load the Summer Palace photo
    img = np.array(img, dtype=np.float64) / 255

    # Load Image and transform to a 2D numpy array.
    w, h, d = tuple(img.shape)
    assert (d == 3)
    image_array = np.reshape(img, (w * h, d))
    print("Fitting model on a small sub-sample of the data")
    image_array_sample = shuffle(image_array)[:min(2000, w * h)]
    kmeans = KMeans(n_clusters=n_colors).fit(image_array_sample)

    print("Predicting color indices on the full image (k-means)")
    labels = kmeans.predict(image_array)

    codebook_random = shuffle(image_array, random_state=0)[:n_colors]
    print("Predicting color indices on the full image (random)")
    labels_random = pairwise_distances_argmin(codebook_random,
                                              image_array,
                                              axis=0)
    return codebook_random, kmeans, labels, labels_random, w, h
コード例 #16
0
def find_clusters(X, n_clusters, rseed=2):
    # 1. 随机选择中心
    rng = np.random.RandomState(rseed)
    i = rng.permutation(X.shape[0])[:n_clusters]
    centers = X[i]

    #开始迭代
    while True:
        # 2a. 根据最近距离确定标签
        labels = pairwise_distances_argmin(X, centers)

        # 2b. 求均值确定新的簇中心
        new_centers = np.array(
            [X[labels == i].mean(0) for i in range(n_clusters)])

        # 2c. 迭代停止条件
        if np.all(centers == new_centers):
            break
        centers = new_centers

    return centers, labels
コード例 #17
0
def find_clusters(X, n_clusters, rseed=3, max_iters=50, weight_koef=0.000002):
    rng = np.random.RandomState(rseed)
    i = rng.permutation(X.shape[0])[:n_clusters]
    centers = X[i]
    # print(X[i])

    for iter in range(max_iters):
        # print(centers)
        labels = pairwise_distances_argmin(X, centers, metric='manhattan')
        # weights = pairwise_distances(X, centers, metric='manhattan')
        elems_count = Counter(labels)
        lengths = []
        for x_iter in range(X.shape[0]):
            weights = []
            for center_id in range(len(centers)):
                weight = abs(X[x_iter, 0] - centers[center_id][0]) + abs(
                    X[x_iter, 1] - centers[center_id][1])
                # Поправка на очереди у банкомата
                weight += weight_koef * elems_count[center_id]
                weights.append(weight)
            lengths.append(weights)
        labels_res = []
        for x in lengths:
            labels_res.append(np.argmin(x))
        labels = np.array(labels_res)

        new_centers = np.array(
            [X[labels == i].mean(0) for i in range(n_clusters)])
        length = len(new_centers[np.isnan(new_centers)]) // 2
        # lat_rand = np.array([X[:, 0].min()]*length) + (X[:, 0].max() - X[:, 0].min()) * np.random.random(length)
        # long_rand = np.array([X[:, 1].min()]*length) + (X[:, 1].max() - X[:, 1].min()) * np.random.random(length)
        # arr = np.transpose(np.array([lat_rand, long_rand]))
        i = rng.permutation(X.shape[0])[:length]
        new_centers[np.isnan(new_centers[:, 0])] = X[i]

        if np.all(centers == new_centers):
            break

        centers = new_centers
    return centers, labels
コード例 #18
0
    def predict(self, X, sta, end):
        """Predict the closest cluster each sample in X belongs to.
        for each seq, use the corresponding part of cluster center to calculate the distance

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            New data to predict.

        Returns
        -------
        labels : array, shape (n_samples,)
            Index of the cluster each sample belongs to.
        values : array, shape (n_samples,)
            min distance to the cluster center each sample belongs to. 
        """
        check_is_fitted(self, "cluster_centers_indices_")
        if not hasattr(self, "cluster_centers_"):
            raise ValueError("Predict method is not supported when "
                             "affinity='precomputed'.")

        return pairwise_distances_argmin(X, self.cluster_centers_, sta, end)
コード例 #19
0
def dbscan(data, similarity, encoding, outlier):
    """Generates clusters using DBSCAN.

    Args:
        data: A timeSeries object.
        similarity: The similarity measure used for scaling the data
            before clustering. Must be "proximity" or "correlation".
        label_encoding: The method used for encoding the labels. Must
            be "none" or "one-hot".
        outlier: Indicates whether outliers are labeled as outliers.

    Returns:
        A list of cluster labels such that the nth element in the list
        represents the cluster the nth element was placed in. Cluster
        labels are integers.
    """
    if similarity == "correlation" and encoding == "none":
        eps_tuned = EPS_CORRELATION_NONE
    elif similarity == "correlation" and encoding == "one-hot":
        eps_tuned = EPS_CORRELATION_ONE_HOT
    elif encoding == "none":
        eps_tuned = EPS_PROXIMITY_NONE
    else:
        eps_tuned = EPS_PROXOMITY_ONE_HOT

    dbscan_result = DBSCAN(eps=eps_tuned, min_samples=2).fit(data)
    cluster_assignment = np.copy(dbscan_result.labels_)
    medians, _ = cluster_medians(data, cluster_assignment)

    outlier_indexes = np.where(cluster_assignment == -1)[0]
    cluster_assignment += 1
    closest = pairwise_distances_argmin(data[outlier_indexes, :], medians)

    for index, index_ts in enumerate(outlier_indexes):
        if outlier == "on":
            cluster_assignment[index_ts] = - (closest[index] + 1)
        else:
            cluster_assignment[index_ts] = closest[index] + 1
    return cluster_assignment
コード例 #20
0
def find_clusters(X, n_clusters, rseed=2):
    # 1. Seleccionamos de manera aleatoria los primeros valores de los centroides
    rng = np.random.RandomState(rseed)
    i = rng.permutation(X.shape[0])[:n_clusters]
    centers = X[i]
    j = 0
    while True:
        """2a. Asignamos a qué cluster pertenece según su cercanía al centroide
           pairwise_distances_argmin retorna array de indices, cada índice corresponde
           con el índice del controide más cercano para ese punto"""

        labels = pairwise_distances_argmin(X, centers)
        """2b. Buscamos los nuevo centroides, calculados como el promedio
           (en cada dimension) de los puntos de cada cluster"""
        new_centers = np.array(
            [X[labels == i].mean(0) for i in range(n_clusters)])
        """ 2c. si los centroides no cambiaron respecto al paso anterior, paro"""
        if np.all(centers == new_centers):
            break
        centers = new_centers

    return centers, labels, kmeans_distorsion(X, centers)
コード例 #21
0
ファイル: kmeans_scratch.py プロジェクト: akshaykatre/kmeans
def find_clusters(X, n_clusters, rseed=2):
    ## Choose random clusters
    rng = np.random.RandomState(rseed)
    i = rng.permutation(X.shape[0])[:n_clusters]
    ## This will give you n_clusters number of centers
    ## which will have the dimensions of your dataset
    centers = X[i]

    while True:
        ## Assign labels based on closest center 
        labels = pairwise_distances_argmin(X, centers)

        ## Find new centers from means of points
        new_centers = np.array([X[labels == i].mean(0)
                                for i in range(n_clusters)])
    
        ## Check for convergence 
        if np.all(centers == new_centers):
            break
        centers = new_centers

    return centers, labels 
コード例 #22
0
def find_clusters(X, n_clusters, rseed=2):
    #1. Randomly choose clusters
    rng = np.random.RandomState(rseed)
    i = rng.permutation(X.shape[0])[:n_clusters]
    #Can be used to get the random number up to certain limit as list

    centers = X[i]  #X[a,b,c,d] => X[a], X[b], X[c], X[d]

    while True:
        # 2a. Assign labels based on closest center
        labels = pairwise_distances_argmin(X, centers)

        # 2b. Find new centers from means of points
        new_centers = np.array(
            [X[labels == i].mean(0) for i in range(n_clusters)])

        # 2c. Check for convergence
        if np.all(centers == new_centers):
            break
        centers = new_centers

    return centers, labels
コード例 #23
0
    def find_my_clusters(self):
        rgn = np.random.RandomState(randint(0, 2**32 - 1))
        i = rgn.permutation(self.dataset.shape[0])[:self.n_clusters]
        self.centers = self.dataset[i]
        init = False
        while True:
            self.labels = pairwise_distances_argmin(self.dataset, self.centers)
            new_centers = np.array([
                self.dataset[self.labels == j].mean(0)
                for j in range(self.n_clusters)
            ])

            if np.all(self.centers == new_centers):
                break
            self.centers = new_centers
            if (init == False):
                Plotting.draw_plot(
                    self.dataset,
                    self.labels,
                    False,
                    self.centers,
                )
                init = True
コード例 #24
0
ファイル: helpers.py プロジェクト: lejeunel/pcml_p2
def kmeans_img(img, n_clusters):
    # Load Image and transform to a 2D numpy array.
    w, h, d = original_shape = tuple(img.shape)
    assert d == 3
    image_array = np.reshape(img, (w * h, d))

    #print("Fitting model on a small sub-sample of the data")
    image_array_sample = shuffle(image_array, random_state=0)[:1000]
    kmeans = KMeans(n_clusters=n_clusters,
                    random_state=0).fit(image_array_sample)

    # Get labels for all points
    #print("Predicting color indices on the full image (k-means)")
    labels = kmeans.predict(image_array)
    #print("done in %0.3fs." % (time() - t0))

    codebook_random = shuffle(image_array, random_state=0)[:n_clusters + 1]
    #print("Predicting color indices on the full image (random)")
    labels_random = pairwise_distances_argmin(codebook_random,
                                              image_array,
                                              axis=0)

    return recreate_image(kmeans.cluster_centers_, labels, w, h)
コード例 #25
0
def count_weight(X, centers, weight_koef=0.000002):
    labels = pairwise_distances_argmin(X, centers, metric='manhattan')
    # Считаем, как много клиентов-точек приходится на каждый банкомат
    elems_count = Counter(labels)
    elems_count_list = []
    for i in range(centers.shape[0]):
        if i in elems_count:
            elems_count_list.append(elems_count[i])
        else:
            elems_count_list.append(0)
    elems_count_list = np.array(elems_count_list)
    lengths = []
    weight_new = np.zeros([X.shape[0], centers.shape[0]])
    it = 0
    for x in X:
        # Считаем расстояние до ближайшего банкомата
        weight = np.abs(x[0] - centers[:, 0]) + np.abs(x[1] - centers[:, 1])
        # Но на этот раз прибавляем поправку, которая зависит от количества точек-клиентов на каждом банкомате
        # Мат.ожидание при распределении Пуассона lambda = n*p, где p - вероятность, что потребуется банкомат
        weight += weight_koef * elems_count_list
        weight_new[it] = weight
        it += 1
    return(np.sum(np.min(weight_new, axis=1)))
コード例 #26
0
def color_quantization(n_colors=64, file_path=''):
    # Load the Summer Palace photo
    image = None
    if (len(file_path) > 0) and (os.path.isfile(file_path)):
        image = mpimg.imread(file_path)
    if image is None:
        image = load_sample_image("china.jpg")
    # Convert to floats instead of the default 8 bits integer coding. Dividing by
    # 255 is important so that plt.imshow behaves works well on float data (need to
    # be in the range [0-1]
    image = np.array(image, dtype=np.float64) / 255

    # Load Image and transform to a 2D numpy array.
    w, h, d = original_shape = tuple(image.shape)
    assert d == 3
    image_array = np.reshape(image, (w * h, d))
    print("Fitting model on a small sub-sample of the data")
    t0 = time()
    image_array_sample = shuffle(image_array, random_state=0)[:1000]
    kmeans = KMeans(n_clusters=n_colors,
                    random_state=0).fit(image_array_sample)
    print("done in %0.3fs." % (time() - t0))

    # Get labels for all points
    print("Predicting color indices on the full image (k-means)")
    t0 = time()
    labels = kmeans.predict(image_array)
    print("done in %0.3fs." % (time() - t0))

    codebook_random = shuffle(image_array, random_state=0)[:n_colors + 1]
    print("Predicting color indices on the full image (random)")
    t0 = time()
    labels_random = pairwise_distances_argmin(codebook_random,
                                              image_array,
                                              axis=0)
    print("done in %0.3fs." % (time() - t0))
    return [kmeans, image, labels, codebook_random, labels_random, w, h]
コード例 #27
0
ファイル: fairrec.py プロジェクト: tnakae/fairrec
    def _greedy_round_robin(self):
        """Assign items to users based on greedy-round-robin rule."""
        while True:
            # 在庫がない場合は終了
            if self.stocks.sum() == 0:
                break

            # 対象ユーザ
            user_idx = self.sigma[self.sigma_idx]

            # このユーザがすでに最大数を取得している場合は終了
            if self.assignment[user_idx].nnz >= self.k:
                break

            # このユーザが持っていない在庫のあるアイテムを持ってくる
            assigned_items = self.assignment[user_idx].nonzero()[1]
            valid_items = np.argwhere(self.stocks > 0).squeeze()
            valid_items = valid_items[~np.isin(valid_items, assigned_items)]

            # このユーザが取れるアイテム在庫がない場合は終了
            if len(valid_items) == 0:
                break

            # 一番距離が近いアイテムを探す
            target_user_emb = self.user_emb[user_idx][None, :]
            valid_items_emb = self.item_emb[valid_items]
            nearest_idx = pairwise_distances_argmin(
                target_user_emb, valid_items_emb,
                metric=self.metric)[0]
            nearest_item = valid_items[nearest_idx]

            # アイテムを割り当て
            self.assignment[user_idx, nearest_item] += 1
            self.stocks[nearest_item] -= 1

            self.sigma_idx = (self.sigma_idx + 1) % self.num_user
コード例 #28
0
def clustering_2D_HT(df, n_clusters=2, rseed=2):
    # First iteration of the cluster code 
    df = df[['WVHT', 'APD']]
    X = df.values
    rng = np.random.RandomState(rseed)
    i = rng.permutation(X.shape[0])[:n_clusters]
    centers = X[i]

    while True:
        labels = pairwise_distances_argmin(X, centers)

        new_centers = np.array([X[labels == j].mean(0) for j in\
                                range(n_clusters)])

        if np.all(centers == new_centers):
            break
        centers = new_centers
        
        
    plt.scatter(X[:,0], X[:,1], c=labels, s=50, cmap='viridis')
    plt.scatter(centers[:,0],centers[:,1], color='white', marker='x')
    plt.xlabel("Significant Wave Height Normalized Data")
    plt.ylabel("Average Wave Period Normalized Data")
    plt.show()
コード例 #29
0
ファイル: kmeans.py プロジェクト: Krati012/DSA-Project
def find_clusters(X, n_clusters, error, rseed = 2):
        #X is a (300,2) shaped array containing 300 samples distributed across the x-y coordinate system 
        #in 4 clusters. The aim is to identify these clusters and corresponding cluster centroids.
        #n_clusters is the number of clusters we divide the data into. Here n_clusters = 4.
        #rseed is the seed value for a pseudo random number generator.
        
        rng = np.random.RandomState(rseed)
        i = rng.permutation(X.shape[0])[:n_clusters]
        centers = X[i] #We randomly initialize the centroids

        while True: #The loop runs till the centroids converge
            labels = pairwise_distances_argmin(X, centers)
            #labels is the array such that the value of label of i-th data point, labels[i] is equal
            # to the number of the cluster that the i-th data point belongs to.
            new_centers = np.array([X[labels==i].mean(0) for i in range(n_clusters)])
            #New centroid for a given cluster is calculated by finding the arithmetic mean of the points
            #assigned to that cluster.
            
            #if the centroids converge then the algorithm ends
            if err(centers, new_centers)<error: 
                break
            centers = new_centers   
    
        return centers,labels   #return final coordinates of centroids of the clusters and label of each data point        
コード例 #30
0
    def cluster_image(self):
        # Converting into range [0,1] from [0, 255]
        image = np.array(self.image, dtype=np.float64) / 255
        w, h, d = tuple(image.shape)

        image_arr = np.reshape(image, (w * h, d))

        image_subset = shuffle(image_arr, random_state=0)[:1000]

        for index, num_clusters in enumerate(self.colors):
            clf = KMeans(n_clusters=num_clusters, random_state=0)
            clf.fit(image_subset)

            predictions = clf.predict(image_arr)
            codebook_random = shuffle(image_arr, random_state=0)[:num_clusters]

            labels_random = pairwise_distances_argmin(codebook_random,
                                                      image_arr,
                                                      axis=0)
            label = "Compressed image (" + str(num_clusters) + " colors)"
            self.show_image(
                index,
                self.rebuild_image(codebook_random, labels_random, w, h,
                                   index), label, self.image_name)
    def predict(self, X):
        """Predict the closest cluster each sample in X belongs to.
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            New data to predict.
        Returns
        -------
        labels : array, shape (n_samples,)
            Index of the cluster each sample belongs to.
        """
        check_is_fitted(self, "cluster_centers_indices_")
        if not hasattr(self, "cluster_centers_"):
            raise ValueError("Predict method is not supported when "
                             "affinity='precomputed'.")

        if self.cluster_centers_.size > 0:
            return pairwise_distances_argmin(X, self.cluster_centers_)
        else:
            warnings.warn(
                "This model does not have any cluster centers "
                "because affinity propagation did not converge. "
                "Labeling every sample as '-1'.", ConvergenceWarning)
            return np.array([-1] * X.shape[0])
kmeans = KMeans(n_clusters=n_colors, random_state=0).fit(image_array_sample)
print('done in %0.3fs.' % (time()-t0))

# Get labels for all points
print('Predicting color indices on the full image(k-means')

t0 = time()
labels = kmeans.predict(image_array)
print('done in %0.3fs.' % (time()-t0))

codebook_random = shuffle(image_array, random_state=0)[:n_colors + 1]
print('codebook - random shape: ', codebook_random.shape)
# print(codebook_random[1])
print('Predicting color indices on the full image(random)')
t0 = time()
labels_random = pairwise_distances_argmin(codebook_random, image_array, axis=0)
print('done in %0.3fs.' % (time() - t0))


def recreate_image(codebook, labels, w, h):
    '''Recreate the (compressed) image from the code book & labels '''
    d = codebook.shape[1]
    image = np.zeros((w, h, d))
    label_index = 0
    for i in range(w):
        for j in range(h):
            image[i][j] = codebook[labels[label_index]]
            label_index += 1
    return image

# Display all results , alongside original image
コード例 #33
0
def assign_centers_dynamic(doc_by_term, chromosome):
    return pairwise_distances_argmin(doc_by_term, doc_by_term[chromosome])
コード例 #34
0
kmeans = KMeans(n_clusters=n_colors,
                init='k-means++',
                verbose=1,
                random_state=0).fit(image_array_sample)

print("done in %0.3fs." % (time() - t0))
# Get labels for all points
print("Predicting color indices on the full image (k-means)")
t0 = time()
labels = kmeans.predict(image_array)
print("done in %0.3fs." % (time() - t0))

codebook_random = shuffle(image_array, random_state=0)[:n_colors + 1]
print("Predicting color indices on the full image (random)")
t0 = time()
labels_random = pairwise_distances_argmin(codebook_random, image_array, axis=0)
print("done in %0.3fs." % (time() - t0))
#codebook_random1 = shuffle(image_array1, random_state=0)[:n_colors + 1]


def recreate_image(codebook, labels, w, h):
    """Recreate the (compressed) image from the code book & labels"""
    #d = 3
    image = np.zeros((w, h, 3))
    label_idx = 0
    for i in range(w):
        for j in range(h):
            image[i][j] = codebook[labels[label_idx]]
            label_idx += 1
    return image
コード例 #35
0
def plot_color_quantization():
    n_colors = 64

    # Load the Summer Palace photo
    china = load_sample_image("china.jpg")

    # Convert to floats instead of the default 8 bits integer coding. Dividing by
    # 255 is important so that plt.imshow behaves works well on float data (need to
    # be in the range [0-1])
    china = np.array(china, dtype=np.float64) / 255

    # Load Image and transform to a 2D numpy array.
    w, h, d = original_shape = tuple(china.shape)
    assert d == 3
    image_array = np.reshape(china, (w * h, d))

    print("Fitting model on a small sub-sample of the data")
    t0 = time()
    image_array_sample = shuffle(image_array, random_state=0)[:1000]
    kmeans = KMeans(n_clusters=n_colors,
                    random_state=0).fit(image_array_sample)
    print("done in %0.3fs." % (time() - t0))

    # Get labels for all points
    print("Predicting color indices on the full image (k-means)")
    t0 = time()
    labels = kmeans.predict(image_array)
    print("done in %0.3fs." % (time() - t0))

    codebook_random = shuffle(image_array, random_state=0)[:n_colors]
    print("Predicting color indices on the full image (random)")
    t0 = time()
    labels_random = pairwise_distances_argmin(codebook_random,
                                              image_array,
                                              axis=0)
    print("done in %0.3fs." % (time() - t0))

    def recreate_image(codebook, labels, w, h):
        """Recreate the (compressed) image from the code book & labels"""
        d = codebook.shape[1]
        image = np.zeros((w, h, d))
        label_idx = 0
        for i in range(w):
            for j in range(h):
                image[i][j] = codebook[labels[label_idx]]
                label_idx += 1
        return image

    # Display all results, alongside original image
    plt.figure(1)
    plt.clf()
    plt.axis('off')
    plt.title('Original image (96,615 colors)')
    plt.imshow(china)

    plt.figure(2)
    plt.clf()
    plt.axis('off')
    plt.title('Quantized image (64 colors, K-Means)')
    plt.imshow(recreate_image(kmeans.cluster_centers_, labels, w, h))

    plt.figure(3)
    plt.clf()
    plt.axis('off')
    plt.title('Quantized image (64 colors, Random)')
    plt.imshow(recreate_image(codebook_random, labels_random, w, h))
    plt.show()
コード例 #36
0
def color_quantization(filename='mass1971.jpg', n_colors = 8):

    # Load the  photo
    img=Image.open(filename)
    
    #convert the values to floats instead of the defalt 8 bits integer
    #Dividing by 255 is very important so that plt.inshow works well on float data
    #data needs to be in range [0 1]
    
    img=np.array(img, dtype=np.float64)/255
    
    #Load the image and transform into a 2Dnumpy array
    w,h,d = original_shape = tuple (img.shape)
    assert d==3
    image_array=np.reshape(img, (w*h,d))
    """
    ((   
   (0, 0, 0),     # black background
    (255, 0, 0),   # index 1 is red
    (255, 255, 0), # index 2 is yellow
    (255, 0, 255), # index 3 is orange
    (102, 160, 38),   # index 4 is green
   ( 0, 148, 189),   # index 5 is blue
   ( 207, 3, 124), # index 6 is pink
   ( 69, 0, 68), # index 7 is lila
   ( 117,117,117)
   )) # index 8 is grey
    """
    """
            Now it builds the predictive model
    """
    
    print("Fitting model on a small sub-sample of the data")
    t0 = time()
    #gets 1000 random states
    image_array_sample = shuffle(image_array, random_state=0)[:1000]
    
  
    #not it fits the model
    kmeans = KMeans(n_clusters=n_colors, random_state=0).fit(image_array_sample)
    #just to check the time
    print("Done in %0.3fs." % (time() - t0))
    
    """
            Now it generalizes to all the pixels of the picture
    """
    
    # Get labels for all points
    print("Predicting color indices on the full image (k-means)")
    t0 = time()
    labels = kmeans.predict(image_array)
    print("Done in %0.3fs." % (time() - t0))
    
    
    """
            Now it predicts the color
    """
    
    codebook_random = shuffle(image_array, random_state=0)[:n_colors + 1]
    print("Predicting color indices on the full image (random)")
    t0 = time()
    labels_random = pairwise_distances_argmin(codebook_random,
                                              image_array,
                                              axis=0)
    print("Done in %0.3fs." % (time() - t0))
    
    
    def recreate_image(codebook, labels, w, h):
        """Recreate the (compressed) image from the code book & labels"""
        d = codebook.shape[1]
        image = np.zeros((w, h, d))
        label_idx = 0
        for i in range(w):
            for j in range(h):
                image[i][j] = codebook[labels[label_idx]]
                label_idx += 1
        return image
        
        
    # Displays the initial image
    plt.figure(1)
    plt.clf()
    ax = plt.axes([0, 0, 1, 1])
    plt.axis('off')
    plt.title('Original image (thousands colors)')  
    plt.imshow(img)
    
    #This plots the image with 8 colors
    plt.figure(2)
    plt.clf()
    ax = plt.axes([0, 0, 1, 1])
    plt.axis('off')
    plt.title('Quantized image (8 colors, K-Means)')
    plt.imshow(recreate_image(kmeans.cluster_centers_, labels, w, h))
    plt.show()
    
    
    #this is the modified image in np.ndarray format
    image = recreate_image(kmeans.cluster_centers_, labels, w, h)
    #now this image is in np.ndarray format, we have to extract the colors.
    plt.imshow(image)
    
    im=toimage(recreate_image(kmeans.cluster_centers_, labels, w, h))
    
    #now transform the image in 8 bits integer (image) and extract colors
    return toimage(recreate_image(kmeans.cluster_centers_, labels, w, h)).getcolors(), im
コード例 #37
0
ファイル: SCUBA_core.py プロジェクト: oftensmile/PySCUBA
def initialize_tree(data, cell_stages, rigorous_gap_stats = False):
    """Come up with an initial assessment of the cellular hierarchy, using a series of
       k-means clustering.
    """

    warnings.formatwarning = custom_formatwarning

    min_split = 15                 # Lower threshold on the number of cells in a cluster 
                                   # for this cluster to be split.
    min_percentage_split = 0.25     # Minimum fraction of cells in the smaller cluster
                                   # during a bifurcation.
    
    N_cells, N_features = data.shape
    
    stages = np.unique(cell_stages)
    N_stages = stages.size
    
    cluster_indices = np.full(N_cells, -1, dtype = int)
    parent_clusters = defaultdict(list)
    cluster_IDs_per_stage = defaultdict(list)
    
    # We're now about to compute a gap statistics estimation of the optimal number
    # of clusters at the initial (pseudo-)time stage:
    
    condition = cell_stages == stages[0]
    
    X = np.compress(condition, data, axis = 0)
    
    if rigorous_gap_stats and (X.shape[0] < min_split):
        N_best_clusters = 1
    elif rigorous_gap_stats:
        N_best_clusters = 1
        N_best_clusters = 1
        k_max = min(2, X.shape[0])
    
        log_W, E_log_W, s_k = gap_stats(X, 1, k_max)
        if log_W is not None:
            gaps = E_log_W - log_W
            for l in xrange(gaps.size - 1):
                if gaps[l] >= gaps[l + 1] - s_k[l + 1]:
                    N_best_clusters = l + 1
                    break
    else:
        N_samples = 1000
        k_max = min(2, X.shape[0])
        
        R = np.random.uniform(size = (N_samples, N_features))
        W = np.diagflat(np.std(data, axis = 0))
        R = np.dot(R, W) / sqrt(1 / float(12))
   
        W_k0 = np.empty(k_max, dtype = float)
        for i in xrange(k_max):
            kmeans = KMeans(i + 1, n_init = 50)
            W_k0[i] = kmeans.fit(R).inertia_ / float(N_samples)
    
        W_k = np.empty(k_max, dtype = float)
        for i in xrange(k_max):
            kmeans = KMeans(i + 1, n_init = 50)
            W_k[i] = kmeans.fit(X).inertia_ / float(X.shape[0])
        
        N_best_clusters = np.argmax(W_k0[:k_max] - W_k[:]) + 1
        if X.shape[0] < min_split:
            N_best_clusters = 1
         
    # Done with the gap statistics for the first (pseudo-)time stage
    
    cluster_tally = N_best_clusters
    
    kmeans = KMeans(N_best_clusters, n_init = 50)
    kmeans.fit(X)
    
    cluster_indices[condition] = kmeans.labels_
    centroid_coordinates = kmeans.cluster_centers_
    parent_clusters[0].extend([-1] * N_best_clusters)
    cluster_IDs_per_stage[0].extend([k for k in xrange(N_best_clusters)])
    
    for stage_idx in xrange(1, N_stages):
        condition = cell_stages == stages[stage_idx]
    
        X = np.compress(condition, data, axis = 0)
        
        # Map the cells at this stage to the nearest clusters from the previous stage:
        previous_clusters = cluster_IDs_per_stage[stage_idx - 1]
        N_previous_clusters = len(previous_clusters)
     
        previous_centroid_coordinates = centroid_coordinates[-N_previous_clusters:]
        nearest_previous_cluster_indices = pairwise_distances_argmin(X,
                                          previous_centroid_coordinates)
        
        # The following will now test for a bifurcation at this (pseudo-)time stage:
        for j in xrange(N_previous_clusters):
            idx = np.where(nearest_previous_cluster_indices == j)[0]
            
            if idx.size == 0:
                msg = ' '.join(["Empty cluster mapping encountered", 
                                "at (pseudo-)time {0}.\n".format(stage_idx + 1)])
                warnings.warn(msg)
                continue
                
            XX = X[idx]
            
            if rigorous_gap_stats:
                N_best_clusters = 1
                k_max = min(2, XX.shape[0])
            
                log_W, E_log_W, s_k = gap_stats(XX, 1, k_max)
                if log_W is not None:
                    gaps = E_log_W - log_W
                    if gaps.size > 1 and gaps[0] >= gaps[1] - s_k[1]:
                        N_best_clusters = 2
                        kmeans = KMeans(2, n_init = 50)
                        cluster_idx = kmeans.fit(XX).labels_
            else:
                W_k = []
                for i in {1, 2}:
                    if XX.shape[0] >= i:
                        kmeans = KMeans(i, n_init = 50)
                        W_k.append(kmeans.fit(XX).inertia_ / float(XX.shape[0]))
                        
                N_best_clusters = np.argmax(W_k0[:len(W_k)] - W_k[:]) + 1
                kmeans = KMeans(N_best_clusters, n_init = 50)
                cluster_idx = kmeans.fit(XX).labels_
            
            # Do not split if there are not enough cells 
            if (idx.size < min_split) or (N_best_clusters == 1) or (min(np.bincount(cluster_idx)) / float(XX.shape[0]) < min_percentage_split):
                cluster_indices[np.where(condition == True)[0][idx]] = cluster_tally
                centroid_coordinates = np.vstack((centroid_coordinates, np.mean(XX, 0)))
                parent_clusters[stage_idx].append(cluster_IDs_per_stage[stage_idx-1][j])
                cluster_IDs_per_stage[stage_idx].append(cluster_tally)
                
                cluster_tally += 1
            else:
                cluster_indices[np.where(condition == True)[0][idx]] = cluster_tally + cluster_idx
                centroid_coordinates = np.vstack((centroid_coordinates,
                                               kmeans.cluster_centers_))
                parent_clusters[stage_idx].extend([cluster_IDs_per_stage[stage_idx-1][j]] * 2)
                cluster_IDs_per_stage[stage_idx].extend([cluster_tally, cluster_tally + 1])
                
                cluster_tally += 2
                
    assert np.all(cluster_indices != -1)
                
    return centroid_coordinates, cluster_indices, parent_clusters