Ejemplo n.º 1
0
def setup_data():
    X, labels_true = make_blobs(n_samples=700,
                                centers=N,
                                n_features=20,
                                random_state=0)
    xm_clust = XMeans()
    xm_clust.fit(X)
    yield xm_clust, labels_true
    def fit(self, data):
        # initialize xmeans 
        xmeans = XMeans(kmin=self.kmin, kmax=self.kmax, distance_function=self.xmeans_df, verbose=self.verbose)
        # fit xmeans on the dataset
        xmeans.fit(data)
        # store the results: labels, clusters and centers
        xmeans_labels = xmeans.labels_
        # create a dict from cluster label to the points in it
        xmeans_clusters = points_labels_to_clusters(data, xmeans_labels)
        # compute the medoids of the clusters
        xmeans_centers = get_clusters_medoids(xmeans_clusters, euclidean_distance)

        # create a dictionary from points to clusters label
        inverse_xmeans_clusters = dict()
        for cluster_id, points_cluster in xmeans_clusters.items():
            for p in points_cluster:
                inverse_xmeans_clusters[(p[0], p[1])] = cluster_id

        # compute the pairwise distances between the centers of xmeans
        data_dist = pdist(xmeans_centers, metric=self.singlelinkage_df)
        # performs the single linkage
        data_link = linkage(data_dist, method='single', metric=self.singlelinkage_df) 

        # compute the height at which to cut the dendogram
        cut_dist = get_cut_distance(data_link, is_outlier=self.is_outlier, min_k=self.kmin, min_dist=self.min_dist)
        self.cut_dist_ = cut_dist
        # forms flat clusters from the hierarchical clustering defined by the linkage matrix
        singlelinkage_labels = fcluster(data_link, cut_dist, 'distance')
        # create a dict from cluster label to the points in it
        singlelinkage_clusters = points_labels_to_clusters(xmeans_centers, singlelinkage_labels)

        # aggregate clusters according to the cut distance
        tosca_clusters = defaultdict(list)
        for sl_cid, sl_pl in singlelinkage_clusters.items():
            # for each point in the cluster
            for p in sl_pl:
                # takes the cluster label given from x means
                xm_cid = inverse_xmeans_clusters[(p[0], p[1])]
                # merges the clusters of xmeans
                tosca_clusters[sl_cid].extend(xmeans_clusters[xm_cid])
        # compute the medoids of the new clusters
        tosca_centers = get_clusters_medoids(tosca_clusters, euclidean_distance)
        # get the list of points and cluster labels
        data, tosca_labels = clusters_to_points_labels(tosca_clusters)
        # final value of k after cluster aggregation
        tosca_k = len(tosca_centers)

        self.cluster_centers_ = tosca_centers
        self.labels_ = tosca_labels
        self.k_ = tosca_k
        self.data_ = data

        return self
Ejemplo n.º 3
0
def XMeans_duplicate_removal(dataframe):
    # Note this method now takes a dataframe as input

    if len(dataframe) < 2:
        # nothing to do
        return dataframe

    Crater_data = dataframe
    # extract axes
    x = Crater_data[0].values.tolist()
    y = Crater_data[1].values.tolist()
    r = Crater_data[2].values.tolist()
    p = Crater_data[3].values.tolist()
    Points = []

    X = np.column_stack((x, y))
    xm_clust = XMeans()
    xm_clust.fit(X)
    groups_pred = xm_clust.labels_

    for c in set(groups_pred):
        idx = [i for i, e in enumerate(groups_pred) if e == c]

        Group_x = []
        Group_y = []
        Group_r = []
        Group_p = []
        index = []

        for i in idx:
            if i in range(0, len(x)):
                Group_x.append(x[i])
                Group_y.append(y[i])
                Group_r.append(r[i])
                Group_p.append(p[i])
                index.append(i)

        # after group is defined, extract its elements from list
        Points.append([Group_x, Group_y, Group_r, Group_p])

    # now reduce groups
    center_size = []
    for i, (Xs, Ys, Rr, Ps) in enumerate(Points):
        # we take the point with best prediction confidence
        best_index = np.argmax(Ps)
        x_center = Xs[best_index]
        y_center = Ys[best_index]
        radius = Rr[best_index]
        prob = Ps[best_index]
        center_size += [[x_center, y_center, radius, prob]]

    return pd.DataFrame(center_size)
Ejemplo n.º 4
0
def compute_similarity(PCP, bound_idxs, xmeans=False, k=5, N=32, seed=None):
    """Main function to compute the segment similarity of file file_struct."""

    # Get PCP segments
    pcp_segments = get_pcp_segments(PCP, bound_idxs)

    # Get the 2d-FMCs segments
    fmcs = pcp_segments_to_2dfmc_fixed(pcp_segments, N=N)
    if fmcs == [] or fmcs is None:
        return np.arange(len(bound_idxs) - 1)

    # Compute the labels using kmeans
    if xmeans:
        xm = XMeans(fmcs, plot=False, seed=seed)
        k = xm.estimate_K_knee(th=0.01, maxK=8)
    est_labels = compute_labels_kmeans(fmcs, k=k)

    # Plot results
    #plot_pcp_wgt(PCP, bound_idxs)

    return est_labels
Ejemplo n.º 5
0
def compute_similarity(PCP, bound_idxs, xmeans=False, k=5, N=32, seed=None):
    """Main function to compute the segment similarity of file file_struct."""

    # Get PCP segments
    pcp_segments = get_pcp_segments(PCP, bound_idxs)

    # Get the 2d-FMCs segments
    fmcs = pcp_segments_to_2dfmc_fixed(pcp_segments, N=N)
    if fmcs == [] or fmcs is None:
        return np.arange(len(bound_idxs) - 1)

    # Compute the labels using kmeans
    if xmeans:
        xm = XMeans(fmcs, plot=False, seed=seed)
        k = xm.estimate_K_knee(th=0.01, maxK=8)
    est_labels = compute_labels_kmeans(fmcs, k=k)

    # Plot results
    #plot_pcp_wgt(PCP, bound_idxs)

    return est_labels
Ejemplo n.º 6
0
def compute_similarity(PCP, bound_idxs, dirichlet=False, xmeans=False, k=5):
    """Main function to compute the segment similarity of file file_struct."""

    # Get PCP segments
    pcp_segments = get_pcp_segments(PCP, bound_idxs)

    # Get the 2d-FMCs segments
    fmcs = pcp_segments_to_2dfmc_max(pcp_segments)
    if len(fmcs) == 0:
        return np.arange(len(bound_idxs) - 1)

    # Compute the labels using kmeans
    if dirichlet:
        k_init = np.min([fmcs.shape[0], k])
        # Only compute the dirichlet method if the fmc shape is small enough
        if fmcs.shape[1] > 500:
            labels_est = compute_labels_kmeans(fmcs, k=k)
        else:
            dpgmm = mixture.DPGMM(n_components=k_init, covariance_type='full')
            #dpgmm = mixture.VBGMM(n_components=k_init, covariance_type='full')
            dpgmm.fit(fmcs)
            k = len(dpgmm.means_)
            labels_est = dpgmm.predict(fmcs)
            #print "Estimated with Dirichlet Process:", k
    if xmeans:
        xm = XMeans(fmcs, plot=False)
        k = xm.estimate_K_knee(th=0.01, maxK=8)
        labels_est = compute_labels_kmeans(fmcs, k=k)
        #print "Estimated with Xmeans:", k
    else:
        labels_est = compute_labels_kmeans(fmcs, k=k)

    # Plot results
    #plot_pcp_wgt(PCP, bound_idxs)

    return labels_est
Ejemplo n.º 7
0
def compute_similarity(PCP, bound_idxs, dirichlet=False, xmeans=False, k=5):
    """Main function to compute the segment similarity of file file_struct."""

    # Get PCP segments
    pcp_segments = get_pcp_segments(PCP, bound_idxs)

    # Get the 2d-FMCs segments
    fmcs = pcp_segments_to_2dfmc_max(pcp_segments)
    if len(fmcs) == 0:
        return np.arange(len(bound_idxs) - 1)

    # Compute the labels using kmeans
    if dirichlet:
        k_init = np.min([fmcs.shape[0], k])
        # Only compute the dirichlet method if the fmc shape is small enough
        if fmcs.shape[1] > 500:
            labels_est = compute_labels_kmeans(fmcs, k=k)
        else:
            dpgmm = mixture.DPGMM(n_components=k_init, covariance_type='full')
            #dpgmm = mixture.VBGMM(n_components=k_init, covariance_type='full')
            dpgmm.fit(fmcs)
            k = len(dpgmm.means_)
            labels_est = dpgmm.predict(fmcs)
            #print "Estimated with Dirichlet Process:", k
    if xmeans:
        xm = XMeans(fmcs, plot=False)
        k = xm.estimate_K_knee(th=0.01, maxK=8)
        labels_est = compute_labels_kmeans(fmcs, k=k)
        #print "Estimated with Xmeans:", k
    else:
        labels_est = compute_labels_kmeans(fmcs, k=k)

    # Plot results
    #plot_pcp_wgt(PCP, bound_idxs)

    return labels_est
Ejemplo n.º 8
0
def test_3(visualize=False):
    X, y = make_blobs(n_samples=500,
                      n_features=2,
                      centers=8,
                      cluster_std=1.5,
                      center_box=(-10.0, 10.0),
                      shuffle=True,
                      random_state=1)
    x, y = X[:, 0], X[:, 1]
    X = np.c_[x, y]
    st = time.time()
    x_means = XMeans(random_state=1).fit(np.c_[X])
    et = time.time() - st

    if visualize:
        print(x_means.labels_)
        print(x_means.cluster_centers_)

        plt.scatter(x, y, c='black', marker='o', s=50)
        plt.title("x-means_sample")
        plt.grid()
        plt.show()

        colors = ["g", "b", "c", "m", "y", "b", "w"]
        for label in range(x_means.labels_.max() + 1):
            plt.scatter(x[x_means.labels_ == label],
                        y[x_means.labels_ == label],
                        c=colors[label],
                        label="sample",
                        s=30)
        plt.scatter(x_means.cluster_centers_[:, 0],
                    x_means.cluster_centers_[:, 1],
                    c="r",
                    marker="+",
                    label="center",
                    s=250)
        plt.title("x-means_test")
        plt.legend()
        plt.grid()
        plt.show()

    return et
Ejemplo n.º 9
0
def test_1(visualize=False):
    x = np.array([
        np.random.normal(loc, 0.1, 20) for loc in np.repeat([1, 2], 2)
    ]).flatten()
    y = np.array([
        np.random.normal(loc, 0.1, 20) for loc in np.tile([1, 2], 2)
    ]).flatten()
    st = time.time()
    x_means = XMeans(random_state=1).fit(np.c_[x, y])
    et = time.time() - st

    if visualize:
        print(x_means.labels_)
        print(x_means.cluster_centers_)

        colors = ["g", "b", "c", "m", "y", "b", "w"]
        for label in range(x_means.labels_.max() + 1):
            plt.scatter(x[x_means.labels_ == label],
                        y[x_means.labels_ == label],
                        c=colors[label],
                        label="sample",
                        s=30)
        plt.scatter(x_means.cluster_centers_[:, 0],
                    x_means.cluster_centers_[:, 1],
                    c="r",
                    marker="+",
                    label="center",
                    s=100)
        plt.xlim(0, 3)
        plt.ylim(0, 3)
        plt.title("x-means_test")
        plt.legend()
        plt.grid()
        plt.show()

    return et
Ejemplo n.º 10
0
def xmeans(D_matrix):
    return XMeans(random_state=1).fit_predict(D_matrix)
Ejemplo n.º 11
0
            Y_axis.append(float(row[1]))
            W_size.append(float(row[2]))
            probs.append(float(row[3]))


xmax = np.max(X_axis)
ymax = np.max(Y_axis)
wmax = np.max(W_size)
X_axis=np.asarray(X_axis, dtype=np.float64)/xmax
Y_axis=np.asarray(Y_axis, dtype=np.float64)/ymax
W_size=np.asarray(W_size, dtype=np.float64)/wmax
a=np.c_[X_axis, Y_axis, W_size]

datafit = np.c_[X_axis, Y_axis, W_size]
kmeans = KMeans(n_clusters=214, max_iter=1000, tol=0.0001, algorithm='auto').fit(datafit)
x_means = XMeans(random_state=1).fit(np.c_[X_axis, Y_axis, W_size])

# print(x_means.labels_)
# print(x_means.cluster_centers_)
# print(x_means.cluster_log_likelihoods_)
# print(x_means.cluster_sizes_)

removed_list_cnn = []
for row in x_means.cluster_centers_:
    xc = row[0] * xmax
    yc = row[1] * ymax
    ws = row[2] * wmax
    removed_list_cnn.append([xc, yc, ws])
removed_file = open("crater_25_cnn_removed.csv","w", newline='')
with removed_file:
    writer = csv.writer(removed_file, delimiter=',')