def check_graph_segments_vals():
    X = np.arange(5)[:, None] ** 2
    mst = MSTClustering(cutoff=0).fit(X)
    segments = mst.get_graph_segments()
    assert len(segments) == 1
    assert_allclose(segments[0],
                    [[0, 4, 4, 9],
                     [1, 1, 9, 16]])
def test_precomputed():
    X, y = make_blobs(100, random_state=42)
    D = pairwise_distances(X)

    mst1 = MSTClustering(cutoff=0.1)
    mst2 = MSTClustering(cutoff=0.1, metric='precomputed')

    assert_equal(mst1.fit_predict(X),
                 mst2.fit_predict(D))
Example #3
0
    def check_shape(ndim, cutoff, N=10):
        X = np.random.rand(N, ndim)
        mst = MSTClustering(cutoff=cutoff).fit(X)

        segments = mst.get_graph_segments()
        print(ndim, cutoff, segments[0].shape)
        assert len(segments) == ndim
        assert all(seg.shape == (2, N - 1 - cutoff) for seg in segments)

        segments = mst.get_graph_segments(full_graph=True)
        print(segments[0].shape)
        assert len(segments) == ndim
        assert all(seg.shape == (2, N - 1) for seg in segments)
    def check_shape(ndim, cutoff, N=10):
        X = np.random.rand(N, ndim)
        mst = MSTClustering(cutoff=cutoff).fit(X)

        segments = mst.get_graph_segments()
        print(ndim, cutoff, segments[0].shape)
        assert len(segments) == ndim
        assert all(seg.shape == (2, N - 1 - cutoff) for seg in segments)

        segments = mst.get_graph_segments(full_graph=True)
        print(segments[0].shape)
        assert len(segments) == ndim
        assert all(seg.shape == (2, N - 1) for seg in segments)
Example #5
0
def do_clustering():

    # create some data with four clusters
    # X, y = make_blobs(200, centers=4, random_state=42)
    X = np.genfromtxt('./file16.csv', delimiter=',')
    print(X.shape)
    X = X[:, 1:]

    # predict the labels with the MST algorithm
    model = MSTClustering(cutoff_scale=2)
    labels = model.fit_predict(X)

    # plot the results
    plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='rainbow', marker='.')
    plt.savefig('./mst.png')
Example #6
0
def test_precomputed_metric():
    N = 30
    n_neighbors = 10
    rng = np.random.RandomState(42)
    X = rng.rand(N, 3)

    G_sparse = kneighbors_graph(X, n_neighbors=n_neighbors, mode='distance')
    G_dense = G_sparse.toarray()
    G_dense[G_dense == 0] = np.nan

    kwds = dict(cutoff=0.1)
    y1 = MSTClustering(n_neighbors=n_neighbors, **kwds).fit_predict(X)
    y2 = MSTClustering(metric='precomputed', **kwds).fit_predict(G_sparse)
    y3 = MSTClustering(metric='precomputed', **kwds).fit_predict(G_dense)

    assert_allclose(y1, y2)
    assert_allclose(y2, y3)
Example #7
0
 def _check(n, min_cluster_size):
     y_pred = MSTClustering(cutoff=n,
                            n_neighbors=2,
                            min_cluster_size=min_cluster_size,
                            approximate=True).fit_predict(X)
     labels, counts = np.unique(y_pred, return_counts=True)
     counts = counts[labels >= 0]
     if len(counts):
         assert_(counts.min() >= min_cluster_size)
Example #8
0
def test_bad_arguments():
    X, y = make_blobs(100, random_state=42)

    mst = MSTClustering()
    assert_raises_regex(ValueError,
                        "Must specify either cutoff or cutoff_frac", mst.fit,
                        X, y)

    mst = MSTClustering(cutoff=-1)
    assert_raises_regex(ValueError, "cutoff must be positive", mst.fit, X)

    mst = MSTClustering()
    msg = "Must call fit\(\) before get_graph_segments()"
    assert_raises_regex(ValueError, msg, mst.get_graph_segments)

    mst = MSTClustering(cutoff=0, metric='precomputed')
    mst.fit(pairwise_distances(X))
    msg = "Cannot use ``get_graph_segments`` with precomputed metric."
    assert_raises_regex(ValueError, msg, mst.get_graph_segments)
Example #9
0
 def __init__(self,
              df,
              cutoff_scale=None,
              min_cluster_size=None,
              n_neighbors=None,
              set_mst=None,
              labels=None,
              segments=None,
              seps=None):
     self.df = df
     self.cutoff_scale = cutoff_scale
     self.min_cluster_size = min_cluster_size
     self.n_neighbors = n_neighbors
     self.set_mst = MSTClustering(cutoff_scale=cutoff_scale,
                                  min_cluster_size=min_cluster_size,
                                  n_neighbors=n_neighbors)
     pos = np.array([list(i) for i in zip(df.ra, df.dec)])
     self.labels = self.set_mst.fit_predict(pos)
     self.segments = self.set_mst.get_graph_segments(full_graph=True)
     self.seps = self.get_sep_mst()
Example #10
0
def MST_clustering(filename):
    with open(filename, 'r') as f:
        words = f.readlines()
    words = [word.rstrip() for word in words if len(word) > 4]
    words = np.asarray(words)
    jac_similarity = np.array([[jaccard(w1, w2) for w1 in words[:500]]
                               for w2 in words[:500]])

    #pdb.set_trace()
    mst = MSTClustering(min_cluster_size=10,
                        cutoff_scale=1)  # cut-off scale ??
    mst.fit(jac_similarity)
    mst_matrix = mst.full_tree_

    X_tsne = TSNE(learning_rate=100).fit_transform(mst_matrix.todense())
    labels = mst.labels_
    pdb.set_trace()
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels)
    #plot_mst(mst)
    plt.show()
Example #11
0
def PRIM_algo(X_train, y_train):
    # predict the labels with the MST algorithm
    silhouette_score_list = []
    X_train = PCA(2, svd_solver="full").fit_transform(X_train)
    for i in range(2, 10):
        model = MSTClustering(cutoff_scale=i)
        labels = model.fit_predict(X_train, y_train)
        plt.title(str(i) + " scatter")
        x = [item[0] for item in X_train]
        y = [item[1] for item in X_train]
        print("this is x: ", x)
        print("this is y: ", y)
        plt.scatter(x, y, c=labels, cmap=cm.jet)
        plt.title("PRIM - " + str(i) + " scatter")
        plt.show()

        try:
            if (len(list(set(labels))) > 1):
                silhouette_score_list.append(
                    metrics.silhouette_score(X_train,
                                             labels,
                                             metric='euclidean'))
            else:
                silhouette_score_list.append(-1)
        except:
            print("silhouette_score did not work")
        # print("Silhouette: ",silhouette_score(df,cluster_of_each_point_in_data))

        # #Computing "the Silhouette Score"
        # print("Silhouette Coefficient: %0.3f"
        #       % metrics.silhouette_score(X_train, labels, metric='euclidean'))
        t_Test(X_train, labels)
        print(labels)
    if (len(silhouette_score_list) != 0):
        kn = KneeLocator([i + 1 for i in range(len(silhouette_score_list))],
                         silhouette_score_list,
                         curve='convex',
                         direction='decreasing')
        print(kn.elbow)
    create_graph(silhouette_score_list, y_text="SSE", start_point=2)
Example #12
0
def test_precomputed_metric_with_duplicates():
    N = 30
    n_neighbors = N - 1
    rng = np.random.RandomState(42)

    # make data with duplicate points
    X = rng.rand(N, 3)
    X[-5:] = X[:5]

    # compute sparse distances
    G_sparse = kneighbors_graph(X, n_neighbors=n_neighbors, mode='distance')

    # compute dense distances
    G_dense = pairwise_distances(X, X)

    kwds = dict(cutoff=0.1)
    y1 = MSTClustering(n_neighbors=n_neighbors, **kwds).fit_predict(X)
    y2 = MSTClustering(metric='precomputed', **kwds).fit_predict(G_sparse)
    y3 = MSTClustering(metric='precomputed', **kwds).fit_predict(G_dense)

    assert_allclose(y1, y2)
    assert_allclose(y2, y3)
Example #13
0
def test_precomputed():
    X, y = make_blobs(100, random_state=42)
    D = pairwise_distances(X)

    mst1 = MSTClustering(cutoff=0.1)
    mst2 = MSTClustering(cutoff=0.1, metric='precomputed')

    assert_equal(mst1.fit_predict(X), mst2.fit_predict(D))
def test_bad_arguments():
    X, y = make_blobs(100, random_state=42)

    mst = MSTClustering()
    assert_raises_regex(ValueError,
                        "Must specify either cutoff or cutoff_frac",
                        mst.fit, X, y)

    mst = MSTClustering(cutoff=-1)
    assert_raises_regex(ValueError, "cutoff must be positive", mst.fit, X)

    mst = MSTClustering()
    msg = "Must call fit\(\) before get_graph_segments()"
    assert_raises_regex(ValueError, msg, mst.get_graph_segments)

    mst = MSTClustering(cutoff=0, metric='precomputed')
    mst.fit(pairwise_distances(X))
    msg = "Cannot use ``get_graph_segments`` with precomputed metric."
    assert_raises_regex(ValueError, msg, mst.get_graph_segments)
Example #15
0
            matrix.append(row)
            row = [float(w)]
        else:
            row.append(float(w))
        old_sample=sample

matrix.append(row)
mat=np.array(matrix)
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=3,
                   dissimilarity="precomputed", n_jobs=1)
pos = mds.fit(mat).embedding_
clf = PCA(n_components=2)
pos = clf.fit_transform(pos)
fig, ax = plt.subplots()

model = MSTClustering(cutoff_scale=200, approximate=False)
labels = model.fit_predict(pos)


#### Om man vill ha kanter:
#X = model.X_fit_
#segments = model.get_graph_segments(full_graph=False)
#ax.plot(segments[0], segments[1], '-k', zorder=1, lw=1)
#ax.scatter(X[:, 0], X[:, 1], c=model.labels_, cmap='rainbow', zorder=2)
#ax.axis('tight')
#####

#### Utan kanter:
plt.scatter(pos[:, 0], pos[:, 1], c=labels, s=100, lw=0)
####
Example #16
0
 def _check_params(kwds):
     y_pred = MSTClustering(n_neighbors=100, **kwds).fit_predict(X)
     assert_equal(len(np.unique(y_pred)), 3)
     assert_allclose([np.std(y[y == i]) for i in range(3)], 0)
Example #17
0
def mst(instance_path, res_folder, strategy = 2):
	instances = instance_path.rsplit('/', 1)[0] + '/'
	file = instance_path.rsplit('/', 1)[1]
	input_type  = '.' + file.rsplit('.', 1)[1]
	file = file.rsplit('.', 1)[0]
	data, row_names = parse.read(instances + file + input_type)
	print 'Size of data matrix: ', data.shape
	if len(data) <> len(row_names):
		print 'MST error: data and row_names have diff. lens', len(data), len(row_names)	
	#save_matrix_fig(data, res_folder, file+'_in')
	dist_matrix = []
	'''OLD try:
		dist_matrix = np.load(res_folder+file+'_dist'+str(strategy)+'.npy')
		print 'Distance matrix %s found.' %(res_folder+file+'_dist'+str(strategy)+'.npy')
	except:
		print 'Distance matrix %s NOT found!!!!' %(res_folder+file+'_dist'+str(strategy)+'.npy')
		dist_matrix = pp.strategy(data, 'distance',strategy)	
		np.save(res_folder+file+'_dist'+str(strategy), dist_matrix)'''
	try:
		dist_matrix = scipy.io.mmread(res_folder+file+'_dist'+str(strategy)).tocsr()
		print 'Distance matrix %s found.' %(res_folder+file+'_dist'+str(strategy))
	except:
		print 'Distance matrix %s NOT found!!!!' %(res_folder+file+'_dist'+str(strategy))
		dist_matrix = pp.strategy(data, 'distance',strategy)	
		scipy.io.mmwrite(res_folder+file+'_dist'+str(strategy),dist_matrix)	

	occupancy = len(dist_matrix.data) / (dist_matrix.shape[0] * dist_matrix.shape[1]) * 100
	q = 10
	dist_percentile = np.percentile(a=dist_matrix.data, q=q, axis=None)
	if dist_percentile == 0: # or strategy == 6:
		q = 1
		print 'Recalculating dist_percentile..'
		#dist_percentile = np.percentile(a=dist_matrix, q=q)
		dist_percentile = np.percentile(a=dist_matrix.data, q=q, axis=None)

	print 'dist_percentile = ', dist_percentile
	old_n_clusters = 0
	old_non_clustered = 0

	# list to save labels from all iterations, so we can later pick the best clustering
	res_from_diff_params = {}
	nr_clusters_from_diff_params = {}
	non_clustered_from_diff_params = {}
	distribution_from_diff_params = {}
	best_iteration = -1
	sec_best_iteration = -1
	n = dist_matrix.shape[0]
	min_non_clusterd = n
	s_min_non_clusterd = n
	min_std_dev = n
	sec_threshold = 0.0001
	n_iterations = 49  		# must be an odd number 
	eps_list = get_eps_list_geom(mid=dist_percentile, length=n_iterations, strategy=strategy)

	pure_diagonal = False
	n_components, p_d_labels = connected_components(dist_matrix, directed=False)
	if n_components > 1:
		print 'MST: pure diagonal found...'
		pure_diagonal = True


	# cluster the data with MST ---------------------------------------------
	for iteration in range(n_iterations):
		gc.collect()
		if dist_percentile == 0:
			print 'dist_percentile = %i, -> we cannot use MST for clustering this instance.' %dist_percentile
			break
		# eps is in range: [dist_percentile - 0.5, dist_percentile + 0.5] but with geometric progression
		eps = eps_list[iteration]

		if eps <= 0:
			continue
		if eps >= 1:
			break	
		# for distance strategy 1: 0.054...
		#eps = 0.1 + (iteration / 10) 
		min_samples = 4
		#print 'DEBUG: eps = ', eps
		labels = []
		print '_______________________________________________________'
		print 'Running MST...'
		print 'iteration= ', iteration
		print 'eps = ', eps
		print 'min_samples = ', min_samples
		
		if pure_diagonal:
			labels = p_d_labels
		else:
			# cutoff_scale is min size of edges to cut
			model = MSTClustering(cutoff_scale=eps, min_cluster_size = min_samples, metric = "precomputed")
			labels = model.fit_predict(dist_matrix)
			
		#print 'labels = ', labels
		#raise Exception('Wait....')

		n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
		
		num_per_cluster = {}
		for i in range(n_clusters):
			num_per_cluster[i] = 0

		for label in labels:
			for i in range(n_clusters):
				if label == i:
					num_per_cluster[i] += 1; 
		non_clustered = 0;			
		for label in labels:
			if label == -1:
				non_clustered += 1	
		
		# criteria for skiping or breaking the loop ---------------------------------------------
		# skip the iteration if the number of clusters is as before
		if iteration == 0:
			old_n_clusters = n_clusters
			old_non_clustered = non_clustered
		if n_clusters == old_n_clusters and non_clustered == old_non_clustered and iteration > 0:
			print 'Same clustering...'
			continue
		old_n_clusters = n_clusters
		old_non_clustered = non_clustered
		if n_clusters == 1 and non_clustered == 0:
			print 'Stopping because bigger EPS will be the same.'
			break
		# ---------------------------------------------------------------------------------------
		# display some information
		print 'Estimated number of clusters: ',  n_clusters		
		print 'Number of points per cluster: ', num_per_cluster
		print 'Number of non clustered points:', non_clustered
		#draw(A=sim_matrix, colors=labels)
		# ---------------------------------------------------------------------------------------
		sorted_data, sotred_labels, sorted_names, column_labels = sort_matrix(data, labels, row_names)
		#print 'DEBUG:'
		#print 'column_labels = ', column_labels
		#print 'sotred_labels = ', sotred_labels
		#save_matrix_fig(sorted_data, res_folder, file + '_B_dec' +  str(iteration))

		# pull down the points which have non-zero value that colides with points from other clusters
		sorted_data2, sotred_labels2, sorted_names2 = postp.remove_colision_points2(sorted_data, 
																				sotred_labels, sorted_names, column_labels)

		num_per_cluster = {}
		n_clusters = len(set(sotred_labels2)) - (1 if -1 in sotred_labels2 else 0)
		if -1 in sotred_labels2:
			all_clusters_list = range(-1, n_clusters)
		else:
			all_clusters_list = range(n_clusters)

		for i in all_clusters_list:
			num_per_cluster[i] = 0

		for label in sotred_labels2:
			for i in all_clusters_list:
				if label == i:
					num_per_cluster[i] += 1; 
		non_clustered = 0;			
		for label in sotred_labels2:
			if label == -1:
				non_clustered += 1	
		print 'Estimated number of clusters after removal: ',  n_clusters
		print 'Number of points per cluster after removal: ', num_per_cluster
		print 'Number of non clustered points after removal:', non_clustered
		if 0 in num_per_cluster.values():
			print 'TIME TO DEBUG:'
			print 'sotred_labels2 = ', sotred_labels2

		# save picture of end matrix
		#save_matrix_fig(sorted_data2, res_folder, file + '_A_dec' +  str(iteration))
		#if res2_folder <> 'none':
			#save_matrix_fig(sorted_data2, res2_folder, file + '_A_dec' +  str(iteration))
		# find the best iteration, so we only save the best one --------------------------
		label_name_pairs = zip(sotred_labels2, sorted_names2)
		if non_clustered < min_non_clusterd:
			res_from_diff_params[iteration] = label_name_pairs
			nr_clusters_from_diff_params[iteration] = n_clusters
			non_clustered_from_diff_params[iteration] = non_clustered
			distribution_from_diff_params[iteration] = num_per_cluster
			min_non_clusterd = non_clustered
			best_iteration = iteration
			print 'this is best iteration currently'
		# find the best iteration (according variance of cluster sizes), ----------------
		# so we only save the best one 
		temp_num_per_cluster = num_per_cluster.copy()
		if -1 in temp_num_per_cluster.keys():
			del temp_num_per_cluster[-1]
		if len(temp_num_per_cluster.values()) > 1:
			std_dev = np.std(temp_num_per_cluster.values())	
			mean = np.mean(temp_num_per_cluster.values())	
			rel_std_dev = std_dev / mean
			rel_std_dev *= pow(non_clustered/n, 2)
			print 'DEBUG: adjusted rel_std_dev = ', rel_std_dev
			std_dev = rel_std_dev
			# we accept the iteration if adjusted rel_std_dev is smaller, or 
			# if it is within the threshold and number of nonclustered points is smaller
			if (std_dev - min_std_dev) <= sec_threshold and non_clustered < s_min_non_clusterd:
				sec_criteria_fulfiled = True
			else:
				sec_criteria_fulfiled = False
			if std_dev < min_std_dev or sec_criteria_fulfiled:
				res_from_diff_params[iteration] = label_name_pairs
				nr_clusters_from_diff_params[iteration] = n_clusters
				non_clustered_from_diff_params[iteration] = non_clustered
				distribution_from_diff_params[iteration] = num_per_cluster
				min_std_dev = std_dev
				s_min_non_clusterd = non_clustered
				sec_best_iteration = iteration
				print 'this is second best iteration currently'	

		if pure_diagonal:
			break
		# ----------------------------------------------------------------------------------
		print '_______________________________________________________'
				

	best_found = False	
	best_n_clusters = 0
	best_non_clusterd = data.shape[0]
	best_distro = {-1:data.shape[0]}
	best_dec = ''	# name of dec file for best iteration

	s_best_found = False
	s_best_n_clusters = 0
	s_best_non_clusterd = data.shape[0]
	s_best_distro = {-1:data.shape[0]}
	s_dec = ''		# name of dec file for second best iteration

	# save .dec from best iteration
	print 'best_iteration= ', best_iteration
	print 'sec best iteration = ', sec_best_iteration
	if best_iteration >= 0:
		best_found = True
		best_n_clusters = nr_clusters_from_diff_params[best_iteration]
		best_non_clusterd = non_clustered_from_diff_params[best_iteration]
		best_distro = distribution_from_diff_params[best_iteration]
		best_dec = file + '_mst_' + str(best_n_clusters) + '_' + str(best_non_clusterd) +'_dist'+str(strategy)
		dec.write(path = res_folder, filename = best_dec, label_name_pairs = res_from_diff_params[best_iteration])
		print '.dec file %s for iteration %i saved.' %(res_folder+best_dec, best_iteration)
	if sec_best_iteration >= 0:
		if sec_best_iteration <> best_iteration:
			s_best_found = True
		s_best_n_clusters = nr_clusters_from_diff_params[sec_best_iteration]
		s_best_non_clusterd = non_clustered_from_diff_params[sec_best_iteration]
		s_best_distro = distribution_from_diff_params[sec_best_iteration]
		s_dec = file + '_mstSTD_' + str(s_best_n_clusters) + '_' + str(s_best_non_clusterd) +'_dist'+str(strategy)
		dec.write(path = res_folder, filename = s_dec, label_name_pairs = res_from_diff_params[sec_best_iteration])
		print '.dec file %s for iteration %i saved.' %(res_folder+s_dec, sec_best_iteration)	
	print '_______________________________________________________'
	print '_______________________________________________________'	
	gc.collect()
	return best_found, best_n_clusters, best_non_clusterd, best_distro, best_dec, data.shape[0], \
			s_best_found, s_best_n_clusters, s_best_non_clusterd, s_best_distro, s_dec
Example #18
0
                                       ['lightblue', model.labels_]):
        segments = model.get_graph_segments(full_graph=full_graph)
        axi.plot(segments[0], segments[1], '-ok', zorder=1, lw=1)
        axi.scatter(X[:, 0], X[:, 1], c=colors, cmap=cmap, zorder=2)
        axi.axis('tight')

    ax[0].set_title('Full Minimum Spanning Tree', size=16)
    ax[1].set_title('Trimmed Minimum Spanning Tree', size=16)


# create some data
X, y = make_blobs(100, centers=5, cluster_std=0.90)
print(X)

# predict the labels with the MST algorithm
model = MSTClustering(cutoff_scale=1.5, approximate=True, n_neighbors=100)
labels = model.fit_predict(X)
counts = np.bincount(labels)
print("No. of clusters: ")
clusters = len(counts)
print(len(counts))
print("No. of elements in each Clusters: ")
print(counts)

# plot the results
plt.scatter(X[0:, 0], X[0:, 1], marker='o', c=labels, cmap='rainbow')
plt.show()
# plot the brief model
plot_mst(model)

wcss = []
Example #19
0
 def _check_n(n):
     y_pred = MSTClustering(cutoff=n, n_neighbors=2,
                            approximate=True).fit_predict(X)
     assert_equal(len(np.unique(y_pred)), n + 1)
Example #20
0
 def _check_n(n):
     y_pred = MSTClustering(cutoff=n).fit_predict(X)
     assert_equal(len(np.unique(y_pred)), n + 1)
Example #21
0
def check_graph_segments_vals():
    X = np.arange(5)[:, None]**2
    mst = MSTClustering(cutoff=0).fit(X)
    segments = mst.get_graph_segments()
    assert len(segments) == 1
    assert_allclose(segments[0], [[0, 4, 4, 9], [1, 1, 9, 16]])
Example #22
0
def get_mst(dataframe):
    model = MSTClustering(cutoff_scale=2)

    model.fit(dataframe)
    return model.labels_
Example #23
0
        #print(k)
        #print(pkl[k[n]])
        #print(list(pkl)[0:5])

        X = pkl[k[n]]
        for i in range(0, len(X)):
            for j in range(0, len(X)):
                if i == j:
                    #print(i,j)
                    X[i, j] = maxa

        #print(X)
        #print(X[0])
        cut = 1.4
        from mst_clustering import MSTClustering
        model = MSTClustering(cutoff_scale=maxa * cut, approximate=False)
        labels = model.fit_predict(X)
        #print(labels)

        # model2 = MSTClustering(cutoff_scale=maxa*0.9, approximate=False)
        # labels2 = model2.fit_predict(X)
        # print(labels2)

        data_src = data + k[n]
        #print(data_src)
        c = 0
        for pic in os.listdir(data_src):

            #print(pic)
            img = cv2.imread(os.path.join(data_src, pic))
            #print(labels[c])
Example #24
0
 def cluster(self):
     model = MSTClustering(cutoff_scale=self.classifyer, approximate=False)
     self.colors = model.fit_predict(self.positions)
Example #25
0
def cluster_positions(positions, plots=False, cutoff_scale_min=120,
                      cutoff_scale_max=350, cutoff_scale_resolution=150,
                      n_neighbors_max=5, min_cluster_size=4):
    """
    Find clusters in the positions produced by
    `~shampoo.track2d.locate_from_hologram` using a Minimum Spanning Tree.

    Parameters
    ----------
    positions : `~numpy.ndarray`
        Positions for each detected specimen in each frame, with values
        specified by `~shampoo.track2d.locate_from_hologram`.
    plots : bool (optional)
        Plot the scores for the grid search in ``(cutoff_scale, n_neighbors)``
        space and the best clusters

    Returns
    -------
    labels : `~numpy.ndarray`
        Cluster labels for each position in ``positions``. `-1` represents
        positions without a cluster.
    """
    # Scale the time and max_intensity dims similarly to the spatial dimensions
    X = positions.copy()
    X = X[:, 0:4]
    X[:, 0] *= 5*positions[:, 1].ptp()/positions[:, 0].ptp()
    X[:, 3] *= positions[:, 1].ptp()/positions[:, 3].ptp()

    # Grid search in (cutoff_scales, n_neighbors) for the best clustering params
    cutoff_scales = np.linspace(cutoff_scale_min, cutoff_scale_max,
                                cutoff_scale_resolution)
    n_neighbors = np.arange(1, n_neighbors_max)
    scores = np.zeros((len(cutoff_scales), len(n_neighbors)), dtype=np.float64)

    for i in range(cutoff_scales.shape[0]):
        for j in range(n_neighbors.shape[0]):
            model = MSTClustering(cutoff_scale=cutoff_scales[i],
                                  approximate=True, n_neighbors=n_neighbors[j],
                                  min_cluster_size=min_cluster_size)
            labels = model.fit_predict(X)

            distance_stds = []
            for l in set(labels):
                if l != -1:
                    pca = PCA(n_components=3)
                    pca.fit(X[labels == l, 0:3])
                    X_prime = pca.transform(X[labels == l, 0:3])

                    distance_stds.append(X_prime[:, 1].std() /
                                         X_prime[:, 0].ptp())

            f_labeled = np.count_nonzero(labels != -1)/float(len(labels))
            scores[i, j] = np.mean(distance_stds)/f_labeled

    # With the best clustering parameters, label the clusters
    x_min_ind, y_min_ind = np.where(scores == scores.min())
    n_neighbors_min = n_neighbors[y_min_ind[0]]
    cuttoff_scale_min = cutoff_scales[x_min_ind[0]]
    print(n_neighbors_min, cuttoff_scale_min)

    model = MSTClustering(cutoff_scale=cuttoff_scale_min, approximate=True,
                          n_neighbors=n_neighbors_min, min_cluster_size=4)
    labels = model.fit_predict(X)

    if plots:
        # Plot the scores in (cutoff_scales, n_neighbors) space
        fig, ax = plt.subplots(figsize=(16, 10))
        ax.imshow(np.log(scores).T, interpolation='nearest', origin='lower',
                  cmap=plt.cm.viridis)
        ax.set_xticks(range(len(cutoff_scales))[::5])
        ax.set_xticklabels(["{0:.2f}".format(cutoff_scale)
                            for cutoff_scale in cutoff_scales[::5]])

        ax.set_yticks(range(len(n_neighbors)))
        ax.set_yticklabels(range(1, len(n_neighbors)+1))

        for l in ax.get_xticklabels():
            l.set_rotation(45)
            l.set_ha('right')

        ax.set_xlabel('cutoff')
        ax.set_ylabel('n_neighbors')
        ax.set_aspect(10)

        # Plot the best clusters
        plot_segments = True
        fig, ax = plt.subplots(1, 3, figsize=(16, 6))

        kwargs = dict(s=100, alpha=0.6, edgecolor='none', cmap=plt.cm.Spectral,
                      c=labels)
        ax[0].scatter(X[:, 0], X[:, 1], **kwargs)
        ax[1].scatter(X[:, 0], X[:, 2], **kwargs)
        ax[2].scatter(X[:, 1], X[:, 2], **kwargs)

        ax[0].set(xlabel='t', ylabel='x')
        ax[1].set(xlabel='t', ylabel='y')
        ax[2].set(xlabel='x', ylabel='y')

        if plot_segments:
            segments = model.get_graph_segments(full_graph=False)
            ax[0].plot(segments[0], segments[1], '-k')
            ax[1].plot(segments[0], segments[2], '-k')
            ax[2].plot(segments[1], segments[2], '-k')

        fig.tight_layout()

        plt.show()

    return labels
Example #26
0
class MST:
    """ Minimum Spanning Tree Class
    
    Compute MST for a set of input points using the MSTClustering 
    code from jakevdp, calculate branch lengths from the MST and 
    generate plots of the MST and cumulative distribution of branch 
    lengths.
    
    ---- Inputs ----
    data frame "df", which has two columns present:
     - ra: right ascension (deg)
     - dec: declination (deg)
    
    cutoff_scale (float): minimum size of edges, also known as the 
                          critical branch length. All edges larger 
                          than cutoff_scale will be removed.
    
    min_cluster_size (int): min number of galaxies in a cluster.
    
    n_neighbors (int): maximum number of neighbors of each point 
    used for approximate Euclidean MST algorithm.
    
    ---- Attributes ----
    labels: integer specifying the structure to which a given galaxy 
            has been assigned. It will have a -1 if no membership was 
            assigned.
            
    segments: sets of ra, dec coordinates for the MST branch segments
    seps: base-10 log of branch lengths (in degrees)
    
    """
    def __init__(self,
                 df,
                 cutoff_scale=None,
                 min_cluster_size=None,
                 n_neighbors=None,
                 set_mst=None,
                 labels=None,
                 segments=None,
                 seps=None):
        self.df = df
        self.cutoff_scale = cutoff_scale
        self.min_cluster_size = min_cluster_size
        self.n_neighbors = n_neighbors
        self.set_mst = MSTClustering(cutoff_scale=cutoff_scale,
                                     min_cluster_size=min_cluster_size,
                                     n_neighbors=n_neighbors)
        pos = np.array([list(i) for i in zip(df.ra, df.dec)])
        self.labels = self.set_mst.fit_predict(pos)
        self.segments = self.set_mst.get_graph_segments(full_graph=True)
        self.seps = self.get_sep_mst()

    """ Calculate branch lengths (in base-10 log(degrees)) 
        from the MST segments """

    def get_sep_mst(self):
        mst_coord0_ra = np.asarray(self.segments[0][0])
        mst_coord1_ra = np.asarray(self.segments[0][1])
        mst_coord0_dec = np.asarray(self.segments[1][0])
        mst_coord1_dec = np.asarray(self.segments[1][1])
        c0 = SkyCoord(mst_coord0_ra, mst_coord0_dec, unit=u.deg)
        c1 = SkyCoord(mst_coord1_ra, mst_coord1_dec, unit=u.deg)
        return np.log10(c0.separation(c1).degree)

    """ Plot the MST diagram (left) and the labeled structures 
        identified from the MST (right) """

    def plot_mst(self, model, cmap='rainbow', *args, **kwargs):
        """Utility code to visualize a minimum spanning tree"""
        xlim = kwargs.get('xlim', None)
        ylim = kwargs.get('ylim', None)
        ssize = kwargs.get('s', 8)
        savefigure = kwargs.get('savefigure', False)
        figname = kwargs.get('figname', 'MST_figure.png')
        X = model.X_fit_

        # One little hack to get more clear color differentiation between the
        # points with cluster membership and without. Add 50(?) to the label numbers
        # of those that are cluster members.
        model.labels_[model.labels_ > -1] += 50

        fig, ax = plt.subplots(1, 2, figsize=(20, 7), sharex=True, sharey=True)
        for axi, full_graph, colors in zip(ax, [True, False],
                                           ['lightblue', model.labels_]):
            segments = model.get_graph_segments(full_graph=full_graph)
            axi.plot(segments[0], segments[1], '-k', zorder=1, lw=1)
            plt.xlabel('Right Ascension (deg)', size=14)
            plt.ylabel('Declination (deg)', size=14)
            axi.scatter(X[:, 0],
                        X[:, 1],
                        c=colors,
                        cmap=cmap,
                        zorder=2,
                        s=ssize)
            axi.axis('tight')
            if xlim != None:
                plt.xlim(xlim)
            if ylim != None:
                plt.ylim(ylim)

        ax[0].set_title('Full Minimum Spanning Tree', size=16)
        ax[1].set_title('Trimmed Minimum Spanning Tree', size=16)

        # Leave an option to save all the plots to output PNG files.
        if savefigure == True:
            pl.savefig(figname, bbox_inches='tight', dpi=250)

    """ Plot the cumulative distribution of MST branch lengths """

    def plot_mst_cumul(self, *args, **kwargs):
        savefigure = kwargs.get('savefigure', False)
        figname = kwargs.get('figname', 'MST_cumul_dist.png')
        sns.distplot(self.seps,
                     hist_kws=dict(cumulative=False),
                     kde_kws=dict(cumulative=True))
        plt.xlabel('log$_{10}$ (MST branch length)', fontsize=15)
        plt.ylabel('Norm. Counts/Cumul. Dist.', fontsize=15)

        # Leave an option to save all the plots to output PNG files.
        if savefigure == True:
            pl.savefig(figname, bbox_inches='tight')
Example #27
0
    for axi, full_graph, colors in zip(ax, [True, False],
                                       ['lightblue', model.labels_]):
        segments = model.get_graph_segments(full_graph=full_graph)
        axi.plot(segments[0], segments[1], '-k', zorder=1, lw=1)
        axi.scatter(X[:, 0], X[:, 1], c=colors, cmap=cmap, zorder=2)
        axi.axis('tight')

    ax[0].set_title('Full Minimum Spanning Tree', size=16)
    ax[1].set_title('Trimmed Minimum Spanning Tree', size=16)


X, y = make_blobs(200, centers=4, random_state=42)
plt.scatter(X[:, 0], X[:, 1], c='lightblue')
plt.show()

model = MSTClustering(cutoff_scale=2, approximate=False)
labels = model.fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='rainbow')
plt.show()

plot_minimum_spanning_tree(model)
plt.show()

rng = np.random.RandomState(int(100 * y[-1]))
noise = -14 + 28 * rng.rand(200, 2)

X_noisy = np.vstack([X, noise])
y_noisy = np.concatenate([y, np.full(200, -1, dtype=int)])

plt.scatter(X_noisy[:, 0], X_noisy[:, 1], c='lightblue', cmap='spectral_r')
plt.xlim(-15, 15)