def test_find_cluster_centers_example1(): """This test uses the build in example class to test the clustering In the end we check if the error (sum of distance of all points to nearest cluster center) is smaller than expected """ n = 5000 #number of data points k = 15 #number of cluster_centers factor = 0.25 #how much is the data perturbed x = ex.generate_test_data(n, 1) data = x[0] #fig,ax = plt.subplots(ncols=2,nrows=1) plt.scatter(data[:, 0], data[:, 1]) clustering = cl.KMeans(data, k, method='kmeans++') cluster_centers = clustering.cluster_centers cluster_labels = clustering.cluster_labels errorNew = 0 for i in range(0, n): dist = n for l in range(0, k): distNew = np.linalg.norm(data[i, :] - cluster_centers[l, :]) if dist > distNew: dist = distNew errorNew = errorNew + dist print(errorNew / n) print(4 / k) check = 0 if errorNew / n < 4 / k: check = 1 assert_equals(check, 1) plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], c='r')
def test_find_cluster_centers_R2(): """This test should check if the cluster_centers are reasonable We test clustering in R^2 """ n = 50 #number of data points in x- and y-direction k = 5 #number of cluster_centers in x- and y-direction when generating data k2 = k #number of cluster_centers in x- and y-direction when doing clustering #dim = 2 factor = 0.3 #how much is the data perturbed data = np.zeros((n * n, 2)) for i in range(0, n): for j in range(0, n): data[n * i + j, 0] = i % k + factor * np.random.normal() * math.pow( -1, int(2 * np.random.rand())) data[n * i + j, 1] = j % k + factor * np.random.normal() * math.pow( -1, int(2 * np.random.rand())) #fig,ax = plt.subplots(ncols=2,nrows=1) plt.scatter(data[:, 0], data[:, 1]) #Do more clustering and take the best clustering anzahl = k * 4 error = n * n * n for t in range(0, anzahl): errorNew = 0 clusteringNew = cl.KMeans(data, k2 * k2) cluster_centersNew = clusteringNew.cluster_centers for i in range(0, n): for j in range(0, n): dist = n for l in range(0, k2 * k2): distNew = np.linalg.norm(data[n * i + j, :] - cluster_centersNew[l, :]) if dist > distNew: dist = distNew errorNew = errorNew + dist print(errorNew) print(error) if errorNew < error: error = errorNew clustering = clusteringNew cluster_centers = cluster_centersNew plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], c='r') #check if cluster_centers are reasonable for i in range(0, k): for j in range(0, k): check = 0 for l in range(0, k2 * k2): if np.linalg.norm([i, j] - cluster_centers[l, :]) < 0.2: check = 1 assert_equals(check, 1)
def test_clustering_estimation_bigger_markov(): """This test generates randomly perturbed data based on a transition matrix. Then we do clustering. Then we check if the estimated matrix behaves as expected. Just as test_clustering_estimation_simple_markov(), but with an arbitrary bigger random matrix """ m = 10 # size of original transition matrix, free to choose A = np.random.rand(m, m) + 0.001 for i, row in enumerate(A): A[i] = row / sum(row) n = 100000 # number of random evaluations of the Markov process states = np.zeros((n, 1)) states[0, 0] = 1 factor = 0.001 for i in range(1, n): check = 0 summe = 0 zahl = np.random.rand(1) while check < m: summe = summe + A[int(states[i - 1, 0]), check] if zahl < summe: states[i, 0] = check + factor * np.random.rand() check = m + 1 check = check + 1 # do the clustering clustering = cl.KMeans(states, m, method='kmeans++') cluster_centers = clustering.cluster_centers cluster_labels = clustering.cluster_labels cluster_labels = np.array(cluster_labels) # do the estimation estimator = est.Estimator(cluster_labels, 1, 1) matrix = estimator.transition_matrix Q = np.identity(m) Qt = Q for i in range(0, m): index = np.argmax(cluster_centers) cluster_centers[index] = cluster_centers[index] - 10 P = np.identity(m) # permute row and column index with m-1-i if m - 1 - i != index: P[m - 1 - i, m - 1 - i] = 0 P[index, index] = 0 P[index, m - 1 - i] = 1 P[m - 1 - i, index] = 1 z = cluster_centers[m - 1 - i, 0] cluster_centers[m - 1 - i] = cluster_centers[index, 0] cluster_centers[index, 0] = z Q = P.dot(Q) Qt = Qt.dot(P) matrix = Q.dot(matrix).dot(Qt) print(np.linalg.norm(A - matrix)) np.testing.assert_allclose(matrix, A, atol=0.05, rtol=0.1)
def test_find_cluster_centers(): """This test should check if the cluster_centers are reasonable We test clustering in R^1 """ n = 200 #number of data points k = 15 #number of cluster_centers factor = 0.25 #how much is the data perturbed data = np.zeros((n, 1)) for i in range(0, n): data[i] = i % k + factor * np.random.rand() * math.pow( -1, int(2 * np.random.rand())) #fig,ax = plt.subplots(ncols=2,nrows=1) plt.scatter(data[:, 0], np.zeros((n, 1))) plt.scatter(data[:, 0], np.ones((n, 1))) plt.scatter(data[:, 0], 2 * np.ones((n, 1))) clustering = cl.KMeans(data, k) clustering2 = cl.KMeans(data, k, method='kmeans++') clustering3 = cl.KMeans(data, k, method='kmeans++') cluster_centers = clustering.cluster_centers cluster_labels = clustering.cluster_labels cluster_centers2 = clustering2.cluster_centers cluster_labels2 = clustering2.cluster_labels cluster_centers3 = clustering3.cluster_centers cluster_labels3 = clustering3.cluster_labels plt.scatter(cluster_centers[:], np.zeros((k, 1)), c='r') plt.scatter(cluster_centers2[:], np.ones((k, 1)), c='r') plt.scatter(cluster_centers3[:], 2 * np.ones((k, 1)), c='r') #check if clusterlabels are reasonable for j in range(0, k): index = np.argmin(cluster_centers) zahl = int(cluster_centers[index]) if cluster_centers[index] - zahl > 0.5: zahl = zahl + 1 cluster_centers[index] = cluster_centers[index] + k + 1
def test_clustering_estimation_simple_markov(): """This test generates randomly perturbed data based on a transition matrix. Then we do clustering. Then we check if the estimated matrix behaves as expected. """ A = np.array([[0.5, 0.4, 0.1, 0], [0.2, 0.8, 0, 0], [0, 0.05, 0.25, 0.7], [0, 0, 0.75, 0.25]]) for i, row in enumerate(A): A[i] = row / sum(row) n = 10000 states = np.zeros((n, 1)) states[0, 0] = 1 factor = 0.001 for i in range(1, n): zahl = np.random.rand(1) if zahl < A[int(states[i - 1, 0]), 0]: states[i, 0] = 0 + factor * np.random.rand() elif zahl < A[int(states[i - 1, 0]), 0] + A[int(states[i - 1, 0]), 1]: states[i, 0] = 1 + factor * np.random.rand() elif zahl < A[int(states[i - 1, 0]), 0] + A[int(states[i - 1, 0]), 1] + A[int(states[i - 1, 0]), 2]: states[i, 0] = 2 + factor * np.random.rand() else: states[i, 0] = 3 + factor * np.random.rand() # do the clustering clustering = cl.KMeans(states, 4, method='kmeans++') cluster_centers = clustering.cluster_centers cluster_labels = clustering.cluster_labels cluster_labels = np.array(cluster_labels) # do the estimation estimator = est.Estimator(cluster_labels, 1, 1) matrix = estimator.transition_matrix Q = np.identity(4) Qt = Q for i in range(0, 4): index = np.argmax(cluster_centers) cluster_centers[index] = cluster_centers[index] - 10 P = np.identity(4) # permute row and column index with 3-i if 3 - i != index: P[3 - i, 3 - i] = 0 P[index, index] = 0 P[index, 3 - i] = 1 P[3 - i, index] = 1 z = cluster_centers[3 - i, 0] cluster_centers[3 - i] = cluster_centers[index, 0] cluster_centers[index, 0] = z Q = P.dot(Q) Qt = Qt.dot(P) matrix = Q.dot(matrix).dot(Qt) np.testing.assert_allclose(matrix, A, atol=0.05, rtol=0.1)
def test_pcca_1(): """ Check Pcca with 4 states on 3 accumulation points in the data We data in R^1 that accumulates at three points 0, 1, 2 Then we apply pcca with 4 pcca_states and check if it works as expected """ n = 1000 #number of data points kk = 3 #number of points where data accumulates k = 10 #number of cluster_centers factor = 0.1 #how much is the data perturbed data = np.zeros((n,1)) for i in range(0,n): data[i] = i % kk + factor * np.random.rand() * math.pow(-1,int(2*np.random.rand())) #plt.scatter(data[:,0],np.zeros((n,1))) clustering = cl.KMeans(data,k) cluster_centers = clustering.cluster_centers cluster_labels = clustering.cluster_labels #plt.scatter(cluster_centers[:],np.zeros((k,1)),c='r') estimator = est.Estimator(cluster_labels, 1, 1) matrix = estimator.reversible_transition_matrix msm = ana.MarkovStateModel(matrix) n_pcca_states = 4; #fig, ax = plt.subplots(figsize=(6.5, 5)) pcca_labels = msm.metastable_set_assignments(n_pcca_states) #im = ax.scatter(cluster_centers[:, 0], np.zeros((k,1)), c=pcca_labels, s=200) #cbar = fig.colorbar(im, ax=ax) error = 0; for j in range(0,kk): for i in range(0,k): if (round(cluster_centers[i,0]) == j): test = i for i in range(0,k): if (np.abs(cluster_centers[i,0] - cluster_centers[test,0]) < 2*factor): if (not pcca_labels[i] == pcca_labels[test]): error = 1 print(error) assert_true(error == 0)
def kmeans_blobs_2d(n_samples,n_clusters,k,method='kmeans++',std=1): ''' generates random dataset by sklearn.datasets.samplesgenerator.make_blobs and visualizes the mcmm.analysis.KMeans clustering algorithm via pyplot Args: n_samples: number of observations in dataset n_clusters: number of clusters in dataset k: number of cluster centers to be determined by k-means method: the KMeans method, i.e. 'forgy' or 'kmeans++' std: the cluster intern standard deviation of the generated dataset ''' data = make_blobs(n_samples,2,n_clusters,cluster_std=std)[0] kmeans = cl.KMeans(data,k,method) cluster_centers = kmeans.cluster_centers cluster_labels = kmeans.cluster_labels plt.scatter(data[:, 0], data[:, 1],c=cluster_labels) plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], c='r', s=50) plt.show()
def test_find_cluster_center_multiple_trajectories1(): """This test checks if the clustering works with multiple trajectories as well Here we just use clustering in R^1 like in "test_find_cluster_centers()" """ n = 2000 #number of data points k = 30 #number of cluster_centers iter = 10 #number of trajetories data = ex.generate_test_data(n, iter) for i in range(0, n): for r in range(0, 2): data[0][i, r] = data[0][i, r] + 6 data[1][i, 0] = data[1][i, 0] + 6 for r in range(0, iter): plt.scatter(data[r][:, 0], data[r][:, 1], c='b') clustering = cl.KMeans(data, k, method='kmeans++') cluster_centers = clustering.cluster_centers cluster_labels = clustering.cluster_labels plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], c='r')
def kmeans_blobs_3d(n_samples,n_clusters,k,method='kmeans++',std=1): ''' generates random dataset by sklearn.datasets.samplesgenerator.make_blobs and visualizes the mcmm.analysis.KMeans clustering algorithm via pyplot Args: n_samples: number of observations in dataset n_clusters: number of clusters in dataset k: number of cluster centers to be determined by k-means method: the KMeans method, i.e. 'forgy' or 'kmeans++' std: the cluster intern standard deviation of the generated dataset ''' data = make_blobs(n_samples,3,n_clusters,cluster_std=std)[0] kmeans = cl.KMeans(data,k,method) cluster_centers = kmeans.cluster_centers cluster_labels = kmeans.cluster_labels fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(data[:, 0], data[:, 1],data[:,2],c=cluster_labels) ax.scatter(cluster_centers[:, 0], cluster_centers[:, 1],cluster_centers[:,2], c='r', s=150,depthshade=False) plt.show()