Ejemplo n.º 1
0
def test_find_cluster_centers_example1():
    """This test uses the build in example class to test the clustering
    In the end we check if the error (sum of distance of all points to nearest cluster center)
    is smaller than expected
    """
    n = 5000  #number of data points
    k = 15  #number of cluster_centers
    factor = 0.25  #how much is the data perturbed
    x = ex.generate_test_data(n, 1)
    data = x[0]
    #fig,ax = plt.subplots(ncols=2,nrows=1)
    plt.scatter(data[:, 0], data[:, 1])

    clustering = cl.KMeans(data, k, method='kmeans++')
    cluster_centers = clustering.cluster_centers
    cluster_labels = clustering.cluster_labels

    errorNew = 0
    for i in range(0, n):
        dist = n
        for l in range(0, k):
            distNew = np.linalg.norm(data[i, :] - cluster_centers[l, :])
            if dist > distNew:
                dist = distNew
        errorNew = errorNew + dist
    print(errorNew / n)
    print(4 / k)
    check = 0
    if errorNew / n < 4 / k:
        check = 1
    assert_equals(check, 1)

    plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], c='r')
Ejemplo n.º 2
0
def test_find_cluster_centers_R2():
    """This test should check if the cluster_centers are reasonable
    We test clustering in R^2
    """
    n = 50  #number of data points in x- and y-direction
    k = 5  #number of cluster_centers in x- and y-direction when generating data
    k2 = k  #number of cluster_centers in x- and y-direction when doing clustering
    #dim = 2
    factor = 0.3  #how much is the data perturbed
    data = np.zeros((n * n, 2))
    for i in range(0, n):
        for j in range(0, n):
            data[n * i + j,
                 0] = i % k + factor * np.random.normal() * math.pow(
                     -1, int(2 * np.random.rand()))
            data[n * i + j,
                 1] = j % k + factor * np.random.normal() * math.pow(
                     -1, int(2 * np.random.rand()))
    #fig,ax = plt.subplots(ncols=2,nrows=1)
    plt.scatter(data[:, 0], data[:, 1])

    #Do more clustering and take the best clustering
    anzahl = k * 4
    error = n * n * n
    for t in range(0, anzahl):
        errorNew = 0
        clusteringNew = cl.KMeans(data, k2 * k2)
        cluster_centersNew = clusteringNew.cluster_centers
        for i in range(0, n):
            for j in range(0, n):
                dist = n
                for l in range(0, k2 * k2):
                    distNew = np.linalg.norm(data[n * i + j, :] -
                                             cluster_centersNew[l, :])
                    if dist > distNew:
                        dist = distNew
                errorNew = errorNew + dist
        print(errorNew)
        print(error)
        if errorNew < error:
            error = errorNew
            clustering = clusteringNew
            cluster_centers = cluster_centersNew

    plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], c='r')

    #check if cluster_centers are reasonable
    for i in range(0, k):
        for j in range(0, k):
            check = 0
            for l in range(0, k2 * k2):
                if np.linalg.norm([i, j] - cluster_centers[l, :]) < 0.2:
                    check = 1
            assert_equals(check, 1)
Ejemplo n.º 3
0
def test_clustering_estimation_bigger_markov():
    """This test generates randomly perturbed data based on a transition matrix. Then we do clustering.
    Then we check if the estimated matrix behaves as expected.
    Just as test_clustering_estimation_simple_markov(), but with an arbitrary bigger random matrix
    """
    m = 10  # size of original transition matrix, free to choose
    A = np.random.rand(m, m) + 0.001
    for i, row in enumerate(A):
        A[i] = row / sum(row)
    n = 100000  # number of random evaluations of the Markov process
    states = np.zeros((n, 1))
    states[0, 0] = 1
    factor = 0.001
    for i in range(1, n):
        check = 0
        summe = 0
        zahl = np.random.rand(1)
        while check < m:
            summe = summe + A[int(states[i - 1, 0]), check]
            if zahl < summe:
                states[i, 0] = check + factor * np.random.rand()
                check = m + 1
            check = check + 1
    # do the clustering
    clustering = cl.KMeans(states, m, method='kmeans++')
    cluster_centers = clustering.cluster_centers
    cluster_labels = clustering.cluster_labels
    cluster_labels = np.array(cluster_labels)
    # do the estimation
    estimator = est.Estimator(cluster_labels, 1, 1)
    matrix = estimator.transition_matrix
    Q = np.identity(m)
    Qt = Q
    for i in range(0, m):
        index = np.argmax(cluster_centers)
        cluster_centers[index] = cluster_centers[index] - 10
        P = np.identity(m)
        # permute row and column index with m-1-i
        if m - 1 - i != index:
            P[m - 1 - i, m - 1 - i] = 0
            P[index, index] = 0
            P[index, m - 1 - i] = 1
            P[m - 1 - i, index] = 1
        z = cluster_centers[m - 1 - i, 0]
        cluster_centers[m - 1 - i] = cluster_centers[index, 0]
        cluster_centers[index, 0] = z
        Q = P.dot(Q)
        Qt = Qt.dot(P)
    matrix = Q.dot(matrix).dot(Qt)
    print(np.linalg.norm(A - matrix))
    np.testing.assert_allclose(matrix, A, atol=0.05, rtol=0.1)
Ejemplo n.º 4
0
def test_find_cluster_centers():
    """This test should check if the cluster_centers are reasonable
    We test clustering in R^1
    """
    n = 200  #number of data points
    k = 15  #number of cluster_centers
    factor = 0.25  #how much is the data perturbed
    data = np.zeros((n, 1))
    for i in range(0, n):
        data[i] = i % k + factor * np.random.rand() * math.pow(
            -1, int(2 * np.random.rand()))
    #fig,ax = plt.subplots(ncols=2,nrows=1)
    plt.scatter(data[:, 0], np.zeros((n, 1)))
    plt.scatter(data[:, 0], np.ones((n, 1)))
    plt.scatter(data[:, 0], 2 * np.ones((n, 1)))

    clustering = cl.KMeans(data, k)
    clustering2 = cl.KMeans(data, k, method='kmeans++')
    clustering3 = cl.KMeans(data, k, method='kmeans++')

    cluster_centers = clustering.cluster_centers
    cluster_labels = clustering.cluster_labels
    cluster_centers2 = clustering2.cluster_centers
    cluster_labels2 = clustering2.cluster_labels
    cluster_centers3 = clustering3.cluster_centers
    cluster_labels3 = clustering3.cluster_labels
    plt.scatter(cluster_centers[:], np.zeros((k, 1)), c='r')
    plt.scatter(cluster_centers2[:], np.ones((k, 1)), c='r')
    plt.scatter(cluster_centers3[:], 2 * np.ones((k, 1)), c='r')
    #check if clusterlabels are reasonable
    for j in range(0, k):
        index = np.argmin(cluster_centers)
        zahl = int(cluster_centers[index])
        if cluster_centers[index] - zahl > 0.5:
            zahl = zahl + 1
        cluster_centers[index] = cluster_centers[index] + k + 1
Ejemplo n.º 5
0
def test_clustering_estimation_simple_markov():
    """This test generates randomly perturbed data based on a transition matrix. Then we do clustering.
    Then we check if the estimated matrix behaves as expected.
    """
    A = np.array([[0.5, 0.4, 0.1, 0], [0.2, 0.8, 0, 0], [0, 0.05, 0.25, 0.7], [0, 0, 0.75, 0.25]])
    for i, row in enumerate(A):
        A[i] = row / sum(row)
    n = 10000
    states = np.zeros((n, 1))
    states[0, 0] = 1
    factor = 0.001
    for i in range(1, n):
        zahl = np.random.rand(1)
        if zahl < A[int(states[i - 1, 0]), 0]:
            states[i, 0] = 0 + factor * np.random.rand()
        elif zahl < A[int(states[i - 1, 0]), 0] + A[int(states[i - 1, 0]), 1]:
            states[i, 0] = 1 + factor * np.random.rand()
        elif zahl < A[int(states[i - 1, 0]), 0] + A[int(states[i - 1, 0]), 1] + A[int(states[i - 1, 0]), 2]:
            states[i, 0] = 2 + factor * np.random.rand()
        else:
            states[i, 0] = 3 + factor * np.random.rand()
    # do the clustering
    clustering = cl.KMeans(states, 4, method='kmeans++')
    cluster_centers = clustering.cluster_centers
    cluster_labels = clustering.cluster_labels
    cluster_labels = np.array(cluster_labels)
    # do the estimation
    estimator = est.Estimator(cluster_labels, 1, 1)
    matrix = estimator.transition_matrix
    Q = np.identity(4)
    Qt = Q
    for i in range(0, 4):
        index = np.argmax(cluster_centers)
        cluster_centers[index] = cluster_centers[index] - 10
        P = np.identity(4)
        # permute row and column index with 3-i
        if 3 - i != index:
            P[3 - i, 3 - i] = 0
            P[index, index] = 0
            P[index, 3 - i] = 1
            P[3 - i, index] = 1
        z = cluster_centers[3 - i, 0]
        cluster_centers[3 - i] = cluster_centers[index, 0]
        cluster_centers[index, 0] = z
        Q = P.dot(Q)
        Qt = Qt.dot(P)
    matrix = Q.dot(matrix).dot(Qt)
    np.testing.assert_allclose(matrix, A, atol=0.05, rtol=0.1)
Ejemplo n.º 6
0
def test_pcca_1():
    """
    Check Pcca with 4 states on 3 accumulation points in the data
    We data in R^1 that accumulates at three points 0, 1, 2
    Then we apply pcca with 4 pcca_states and check if it works as expected
    """
    n = 1000 #number of data points
    kk = 3 #number of points where data accumulates
    k = 10 #number of cluster_centers
    factor = 0.1 #how much is the data perturbed
    data = np.zeros((n,1))
    for i in range(0,n):
        data[i] = i % kk + factor * np.random.rand() * math.pow(-1,int(2*np.random.rand()))
    #plt.scatter(data[:,0],np.zeros((n,1)))
    
    clustering = cl.KMeans(data,k)
    cluster_centers = clustering.cluster_centers
    cluster_labels = clustering.cluster_labels
    
    #plt.scatter(cluster_centers[:],np.zeros((k,1)),c='r')
    
    estimator = est.Estimator(cluster_labels, 1, 1)
    matrix = estimator.reversible_transition_matrix
    msm = ana.MarkovStateModel(matrix)
    
    n_pcca_states = 4;
    #fig, ax = plt.subplots(figsize=(6.5, 5))
    pcca_labels = msm.metastable_set_assignments(n_pcca_states)
    #im = ax.scatter(cluster_centers[:, 0], np.zeros((k,1)), c=pcca_labels, s=200)
    #cbar = fig.colorbar(im, ax=ax)
    error = 0;
    for j in range(0,kk):
        for i in range(0,k):
            if (round(cluster_centers[i,0]) == j):
                test = i
        for i in range(0,k):
            if (np.abs(cluster_centers[i,0] - cluster_centers[test,0]) < 2*factor):
                if (not pcca_labels[i] == pcca_labels[test]):
                    error = 1
    print(error)
    assert_true(error == 0)
Ejemplo n.º 7
0
def kmeans_blobs_2d(n_samples,n_clusters,k,method='kmeans++',std=1):
    '''
    generates random dataset by sklearn.datasets.samplesgenerator.make_blobs
    and visualizes the mcmm.analysis.KMeans clustering algorithm via pyplot

        Args:
        n_samples: number of observations in dataset
        n_clusters: number of clusters in dataset
        k: number of cluster centers to be determined by k-means
        method: the KMeans method, i.e. 'forgy' or 'kmeans++'
        std: the cluster intern standard deviation of the generated dataset
    '''

    data = make_blobs(n_samples,2,n_clusters,cluster_std=std)[0]
    kmeans = cl.KMeans(data,k,method)
    cluster_centers = kmeans.cluster_centers
    cluster_labels = kmeans.cluster_labels

    plt.scatter(data[:, 0], data[:, 1],c=cluster_labels)
    plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], c='r', s=50)
    plt.show()
Ejemplo n.º 8
0
def test_find_cluster_center_multiple_trajectories1():
    """This test checks if the clustering works with multiple trajectories as well
    Here we just use clustering in R^1 like in "test_find_cluster_centers()"
    """
    n = 2000  #number of data points
    k = 30  #number of cluster_centers
    iter = 10  #number of trajetories
    data = ex.generate_test_data(n, iter)

    for i in range(0, n):
        for r in range(0, 2):
            data[0][i, r] = data[0][i, r] + 6
        data[1][i, 0] = data[1][i, 0] + 6

    for r in range(0, iter):
        plt.scatter(data[r][:, 0], data[r][:, 1], c='b')

    clustering = cl.KMeans(data, k, method='kmeans++')
    cluster_centers = clustering.cluster_centers
    cluster_labels = clustering.cluster_labels

    plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], c='r')
Ejemplo n.º 9
0
def kmeans_blobs_3d(n_samples,n_clusters,k,method='kmeans++',std=1):
    '''
    generates random dataset by sklearn.datasets.samplesgenerator.make_blobs
    and visualizes the mcmm.analysis.KMeans clustering algorithm via pyplot

        Args:
        n_samples: number of observations in dataset
        n_clusters: number of clusters in dataset
        k: number of cluster centers to be determined by k-means
        method: the KMeans method, i.e. 'forgy' or 'kmeans++'
        std: the cluster intern standard deviation of the generated dataset
    '''

    data = make_blobs(n_samples,3,n_clusters,cluster_std=std)[0]
    kmeans = cl.KMeans(data,k,method)
    cluster_centers = kmeans.cluster_centers
    cluster_labels = kmeans.cluster_labels

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    ax.scatter(data[:, 0], data[:, 1],data[:,2],c=cluster_labels)
    ax.scatter(cluster_centers[:, 0], cluster_centers[:, 1],cluster_centers[:,2], c='r', s=150,depthshade=False)
    plt.show()