Esempio n. 1
0
def calculate_pairwise_rmsd(xyzlist, n_real_atoms):
    n_frames = len(xyzlist)
    log('pairwise distance matrix will be %.2f MB...', 4*n_frames**2 / (1024.0**2))
    pairwise_distance = np.empty((n_frames, n_frames), dtype=np.float32)

    sremove_center_of_mass(xyzlist, n_real_atoms)
    g = scalculate_g(xyzlist)

    for i in range(n_frames):
        if i % 100 == 0:
            print '%d/%d' % (i, n_frames)
        pairwise_distance[i, :] = IRMSD.rmsd_one_to_all(xyzlist, xyzlist, g, g, n_real_atoms, i)

    return pairwise_distance
Esempio n. 2
0
def kmeans_mds(xyzlist, k=10, max_iters=100, max_time=10, threshold=1e-8, nearest_medoid=False):
    """k-means clustering with the RMSD distance metric.

    this is an iterative algorithm. during each iteration we first move each cluster center to
    the empirical average of the conformations currently assigned to it, and then we re-assign
    all of the conformations given the new locations of the centers.
    
    to compute the average conformations, we use a form of classical multidimensional
    scaling / principle coordinate analysis.
    """
    xyzlist = ensure_type(xyzlist, np.float32, 3, name='xyzlist', shape=(None, None, 3), warn_on_cast=False)
    
    # center
    for x in xyzlist:
        centroid = x.astype('float64').mean(0)
        assert centroid.shape == (3,)
        x -= centroid

    # setup for the rmsd calculation
    n_frames, n_atoms = xyzlist.shape[0:2]
    xyzlist_irmsd, n_atoms_padded = rmsd.reshape_irmsd(xyzlist)
    xyzlist_G = rmsd.calculate_G(xyzlist)
    
    # setup for the clustering stuff
    # assignments[i] = j means that the i-th conformation is assigned to the j-th cluster
    assignments = -1*np.ones(n_frames, dtype=np.int64)
    assignments[0:k] = np.arange(k)
    np.random.shuffle(assignments)
    
    # the j-th cluster has cartesian coorinates centers[j]
    centers = np.zeros((k, xyzlist.shape[1], 3))
    # assignment_dist[i] gives the RMSD between the ith conformation and its
    # cluster center
    assignment_dist = np.inf * np.ones(len(xyzlist))
    # previous value of the clustering score
    # all of the clustering scores
    scores = [np.inf]
    times = [time.time()]
    
    for n in itertools.count():
        # recenter each cluster based on its current members
        for i in range(k):
            structures = xyzlist[assignments == i, :, :]
            if len(structures) == 0:
                # if the current state has zero assignments, just randomly
                # select a structure for it
                print 'warning: cluster %5d contains zero structures, reseeding...' % i
                print '(if this error appears once or twice at the beginning and then goes away'
                print 'don\'t worry. but if it keeps up repeatedly, something is wrong)'
                new_center = xyzlist[np.random.randint(len(xyzlist))]
            else:
                medoid = average_structure(structures)
                medoid -= medoid.mean(0)
                if nearest_medoid:
                    # instead of actually using the raw MDS average structure, we choose
                    # the data point in xyzlist[assignments == i, :, :] that is closest,
                    # by RMSD, to this MDS structure.

                    # reshape the medoid for RMSD
                    medoid = medoid[np.newaxis, :, :]
                    medoid_g = rmsd.calculate_G(medoid)
                    medoid_irmsd, _ = rmsd.reshape_irmsd(medoid)

                    # actually compute the RMSDs
                    d = IRMSD.rmsd_one_to_all(medoid_irmsd, xyzlist_irmsd[assignments == i, :, :],
                        medoid_g, xyzlist_G[assignments == i, :, :], n_atoms, 0)

                    # choose the structure that was closest to be the medoid
                    medoid = xyzlist[assignments == i, :, :][np.argmin(d)]

            centers[i] = medoid
        
        # prepare the new centers for RMSD
        centers_G = rmsd.calculate_G(centers)
        centers_irmsd, _ = rmsd.reshape_irmsd(centers)
        
        # reassign all of the data
        assignments = -1 * np.ones(len(xyzlist))
        assignment_dist = np.inf * np.ones(len(xyzlist))
        for i in range(k):
            d = IRMSD.rmsd_one_to_all(centers_irmsd, xyzlist_irmsd, centers_G, xyzlist_G, n_atoms, i)
            where = d < assignment_dist
            assignments[where] = i
            assignment_dist[where] = d[where]

        # check how far each cluster center moved during the last iteration
        # and break if necessary
        scores.append(np.sqrt(np.mean(np.square(assignment_dist))))
        times.append(time.time())
        print 'round %3d, RMS radius %8f, change %.3e' % (n, scores[-1], scores[-1] - scores[-2])
        if threshold is not None and scores[-2] - scores[-1] < threshold:
            print 'score decreased less than threshold (%s). done\n' % threshold
            break
        if max_iters is not None and n >= max_iters:
            print 'reached maximum number of iterations. done\n'
            break
        if max_time is not None and times[-1] >= times[0] + max_time:
            print 'reached maximum amount of time. done\n'
            break
    
    print 'RMSD KMeans Performance Summary (py)'
    print '------------------------------------'
    print 'n frames: %d' % n_frames
    print 'n states: %d' % k
    print 'mean time per round (s)   %.4f' % np.mean(np.diff(times))
    print 'stddev time per round (s) %.4f' % np.std(np.diff(times))
    print 'total time (s)            %.4f' % (times[-1] - times[0])
    return centers, assignments, assignment_dist, np.array(scores), np.array(times)