def calculate_pairwise_rmsd(xyzlist, n_real_atoms): n_frames = len(xyzlist) log('pairwise distance matrix will be %.2f MB...', 4*n_frames**2 / (1024.0**2)) pairwise_distance = np.empty((n_frames, n_frames), dtype=np.float32) sremove_center_of_mass(xyzlist, n_real_atoms) g = scalculate_g(xyzlist) for i in range(n_frames): if i % 100 == 0: print '%d/%d' % (i, n_frames) pairwise_distance[i, :] = IRMSD.rmsd_one_to_all(xyzlist, xyzlist, g, g, n_real_atoms, i) return pairwise_distance
def kmeans_mds(xyzlist, k=10, max_iters=100, max_time=10, threshold=1e-8, nearest_medoid=False): """k-means clustering with the RMSD distance metric. this is an iterative algorithm. during each iteration we first move each cluster center to the empirical average of the conformations currently assigned to it, and then we re-assign all of the conformations given the new locations of the centers. to compute the average conformations, we use a form of classical multidimensional scaling / principle coordinate analysis. """ xyzlist = ensure_type(xyzlist, np.float32, 3, name='xyzlist', shape=(None, None, 3), warn_on_cast=False) # center for x in xyzlist: centroid = x.astype('float64').mean(0) assert centroid.shape == (3,) x -= centroid # setup for the rmsd calculation n_frames, n_atoms = xyzlist.shape[0:2] xyzlist_irmsd, n_atoms_padded = rmsd.reshape_irmsd(xyzlist) xyzlist_G = rmsd.calculate_G(xyzlist) # setup for the clustering stuff # assignments[i] = j means that the i-th conformation is assigned to the j-th cluster assignments = -1*np.ones(n_frames, dtype=np.int64) assignments[0:k] = np.arange(k) np.random.shuffle(assignments) # the j-th cluster has cartesian coorinates centers[j] centers = np.zeros((k, xyzlist.shape[1], 3)) # assignment_dist[i] gives the RMSD between the ith conformation and its # cluster center assignment_dist = np.inf * np.ones(len(xyzlist)) # previous value of the clustering score # all of the clustering scores scores = [np.inf] times = [time.time()] for n in itertools.count(): # recenter each cluster based on its current members for i in range(k): structures = xyzlist[assignments == i, :, :] if len(structures) == 0: # if the current state has zero assignments, just randomly # select a structure for it print 'warning: cluster %5d contains zero structures, reseeding...' % i print '(if this error appears once or twice at the beginning and then goes away' print 'don\'t worry. but if it keeps up repeatedly, something is wrong)' new_center = xyzlist[np.random.randint(len(xyzlist))] else: medoid = average_structure(structures) medoid -= medoid.mean(0) if nearest_medoid: # instead of actually using the raw MDS average structure, we choose # the data point in xyzlist[assignments == i, :, :] that is closest, # by RMSD, to this MDS structure. # reshape the medoid for RMSD medoid = medoid[np.newaxis, :, :] medoid_g = rmsd.calculate_G(medoid) medoid_irmsd, _ = rmsd.reshape_irmsd(medoid) # actually compute the RMSDs d = IRMSD.rmsd_one_to_all(medoid_irmsd, xyzlist_irmsd[assignments == i, :, :], medoid_g, xyzlist_G[assignments == i, :, :], n_atoms, 0) # choose the structure that was closest to be the medoid medoid = xyzlist[assignments == i, :, :][np.argmin(d)] centers[i] = medoid # prepare the new centers for RMSD centers_G = rmsd.calculate_G(centers) centers_irmsd, _ = rmsd.reshape_irmsd(centers) # reassign all of the data assignments = -1 * np.ones(len(xyzlist)) assignment_dist = np.inf * np.ones(len(xyzlist)) for i in range(k): d = IRMSD.rmsd_one_to_all(centers_irmsd, xyzlist_irmsd, centers_G, xyzlist_G, n_atoms, i) where = d < assignment_dist assignments[where] = i assignment_dist[where] = d[where] # check how far each cluster center moved during the last iteration # and break if necessary scores.append(np.sqrt(np.mean(np.square(assignment_dist)))) times.append(time.time()) print 'round %3d, RMS radius %8f, change %.3e' % (n, scores[-1], scores[-1] - scores[-2]) if threshold is not None and scores[-2] - scores[-1] < threshold: print 'score decreased less than threshold (%s). done\n' % threshold break if max_iters is not None and n >= max_iters: print 'reached maximum number of iterations. done\n' break if max_time is not None and times[-1] >= times[0] + max_time: print 'reached maximum amount of time. done\n' break print 'RMSD KMeans Performance Summary (py)' print '------------------------------------' print 'n frames: %d' % n_frames print 'n states: %d' % k print 'mean time per round (s) %.4f' % np.mean(np.diff(times)) print 'stddev time per round (s) %.4f' % np.std(np.diff(times)) print 'total time (s) %.4f' % (times[-1] - times[0]) return centers, assignments, assignment_dist, np.array(scores), np.array(times)