コード例 #1
0
    def calculate_tica_components(self, cluster_method, calculate_strides=False, feats=None):
          
        '''Load in the features, calculate a given number of tICA components (tica_components) given a
        lagtime (lag_time), and save tICA coordinates and eigenvector data. It then creates and populates
        a list for each desired component, clusters the data, saving normalized populations as populations.dat
        and saving each cluster center as a .pdb. tICA plots are created and saved, and implied timescales are
        calculated, saved, and plotted.
        '''
	
	# tICA parameters
        tica_lagtime = 10 # determine from implied timescales
        tica_components = 8 # how many tICs to compute
        n_clusters = 100 # denotes number of microstates
        n_timescales = tica_components # plot all eigenvalues --> timescales
        md_time_step = 0.02 # ns
        subsampled_time_step = 1. # ns multiplier of timescales and lagtimes in implied timescale plot
        stride = int(subsampled_time_step / md_time_step)  #time step stride for sub-sampling
	equil_time = 1. # ns
        equil_steps = 1 #int(equil_time / md_time_step)  time steps to be removed from start
        lagtimes = np.array([1,2,4,8,16,32,64,128,256,512,1024])
        cluster_method = 'kcenters' # 'kcenters/kmeans'
        all_ticas = list(itertools.permutations(range(1,tica_components+1), 2)) # all combinations
        all_ticas = [[1,2]] # override: just show analysis for first two components
        cluster_percentage_cutoff = 5 # clusters with a relative population less than this
                                  # number will not be labeled on plot i.e. 0 : all clusters labeled
        verbose = False

        print("\nCalculating tICA components...")
        
        # Load in feature files THIS WILL NEED TO BE CHANGED
	if feats == None:
	    if calculate_strides:
                self.calculate_stride_distances(stride, equil_steps)
	        data = np.load('/home/server/git/fah-scripts/DataAnalysisScripts/stride_dist/stride_dist_%d.npy' % self.proj_num)
	    else:
	        data = self.data
	else:
	    data = np.load(feats)

	features = []
	for run in data:
	    for clone in run:
	        gen_seq = []
	        for gen in clone:
	            if gen is not None and gen[0] is not None:
    	                if calculate_strides or feats is not None:
			    gen_seq.append(gen)
		        else:
	                    gen_seq.append(gen[::stride]) 
	        if len(gen_seq) > 0:
		    gen_cat = np.concatenate(gen_seq)
		    if calculate_strides:
		        features.append(gen_cat)
		    else:
		        features.append(gen_cat[equil_steps:])
	features = np.asarray(features)
	print(features.shape)
	print(features[0].shape)
	tica_coordinates = tICA(lag_time=tica_lagtime,
            n_components=int(tica_components)).fit_transform(features)
      
        np.save('%s/lag_%d_coord_%d.npy' %(self.tICA_dir, tica_lagtime, tica_components), tica_coordinates)
          
        # Initiate and populate an array for each component    
        for i in range(tica_components):
            exec('tica_' + str(i+1) + ' = []')
          
        for i in tqdm.tqdm(range(len(features))):
            for j in range(len(tica_coordinates[i])):
                for k in range(tica_components):
                    exec('tica_' + str(k+1) + '.append(tica_coordinates[i][j][k])')
            
        # Perform clustering based on the cluster_method parameter.
        if cluster_method == 'kcenters':
            print("Clustering via KCenters...")
            clusters = KCenters(n_clusters)
        elif cluster_method == 'kmeans':
            print("Clustering via KMeans...")
            clusters = KMeans(n_clusters)
        else:
            sys.exit("Invalid cluster_method. Use kmeans or kcenters.")
        
        # Determine cluster assignment for each frame.      
        sequences = clusters.fit_transform(tica_coordinates)
	
        np.save('%s/lag_%d_clusters_%d_sequences.npy' %(self.tICA_dir, tica_lagtime, n_clusters), sequences)
        np.save('%s/lag_%d_clusters_%d_center.npy' %(self.tICA_dir, tica_lagtime, n_clusters),
        clusters.cluster_centers_)

        # Determine cluster populations, normalize the counts, and save as percentages for
        # labeling if a cluster contains more than cluster_percentage_cutoff percent of the data.
        # Finally, save normalized counts.
        print("\nDetermining cluster populations...")
    
        if not os.path.exists('%s/cluster_centers' % self.tICA_dir):
            os.makedirs('%s/cluster_centers' % self.tICA_dir)
        counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)])
        normalized_counts =  counts/float(counts.sum())
        percentages = [ i*100 for i in normalized_counts ]
        population_labels = [ [i,"%.2f"%percentages[i]] for i in range(len(percentages)) if percentages[i] > cluster_percentage_cutoff ]
        np.savetxt('%s/cluster_centers/populations.dat' % self.tICA_dir, normalized_counts)
	

        # Plot all unique combinations of tICA components
        print("\nPlotting tICA components with cluster centers...")
        all_ticas = list(itertools.permutations(range(1,tica_components+1), 2))
        for j in tqdm.tqdm(range(len(all_ticas))): # For each pair
            if all_ticas[j][0] < all_ticas[j][1]:
                plt.figure(j, figsize=(20,16))
                plt.hexbin(eval("tica_"+str(all_ticas[j][0])), eval("tica_"+str(all_ticas[j][1])), bins='log')
                x_centers = [clusters.cluster_centers_[i][all_ticas[j][0]-1] for i in range(len(clusters.cluster_centers_))]
                y_centers = [clusters.cluster_centers_[i][all_ticas[j][1]-1] for i in range(len(clusters.cluster_centers_))]
                high_pop_x_centers = [ x_centers[i] for i in range(len(x_centers)) if percentages[i] > cluster_percentage_cutoff ]
                high_pop_y_centers = [ y_centers[i] for i in range(len(y_centers)) if percentages[i] > cluster_percentage_cutoff ]
                plt.plot(x_centers, y_centers, color='y', linestyle="", marker="o")
                plt.plot(eval("tica_"+str(all_ticas[j][0])+'[0]'), eval("tica_"+str(all_ticas[j][1])+'[0]'), color='k', marker='*',markersize=24)
                plt.xlabel('tic'+str(all_ticas[j][0]))
                plt.ylabel('tic'+str(all_ticas[j][1]))
                plt.title(self.proj_num)
                # Add labels for high-population cluster centers
                for label, x, y in zip(population_labels, high_pop_x_centers, high_pop_y_centers):
                    plt.annotate(
                      label,
                      xy = (x, y), xytext = (-15, 15),
                      textcoords = 'offset points', ha = 'right', va = 'bottom',
                      bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
                      arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
                plt.savefig('%s/tica_' % (self.tICA_dir) +str(all_ticas[j][0])+'_'+str(all_ticas[j][1])+'.png')
                plt.close()
        
###########################################################################
	for filename in os.listdir(self.tICA_dir + '/cluster_centers'):
	    if filename.endswith('.pdb'):
                os.remove(self.tICA_dir + '/cluster_centers/' + filename)  
    # Write out PDBs for each cluster center
        print("Performing cluster analytics and saving center PDBs...\n")
	runs, clones, gens = data.shape[0], data.shape[1], data.shape[2]
	x, y, z = 0, 0, 0
	for i in range(len(features)):
	    if i % clones == 0 and i != 0:
		x += 1
	    if i % gens == 0:
		y = 0
            n_snapshots = len(clusters.distances_[i])

            # Determine frames that are cluster centers
            cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ]
            # Determine number of each cluster, correlates to populations.dat
            cluster_labels = sequences[i][cluster_indices]
            # Save each cluster center as a pdb
            if list(cluster_indices): # load center-containing xtcs to check length
		traj_cat = []
		print('x: %d, y: %d, z: %d' % (x, y, z))

		while True:
		    try:
			traj = base_dir + 'PROJ%s/RUN%s/CLONE%s/results%s/traj_comp.xtc' % (self.proj_num, x, y, z)
                	traj_cat.append(md.load(traj, top=self.gro_file))
			z += 1
		    except:
			break
		if len(traj_cat) > 0:
		    trajectory_file = md.join(traj_cat)
                xtc_len = len(trajectory_file)
	    y += 1
            z = 0
            for j in range(len(cluster_indices)):
                frames = range(xtc_len) # map the strided frame number back to xtc frame number
                strided_frames = frames[equil_steps:][::stride]  
                xtc_frame = frames.index(strided_frames[cluster_indices[j]])
                cluster_traj = trajectory_file[xtc_frame]
                cluster_traj.save_pdb('%s/cluster_centers/state_%d_%.3f.pdb'%(self.tICA_dir, cluster_labels[j],percentages[cluster_labels[j]]))
                if verbose:
                    print('Successfully saved PDB for cluster: %d, (rel.pop: %.3f)'%(cluster_labels[j],percentages[cluster_labels[j]]))
                    print('traj_file: %s (%d/%d)'%(trajectory_file,i,len(features)))
                    print('frame: %d (%d/%d centers from this trajectory)'%(cluster_indices[j],j,len(cluster_indices)))
                    print('strided: npy_frame/npy_len = %d/%d = %f'%(cluster_indices[j],n_snapshots,cluster_indices[j]/n_snapshots))
                    print('re-mapped: orig_frame/xtc_len = %d/%d = %f\n'%(xtc_frame,xtc_len,xtc_frame/xtc_len))
コード例 #2
0
ファイル: tica.py プロジェクト: yabmtm/scripts
def calculate_tica_components():
    print("Calculating tICA components...")
    in_files = glob.glob("out*npy")
    loaded_files = [ np.load(filename) for filename in in_files ]
    tica = tICA(lag_time=tica_lagtime,
        n_components=int(tica_components)).fit_transform(loaded_files)
    np.save('lag_%d_comp_%d.npy' %(tica_lagtime, tica_components), tica)
    tica_data = 'data_lag_%d_comp_%d' %(tica_lagtime, tica_components)
    joblib.dump(tica, tica_data)
    data = np.load('lag_%d_comp_%d.npy' %(tica_lagtime, tica_components))

    for i in range(len(glob.glob('out*npy'))): # extract the four tICA components
        for j in range(len(data[i])):
            tica_1.append(data[i][j][0])
            tica_2.append(data[i][j][1])
            tica_3.append(data[i][j][2])
            tica_4.append(data[i][j][3])

# Clustering via KCenters
    if cluster_method == 'kcenters':
        print("Clustering via KCenters...")
        clusters = KCenters(n_clusters)
    elif cluster_method == 'kmeans':
        print("Clustering via KMeans...")
        clusters = KMeans(n_clusters)
    else:
        sys.exit("Invalid cluster_method. Use kmeans or kcenters.")
    sequences = clusters.fit_transform(tica)
    np.save('lag_%d_clusters_%d_sequences.npy' %(tica_lagtime, n_clusters), sequences)
    np.save('lag_%d_clusters_%d_center.npy' %(tica_lagtime, n_clusters),
        clusters.cluster_centers_)
    cluster_data = 'lag_%d_clusters_%d.pkl' %(tica_lagtime, n_clusters)
    joblib.dump(sequences, cluster_data)

 # Determining cluster populations
    print("Determining cluster populations...")
    counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) # how many frames are in each cluster
    normalized_counts =  counts/float(counts.sum())
    percentages = [ i*100 for i in normalized_counts ]

# Plotting the tICA components
    print("Plotting tICA components with cluster centers...")
    plt.figure(0) # plotting tica_1, tica_2
    plt.hexbin(tica_1, tica_2, bins='log') #, cmap=cmaps.viridis
    x_centers = [clusters.cluster_centers_[i][0] for i in range(len(clusters.cluster_centers_))]
    y_centers = [clusters.cluster_centers_[i][1] for i in range(len(clusters.cluster_centers_))]
    plt.plot(x_centers, y_centers, 'wo')
    for label, x, y in zip(["%.4f"%i for i in percentages], x_centers, y_centers): # adds percentage contribution for each cluster
        plt.annotate(
          label,
          xy = (x, y), xytext = (-20, 20),
          textcoords = 'offset points', ha = 'right', va = 'bottom',
          bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
          arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
    plt.savefig('tica_1_2.png')
    plt.figure(1) # plotting tica_1, tica_3
    plt.hexbin(tica_1, tica_3, bins='log')
    x_centers = [clusters.cluster_centers_[i][0] for i in range(len(clusters.cluster_centers_))]
    y_centers = [clusters.cluster_centers_[i][2] for i in range(len(clusters.cluster_centers_))]
    plt.plot(x_centers, y_centers, 'wo')
    for label, x, y in zip([ "%.4f"%i for i in percentages], x_centers, y_centers):
        plt.annotate(
          label,
          xy = (x, y), xytext = (-20, 20),
          textcoords = 'offset points', ha = 'right', va = 'bottom',
          bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
          arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
    plt.savefig('tica_1_3.png')
    plt.figure(2) # plotting tica_2, tica_3
    plt.hexbin(tica_2, tica_3, bins='log')
    x_centers = [clusters.cluster_centers_[j][1] for j in range(len(clusters.cluster_centers_))]
    y_centers = [clusters.cluster_centers_[j][2] for j in range(len(clusters.cluster_centers_))]
    plt.plot(x_centers, y_centers, 'wo')
    for label, x, y in zip(["%.4f"%i for i in percentages], x_centers, y_centers):
        plt.annotate(
          label,
          xy = (x, y), xytext = (-20, 20),
          textcoords = 'offset points', ha = 'right', va = 'bottom',
          bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
          arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
    plt.savefig('tica_2_3.png')


   # Determining cluster entropy ( this yields errors for me )
    # print("Determining cluster entropy")
    # cluster_entropy = (-1.0*normalized_counts*np.log(normalized_counts)).sum()
    # np.savetxt('cluster_entropy.dat', cluster_entropy)

  
 # Determining the cluster populations and writing out PDBs for cluster centers
    print("Determining cluster populations...")
    counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) # how many frames are in each cluster
    normalized_counts =  counts/float(counts.sum())
    np.savetxt('populations.dat', normalized_counts)
    print("Performing cluster analytics and saving center PDBs...\n")
    for i in range(len(glob.glob("traj*xtc"))):
        n_snapshots = len(clusters.distances_[i])
        cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ] # frames that have centers
        cluster_labels = sequences[i][cluster_indices] # number of cluster
	if cluster_indices.size != 0: # print only the trajectories that have cluster centers
            for j in range(len(cluster_labels)): # for each cluster center found in this trajectory
                print('Cluster center', cluster_labels[j], 'was found in trajectory', str(i) + '.')
                print('It is found on frame', cluster_indices[j], 'and has a relative population of',
                  "%.4f"%percentages[cluster_labels[j]], '%.')

        xtcfile = sorted(glob.glob("traj*xtc"))[i]
        for j in range(len(cluster_indices)): # actually saving the snapshots
            cluster_traj = md.load_frame(xtcfile, cluster_indices[j], top='structure.gro')
            cluster_traj.save_pdb('state_%d.pdb' %cluster_labels[j]+1)


   # Calculating IPTs
    print("\nCalculating Implied Timescales...")
    timescales = implied_timescales(sequences, lagtimes, n_timescales=n_timescales,
        msm=MarkovStateModel(verbose=False))
    
    implied_timescale_data = 'ipt_lag_%d_clusters_%d.pkl' %(tica_lagtime, n_clusters)
    joblib.dump(timescales, implied_timescale_data)
    numpy_timescale_data = 'lag_%d_clusters_%d_timescales.npy' %(tica_lagtime, n_clusters)
    np.savetxt('lagtimes.txt', lagtimes)
    np.save(numpy_timescale_data, timescales)
   
# Plotting IPTs (lagtimes and timescales)
    print("Plotting Implied Timescales...")
    for i in range(n_timescales):
	plt.figure(42)
	plt.plot(lagtimes * time_step, timescales[:, i] * time_step, 'o-')
	plt.yscale('log')
	plt.xlabel('lagtime (ns)')
	plt.ylabel('Implied timescales (ns)')
	plt.savefig('lag_%d_clusters_%d_.png' %(tica_lagtime, n_clusters))
コード例 #3
0
ファイル: tica_test.py プロジェクト: yabmtm/FAHscripts
def compute_tica_components():
          
    '''Load in the features, calculate a given number of tICA components (tica_components) given a
       lagtime (lag_time), and save tICA coordinates and eigenvector data. It then creates and populates
       a list for each desired component, clusters the data, saving normalized populations as populations.dat
       and saving each cluster center as a .pdb. tICA plots are created and saved, and implied timescales are
       calculated, saved, and plotted.
    '''
        
    verbose = False
    save_pdb = True
    color_by = 'cluster'
    
    if verbose:
        print("\nCalculating tICA components...")
    if not os.path.exists(project_title + '/tica_%d'%n_clusters):
        os.mkdir(project_title + '/tica_%d'%n_clusters)
    
    # load in feature files and determine indices of unbiased ensembles
    feature_files = []
    for i in range(runs):
        run_files = sorted(glob.glob(/features/' + "P*R%d_*npy"%i))
        feature_files += run_files
        if i in unbiased_runs:
            unbiased_indices = [len(feature_files) - len(run_files),len(feature_files)]
    features = [np.load(x) for x in feature_files]
    
    # perform tICA calculation and extract score / eigenvectors
    tica_coordinates = tICA(lag_time=tica_lagtime,
        n_components=int(n_components)).fit_transform(features)
    tica_components = tICA(lag_time=tica_lagtime,
        n_components=int(n_components)).fit(features)
    eigenvectors = np.transpose(tica_components.eigenvectors_)
    tica_score = tica_components.score(features)
          
    np.save('%s/tica_%d/tica_coords-lag_%d-comp_%d.npy' %(
        project_title, n_clusters, tica_lagtime, n_components), tica_coordinates)
    np.save('%s/tica_%d/tica_comps-lag_%d-comp_%d.npy' %(
        project_title, n_clusters, tica_lagtime, n_components), tica_components)
    
    # Perform clustering based on the cluster_method parameter.
    if verbose:
        print('Clustering via %s'%cluster_method)
    if cluster_method == 'kcenters':
        clusters = KCenters(n_clusters)
    elif cluster_method == 'kmeans':
        clusters = KMeans(n_clusters)
    elif cluster_method == 'kmedoids':
        clusters = KMedoids(n_clusters)
    else:
        sys.exit('Invalid cluster_method. Use kcenters/kmeans/kmedoids.')
        
    # Cluster unbiased data and fit biased data to these centers
    new_assignments = []
    sequences = clusters.fit_transform(tica_coordinates[unbiased_indices[0]:unbiased_indices[1]])
    for i in tqdm.tqdm_notebook(range(unbiased_indices[0])):
        tica_traj = tica_coordinates[i]
        if isinstance(tica_traj, np.ndarray):
            if not (tica_traj.dtype == 'float32' or tica_traj.dtype == 'float64'):
                tica_traj = tica_traj.astype('float64')
        labels, inertia = msmbuilder.libdistance.assign_nearest(
            tica_traj, clusters.cluster_centers_, metric='euclidean')
        new_assignments.append(labels)

    new_assignments += sequences # tack the unbiased assignments back on to the end.


    np.save('%s/tica_%d/lag_%d_clusters_%d_assignments.npy' %(
        project_title, n_clusters, tica_lagtime, n_clusters), new_assignments)
    np.save('%s/tica_%d/lag_%d_clusters_%d_center.npy' %(
        project_title, n_clusters, tica_lagtime, n_clusters), clusters.cluster_centers_)

    # Determine cluster populations, normalize the counts, and save as percentages for
    # labeling if a cluster contains more than cluster_percentage_cutoff percent of the data.
    # Finally, save normalized counts.
    
    if verbose:
        print("\nDetermining cluster populations...")
    if not os.path.exists('%s/tica_%d/%s_clusters'%(project_title,n_clusters,cluster_method)):
        os.mkdir('%s/tica_%d/%s_clusters'%(project_title,n_clusters,cluster_method))
    if not os.path.exists('%s/tica_%d/plots'%(project_title,n_clusters)):
        os.mkdir('%s/tica_%d/plots'%(project_title,n_clusters))
        
    counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)])
    normalized_counts =  counts/float(counts.sum())
    percentages = [ i*100 for i in normalized_counts ]
    population_labels = [ [i,"%.2f"%percentages[i]] for i in range(len(percentages)) if percentages[i] > cluster_percentage_cutoff ]
    np.savetxt('%s/tica_%d/%s_clusters/populations.dat'
               %(project_title,n_clusters,cluster_method), normalized_counts)

    # Plot all unique combinations of tICA components
    if verbose:
        print("\nPlotting tICA components...")
    tica_coordinates = np.concatenate(tica_coordinates)
    new_assignments = np.concatenate(new_assignments)
    cluster_colors = matplotlib.cm.rainbow(np.linspace(0,1,n_clusters))
    for j in tqdm.tqdm_notebook(range(len(all_ticas)),leave=False): # For each pair
        if all_ticas[j][0] < all_ticas[j][1]:
            plt.figure(j, figsize=(20,16))
            tICx, tICy = all_ticas[j][0]-1, all_ticas[j][1]-1
            plt.hexbin(tica_coordinates[:,tICx],tica_coordinates[:,tICy], bins='log')
            for l in tqdm.tqdm(range(len(tica_coordinates))[::stride*2]):
                if color_by == 'cluster':
                    plt.plot(tica_coordinates[l][tICx], tica_coordinates[l][tICy],
                        color=cluster_colors[new_assignments[l]], linestyle="", marker="o")
            x_centers = [clusters.cluster_centers_[i][tICx] for i in range(len(clusters.cluster_centers_))]
            y_centers = [clusters.cluster_centers_[i][tICy] for i in range(len(clusters.cluster_centers_))]
            high_pop_x_centers = [ x_centers[i] for i in range(len(x_centers)) if percentages[i] > cluster_percentage_cutoff ]
            high_pop_y_centers = [ y_centers[i] for i in range(len(y_centers)) if percentages[i] > cluster_percentage_cutoff ]
            plt.plot(x_centers, y_centers, color='y', linestyle="", marker="o")
            plt.plot(tica_coordinates[:,tICx][0],tica_coordinates[:,tICy][0], color='k', marker='*',markersize=24)
            plt.xlabel('tIC'+str(all_ticas[j][0]))
            plt.ylabel('tIC'+str(all_ticas[j][1]))
            plt.title(project_title)
            # Add labels for high-population cluster centers
            for label, x, y in zip(population_labels, high_pop_x_centers, high_pop_y_centers):
                plt.annotate(
                  label,
                  xy = (x, y), xytext = (-15, 15),
                  textcoords = 'offset points', ha = 'right', va = 'bottom',
                  bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
                  arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
            plt.savefig('%s/tica_%d/plots/tica_%d_%d.png'%(project_title,n_clusters,
                all_ticas[j][0], all_ticas[j][1]))
            plt.close()

    # Write out PDBs for each cluster center
    if verbose:
        print("Performing cluster analytics and saving center PDBs...\n")
    if save_pdb:
        trajectory_files, feature_files, cluster_features = [],[],[]
        for run in range(runs): # get only xtc files that correlate to cluster-center features
            trajectory_files += [re.sub('features',
                                    'traj_data/RUN%d'%run,re.sub('npy','xtc',x)
                                     ) for x in sorted(glob.glob('%s/features/*R%d_*npy'%(
                                        project_title,run)))]
            feature_files += sorted(glob.glob('%s/features/*R%d_*npy'%(project_title,run)))

        for i in tqdm.tqdm_notebook(range(len(trajectory_files)),leave=False):

                n_snapshots = len(clusters.distances_[i])

                # Determine frames that are cluster centers
                cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ]

                # Determine number of each cluster, correlates to populations.dat
                cluster_labels = sequences[i][cluster_indices]

                # Save each cluster center as a pdb
                if list(cluster_indices): # load center-containing xtcs to check length
                    xtc_len = len(md.load(trajectory_files[i],top=structure_file))
                    
                # map strided frame number back to xtc frame number
                for j in range(len(cluster_indices)):
                        frames = range(xtc_len) 
                        strided_frames = frames[equil_steps:][::stride]
                        xtc_frame = frames.index(strided_frames[cluster_indices[j]])
                        cluster_traj = md.load_frame(trajectory_files[i], xtc_frame,
                                            top=structure_file)
                        cluster_features.append(np.load(feature_files[i])[cluster_indices[j]])
                        cluster_traj.save_pdb('%s/tica_%d/%s_clusters/state_%d.pdb'
                                            %(project_title,n_clusters,cluster_method,
                                            cluster_labels[j]))
                        
                        # save cluster information
                        with open('%s/tica_%d/cluster.dat'%(project_title,n_clusters),'w') as f:
                            f.write('\nSuccessfully saved PDB for cluster: %d, (rel.pop: %.3f)'%(
                                cluster_labels[j],percentages[cluster_labels[j]]))
                            f.write('traj_file: %s (%d/%d)'%(trajectory_files[i],i,len(features)))
                            f.write('frame: %d (%d/%d centers from this trajectory)'%(
                                cluster_indices[j],j,len(cluster_indices)))
                            f.write('strided: npy_frame/npy_len = %d/%d = %f'%(
                                cluster_indices[j],n_snapshots,cluster_indices[j]/n_snapshots))
                            f.write('re-mapped: orig_frame/xtc_len = %d/%d = %f\n'%(
                                xtc_frame,xtc_len,xtc_frame/xtc_len))
                            f.close()
                        
        # save features corresponding to each cluster center
        np.save('%s/tica_%d/cluster_features.npy'%(project_title,n_clusters),cluster_features)
                    
    return tica_score