def save_matrix(self, matrix_path): """ Writes matrix contents to disk. @param matrix_save_file: Complete path (with filename) where to save the matrix. """ pyRMSD_MatrixHandler.save_matrix(matrix_path, self.distance_matrix)
def save_statistics(self, matrix_base_path): """ Writes matrix statistics to disk in JSON format. @param matrix_base_path: The folder where to save the 'statistics.json' file. """ return pyRMSD_MatrixHandler.save_statistics(matrix_base_path, self.distance_matrix)
def calculate(cls, data_handler, matrix_params): """ :param matrix_params: The parameters to build the matrix. In this base case the only option is "load". Base parameters : { "method": STRING, "parameters":{ ... } } Options: - "load": Load an already created matrix from disk "parameters":{ "path": STRING } "path": The path from where the matrix is going to be loaded. :return: A CondensedMatrix. """ return pyRMSD_MatrixHandler.load_matrix(matrix_params["path"])
def get_rmsd_matrix(self, align, symmetry): (ps, masses, radii, conforms, symm_groups, models_name, n_models) = rmsd_calculation.get_rmfs_coordinates_one_rmf( "./", self.get_input_file_name("SampledA.rmf3"), self.get_input_file_name("SampledC.rmf3"), None, symmetry, None, 1) inner_data = rmsd_calculation.get_rmsds_matrix( # noqa conforms, 'cpu_omp', align, 2, symm_groups) del conforms mHandler = MatrixHandler() mHandler.loadMatrix("Distances_Matrix.data") rmsd_matrix = mHandler.getMatrix() distmat = rmsd_matrix.get_data() distmat_full = sp.spatial.distance.squareform(distmat) return distmat_full
def test_write_and_load(self): mh = MatrixHandler(".") data = range(1000) matrix = CondensedMatrix(data) mh.distance_matrix = matrix mh.saveMatrix("matrix") mh2 = MatrixHandler(None) mh2.loadMatrix("matrix") recovered_data = mh2.distance_matrix.get_data() numpy.testing.assert_array_equal(mh.distance_matrix.get_data(), data) numpy.testing.assert_array_equal(recovered_data, data) # Clean it! os.system("rm matrix.npy")
def calcDistMatrix(coordsets): from pyRMSD.matrixHandler import MatrixHandler matrix = MatrixHandler().createMatrix(coordsets,'NOSUP_SERIAL_CALCULATOR') return matrix.get_data()
conforms, masses, radii, models_name = get_pdbs_coordinates( args.path, idfile_A, idfile_B) else: args.extension = "rmf3" ps_names, masses, radii, conforms, models_name = get_rmfs_coordinates( args.path, idfile_A, idfile_B, args.subunit) print "Size of conformation matrix", conforms.shape if not args.skip_sampling_precision: inner_data = get_rmsds_matrix(conforms, args.mode, args.align, args.cores) print "Size of RMSD matrix (flattened):", inner_data.shape import pyRMSD.RMSDCalculator from pyRMSD.matrixHandler import MatrixHandler mHandler = MatrixHandler() mHandler.loadMatrix("Distances_Matrix.data") rmsd_matrix = mHandler.getMatrix() distmat = rmsd_matrix.get_data() distmat_full = sp.spatial.distance.squareform(distmat) print "Size of RMSD matrix (unpacked, N x N):", distmat_full.shape # Get model lists sampleA_all_models, sampleB_all_models = get_sample_identity( idfile_A, idfile_B) total_num_models = len(sampleA_all_models) + len(sampleB_all_models) all_models = sampleA_all_models + sampleB_all_models print "Size of Sample A:", len( sampleA_all_models), " ; Size of Sample B: ", len(
def process(self, args=None): if not args: args = self.args NUCS = self.atypes iNUCS = dict(map(lambda x: ( x[1], x[0], ), enumerate(NUCS))) iNUCS = self.mpi.comm.bcast(iNUCS) lnucs = len(NUCS) fNUCS = np.zeros((lnucs, ), dtype=np.float) # Init storage for matrices # Get file name tSf = dict() for i in NUCS: tSf[i] = np.zeros(self.N, dtype=np.float) args['mpi'] = self.mpi extractor = er.PepExtractor(**args) lM = len(self.aplist) self.mpi.comm.Barrier() # if self.mpi.rank == 0: # pbar = tqdm(total=lM) tota_ = 0 totba_ = 0 for cm in range(lM): m = self.aplist[cm] print('Rank %d model %d of %d' % (self.mpi.rank, cm, lM)) # if self.mpi.rank == 0: # pbar.update(cm) try: S = extractor.extract_result(m) except: print('ERROR: BAD PEPTIDE: %s' % m) continue lS = S.numCoordsets() tlS = range(lS) if self.cluster is True: resc = S.select('not element H').getCoordsets() cl = 'NOSUP_SERIAL_CALCULATOR' mHandler = MatrixHandler() matrix = mHandler.createMatrix(resc, cl) mat = scipy.spatial.distance.squareform(matrix.get_data()) smatrix = (mat**2) * (-1) aff = AffinityPropagation(affinity='precomputed') aff_cluster = aff.fit(smatrix) tlS = aff_cluster.cluster_centers_indices_ if tlS is None: continue for S_ in tlS: S.setACSIndex(S_) for a in S.iterAtoms(): # skip hydrogens if a.getElement() == 'H': continue try: atype = self.rtypes[(a.getResname(), a.getName())] except: print('ATYPE not found', a.getResname(), a.getName()) if atype not in self.atypes: continue Agrid, AminXYZ = gu.process_atom(a, self.step) adj = (AminXYZ - self.GminXYZ) adj = (adj / self.step).astype(np.int) x, y, z = adj try: tSf[atype][x:x + Agrid.shape[0], y:y + Agrid.shape[1], z:z + Agrid.shape[2]] += Agrid fNUCS[iNUCS[atype]] += 1 tota_ += 1 except: # print(m, a) totba_ += 1 pass # if self.mpi.rank == 0: # pbar.close() self.mpi.comm.Barrier() if self.mpi.rank == 0: print('Collecting grids') fNUCS_ = self.mpi.comm.allreduce(fNUCS) nNUCS = np.zeros((lnucs, ), dtype=np.float) tota = self.mpi.comm.reduce(tota_) totba = self.mpi.comm.reduce(totba_) for i in range(lnucs): NUC_ = NUCS[i] if self.mpi.rank != 0: self.mpi.comm.Send(tSf[NUC_], dest=0, tag=i) elif self.mpi.rank == 0: for j in range(1, self.mpi.NPROCS): tG = np.empty(tSf[NUC_].shape, dtype=np.float) self.mpi.comm.Recv(tG, source=j, tag=i) tSf[NUC_] += tG nNUCS[i] = np.max(tSf[NUC_]) nNUCS_ = self.mpi.comm.bcast(nNUCS) self.mpi.comm.Barrier() # Allocate results file Sfn = args['Sfn'] if self.mpi.rank == 0: print('Saving data') # Sf.atomic = True nmax = bn.nanmax(np.divide(nNUCS_, fNUCS_)) Sf = h5py.File(Sfn, 'w') for i in range(lnucs): NUC_ = NUCS[i] iNUC_ = iNUCS[NUC_] mult = fNUCS_[iNUC_] if mult > 0.0: tG = tSf[NUC_] med = np.median(tG) tG[tG < (med)] = 0 tG /= float(mult) tG /= float(nmax) tG *= 100.0 tSf[NUC_] = tG else: print('Array is empty for: ', NUC_) Sf.create_dataset(NUC_, data=tSf[NUC_]) Gstep = np.array([self.step, self.step, self.step], dtype=np.float) Sf.create_dataset('step', data=Gstep) Sf.create_dataset('origin', data=self.GminXYZ) Sf.create_dataset('atypes', data=np.array([ args['atypes'], ], dtype='S20')) print('Total bad atoms %d of %d' % (totba, tota)) Sf.close() self.mpi.comm.Barrier() # Open matrix file in parallel mode self.database.close()
#import numpy as np # #for k in range(self._cool_cycle): # confs = self.confs['cool']['samples'][-1][k] # for c in range(len(confs)): # self.universe.setConfiguration(Configuration(self.universe,confs[c])) # self.universe.normalizeConfiguration() # self.confs['cool']['samples'][-1][k][c] = np.copy(self.universe.configuration().array) import itertools confs = [self.confs['cool']['samples'][-1][k] for k in range(self._cool_cycle)] confs = np.array([conf[self.molecule.heavy_atoms,:] for conf in itertools.chain.from_iterable(confs)]) from pyRMSD.matrixHandler import MatrixHandler rmsd_matrix = MatrixHandler().createMatrix(confs,'QCP_SERIAL_CALCULATOR') # NOSUP_SERIAL_CALCULATOR #GBSA_energy = [(self.cool_Es[-1][k]['LNAMD_GBSA'][:,-1]-self.cool_Es[-1][k]['LNAMD_Gas'][:,-1]) for k in range(self._cool_cycle)] #GBSA_energy = np.array(list(itertools.chain.from_iterable(GBSA_energy))) cum_Nk = np.cumsum([len(self.confs['cool']['samples'][-1][k]) for k in range(self._cool_cycle)]) # # Compute distance matrix with centering # self._write_traj('cool.dcd',confs,moiety='L') # import mdtraj as md # traj = md.load('cool.dcd',top=self._FNs['prmtop']['L']) # dist_matrix = [mdtraj.rmsd(traj,traj,frame=k,atom_indices=traj.topology.select('type!=H')) for k in range(N)] # dist_matrix = np.array(dist_matrix)
def main(): args = parse_args() import os import shutil import numpy import scipy as sp import IMP.sampcon from IMP.sampcon import scores_convergence, clustering_rmsd from IMP.sampcon import rmsd_calculation, precision_rmsd import IMP idfile_A = "Identities_A.txt" idfile_B = "Identities_B.txt" # Step 0: Compute Score convergence score_A = [] score_B = [] with open(os.path.join(args.path, args.scoreA), 'r') as f: for line in f: score_A.append(float(line.strip("\n"))) with open(os.path.join(args.path, args.scoreB), 'r') as f: for line in f: score_B.append(float(line.strip("\n"))) scores = score_A + score_B # Get the convergence of the best score scores_convergence.get_top_scorings_statistics(scores, 0, args.sysname) # Check if the two score distributions are similar scores_convergence.get_scores_distributions_KS_Stats( score_A, score_B, 100, args.sysname) # Step 1: Compute RMSD matrix if args.extension == "pdb": ps_names = [] # bead names are not stored in PDB files symm_groups = None conforms, masses, radii, models_name = \ rmsd_calculation.get_pdbs_coordinates( args.path, idfile_A, idfile_B) else: args.extension = "rmf3" # If we have a single RMF file, read conformations from that if args.rmf_A is not None: (ps_names, masses, radii, conforms, symm_groups, models_name, n_models) = rmsd_calculation.get_rmfs_coordinates_one_rmf( args.path, args.rmf_A, args.rmf_B, args.subunit, args.symmetry_groups) # If not, default to the Identities.txt file else: symm_groups = None (ps_names, masses, radii, conforms, models_name) = rmsd_calculation.get_rmfs_coordinates( args.path, idfile_A, idfile_B, args.subunit) print("Size of conformation matrix", conforms.shape) if not args.skip_sampling_precision: # get_rmsds_matrix modifies conforms, so save it to a file and restore # afterwards (so that we retain the original IMP orientation) numpy.save("conforms", conforms) inner_data = rmsd_calculation.get_rmsds_matrix(conforms, args.mode, args.align, args.cores, symm_groups) print("Size of RMSD matrix (flattened):", inner_data.shape) del conforms conforms = numpy.load("conforms.npy") os.unlink('conforms.npy') from pyRMSD.matrixHandler import MatrixHandler mHandler = MatrixHandler() mHandler.loadMatrix("Distances_Matrix.data") rmsd_matrix = mHandler.getMatrix() distmat = rmsd_matrix.get_data() distmat_full = sp.spatial.distance.squareform(distmat) print("Size of RMSD matrix (unpacked, N x N):", distmat_full.shape) # Get model lists if args.rmf_A is not None: sampleA_all_models = list(range(n_models[0])) sampleB_all_models = list(range(n_models[0], n_models[1] + n_models[0])) total_num_models = n_models[1] + n_models[0] else: (sampleA_all_models, sampleB_all_models) = clustering_rmsd.get_sample_identity( idfile_A, idfile_B) total_num_models = len(sampleA_all_models) + len(sampleB_all_models) all_models = list(sampleA_all_models) + list(sampleB_all_models) print("Size of Sample A:", len(sampleA_all_models), " ; Size of Sample B: ", len(sampleB_all_models), "; Total", total_num_models) if not args.skip_sampling_precision: print("Calculating sampling precision") # Step 2: Cluster at intervals of grid size to get the # sampling precision gridSize = args.gridsize # Get cutoffs for clustering cutoffs_list = clustering_rmsd.get_cutoffs_list(distmat, gridSize) print("Clustering at thresholds:", cutoffs_list) # Do clustering at each cutoff pvals, cvs, percents = clustering_rmsd.get_clusters( cutoffs_list, distmat_full, all_models, total_num_models, sampleA_all_models, sampleB_all_models, args.sysname) # Now apply the rule for selecting the right precision based # on population of contingency table, pvalue and cramersv (sampling_precision, pval_converged, cramersv_converged, percent_converged) = clustering_rmsd.get_sampling_precision( cutoffs_list, pvals, cvs, percents) # Output test statistics with open("%s.Sampling_Precision_Stats.txt" % args.sysname, 'w+') as fpv: print( "The sampling precision is defined as the largest allowed " "RMSD between the cluster centroid and a ", args.sysname, "model within any cluster in the finest clustering for " "which each sample contributes models proportionally to " "its size (considering both significance and magnitude of " "the difference) and for which a sufficient proportion of " "all models occur in sufficiently large clusters. The " "sampling precision for our ", args.sysname, " modeling is %.3f" % (sampling_precision), " A.", file=fpv) print( "Sampling precision, P-value, Cramer's V and percentage " "of clustered models below:", file=fpv) print("%.3f\t%.3f\t%.3f\t%.3f" % (sampling_precision, pval_converged, cramersv_converged, percent_converged), file=fpv) print("", file=fpv) final_clustering_threshold = sampling_precision else: final_clustering_threshold = args.cluster_threshold # Perform final clustering at the required precision print("Clustering at threshold %.3f" % final_clustering_threshold) (cluster_centers, cluster_members) = clustering_rmsd.precision_cluster( distmat_full, total_num_models, final_clustering_threshold) (ctable, retained_clusters) = clustering_rmsd.get_contingency_table( len(cluster_centers), cluster_members, all_models, sampleA_all_models, sampleB_all_models) print("Contingency table:", ctable) # Output the number of models in each cluster and each sample with open("%s.Cluster_Population.txt" % args.sysname, 'w+') as fcp: for rows in range(len(ctable)): print(rows, ctable[rows][0], ctable[rows][1], file=fcp) # Obtain the subunits for which we need to calculate densities density_custom_ranges = precision_rmsd.parse_custom_ranges(args.density) # Output cluster precisions fpc = open("%s.Cluster_Precision.txt" % args.sysname, 'w+') # For each cluster, output the models in the cluster # Also output the densities for the cluster models for i in range(len(retained_clusters)): clus = retained_clusters[i] # The cluster centroid is the first conformation. # We use this as to align and compute RMSD/precision conform_0 = conforms[all_models[cluster_members[clus][0]]] # create a directory for the cluster if not os.path.exists("./cluster.%s" % i): os.mkdir("./cluster.%s" % i) os.mkdir("./cluster.%s/Sample_A/" % i) os.mkdir("./cluster.%s/Sample_B/" % i) else: shutil.rmtree("./cluster.%s" % i) os.mkdir("./cluster.%s" % i) os.mkdir("./cluster.%s/Sample_A/" % i) os.mkdir("./cluster.%s/Sample_B/" % i) # Create densities for all subunits for both sample A and sample B # as well as separately. gmd1 = precision_rmsd.GetModelDensity( custom_ranges=density_custom_ranges, resolution=args.density_threshold, voxel=args.voxel, bead_names=ps_names) gmd2 = precision_rmsd.GetModelDensity( custom_ranges=density_custom_ranges, resolution=args.density_threshold, voxel=args.voxel, bead_names=ps_names) gmdt = precision_rmsd.GetModelDensity( custom_ranges=density_custom_ranges, resolution=args.density_threshold, voxel=args.voxel, bead_names=ps_names) # Also output the identities of cluster members both_file = open('cluster.' + str(i) + '.all.txt', 'w') sampleA_file = open('cluster.' + str(i) + '.sample_A.txt', 'w') sampleB_file = open('cluster.' + str(i) + '.sample_B.txt', 'w') # Create a model with just the cluster_member particles model = IMP.Model() ps = [] # particle list to be updated by each RMF frame for pi in range(len(conform_0)): p = IMP.Particle(model, "%s" % str(pi)) IMP.core.XYZ.setup_particle(p, (0, 0, 0)) IMP.core.XYZR.setup_particle(p, float(radii[pi])) IMP.atom.Mass.setup_particle(p, float(masses[pi])) ps.append(p) # Obtain cluster precision by obtaining average RMSD of each model # to the cluster center cluster_precision = 0.0 # transformation from internal pyRMSD orientation trans = None # for each model in the cluster for mem in cluster_members[clus]: model_index = all_models[mem] # get superposition of each model to cluster center and the # RMSD between the two if args.symmetry_groups: rmsd, superposed_ps, trans = \ precision_rmsd.get_particles_from_superposed_amb( conforms[model_index], conform_0, args.align, ps, trans, symm_groups) else: rmsd, superposed_ps, trans = \ precision_rmsd.get_particles_from_superposed( conforms[model_index], conform_0, args.align, ps, trans) model.update() # why not? cluster_precision += rmsd # Add the superposed particles to the respective density maps gmdt.add_subunits_density(superposed_ps) # total density map print(model_index, file=both_file) if model_index in sampleA_all_models: # density map for sample A gmd1.add_subunits_density(superposed_ps) print(model_index, file=sampleA_file) else: # density map for sample B gmd2.add_subunits_density(superposed_ps) print(model_index, file=sampleB_file) cluster_precision /= float(len(cluster_members[clus]) - 1.0) print( "Cluster precision (average distance to cluster centroid) " "of cluster ", str(i), " is %.3f" % cluster_precision, "A", file=fpc) both_file.close() sampleA_file.close() sampleB_file.close() # Output density files for the cluster density = gmdt.write_mrc(path="./cluster.%s" % i, file_prefix="LPD") gmd1.write_mrc(path="./cluster.%s/Sample_A/" % i, file_prefix="LPD") gmd2.write_mrc(path="./cluster.%s/Sample_B/" % i, file_prefix="LPD") # Add the cluster center model RMF to the cluster directory cluster_center_index = cluster_members[clus][0] if args.rmf_A is not None: cluster_center_model_id = cluster_center_index if cluster_center_index < n_models[0]: make_cluster_centroid( os.path.join(args.path, args.rmf_A), cluster_center_index, os.path.join("cluster.%d" % i, "cluster_center_model.rmf3"), i, len(cluster_members[clus]), cluster_precision, density, args.path) else: make_cluster_centroid( os.path.join(args.path, args.rmf_B), cluster_center_index - n_models[0], os.path.join("cluster.%d" % i, "cluster_center_model.rmf3"), i, len(cluster_members[clus]), cluster_precision, density, args.path) else: # index to Identities file. cluster_center_model_id = all_models[cluster_center_index] outfname = os.path.join("cluster.%d" % i, "cluster_center_model." + args.extension) if 'rmf' in args.extension: make_cluster_centroid(models_name[cluster_center_model_id], 0, outfname, i, len(cluster_members[clus]), cluster_precision, density, args.path) else: shutil.copy(models_name[cluster_center_model_id], outfname) fpc.close() # generate plots for the score and structure tests if args.gnuplot: import subprocess import glob gnuplotdir = IMP.sampcon.get_data_path("gnuplot_scripts") for filename in sorted(glob.glob(os.path.join(gnuplotdir, "*.plt"))): cmd = ['gnuplot', '-e', 'sysname="%s"' % args.sysname, filename] print(" ".join(cmd)) subprocess.check_call(cmd)
"chain":ligand_file_description[1], "atoms":ligand_file_description[2:] } ligand_description = "resname %s and name %s"%(ligand["resname"],"".join( a+" " for a in ligand["atoms"])) print "* Ligand parsed: ",ligand_description ####################################################################################################################### # Generate matrix with metrics (so now we are going to cluster based on Energy and spawning ####################################################################################################################### print "* Creating Spawning - totalE matrix" records = [] processFile(traj_pdb, records, True) all_metrics = genMetrics(plots["totale_spawning"], records) matrix_data = scipy.spatial.distance.pdist(normalize_metrics(all_metrics), 'euclidean') m_handler = MatrixHandler() m_handler.distance_matrix = CondensedMatrix(matrix_data) matrix_file = os.path.join(base_dir, TENERGY_SPAWN_MATRIX) m_handler.saveMatrix(matrix_file) ####################################################################################################################### # Cluster by metrics ####################################################################################################################### print "* Spawning - totalE clustering" be_rmsd_clustering_script_path = os.path.join(base_dir, 'scripts', CLUSTERING_SPAWN_TOTE_SCRIPT) working_directory = os.path.join(base_dir, TOTALE_SPAWN_WORKSPACE) params = load_dic_in_json(be_rmsd_clustering_script_path) params['global']['workspace']['base'] = working_directory params['data']['files'] = [os.path.join(os.getcwd(), traj_pdb)] params['data']['matrix']['parameters']['path'] = matrix_file save_dic_in_json(params, be_rmsd_clustering_script_path)