def main(args, metric): check_paths(args) if args.alg == 'sclarans' and args.stride != 1: logger.error("""You don't want to use a stride with sclarans. The whole point of sclarans is to use a shrink multiple to accomplish the same purpose, but in parallel with stochastic subsampling. If you cant fit all your frames into memory at the same time, maybe you could stride a little at the begining, but its not recommended.""") sys.exit(1) trajs = load_trajectories(args.project, args.stride) logger.info('Loaded %d trajs', len(trajs)) clusterer = cluster(metric, trajs, args) if not isinstance(clusterer, clustering.Hierarchical): generators = clusterer.get_generators_as_traj() logger.info('Saving %s', args.generators) generators.save_to_lhdf(args.generators) if args.stride == 1: assignments = clusterer.get_assignments() distances = clusterer.get_distances() logger.info('Since stride=1, Saving %s', args.assignments) logger.info('Since stride=1, Saving %s', args.distances) io.saveh(args.assignments, assignments) io.saveh(args.distances, distances)
def main(args, metric): check_paths(args) if args.alg == 'sclarans' and args.stride != 1: logger.error( """You don't want to use a stride with sclarans. The whole point of sclarans is to use a shrink multiple to accomplish the same purpose, but in parallel with stochastic subsampling. If you cant fit all your frames into memory at the same time, maybe you could stride a little at the begining, but its not recommended.""") sys.exit(1) trajs = load_trajectories(args.project, args.stride) logger.info('Loaded %d trajs', len(trajs)) clusterer = cluster(metric, trajs, args) if not isinstance(clusterer, clustering.Hierarchical): generators = clusterer.get_generators_as_traj() logger.info('Saving %s', args.generators) generators.save_to_lhdf(args.generators) if args.stride == 1: assignments = clusterer.get_assignments() distances = clusterer.get_distances() logger.info('Since stride=1, Saving %s', args.assignments) logger.info('Since stride=1, Saving %s', args.distances) io.saveh(args.assignments, assignments) io.saveh(args.distances, distances)
def run(tProb, observable, init_pops=None, num_vecs=10, output='evec_amps.h5'): if init_pops is None: init_pops = np.ones(tProb.shape[0]).astype(float) / float(tProb.shape[0]) else: init_pops = init_pops.astype(float) init_pops /= init_pops.sum() assert (observable.shape[0] == init_pops.shape[0]) assert (observable.shape[0] == tProb.shape[0]) try: f = io.loadh('eigs%d.h5' % num_vecs) vals = f['vals'] vecsL = f['vecs'] except: vals, vecsL = msm_analysis.get_eigenvectors(tProb, num_vecs + 1, right=False) io.saveh('eigs%d.h5' % num_vecs, vals=vals, vecs=vecsL) equil = vecsL[:,0] / vecsL[:,0].sum() dyn_vecsL = vecsL[:, 1:] # normalize the left and right eigenvectors dyn_vecsL /= np.sqrt(np.sum(dyn_vecsL * dyn_vecsL / np.reshape(equil, (-1, 1)), axis=0)) dyn_vecsR = dyn_vecsL / np.reshape(equil, (-1, 1)) amps = dyn_vecsL.T.dot(observable) * dyn_vecsR.T.dot(init_pops) io.saveh(output, evals=vals[1:], amplitudes=amps) logger.info("saved output to %s" % output)
def save_container(filename, dtype): io.saveh( filename, arr_0=np.array(minus_ones, dtype=dtype), completed_vtrajs=np.zeros((n_vtrajs), dtype=np.bool), hashes=hashes, )
def main(args, metric): if args.alg == "sclarans" and args.stride != 1: logger.error( """You don't want to use a stride with sclarans. The whole point of sclarans is to use a shrink multiple to accomplish the same purpose, but in parallel with stochastic subsampling. If you cant fit all your frames into memory at the same time, maybe you could stride a little at the begining, but its not recommended.""" ) sys.exit(1) # if we have a metric that explicitly operates on a subset of indices, # then we provide the option to only load those indices into memory # WARNING: I also do something a bit dirty, and inject `None` for the # RMSD.atomindices to get the metric to not splice if isinstance(metric, metrics.RMSD): atom_indices = metric.atomindices metric.atomindices = None # probably bad... logger.info("RMSD metric - loading only the atom indices required") else: atom_indices = None # In case the clustering / algorithm needs extra arguments, use # this dictionary extra_kwargs = {} # Check to be sure we won't overwrite any data if args.alg == "hierarchical": zmatrix_fn = os.path.join(args.output_dir, "ZMatrix.h5") die_if_path_exists(zmatrix_fn) extra_kwargs["zmatrix_fn"] = zmatrix_fn else: generators_fn = os.path.join(args.output_dir, "Gens.lh5") die_if_path_exists(generators_fn) if args.stride == 1: assignments_fn = os.path.join(args.output_dir, "Assignments.h5") distances_fn = os.path.join(args.output_dir, "Assignments.h5.distances") die_if_path_exists([assignments_fn, distances_fn]) trajs = load_trajectories(args.project, args.stride, atom_indices) logger.info("Loaded %d trajs", len(trajs)) clusterer = cluster(metric, trajs, args, **extra_kwargs) if not isinstance(clusterer, clustering.Hierarchical): generators = clusterer.get_generators_as_traj() logger.info("Saving %s", generators_fn) generators.save_to_lhdf(generators_fn) if args.stride == 1: assignments = clusterer.get_assignments() distances = clusterer.get_distances() logger.info("Since stride=1, Saving %s", assignments_fn) logger.info("Since stride=1, Saving %s", distances_fn) io.saveh(assignments_fn, assignments) io.saveh(distances_fn, distances)
def save(self, output): """ save the results to file Parameters: ----------- output : str output filename (.h5) """ io.saveh(output, timelag_corr_mat=self.timelag_corr_mat, cov_mat=self.cov_mat, lag=np.array([self.lag]), vals=self.vals, vecs=self.vecs)
def save_to_disk(self, filename): """Save this clusterer to disk. This is useful because computing the Z-matrix (done in __init__) is the most expensive part, and assigning is cheap Parameters ---------- filename : str location to save to Raises ------ Exception if something already exists at `filename` """ io.saveh(filename, z_matrix=self.Z, traj_lengths=self.traj_lengths)
def save_to_hdf(self, filename): """Save a Trajectory instance to a HDF File. First, remove the XYZList key because it should be written using the special CArray operation. This file format is roughly equivalent to an XTC and should comparable file sizes but with better IO performance. Parameters ---------- Filename: str location to save to Precision : float, optional Precision to save xyzlist """ indexlist = self.pop('IndexList', None) io.saveh(filename, **self) self['IndexList'] = indexlist
def save_to_lhdf(self, filename, precision=DEFAULT_PRECISION): """Save a Trajectory instance to a Lossy HDF File. First, remove the XYZList key because it should be written using the special CArray operation. This file format is roughly equivalent to an XTC and should comparable file sizes but with better IO performance. Parameters ---------- Filename: str location to save to Precision : float, optional Precision to save xyzlist """ self.pop('IndexList') xyzlist = self.pop('XYZList') rounded = _convert_to_lossy_integers(xyzlist, precision) self['XYZList'] = rounded io.saveh(filename, **self) self['XYZList'] = xyzlist
def save_to_lhdf(self, filename, precision=DEFAULT_PRECISION): """Save a Trajectory instance to a Lossy HDF File. First, remove the XYZList key because it should be written using the special CArray operation. This file format is roughly equivalent to an XTC and should comparable file sizes but with better IO performance. Parameters ---------- Filename: str location to save to Precision : float, optional Precision to save xyzlist """ self.pop("IndexList") xyzlist = self.pop("XYZList") rounded = _convert_to_lossy_integers(xyzlist, precision) self["XYZList"] = rounded io.saveh(filename, **self) self["XYZList"] = xyzlist
def main(args, metric): check_paths(args) if args.alg == 'sclarans' and args.stride != 1: logger.error("""You don't want to use a stride with sclarans. The whole point of sclarans is to use a shrink multiple to accomplish the same purpose, but in parallel with stochastic subsampling. If you cant fit all your frames into memory at the same time, maybe you could stride a little at the begining, but its not recommended.""") sys.exit(1) # if we have a metric that explicitly operates on a subset of indices, # then we provide the option to only load those indices into memory # WARNING: I also do something a bit dirty, and inject `None` for the # RMSD.atomindices to get the metric to not splice if isinstance(metric, metrics.RMSD): atom_indices = metric.atomindices metric.atomindices = None # probably bad... logger.info('RMSD metric - loading only the atom indices required') else: atom_indices = None trajs = load_trajectories(args.project, args.stride, atom_indices) logger.info('Loaded %d trajs', len(trajs)) clusterer = cluster(metric, trajs, args) if not isinstance(clusterer, clustering.Hierarchical): generators = clusterer.get_generators_as_traj() logger.info('Saving %s', args.generators) generators.save_to_lhdf(args.generators) if args.stride == 1: assignments = clusterer.get_assignments() distances = clusterer.get_distances() logger.info('Since stride=1, Saving %s', args.assignments) logger.info('Since stride=1, Saving %s', args.distances) io.saveh(args.assignments, assignments) io.saveh(args.distances, distances)
def main_extract(args): "main method for the extract subcommand" project = Project.load_from(args.project_info) close = int(args.close) stride = int(args.stride) if args.far < 0: far = None else: far = args.far die_if_path_exists(args.output) if args.extract_method == 'rmsd': atomindices = np.loadtxt(args.atomindices, dtype=int) AtoB, AtoC = triplets.extract_rmsd(project, close, stride, atomindices, far) elif args.extract_method == 'dihedral': if 'types' in args: AtoB, AtoC = triplets.extract_dihedral(project, close, stride, types=args.types, far=far) else: indices = np.loadtxt(args.indices, dtype=int) AtoB, AtoC = triplets.extract_dihedral(project, close, stride, indices=indices, far=far) elif args.extract_method == 'recipcontact': AtoB, AtoC = triplets.extract_recipcontact(project, close, stride, far=far) elif args.extract_method == 'drmsd': indices = np.loadtxt(args.indices, dtype=int) AtoB, AtoC, atom_pairs = triplets.extract_drmsd(project, close, stride, indices=indices, far=far) io.saveh(args.output, atom_pairs=atom_pairs) else: raise NotImplementedError("Sorry, we don't have that metric") #Serializer({'AtoB': AtoB, 'AtoC': AtoC, 'metric': args.extract_method}).SaveToHDF(args.output) io.saveh(args.output, AtoB=AtoB, AtoC=AtoC, metric=np.array(list(args.extract_method))) print 'Saved triplets to {}'.format(args.output)
not_too_short_inds = np.where( traj_lens >= ( args.min_length + args.trim_first ) )[0] os.mkdir( os.path.join( args.write_dir, 'Trajectories' ) ) print "Will limit this project to %d trajectories." % len( not_too_short_inds ) for i in xrange( len( not_too_short_inds ) ): print "Copying trajectory %d -> %d (length=%d)" % ( not_too_short_inds[i], i, Proj.traj_lengths[ not_too_short_inds[i] ] - args.trim_first ) trj0 = tables.openFile( Proj.traj_filename( not_too_short_inds[i] ) ) trj1 = tables.openFile( os.path.abspath( os.path.join( args.write_dir, 'Trajectories', '%s%d%s'% ('trj',i, '.lh5' ) ) ), 'w' ) #os.system( 'ln -s %s %s' % ( trj0, trj1 ) ) #os.symlink( trj0, trj1 ) for n0 in trj0.iterNodes('/'): if n0.name != 'XYZList': trj0.copyNode( where='/', name=n0.name, newparent=trj1.root ) else: temp_ary = n0[ args.trim_first : ] io.saveh( trj1, XYZList=temp_ary ) trj0.close() trj1.close() new_records = {'conf_filename': Proj.conf_filename.split('/')[-1], 'traj_lengths': Proj.traj_lengths[ not_too_short_inds ] - args.trim_first, 'traj_paths': Proj._traj_paths[ : len( not_too_short_inds ) ], # This works because they're named relatively and they are re-numbered 'traj_converted_from': Proj._traj_converted_from[ not_too_short_inds ], 'traj_errors': Proj._traj_errors[ not_too_short_inds ] } new_proj_dir = args.write_dir # Copy the trajectories New_Proj = Project( new_records, project_dir = new_proj_dir ) New_Proj.save( os.path.join( args.write_dir, 'ProjectInfo.yaml' ) )
if ass == -1: continue CMs_1d[ass] += ptrj_chunk[i] # StateAssigns = np.array([ np.where( Ass == i )[0].shape[0] for i in np.unique( Ass[ np.where( Ass >= 0 ) ] )] ) StateAssigns = np.bincount(Ass[np.where(Ass != -1)], minlength=Ass.max() + 1) StateAssigns = StateAssigns.reshape((len(StateAssigns), 1)) AvgCMs_1d = CMs_1d / StateAssigns num_donors = len(HB.last_donor_ainds) num_acceptors = len(HB.last_acceptor_ainds) CMs = AvgCMs_1d.reshape((-1, num_donors, num_acceptors), order="C") triples = HB.get_angle_list() io.saveh(args.out_cm, donor_h_acceptor_ainds=triples, HB_maps=np.array(CMs)) # CMs = [ avg_cm.reshape( (num_donors, num_acceptors ), order='C') for avg_cm in AvgCMs_1d ] num_acceptors = len(np.unique(triples[:, 2])) num_donors = len(np.unique(triples[:, 0])) donor_ainds = np.unique(triples[:, 0]) donorH_ainds = np.unique(triples[:, 1]) acceptor_ainds = np.unique(triples[:, 2]) donor_atomnames = pdb["AtomNames"][donor_ainds] donorH_atomnames = pdb["AtomNames"][donorH_ainds] acceptor_atomnames = pdb["AtomNames"][acceptor_ainds] donor_resnames = pdb["ResidueNames"][donor_ainds]
def main(args, metric): if args.alg == 'sclarans' and args.stride != 1: logger.error("""You don't want to use a stride with sclarans. The whole point of sclarans is to use a shrink multiple to accomplish the same purpose, but in parallel with stochastic subsampling. If you cant fit all your frames into memory at the same time, maybe you could stride a little at the begining, but its not recommended.""") sys.exit(1) # if we have a metric that explicitly operates on a subset of indices, # then we provide the option to only load those indices into memory # WARNING: I also do something a bit dirty, and inject `None` for the # RMSD.atomindices to get the metric to not splice if isinstance(metric, metrics.RMSD): atom_indices = metric.atomindices metric.atomindices = None # probably bad... logger.info('RMSD metric - loading only the atom indices required') else: atom_indices = None # In case the clustering / algorithm needs extra arguments, use # this dictionary extra_kwargs = {} # Check to be sure we won't overwrite any data if args.alg == 'hierarchical': zmatrix_fn = os.path.join(args.output_dir, 'ZMatrix.h5') die_if_path_exists(zmatrix_fn) extra_kwargs['zmatrix_fn'] = zmatrix_fn else: generators_fn = os.path.join(args.output_dir, 'Gens.lh5') die_if_path_exists(generators_fn) if args.stride == 1: assignments_fn = os.path.join(args.output_dir, 'Assignments.h5') distances_fn = os.path.join(args.output_dir, 'Assignments.h5.distances') die_if_path_exists([assignments_fn, distances_fn]) project = Project.load_from(args.project) if isinstance(metric, metrics.Vectorized) and not args.alg == 'hierarchical': # if the metric is vectorized then # we can load prepared trajectories # which may allow for better memory # efficiency ptrajs, which = load_prep_trajectories(project, args.stride, atom_indices, metric) trajectories = None n_trajs = len(ptrajs) num_frames = np.sum([len(p) for p in ptrajs]) if num_frames != len(which): raise Exception("something went wrong in loading step (%d v %d)" % (num_frames, len(which))) else: trajectories = load_trajectories(project, args.stride, atom_indices) ptrajs = None which = None n_trajs = len(trajectories) logger.info('Loaded %d trajs', n_trajs) clusterer = cluster(metric, trajectories, ptrajs, args, **extra_kwargs) if not isinstance(clusterer, clustering.Hierarchical): if isinstance(metric, metrics.Vectorized): gen_inds = clusterer.get_generator_indices() generators = project.load_frame(which[gen_inds,0], which[gen_inds,1]) else: generators = clusterer.get_generators_as_traj() logger.info('Saving %s', generators_fn) generators.save_to_lhdf(generators_fn) if args.stride == 1: assignments = clusterer.get_assignments() distances = clusterer.get_distances() logger.info('Since stride=1, Saving %s', assignments_fn) logger.info('Since stride=1, Saving %s', distances_fn) io.saveh(assignments_fn, assignments) io.saveh(distances_fn, distances)
=============================================================================== This script is deprecated and will be removed in v2.7 Please use CalculateProjectDistance.py =============================================================================== """ parser = arglib.ArgumentParser(description=""" Calculate the RMSD between an input PDB and all conformations in your project. Output as a HDF5 file (load using msmbuilder.io.loadh()) """ + deprecationmessage) warnings.warn(deprecationmessage, DeprecationWarning) parser.add_argument('pdb') parser.add_argument('atom_indices', help='Indices of atoms to compare', default='AtomIndices.dat') parser.add_argument('output', help='''Output file name. Output is an .h5 file with RMSD entries corresponding to the Assignments.h5 file.''', default='Data/RMSD.h5') parser.add_argument('project') args = parser.parse_args() arglib.die_if_path_exists(args.output) project = Project.load_from(args.project) pdb = Trajectory.load_trajectory_file( args.pdb ) atom_indices = np.loadtxt( args.atom_indices ).astype(int) distances = run(project, pdb, atom_indices) io.saveh(args.output, distances) logger.info('Saved to %s', args.output)
args = parser.parse_args() traj_fns = [os.path.join(os.path.dirname(args.data_list), fn) for fn in np.loadtxt(args.data_list, dtype=str)] traj = np.concatenate([np.load(fn) for fn in traj_fns]) metric = EuclideanMetric() gen_ids, ass_gen_ids, distances = clustering._kcenters(metric, traj, k=args.num_states, seed=np.random.randint(len(traj))) ass_gen_ids = np.array([ass_gen_ids]) if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) ass_contig = np.ones(ass_gen_ids.shape) * -1 ass_contig = ass_contig.astype(int) for i, j in enumerate(gen_ids): ass_contig[np.where(ass_gen_ids == j)] = i ass_contig = ass_contig.reshape((len(traj_fns), -1)) np.savetxt(os.path.join(args.output_dir, 'gen_ids.dat'), gen_ids) np.save(os.path.join(args.output_dir, 'gens.npy'), traj[gen_ids]) io.saveh(os.path.join(args.output_dir, 'Assignments.h5'), ass_contig) io.saveh(os.path.join(args.output_dir, 'Assignments.h5.distances'), np.array([distances])) print "Saved output to %s" % args.output_dir
import numpy as np from matplotlib import mlab from msmbuilder import io ff = "amber96" #num_frames = 11250 num_frames = 225001 num_frames = 291622 num_frames = 295189 num_frames = 41250 data = [] for i in xrange(num_frames): print(i) d = mlab.csv2rec("%s/production/pdbs/frame%d.pdb.cs"%(ff,i)) data.append(d["shift"]) data = np.array(data) io.saveh("%s/shifts.h5"%ff,data) np.savetxt("./%s/shifts_atoms.txt"%ff,d["atomname"],"%s")
def save_container(filename, dtype): io.saveh( filename, arr_0=-1 * np.ones( (project.n_trajs, np.max(project.traj_lengths)), dtype=dtype), completed_trajs=np.zeros((project.n_trajs), dtype=np.bool))
def save_container(filename, dtype): io.saveh(filename, arr_0=np.array(minus_ones, dtype=dtype), completed_vtrajs=np.zeros((n_vtrajs), dtype=np.bool), hashes=hashes)
discards (expensive!) data, so should only be used if an optimal clustering is not available. Note: Check your cluster sized with CalculateClusterRadii.py to get a handle on how big they are before you trim. Recall the radius is the *average* distance to the generator, here you are enforcing the *maximum* distance. Output: A trimmed assignments file (Assignments.Trimmed.h5).""") parser.add_argument('assignments', default='Data/Assignments.Fixed.h5') parser.add_argument('distances', default='Data/Assignments.h5.distances') parser.add_argument('rmsd_cutoff', help="""distance value at which to trim, in. Data further than this value to its generator will be discarded. Note: this is measured with whatever distance metric you used to cluster""", type=float) parser.add_argument('output', default='Data/Assignments.Trimmed.h5') args = parser.parse_args() arglib.die_if_path_exists(args.output) try: assignments = io.loadh(args.assignments, 'arr_0') distances = io.loadh(args.distances, 'arr_0') except KeyError: assignments = io.loadh(args.assignments, 'Data') distances = io.loadh(args.distances, 'Data') trimmed = run(assignments, distances, args.rmsd_cutoff) io.saveh(args.output, trimmed) logger.info('Saved output to %s', args.output)
num_confs = len(glob.glob("/%s/frame*.pdb" % in_directory)) all_shifts = [] all_errs = [] for i in xrange(num_confs): print(i) x = np.loadtxt("%s/frame%d_pred.tab"% (in_directory ,i),'str',skiprows=27) shifts = x[:,4].astype('float') errs = x[:,-1].astype('float') res_id = x[:,0].astype('int') atom_name = x[:,2] all_shifts.append(shifts) all_errs.append(errs) all_shifts = np.array(all_shifts) all_errs = np.array(all_errs) mean_errs = all_errs.mean(0) atom_name[atom_name == "HN"] = "H" res_id -= 1 io.saveh("%s/shifts.h5" % out_directory,all_shifts) np.savetxt("%s/shifts_errs.dat" % out_directory,mean_errs) np.savetxt("%s/shifts_atoms.txt" % out_directory,atom_name,"%s") np.savetxt("%s/shifts_resid.dat" % out_directory,res_id,"%d")
#!/usr/bin/env python from msmbuilder import arglib, io, tICA, Project parser = arglib.ArgumentParser( get_basic_metric=True ) parser.add_argument( 'project' ) parser.add_argument( 'output' ) parser.add_argument( 'stride', type=int, default=1 ) args, metric = parser.parse_args() project = Project.load_from( args.project ) arglib.die_if_path_exists( args.output ) stride = int( args.stride ) cov_mat = tICA.CovarianceMatrix( 0 ) for i in xrange( project.n_trajs ): print "Working on Trajectory %d" ptraj = metric.prepare_trajectory( project.load_traj( i, stride=stride ) ) if i == 0: cov_mat.set_size( ptraj.shape[1] ) cov_mat.train( ptraj ) print "Saving matrix to %s" % args.output io.saveh( args.output, covariance_matrix=cov_mat.get_current_estimate() )
import sys import numpy as np import scipy.sparse import matplotlib.pyplot as pp from msmbuilder import io, msm_analysis, MSMLib from bayesmutant import SimpleMutantSampler wt_tprob = np.loadtxt('cayleytree_tprob_wildtype.dat') mutant_tprob = np.loadtxt('cayleytree_tprob_mutant.dat') base_counts = np.zeros_like(wt_tprob) for i in range(base_counts.shape[0]): base_counts[i] = np.random.multinomial(200, wt_tprob[i]) print 'base counts' print base_counts ms = SimpleMutantSampler(base_counts, mutant_tprob) ms.step(5000) #print ms.eff_counts() #print 'observed counts' #print ms.counts io.saveh('sampling2.h5', base_counts=base_counts, samples=ms.samples, observed_counts=ms.counts, scores=ms.scores, effective_counts=ms.eff_counts(), transition_matrix=mutant_tprob)
def save_container(filename, dtype): io.saveh(filename, arr_0=-1*np.ones((project.n_trajs, np.max(project.traj_lengths)), dtype=dtype), completed_trajs=np.zeros((project.n_trajs), dtype=np.bool))
ptrj_chunk = get_hb(trj_chunk).astype(float) ass_chunk = Ass[traj_ind][ chunk_ind * chunk_size : (chunk_ind + 1) * chunk_size ] # this behaves as you want at the end of the array for i, ass in enumerate(ass_chunk): if ass == -1: continue CMs_1d[ass] += ptrj_chunk[i] # StateAssigns = np.array([ np.where( Ass == i )[0].shape[0] for i in np.unique( Ass[ np.where( Ass >= 0 ) ] )] ) StateAssigns = np.bincount(Ass[np.where(Ass != -1)], minlength=Ass.max() + 1) StateAssigns = StateAssigns.reshape((len(StateAssigns), 1)) AvgCMs_1d = CMs_1d / StateAssigns io.saveh(args.out_cm, which=which, HB_maps=AvgCMs_1d) uniq_res = np.unique(pdb["ResidueID"]) n_res = uniq_res.shape[0] acc_res_ids = pdb["ResidueID"][which[:, 0]] donor_res_ids = pdb["ResidueID"][which[:, 2]] CMs = np.zeros((len(AvgCMs_1d), n_res, n_res)) CM_pdb = np.zeros((n_res, n_res)) for i in xrange(n_res): for j in xrange(n_res): if i == j: continue inds = np.where((acc_res_ids == uniq_res[i]) & (donor_res_ids == uniq_res[j]))[0]
def main(coarse_val, orig_val, rcut): data=dict() data['coarse']=dict() data['orig']=dict() dirs=dict() dirs['coarse']='./d%s' % coarse_val dirs['orig']='./d%s' % orig_val proj=Project.load_from('ProjectInfo.yaml') types=['ass', 'rmsd', 'dist', 'gens'] for key in ['coarse', 'orig']: for type in types: if 'ass' in type: ass=io.loadh('%s/Data/Assignments.h5' % dirs[key]) data[key][type]=ass['arr_0'] elif 'dist' in type: ass=io.loadh('%s/Data/Assignments.h5.distances' % dirs[key]) data[key][type]=ass['arr_0'] elif 'rmsd' in type: rmsd=numpy.loadtxt('%s/Gens.rmsd.dat' % dirs[key]) data[key][type]=rmsd elif 'gens' in type: gens=Trajectory.load_from_lhdf('%s/Gens.lh5' % dirs[key]) data[key][type]=gens unboundmap=dict() boundmap=dict() # build map dict for orig to coarse unbound states, bound will stay same unboundass=-1*numpy.ones(( data['orig']['ass'].shape[0], data['orig']['ass'].shape[1]), dtype=int) newass=-1*numpy.ones(( data['orig']['ass'].shape[0], data['orig']['ass'].shape[1]), dtype=int) newdist=-1*numpy.ones(( data['orig']['ass'].shape[0], data['orig']['ass'].shape[1])) for j in range(0, data['orig']['ass'].shape[0]): rmsd=numpy.loadtxt('Trajectories-metric/trj%s_lprmsd.dat' % j) frames=numpy.where(data['orig']['ass'][j]!=-1)[0] if len(rmsd)!=len(frames): print "trajectory mismatch" import pdb pdb.set_trace() for (n,i) in enumerate(data['orig']['ass'][j]): # if unbound if i != -1: #if data['orig']['rmsd'][i] > float(rcut): if rmsd[n] > float(rcut): newstate=data['coarse']['ass'][j][n] if data['coarse']['rmsd'][newstate] < float(rcut): newass[j][n]=i newdist[j][n]=data['orig']['dist'][j][n] else: unboundass[j][n]=newstate newdist[j][n]=data['coarse']['dist'][j][n] else: newass[j][n]=i newdist[j][n]=data['orig']['dist'][j][n] count=0 unique=sorted(set(newass.flatten())) newass, boundmap, count=remap_ass(newass, newass, unique, count) unique=sorted(set(unboundass.flatten())) newass, unboundmap, count=remap_ass(unboundass, newass, unique, count) io.saveh('%s/Coarsed_r%s_d%s_Assignments.h5' % (dirs['orig'], rcut, coarse_val), newass) io.saveh('%s/Coarsed_r%s_d%s_Assignments.distances.h5' % (dirs['orig'], rcut, coarse_val), newdist) subdir='%s/Coarsed_r%s_gen/' % (dirs['orig'], rcut) if not os.path.exists(subdir): os.mkdir(subdir) ohandle=open('%s/Coarsed%s_r%s_Gens.rmsd.dat' % (subdir, coarse_val, rcut), 'w') b=data['orig']['gens']['XYZList'].shape[1] c=data['orig']['gens']['XYZList'].shape[2] dicts=[boundmap, unboundmap] names=['bound', 'unbound'] labels=['orig', 'coarse'] total=len(boundmap.keys()) + len(unboundmap.keys()) structure=proj.empty_traj() structure['XYZList']=numpy.zeros((total, b, c), dtype='float32') count=0 for (name, label, mapdata) in zip( names, labels, dicts): print "writing coarse gen %s out of %s pdbs" % (count, len(mapdata.keys())) for i in sorted(mapdata.keys()): macro=mapdata[i] structure['XYZList'][count]=data[label]['gens']['XYZList'][macro] ohandle.write('%s\t%s\t%s\n' % (name, count, data[label]['rmsd'][macro])) print name, count count+=1 otraj='%s/Coarsed%s_r%s_Gens.xtc' % (subdir, coarse_val, rcut) if os.path.exists(otraj): os.remove(otraj) structure.save_to_xtc('%s/Coarsed%s_r%s_Gens.xtc' % (subdir, coarse_val, rcut))
#!/usr/bin/env python from msmbuilder import io from msmbuilder import msm_analysis from scipy.io import mmread from argparse import ArgumentParser import os parser = ArgumentParser() parser.add_argument('-t', dest='tProb', help='transition matrix', default='./tProb.mtx') parser.add_argument('-o', dest='output', help='output filename', default='./eigs.h5') parser.add_argument('-n', dest='num_vecs', help='number of eigenvectors to find.', default=500, type=int) args = parser.parse_args() if os.path.exists(args.output): raise Exception("path (%s) exists!" % args.output) tProb = mmread(args.tProb) eigs = msm_analysis.get_eigenvectors(tProb, args.num_vecs) io.saveh(args.output, vals=eigs[0], vecs=eigs[1])
import os import sys import numpy as np import scipy.sparse from msmbuilder import io, msm_analysis, MSMLib from bayesmutant import SimpleMutantSampler P = np.loadtxt('base_transition_matrix.dat') mutant_transition_matrix = P + 0.2*scipy.sparse.rand(P.shape[0], P.shape[1], density=0.1).todense() mutant_transition_matrix /= np.sum(mutant_transition_matrix, axis=1) trajectory = np.array(msm_analysis.sample(P, 0, 5000)) base_counts = MSMLib.get_counts_from_traj(trajectory).todense() print 'base counts' print base_counts ms = SimpleMutantSampler(base_counts, mutant_transition_matrix) ms.step(5000) print 'observed counts' print ms.counts io.saveh('sampling.h5', base_counts=base_counts, samples=ms.samples, observed_counts=ms.counts, scores=ms.scores, transition_matrix=mutant_transition_matrix)
Please use CalculateProjectDistance.py =============================================================================== """ parser = arglib.ArgumentParser(description=""" Calculate the RMSD between an input PDB and all conformations in your project. Output as a HDF5 file (load using msmbuilder.io.loadh()) """ + deprecationmessage) warnings.warn(deprecationmessage, DeprecationWarning) parser.add_argument('pdb') parser.add_argument('atom_indices', help='Indices of atoms to compare', default='AtomIndices.dat') parser.add_argument('output', help='''Output file name. Output is an .h5 file with RMSD entries corresponding to the Assignments.h5 file.''', default='Data/RMSD.h5') parser.add_argument('project') args = parser.parse_args() arglib.die_if_path_exists(args.output) project = Project.load_from(args.project) pdb = Trajectory.load_trajectory_file(args.pdb) atom_indices = np.loadtxt(args.atom_indices).astype(int) distances = run(project, pdb, atom_indices) io.saveh(args.output, distances) logger.info('Saved to %s', args.output)
if __name__ == '__main__': parser = arglib.ArgumentParser("""Calculates the Solvent Accessible Surface Area of all atoms in a given trajectory, or for all trajectories in the project. The output is a hdf5 file which contains the SASA for each atom in each frame in each trajectory (or the single trajectory you passed in.""" ) parser.add_argument('project') parser.add_argument('atom_indices', help='Indices of atoms to calculate SASA', default='all') parser.add_argument('output', help='''hdf5 file for output. Note this will be THREE dimensional: ( trajectory, frame, atom ), unless you just ask for one trajectory, in which case it will be shape (frame, atom).''', default='SASA.h5') parser.add_argument('traj_fn', help='''Pass a trajectory file if you only want to calclate the SASA for a single trajectory''', default='all' ) args = parser.parse_args() arglib.die_if_path_exists(args.output) if args.atom_indices.lower() == 'all': atom_indices = None else: atom_indices = np.loadtxt(args.atom_indices).astype(int) project = Project.load_from(args.project) SASA = run(project, atom_indices, args.traj_fn) io.saveh(args.output, SASA)
def main(coarse_val, orig_val, rcut): data=dict() data['coarse']=dict() data['orig']=dict() dirs=dict() dirs['coarse']='./d%s' % coarse_val dirs['orig']='./d%s' % orig_val proj=Project.load_from('ProjectInfo.yaml') types=['ass', 'rmsd', 'dist', 'gens'] for key in ['coarse', 'orig']: for type in types: if 'ass' in type: ass=io.loadh('%s/Data/Assignments.h5' % dirs[key]) data[key][type]=ass['arr_0'] elif 'dist' in type: ass=io.loadh('%s/Data/Assignments.h5.distances' % dirs[key]) data[key][type]=ass['arr_0'] elif 'rmsd' in type: rmsd=numpy.loadtxt('%s/Gens.rmsd.dat' % dirs[key]) data[key][type]=rmsd elif 'gens' in type: gens=Trajectory.load_from_lhdf('%s/Gens.lh5' % dirs[key]) data[key][type]=gens unboundmap=dict() boundmap=dict() #unboundstates=dict() #unboundrmsd=dict() # build map dict for orig to coarse unbound states, bound will stay same newass=-1*numpy.ones(( data['orig']['ass'].shape[0], data['orig']['ass'].shape[1]), dtype=int) for j in range(0, data['orig']['ass'].shape[0]): for (n,i) in enumerate(data['orig']['ass'][j]): # if unbound if i != -1: if data['orig']['rmsd'][i] > float(rcut): state=data['coarse']['ass'][j][n] newass[j][n]=state+10000 else: newass[j][n]=i count=0 unique=set(newass.flatten()) boundmap=dict() unboundmap=dict() for x in unique: locations=numpy.where(newass==x) newass[locations]=count if x >= 10000: unboundmap[count]=(x-10000) else: boundmap[count]=x count+=1 io.saveh('%s/Coarsed_r%s_Assignments.h5' % (dirs['orig'], rcut), newass) subdir='%s/Coarsed_r%s_gen/' % (dirs['orig'], rcut) if not os.path.exists(subdir): os.mkdir(subdir) ohandle=open('%s/Coarsed%s_r%s_Gens.rmsd.dat' % (subdir, coarse_val, rcut), 'w') b=data['orig']['gens']['XYZList'].shape[1] c=data['orig']['gens']['XYZList'].shape[2] dicts=[boundmap, unboundmap] names=['bound', 'unbound'] labels=['orig', 'coarse'] total=len(boundmap.keys()) + len(unboundmap.keys()) structure=proj.empty_traj() structure['XYZList']=numpy.zeros((total, b, c), dtype='float32') count=0 for (name, label, mapdata) in zip( names, labels, dicts): print "writing coarse gen %s out of %s pdbs" % (count, len(mapdata.keys())) for i in sorted(mapdata.keys()): macro=mapdata[i] structure['XYZList'][count]=data[label]['gens']['XYZList'][macro] ohandle.write('%s\t%s\t%s\n' % (name, count, data[label]['rmsd'][macro])) print name, count count+=1 structure.save_to_xtc('%s/Coarsed%s_r%s_Gens.xtc' % (subdir, coarse_val, rcut))
def write_msm_output(self,outarray=None,filename="Assignments.h5"): from msmbuilder import io if outarray is None: outarray = self.assignments io.saveh(filename,outarray)
of all atoms in a given trajectory, or for all trajectories in the project. The output is a hdf5 file which contains the SASA for each atom in each frame in each trajectory (or the single trajectory you passed in.""") parser.add_argument('project') parser.add_argument('atom_indices', help='Indices of atoms to calculate SASA', default='all') parser.add_argument('output', help='''hdf5 file for output. Note this will be THREE dimensional: ( trajectory, frame, atom ), unless you just ask for one trajectory, in which case it will be shape (frame, atom).''', default='SASA.h5') parser.add_argument('traj_fn', help='''Pass a trajectory file if you only want to calclate the SASA for a single trajectory''', default='all') args = parser.parse_args() arglib.die_if_path_exists(args.output) if args.atom_indices.lower() == 'all': atom_indices = None else: atom_indices = np.loadtxt(args.atom_indices).astype(int) project = Project.load_from(args.project) SASA = run(project, atom_indices, args.traj_fn) io.saveh(args.output, SASA)
c = tICA.CovarianceMatrix(args.lag, tProb=tProb, populations=pops) for i, fn in enumerate(traj_list): print fn t = np.load(fn) c.train(t, ass[i]) #c.train(t) C, Sigma = c.get_current_estimate() vals, vecs = scipy.linalg.eig(C, b=Sigma) print vals print vecs io.saveh(args.out, vals=vals, vecs=vecs, C=C, Sigma=Sigma) muller.plot_v() ref = io.loadh('ref.h5') ref['vecs'][:,0] *= -1 vecs[:,0] *= -1 plot([0, vecs[0,0]], [0, vecs[1,0]], color='white', lw=3) plot([0, vecs[0,1]], [0, vecs[1,1]], color='white', ls='dashed', lw=3) plot([0, ref['vecs'][0,0]], [0, ref['vecs'][1,0]], color='red', lw=3) plot([0, ref['vecs'][0,1]], [0, ref['vecs'][1,1]], color='red', ls='dashed', lw=3)
from msmbuilder import io from msmbuilder.clustering import Hierarchical from msmbuilder import arglib import logging logger = logging.getLogger('msmbuilder.scripts.AssignHierarchical') parser = arglib.ArgumentParser(description='Assign data using a hierarchical clustering') parser.add_argument('hierarchical_clustering_zmatrix', default='./Data/ZMatrix.h5', help='Path to hierarchical clustering zmatrix' ) parser.add_argument('num_states', help='Number of States', default='none') parser.add_argument('cutoff_distance', help='Maximum cophenetic distance', default='none') parser.add_argument('assignments', type=str) def main(k, d, zmatrix_fn): hierarchical = Hierarchical.load_from_disk(zmatrix_fn) assignments = hierarchical.get_assignments(k=k, cutoff_distance=d) return assignments if __name__ == "__main__": args = parser.parse_args() k = int(args.num_states) if args.num_states != 'none' else None d = float(args.cutoff_distance) if args.cutoff_distance != 'none' else None arglib.die_if_path_exists(args.assignments) if k is None and d is None: logger.error('You need to supply either a number of states or a cutoff distance') sys.exit(1) assignments = main(k, d, args.hierarchical_clustering_zmatrix) io.saveh(args.assignments, assignments) logger.info('Saved assignments to %s', args.assignments)
def test_save(self): """Save HDF5 to disk and load it back up""" io.saveh(self.filename2, self.data) TestData = io.loadh(self.filename2, 'arr_0') npt.assert_array_equal(TestData, self.data)