Beispiel #1
0
def main(args, metric):
    check_paths(args)
    
    if args.alg == 'sclarans' and args.stride != 1:
        logger.error("""You don't want to use a stride with sclarans. The whole point of
sclarans is to use a shrink multiple to accomplish the same purpose, but in parallel with
stochastic subsampling. If you cant fit all your frames into  memory at the same time, maybe you
could stride a little at the begining, but its not recommended.""")
        sys.exit(1)
    
    trajs = load_trajectories(args.project, args.stride)
    logger.info('Loaded %d trajs', len(trajs))

    clusterer = cluster(metric, trajs, args)
    
    if not isinstance(clusterer, clustering.Hierarchical):
        generators = clusterer.get_generators_as_traj()
        logger.info('Saving %s', args.generators)
        generators.save_to_lhdf(args.generators)
        if args.stride == 1:
            assignments = clusterer.get_assignments()
            distances = clusterer.get_distances()
            
            logger.info('Since stride=1, Saving %s', args.assignments)
            logger.info('Since stride=1, Saving %s', args.distances)
            io.saveh(args.assignments, assignments)
            io.saveh(args.distances, distances)
Beispiel #2
0
def main(args, metric):
    check_paths(args)

    if args.alg == 'sclarans' and args.stride != 1:
        logger.error(
            """You don't want to use a stride with sclarans. The whole point of
sclarans is to use a shrink multiple to accomplish the same purpose, but in parallel with
stochastic subsampling. If you cant fit all your frames into  memory at the same time, maybe you
could stride a little at the begining, but its not recommended.""")
        sys.exit(1)

    trajs = load_trajectories(args.project, args.stride)
    logger.info('Loaded %d trajs', len(trajs))

    clusterer = cluster(metric, trajs, args)

    if not isinstance(clusterer, clustering.Hierarchical):
        generators = clusterer.get_generators_as_traj()
        logger.info('Saving %s', args.generators)
        generators.save_to_lhdf(args.generators)
        if args.stride == 1:
            assignments = clusterer.get_assignments()
            distances = clusterer.get_distances()

            logger.info('Since stride=1, Saving %s', args.assignments)
            logger.info('Since stride=1, Saving %s', args.distances)
            io.saveh(args.assignments, assignments)
            io.saveh(args.distances, distances)
Beispiel #3
0
def run(tProb, observable, init_pops=None, num_vecs=10, output='evec_amps.h5'):

    if init_pops is None:
        init_pops = np.ones(tProb.shape[0]).astype(float) / float(tProb.shape[0])

    else:
        init_pops = init_pops.astype(float) 
        init_pops /= init_pops.sum()

    assert (observable.shape[0] == init_pops.shape[0])
    assert (observable.shape[0] == tProb.shape[0])
    
    try:
        f = io.loadh('eigs%d.h5' % num_vecs)
        vals = f['vals']
        vecsL = f['vecs']
    except:
        vals, vecsL = msm_analysis.get_eigenvectors(tProb, num_vecs + 1, right=False)
        io.saveh('eigs%d.h5' % num_vecs, vals=vals, vecs=vecsL)

    equil = vecsL[:,0] / vecsL[:,0].sum()

    dyn_vecsL = vecsL[:, 1:]
    # normalize the left and right eigenvectors

    dyn_vecsL /= np.sqrt(np.sum(dyn_vecsL * dyn_vecsL / np.reshape(equil, (-1, 1)), axis=0))

    dyn_vecsR = dyn_vecsL / np.reshape(equil, (-1, 1))

    amps = dyn_vecsL.T.dot(observable) * dyn_vecsR.T.dot(init_pops)

    io.saveh(output, evals=vals[1:], amplitudes=amps)
    logger.info("saved output to %s" % output)
Beispiel #4
0
 def save_container(filename, dtype):
     io.saveh(
         filename,
         arr_0=np.array(minus_ones, dtype=dtype),
         completed_vtrajs=np.zeros((n_vtrajs), dtype=np.bool),
         hashes=hashes,
     )
def main(args, metric):

    if args.alg == "sclarans" and args.stride != 1:
        logger.error(
            """You don't want to use a stride with sclarans. The whole point of
sclarans is to use a shrink multiple to accomplish the same purpose, but in parallel with
stochastic subsampling. If you cant fit all your frames into  memory at the same time, maybe you
could stride a little at the begining, but its not recommended."""
        )
        sys.exit(1)

    # if we have a metric that explicitly operates on a subset of indices,
    # then we provide the option to only load those indices into memory
    # WARNING: I also do something a bit dirty, and inject `None` for the
    # RMSD.atomindices to get the metric to not splice
    if isinstance(metric, metrics.RMSD):
        atom_indices = metric.atomindices
        metric.atomindices = None  # probably bad...
        logger.info("RMSD metric - loading only the atom indices required")
    else:
        atom_indices = None

    # In case the clustering / algorithm needs extra arguments, use
    # this dictionary
    extra_kwargs = {}

    # Check to be sure we won't overwrite any data
    if args.alg == "hierarchical":
        zmatrix_fn = os.path.join(args.output_dir, "ZMatrix.h5")
        die_if_path_exists(zmatrix_fn)
        extra_kwargs["zmatrix_fn"] = zmatrix_fn
    else:
        generators_fn = os.path.join(args.output_dir, "Gens.lh5")
        die_if_path_exists(generators_fn)
        if args.stride == 1:
            assignments_fn = os.path.join(args.output_dir, "Assignments.h5")
            distances_fn = os.path.join(args.output_dir, "Assignments.h5.distances")
            die_if_path_exists([assignments_fn, distances_fn])

    trajs = load_trajectories(args.project, args.stride, atom_indices)
    logger.info("Loaded %d trajs", len(trajs))

    clusterer = cluster(metric, trajs, args, **extra_kwargs)

    if not isinstance(clusterer, clustering.Hierarchical):
        generators = clusterer.get_generators_as_traj()
        logger.info("Saving %s", generators_fn)
        generators.save_to_lhdf(generators_fn)
        if args.stride == 1:
            assignments = clusterer.get_assignments()
            distances = clusterer.get_distances()

            logger.info("Since stride=1, Saving %s", assignments_fn)
            logger.info("Since stride=1, Saving %s", distances_fn)
            io.saveh(assignments_fn, assignments)
            io.saveh(distances_fn, distances)
Beispiel #6
0
 def save(self, output):
     """
     save the results to file
     
     Parameters:
     -----------
     output : str
         output filename (.h5)
     """
     
     io.saveh(output, timelag_corr_mat=self.timelag_corr_mat,
         cov_mat=self.cov_mat, lag=np.array([self.lag]), vals=self.vals,
         vecs=self.vecs)
Beispiel #7
0
    def save_to_disk(self, filename):
        """Save this clusterer to disk.

        This is useful because computing the Z-matrix
        (done in __init__) is the most expensive part, and assigning is cheap

        Parameters
        ----------
        filename : str
            location to save to

        Raises
        ------
        Exception if something already exists at `filename`
        """
        io.saveh(filename, z_matrix=self.Z, traj_lengths=self.traj_lengths)
Beispiel #8
0
    def save_to_disk(self, filename):
        """Save this clusterer to disk.

        This is useful because computing the Z-matrix
        (done in __init__) is the most expensive part, and assigning is cheap

        Parameters
        ----------
        filename : str
            location to save to

        Raises
        ------
        Exception if something already exists at `filename`
        """
        io.saveh(filename, z_matrix=self.Z, traj_lengths=self.traj_lengths)
Beispiel #9
0
    def save_to_hdf(self, filename):
        """Save a Trajectory instance to a HDF File.

        First, remove the XYZList key because it should be written using the
        special CArray operation.  This file format is roughly equivalent to
        an XTC and should comparable file sizes but with better IO performance.

        Parameters
        ----------
        Filename: str
            location to save to
        Precision : float, optional
            Precision to save xyzlist
        """
        indexlist = self.pop('IndexList', None)
        io.saveh(filename, **self)
        self['IndexList'] = indexlist
Beispiel #10
0
    def save_to_lhdf(self, filename, precision=DEFAULT_PRECISION):
        """Save a Trajectory instance to a Lossy HDF File.

        First, remove the XYZList key because it should be written using the
        special CArray operation.  This file format is roughly equivalent to
        an XTC and should comparable file sizes but with better IO performance.

        Parameters
        ----------
        Filename: str
            location to save to
        Precision : float, optional
            Precision to save xyzlist
        """
        self.pop('IndexList')
        
        xyzlist = self.pop('XYZList')
        rounded = _convert_to_lossy_integers(xyzlist, precision)
        self['XYZList'] = rounded
        io.saveh(filename, **self)
        self['XYZList'] = xyzlist
Beispiel #11
0
    def save_to_lhdf(self, filename, precision=DEFAULT_PRECISION):
        """Save a Trajectory instance to a Lossy HDF File.

        First, remove the XYZList key because it should be written using the
        special CArray operation.  This file format is roughly equivalent to
        an XTC and should comparable file sizes but with better IO performance.

        Parameters
        ----------
        Filename: str
            location to save to
        Precision : float, optional
            Precision to save xyzlist
        """
        self.pop("IndexList")

        xyzlist = self.pop("XYZList")
        rounded = _convert_to_lossy_integers(xyzlist, precision)
        self["XYZList"] = rounded
        io.saveh(filename, **self)
        self["XYZList"] = xyzlist
Beispiel #12
0
def main(args, metric):
    check_paths(args)
    
    if args.alg == 'sclarans' and args.stride != 1:
        logger.error("""You don't want to use a stride with sclarans. The whole point of
sclarans is to use a shrink multiple to accomplish the same purpose, but in parallel with
stochastic subsampling. If you cant fit all your frames into  memory at the same time, maybe you
could stride a little at the begining, but its not recommended.""")
        sys.exit(1)
        
    # if we have a metric that explicitly operates on a subset of indices,
    # then we provide the option to only load those indices into memory
    # WARNING: I also do something a bit dirty, and inject `None` for the
    # RMSD.atomindices to get the metric to not splice
    if isinstance(metric, metrics.RMSD):
        atom_indices = metric.atomindices
        metric.atomindices = None # probably bad...
        logger.info('RMSD metric - loading only the atom indices required')
    else:
        atom_indices = None
        
        
    trajs = load_trajectories(args.project, args.stride, atom_indices)
    logger.info('Loaded %d trajs', len(trajs))

    clusterer = cluster(metric, trajs, args)
    
    if not isinstance(clusterer, clustering.Hierarchical):
        generators = clusterer.get_generators_as_traj()
        logger.info('Saving %s', args.generators)
        generators.save_to_lhdf(args.generators)
        if args.stride == 1:
            assignments = clusterer.get_assignments()
            distances = clusterer.get_distances()
            
            logger.info('Since stride=1, Saving %s', args.assignments)
            logger.info('Since stride=1, Saving %s', args.distances)
            io.saveh(args.assignments, assignments)
            io.saveh(args.distances, distances)
Beispiel #13
0
def main_extract(args):
    "main method for the extract subcommand"
    project = Project.load_from(args.project_info)
    close = int(args.close)
    stride = int(args.stride)
    if args.far < 0:
        far = None
    else:
        far = args.far

    die_if_path_exists(args.output)

    if args.extract_method == 'rmsd':
        atomindices = np.loadtxt(args.atomindices, dtype=int)
        AtoB, AtoC = triplets.extract_rmsd(project, close, stride, atomindices, far)

    elif args.extract_method == 'dihedral':
        if 'types' in args:
            AtoB, AtoC = triplets.extract_dihedral(project, close, stride, types=args.types, far=far)
        else:
            indices = np.loadtxt(args.indices, dtype=int)
            AtoB, AtoC = triplets.extract_dihedral(project, close, stride, indices=indices, far=far)

    elif args.extract_method == 'recipcontact':
        AtoB, AtoC = triplets.extract_recipcontact(project, close, stride, far=far)

    elif args.extract_method == 'drmsd':
        indices = np.loadtxt(args.indices, dtype=int)
        AtoB, AtoC, atom_pairs = triplets.extract_drmsd(project, close, stride, indices=indices, far=far)
        io.saveh(args.output, atom_pairs=atom_pairs)
    else:
        raise NotImplementedError("Sorry, we don't have that metric")

    #Serializer({'AtoB': AtoB, 'AtoC': AtoC, 'metric': args.extract_method}).SaveToHDF(args.output)
    io.saveh(args.output, AtoB=AtoB, AtoC=AtoC, metric=np.array(list(args.extract_method)))
    print 'Saved triplets to {}'.format(args.output)
Beispiel #14
0
not_too_short_inds = np.where( traj_lens >= ( args.min_length + args.trim_first ) )[0]

os.mkdir( os.path.join( args.write_dir, 'Trajectories' ) )
print "Will limit this project to %d trajectories." % len( not_too_short_inds )
for i in xrange( len( not_too_short_inds ) ):
   print "Copying trajectory %d -> %d (length=%d)" % ( not_too_short_inds[i], i, Proj.traj_lengths[ not_too_short_inds[i] ] - args.trim_first )
   trj0 = tables.openFile( Proj.traj_filename( not_too_short_inds[i] ) )
   trj1 = tables.openFile( os.path.abspath( os.path.join( args.write_dir, 'Trajectories', '%s%d%s'% ('trj',i, '.lh5' ) ) ), 'w' )
   #os.system( 'ln -s %s %s' % ( trj0, trj1 ) )
   #os.symlink( trj0, trj1 )
   
   for n0 in trj0.iterNodes('/'):
      if n0.name != 'XYZList':
         trj0.copyNode( where='/', name=n0.name, newparent=trj1.root )
      else:
         temp_ary = n0[ args.trim_first : ]
         io.saveh( trj1, XYZList=temp_ary )

   trj0.close()
   trj1.close()

new_records = {'conf_filename': Proj.conf_filename.split('/')[-1], 
           'traj_lengths': Proj.traj_lengths[ not_too_short_inds ] - args.trim_first,
           'traj_paths': Proj._traj_paths[ : len( not_too_short_inds ) ], # This works because they're named relatively and they are re-numbered
           'traj_converted_from': Proj._traj_converted_from[ not_too_short_inds ],
           'traj_errors': Proj._traj_errors[ not_too_short_inds ] }
new_proj_dir = args.write_dir
# Copy the trajectories
New_Proj = Project( new_records, project_dir = new_proj_dir )
New_Proj.save( os.path.join( args.write_dir, 'ProjectInfo.yaml' ) )
Beispiel #15
0
                if ass == -1:
                    continue
                CMs_1d[ass] += ptrj_chunk[i]

    # StateAssigns = np.array([ np.where( Ass == i )[0].shape[0] for i in np.unique( Ass[ np.where( Ass >= 0 ) ] )] )
    StateAssigns = np.bincount(Ass[np.where(Ass != -1)], minlength=Ass.max() + 1)
    StateAssigns = StateAssigns.reshape((len(StateAssigns), 1))
    AvgCMs_1d = CMs_1d / StateAssigns

    num_donors = len(HB.last_donor_ainds)
    num_acceptors = len(HB.last_acceptor_ainds)

    CMs = AvgCMs_1d.reshape((-1, num_donors, num_acceptors), order="C")

    triples = HB.get_angle_list()
    io.saveh(args.out_cm, donor_h_acceptor_ainds=triples, HB_maps=np.array(CMs))
# CMs = [ avg_cm.reshape( (num_donors, num_acceptors ), order='C') for avg_cm in AvgCMs_1d ]

num_acceptors = len(np.unique(triples[:, 2]))
num_donors = len(np.unique(triples[:, 0]))

donor_ainds = np.unique(triples[:, 0])
donorH_ainds = np.unique(triples[:, 1])
acceptor_ainds = np.unique(triples[:, 2])


donor_atomnames = pdb["AtomNames"][donor_ainds]
donorH_atomnames = pdb["AtomNames"][donorH_ainds]
acceptor_atomnames = pdb["AtomNames"][acceptor_ainds]

donor_resnames = pdb["ResidueNames"][donor_ainds]
Beispiel #16
0
def main(args, metric):
    
    if args.alg == 'sclarans' and args.stride != 1:
        logger.error("""You don't want to use a stride with sclarans. The whole point of
sclarans is to use a shrink multiple to accomplish the same purpose, but in parallel with
stochastic subsampling. If you cant fit all your frames into  memory at the same time, maybe you
could stride a little at the begining, but its not recommended.""")
        sys.exit(1)
        
    # if we have a metric that explicitly operates on a subset of indices,
    # then we provide the option to only load those indices into memory
    # WARNING: I also do something a bit dirty, and inject `None` for the
    # RMSD.atomindices to get the metric to not splice
    if isinstance(metric, metrics.RMSD):
        atom_indices = metric.atomindices
        metric.atomindices = None # probably bad...
        logger.info('RMSD metric - loading only the atom indices required')
    else:
        atom_indices = None

    # In case the clustering / algorithm needs extra arguments, use
    # this dictionary
    extra_kwargs = {}

    # Check to be sure we won't overwrite any data 
    if args.alg == 'hierarchical':
        zmatrix_fn = os.path.join(args.output_dir, 'ZMatrix.h5')
        die_if_path_exists(zmatrix_fn)
        extra_kwargs['zmatrix_fn'] = zmatrix_fn
    else:
        generators_fn = os.path.join(args.output_dir, 'Gens.lh5') 
        die_if_path_exists(generators_fn)
        if args.stride == 1:
            assignments_fn = os.path.join(args.output_dir, 'Assignments.h5') 
            distances_fn = os.path.join(args.output_dir, 'Assignments.h5.distances')
            die_if_path_exists([assignments_fn, distances_fn])
        
    project = Project.load_from(args.project)

    if isinstance(metric, metrics.Vectorized) and not args.alg == 'hierarchical': 
        # if the metric is vectorized then
        # we can load prepared trajectories 
        # which may allow for better memory
        # efficiency
        ptrajs, which = load_prep_trajectories(project, args.stride, atom_indices, metric)
        trajectories = None
        n_trajs = len(ptrajs)

        num_frames = np.sum([len(p) for p in ptrajs])
        if num_frames != len(which):
            raise Exception("something went wrong in loading step (%d v %d)" % (num_frames, len(which)))
    else:
        trajectories = load_trajectories(project, args.stride, atom_indices)       
        ptrajs = None
        which = None
        n_trajs = len(trajectories)

    logger.info('Loaded %d trajs', n_trajs)

    clusterer = cluster(metric, trajectories, ptrajs, args, **extra_kwargs)

    if not isinstance(clusterer, clustering.Hierarchical):

        if isinstance(metric, metrics.Vectorized):
            gen_inds = clusterer.get_generator_indices()
            generators = project.load_frame(which[gen_inds,0], which[gen_inds,1])
        else:
            generators = clusterer.get_generators_as_traj()
        
        logger.info('Saving %s', generators_fn)
        generators.save_to_lhdf(generators_fn)

        if args.stride == 1:
            assignments = clusterer.get_assignments()
            distances = clusterer.get_distances()
            
            logger.info('Since stride=1, Saving %s', assignments_fn)
            logger.info('Since stride=1, Saving %s', distances_fn)
            io.saveh(assignments_fn, assignments)
            io.saveh(distances_fn, distances)
===============================================================================
This script is deprecated and will be removed in v2.7 
Please use CalculateProjectDistance.py
===============================================================================
"""
    parser = arglib.ArgumentParser(description="""
Calculate the RMSD between an input PDB and all conformations in your project.
Output as a HDF5 file (load using msmbuilder.io.loadh())
""" + deprecationmessage)
    warnings.warn(deprecationmessage, DeprecationWarning)
    
    parser.add_argument('pdb')
    parser.add_argument('atom_indices', help='Indices of atoms to compare',
        default='AtomIndices.dat')
    parser.add_argument('output', help='''Output file name. Output is an
        .h5 file with RMSD entries corresponding to the Assignments.h5 file.''',
        default='Data/RMSD.h5')
    parser.add_argument('project')
    args = parser.parse_args()

    arglib.die_if_path_exists(args.output)

    project = Project.load_from(args.project)
    pdb = Trajectory.load_trajectory_file( args.pdb )
    atom_indices = np.loadtxt( args.atom_indices ).astype(int)

    distances = run(project, pdb, atom_indices)
    
    io.saveh(args.output, distances)
    logger.info('Saved to %s', args.output)
Beispiel #18
0
args = parser.parse_args()

traj_fns = [os.path.join(os.path.dirname(args.data_list), fn) for fn in np.loadtxt(args.data_list, dtype=str)]

traj = np.concatenate([np.load(fn) for fn in traj_fns])

metric = EuclideanMetric()

gen_ids, ass_gen_ids, distances = clustering._kcenters(metric, traj, k=args.num_states, seed=np.random.randint(len(traj)))

ass_gen_ids = np.array([ass_gen_ids])

if not os.path.exists(args.output_dir):
    os.mkdir(args.output_dir)

ass_contig = np.ones(ass_gen_ids.shape) * -1
ass_contig = ass_contig.astype(int)

for i, j in enumerate(gen_ids):
    ass_contig[np.where(ass_gen_ids == j)] = i


ass_contig = ass_contig.reshape((len(traj_fns), -1))

np.savetxt(os.path.join(args.output_dir, 'gen_ids.dat'), gen_ids)
np.save(os.path.join(args.output_dir, 'gens.npy'), traj[gen_ids])
io.saveh(os.path.join(args.output_dir, 'Assignments.h5'), ass_contig)
io.saveh(os.path.join(args.output_dir, 'Assignments.h5.distances'), np.array([distances]))
print "Saved output to %s" % args.output_dir
import numpy as np
from matplotlib import mlab
from msmbuilder import io

ff = "amber96"
#num_frames = 11250
num_frames = 225001
num_frames = 291622
num_frames = 295189
num_frames = 41250
data = []
for i in xrange(num_frames):
    print(i)
    d = mlab.csv2rec("%s/production/pdbs/frame%d.pdb.cs"%(ff,i))
    data.append(d["shift"])
    
data = np.array(data)

io.saveh("%s/shifts.h5"%ff,data)
np.savetxt("./%s/shifts_atoms.txt"%ff,d["atomname"],"%s")
Beispiel #20
0
 def save_container(filename, dtype):
     io.saveh(
         filename,
         arr_0=-1 * np.ones(
             (project.n_trajs, np.max(project.traj_lengths)), dtype=dtype),
         completed_trajs=np.zeros((project.n_trajs), dtype=np.bool))
Beispiel #21
0
 def save_container(filename, dtype):
     io.saveh(filename, arr_0=np.array(minus_ones, dtype=dtype),
         completed_vtrajs=np.zeros((n_vtrajs), dtype=np.bool),
         hashes=hashes)
discards (expensive!) data, so should only be used if an optimal
clustering is not available.

Note: Check your cluster sized with CalculateClusterRadii.py to get
a handle on how big they are before you trim. Recall the radius is the
*average* distance to the generator, here you are enforcing the
*maximum* distance.

Output: A trimmed assignments file (Assignments.Trimmed.h5).""")
    parser.add_argument('assignments', default='Data/Assignments.Fixed.h5')
    parser.add_argument('distances', default='Data/Assignments.h5.distances')
    parser.add_argument('rmsd_cutoff', help="""distance value at which to trim,
        in. Data further than this value to its generator will be
        discarded. Note: this is measured with whatever distance metric you used to cluster""", type=float)
    parser.add_argument('output', default='Data/Assignments.Trimmed.h5')
    args = parser.parse_args()
    
    arglib.die_if_path_exists(args.output)
    
    try:
        assignments = io.loadh(args.assignments, 'arr_0')
        distances =  io.loadh(args.distances, 'arr_0')
    except KeyError:
        assignments = io.loadh(args.assignments, 'Data')
        distances =  io.loadh(args.distances, 'Data')

    trimmed = run(assignments, distances, args.rmsd_cutoff)
    
    io.saveh(args.output, trimmed)
    logger.info('Saved output to %s', args.output)
num_confs = len(glob.glob("/%s/frame*.pdb" % in_directory))

all_shifts = []
all_errs = []

for i in xrange(num_confs):
    print(i)
    x = np.loadtxt("%s/frame%d_pred.tab"% (in_directory ,i),'str',skiprows=27)
    shifts = x[:,4].astype('float')
    errs = x[:,-1].astype('float')
    res_id = x[:,0].astype('int')
    atom_name = x[:,2]
    all_shifts.append(shifts)
    all_errs.append(errs)
    
    
all_shifts = np.array(all_shifts)
all_errs = np.array(all_errs)

mean_errs = all_errs.mean(0)
atom_name[atom_name == "HN"] = "H"

res_id -= 1

io.saveh("%s/shifts.h5" % out_directory,all_shifts)
np.savetxt("%s/shifts_errs.dat" % out_directory,mean_errs)
np.savetxt("%s/shifts_atoms.txt" % out_directory,atom_name,"%s")
np.savetxt("%s/shifts_resid.dat" % out_directory,res_id,"%d")

Beispiel #24
0
#!/usr/bin/env python
from msmbuilder import arglib, io, tICA, Project

parser = arglib.ArgumentParser( get_basic_metric=True )

parser.add_argument( 'project' )
parser.add_argument( 'output' )
parser.add_argument( 'stride', type=int, default=1 )

args, metric = parser.parse_args()

project = Project.load_from( args.project )

arglib.die_if_path_exists( args.output )

stride = int( args.stride )

cov_mat = tICA.CovarianceMatrix( 0 )

for i in xrange( project.n_trajs ):

    print "Working on Trajectory %d"
    ptraj = metric.prepare_trajectory( project.load_traj( i, stride=stride ) )
    if i == 0:
        cov_mat.set_size( ptraj.shape[1] )

    cov_mat.train( ptraj )

print "Saving matrix to %s" % args.output
io.saveh( args.output, covariance_matrix=cov_mat.get_current_estimate() )
Beispiel #25
0
import sys
import numpy as np
import scipy.sparse
import matplotlib.pyplot as pp

from msmbuilder import io, msm_analysis, MSMLib
from bayesmutant import SimpleMutantSampler

wt_tprob = np.loadtxt('cayleytree_tprob_wildtype.dat')
mutant_tprob = np.loadtxt('cayleytree_tprob_mutant.dat')

base_counts = np.zeros_like(wt_tprob)
for i in range(base_counts.shape[0]):
    base_counts[i] = np.random.multinomial(200, wt_tprob[i])

print 'base counts'
print base_counts

ms = SimpleMutantSampler(base_counts, mutant_tprob)
ms.step(5000)

#print ms.eff_counts()

#print 'observed counts'
#print ms.counts

io.saveh('sampling2.h5', base_counts=base_counts, samples=ms.samples,
                        observed_counts=ms.counts, scores=ms.scores,
                        effective_counts=ms.eff_counts(),
                        transition_matrix=mutant_tprob)
Beispiel #26
0
 def save_container(filename, dtype):
     io.saveh(filename, arr_0=-1*np.ones((project.n_trajs, np.max(project.traj_lengths)), dtype=dtype),
              completed_trajs=np.zeros((project.n_trajs), dtype=np.bool))
Beispiel #27
0
            ptrj_chunk = get_hb(trj_chunk).astype(float)
            ass_chunk = Ass[traj_ind][
                chunk_ind * chunk_size : (chunk_ind + 1) * chunk_size
            ]  # this behaves as you want at the end of the array

            for i, ass in enumerate(ass_chunk):
                if ass == -1:
                    continue
                CMs_1d[ass] += ptrj_chunk[i]

    # StateAssigns = np.array([ np.where( Ass == i )[0].shape[0] for i in np.unique( Ass[ np.where( Ass >= 0 ) ] )] )
    StateAssigns = np.bincount(Ass[np.where(Ass != -1)], minlength=Ass.max() + 1)
    StateAssigns = StateAssigns.reshape((len(StateAssigns), 1))
    AvgCMs_1d = CMs_1d / StateAssigns

    io.saveh(args.out_cm, which=which, HB_maps=AvgCMs_1d)


uniq_res = np.unique(pdb["ResidueID"])
n_res = uniq_res.shape[0]

acc_res_ids = pdb["ResidueID"][which[:, 0]]
donor_res_ids = pdb["ResidueID"][which[:, 2]]

CMs = np.zeros((len(AvgCMs_1d), n_res, n_res))
CM_pdb = np.zeros((n_res, n_res))
for i in xrange(n_res):
    for j in xrange(n_res):
        if i == j:
            continue
        inds = np.where((acc_res_ids == uniq_res[i]) & (donor_res_ids == uniq_res[j]))[0]
Beispiel #28
0
def main(coarse_val, orig_val, rcut):
    data=dict()
    data['coarse']=dict()
    data['orig']=dict()
    dirs=dict()
    dirs['coarse']='./d%s' % coarse_val
    dirs['orig']='./d%s' % orig_val
    proj=Project.load_from('ProjectInfo.yaml')
    types=['ass', 'rmsd', 'dist', 'gens']
    for key in ['coarse', 'orig']:
        for type in types:
            if 'ass' in type:
                ass=io.loadh('%s/Data/Assignments.h5' % dirs[key])
                data[key][type]=ass['arr_0']
            elif 'dist' in type:
                ass=io.loadh('%s/Data/Assignments.h5.distances' % dirs[key])
                data[key][type]=ass['arr_0']
            elif 'rmsd' in type:
                rmsd=numpy.loadtxt('%s/Gens.rmsd.dat' % dirs[key])
                data[key][type]=rmsd
            elif 'gens' in type:
                gens=Trajectory.load_from_lhdf('%s/Gens.lh5' % dirs[key])
                data[key][type]=gens
    unboundmap=dict()
    boundmap=dict()
    # build map dict for orig to coarse unbound states, bound will stay same
    unboundass=-1*numpy.ones(( data['orig']['ass'].shape[0], data['orig']['ass'].shape[1]), dtype=int)
    newass=-1*numpy.ones(( data['orig']['ass'].shape[0], data['orig']['ass'].shape[1]), dtype=int)
    newdist=-1*numpy.ones(( data['orig']['ass'].shape[0], data['orig']['ass'].shape[1]))
    for j in range(0, data['orig']['ass'].shape[0]):
        rmsd=numpy.loadtxt('Trajectories-metric/trj%s_lprmsd.dat' % j)
        frames=numpy.where(data['orig']['ass'][j]!=-1)[0]
        if len(rmsd)!=len(frames):
            print "trajectory mismatch"
            import pdb
            pdb.set_trace()
        for (n,i) in enumerate(data['orig']['ass'][j]):
            # if unbound
            if i != -1:
                #if data['orig']['rmsd'][i] > float(rcut):
                if rmsd[n] > float(rcut):
                    newstate=data['coarse']['ass'][j][n]
                    if data['coarse']['rmsd'][newstate] < float(rcut):
                        newass[j][n]=i
                        newdist[j][n]=data['orig']['dist'][j][n]
                    else:
                        unboundass[j][n]=newstate
                        newdist[j][n]=data['coarse']['dist'][j][n]
                else:
                    newass[j][n]=i
                    newdist[j][n]=data['orig']['dist'][j][n]
    count=0
    unique=sorted(set(newass.flatten()))
    newass, boundmap, count=remap_ass(newass, newass, unique, count)
    unique=sorted(set(unboundass.flatten()))
    newass, unboundmap, count=remap_ass(unboundass, newass, unique, count)
    io.saveh('%s/Coarsed_r%s_d%s_Assignments.h5' % (dirs['orig'], rcut, coarse_val), newass)
    io.saveh('%s/Coarsed_r%s_d%s_Assignments.distances.h5' % (dirs['orig'], rcut, coarse_val), newdist)
    subdir='%s/Coarsed_r%s_gen/' % (dirs['orig'], rcut)
    if not os.path.exists(subdir):
        os.mkdir(subdir)
    ohandle=open('%s/Coarsed%s_r%s_Gens.rmsd.dat' % (subdir, coarse_val, rcut), 'w')
    b=data['orig']['gens']['XYZList'].shape[1]
    c=data['orig']['gens']['XYZList'].shape[2]
    dicts=[boundmap, unboundmap]
    names=['bound', 'unbound']
    labels=['orig', 'coarse']
    total=len(boundmap.keys()) + len(unboundmap.keys())
    structure=proj.empty_traj()
    structure['XYZList']=numpy.zeros((total, b, c), dtype='float32')
    count=0
    for (name, label, mapdata) in zip( names, labels, dicts):
        print "writing coarse gen %s out of %s pdbs" % (count, len(mapdata.keys()))
        for i in sorted(mapdata.keys()):
            macro=mapdata[i]
            structure['XYZList'][count]=data[label]['gens']['XYZList'][macro]
            ohandle.write('%s\t%s\t%s\n' % (name, count, data[label]['rmsd'][macro]))
            print name, count
            count+=1
    otraj='%s/Coarsed%s_r%s_Gens.xtc' % (subdir, coarse_val, rcut)
    if os.path.exists(otraj):
        os.remove(otraj)
    structure.save_to_xtc('%s/Coarsed%s_r%s_Gens.xtc' % (subdir, coarse_val, rcut))
Beispiel #29
0
#!/usr/bin/env python

from msmbuilder import io
from msmbuilder import msm_analysis
from scipy.io import mmread
from argparse import ArgumentParser
import os

parser = ArgumentParser()
parser.add_argument('-t', dest='tProb', help='transition matrix', default='./tProb.mtx')
parser.add_argument('-o', dest='output', help='output filename', default='./eigs.h5')
parser.add_argument('-n', dest='num_vecs', help='number of eigenvectors to find.', default=500, type=int)

args = parser.parse_args()

if os.path.exists(args.output):
    raise Exception("path (%s) exists!" % args.output)

tProb = mmread(args.tProb)

eigs = msm_analysis.get_eigenvectors(tProb, args.num_vecs)

io.saveh(args.output, vals=eigs[0], vecs=eigs[1])
Beispiel #30
0
import os
import sys
import numpy as np
import scipy.sparse

from msmbuilder import io, msm_analysis, MSMLib
from bayesmutant import SimpleMutantSampler

P = np.loadtxt('base_transition_matrix.dat')

mutant_transition_matrix = P + 0.2*scipy.sparse.rand(P.shape[0], P.shape[1], density=0.1).todense()
mutant_transition_matrix /= np.sum(mutant_transition_matrix, axis=1)

trajectory =  np.array(msm_analysis.sample(P, 0, 5000))
base_counts = MSMLib.get_counts_from_traj(trajectory).todense()


print 'base counts'
print base_counts

ms = SimpleMutantSampler(base_counts, mutant_transition_matrix)
ms.step(5000)

print 'observed counts'
print ms.counts

io.saveh('sampling.h5', base_counts=base_counts, samples=ms.samples,
                        observed_counts=ms.counts, scores=ms.scores,
                        transition_matrix=mutant_transition_matrix)
Please use CalculateProjectDistance.py
===============================================================================
"""
    parser = arglib.ArgumentParser(description="""
Calculate the RMSD between an input PDB and all conformations in your project.
Output as a HDF5 file (load using msmbuilder.io.loadh())
""" + deprecationmessage)
    warnings.warn(deprecationmessage, DeprecationWarning)

    parser.add_argument('pdb')
    parser.add_argument('atom_indices',
                        help='Indices of atoms to compare',
                        default='AtomIndices.dat')
    parser.add_argument('output',
                        help='''Output file name. Output is an
        .h5 file with RMSD entries corresponding to the Assignments.h5 file.''',
                        default='Data/RMSD.h5')
    parser.add_argument('project')
    args = parser.parse_args()

    arglib.die_if_path_exists(args.output)

    project = Project.load_from(args.project)
    pdb = Trajectory.load_trajectory_file(args.pdb)
    atom_indices = np.loadtxt(args.atom_indices).astype(int)

    distances = run(project, pdb, atom_indices)

    io.saveh(args.output, distances)
    logger.info('Saved to %s', args.output)

if __name__ == '__main__':
    parser = arglib.ArgumentParser("""Calculates the Solvent Accessible Surface Area
    of all atoms in a given trajectory, or for all trajectories in the project. The
    output is a hdf5 file which contains the SASA for each atom in each frame
    in each trajectory (or the single trajectory you passed in.""" )
    parser.add_argument('project')
    parser.add_argument('atom_indices', help='Indices of atoms to calculate SASA',
        default='all')
    parser.add_argument('output', help='''hdf5 file for output. Note this will
        be THREE dimensional: ( trajectory, frame, atom ), unless you just ask for
        one trajectory, in which case it will be shape (frame, atom).''',
        default='SASA.h5')
    parser.add_argument('traj_fn', help='''Pass a trajectory file if you only
        want to calclate the SASA for a single trajectory''', default='all' )
    args = parser.parse_args()

    arglib.die_if_path_exists(args.output)

    if args.atom_indices.lower() == 'all':
        atom_indices = None
    else:
        atom_indices = np.loadtxt(args.atom_indices).astype(int)

    project = Project.load_from(args.project)

    SASA = run(project, atom_indices, args.traj_fn)

    io.saveh(args.output, SASA)
Beispiel #33
0
def main(coarse_val, orig_val, rcut):
    data=dict()
    data['coarse']=dict()
    data['orig']=dict()
    dirs=dict()
    dirs['coarse']='./d%s' % coarse_val
    dirs['orig']='./d%s' % orig_val
    proj=Project.load_from('ProjectInfo.yaml')
    types=['ass', 'rmsd', 'dist', 'gens']
    for key in ['coarse', 'orig']:
        for type in types:
            if 'ass' in type:
                ass=io.loadh('%s/Data/Assignments.h5' % dirs[key])
                data[key][type]=ass['arr_0']
            elif 'dist' in type:
                ass=io.loadh('%s/Data/Assignments.h5.distances' % dirs[key])
                data[key][type]=ass['arr_0']
            elif 'rmsd' in type:
                rmsd=numpy.loadtxt('%s/Gens.rmsd.dat' % dirs[key])
                data[key][type]=rmsd
            elif 'gens' in type:
                gens=Trajectory.load_from_lhdf('%s/Gens.lh5' % dirs[key])
                data[key][type]=gens
    unboundmap=dict()
    boundmap=dict()
    #unboundstates=dict()
    #unboundrmsd=dict()
    # build map dict for orig to coarse unbound states, bound will stay same
    newass=-1*numpy.ones(( data['orig']['ass'].shape[0], data['orig']['ass'].shape[1]), dtype=int)
    for j in range(0, data['orig']['ass'].shape[0]):
        for (n,i) in enumerate(data['orig']['ass'][j]):
            # if unbound
            if i != -1:
                if data['orig']['rmsd'][i] > float(rcut):
                    state=data['coarse']['ass'][j][n]
                    newass[j][n]=state+10000
                else:
                    newass[j][n]=i
    count=0
    unique=set(newass.flatten())
    boundmap=dict()
    unboundmap=dict()
    for x in unique:
        locations=numpy.where(newass==x)
        newass[locations]=count
        if x >= 10000:
            unboundmap[count]=(x-10000)
        else:
            boundmap[count]=x
        count+=1
    io.saveh('%s/Coarsed_r%s_Assignments.h5' % (dirs['orig'], rcut), newass)
    subdir='%s/Coarsed_r%s_gen/' % (dirs['orig'], rcut)
    if not os.path.exists(subdir):
        os.mkdir(subdir)
    ohandle=open('%s/Coarsed%s_r%s_Gens.rmsd.dat' % (subdir, coarse_val, rcut), 'w')
    b=data['orig']['gens']['XYZList'].shape[1]
    c=data['orig']['gens']['XYZList'].shape[2]
    dicts=[boundmap, unboundmap]
    names=['bound', 'unbound']
    labels=['orig', 'coarse']
    total=len(boundmap.keys()) + len(unboundmap.keys())
    structure=proj.empty_traj()
    structure['XYZList']=numpy.zeros((total, b, c), dtype='float32')
    count=0
    for (name, label, mapdata) in zip( names, labels, dicts):
        print "writing coarse gen %s out of %s pdbs" % (count, len(mapdata.keys()))
        for i in sorted(mapdata.keys()):
            macro=mapdata[i]
            structure['XYZList'][count]=data[label]['gens']['XYZList'][macro]
            ohandle.write('%s\t%s\t%s\n' % (name, count, data[label]['rmsd'][macro]))
            print name, count
            count+=1
    structure.save_to_xtc('%s/Coarsed%s_r%s_Gens.xtc' % (subdir, coarse_val, rcut))
Beispiel #34
0
 def write_msm_output(self,outarray=None,filename="Assignments.h5"):
     from msmbuilder import io
     if outarray is None:
         outarray = self.assignments
     io.saveh(filename,outarray)
    of all atoms in a given trajectory, or for all trajectories in the project. The
    output is a hdf5 file which contains the SASA for each atom in each frame
    in each trajectory (or the single trajectory you passed in.""")
    parser.add_argument('project')
    parser.add_argument('atom_indices',
                        help='Indices of atoms to calculate SASA',
                        default='all')
    parser.add_argument('output',
                        help='''hdf5 file for output. Note this will
        be THREE dimensional: ( trajectory, frame, atom ), unless you just ask for
        one trajectory, in which case it will be shape (frame, atom).''',
                        default='SASA.h5')
    parser.add_argument('traj_fn',
                        help='''Pass a trajectory file if you only
        want to calclate the SASA for a single trajectory''',
                        default='all')
    args = parser.parse_args()

    arglib.die_if_path_exists(args.output)

    if args.atom_indices.lower() == 'all':
        atom_indices = None
    else:
        atom_indices = np.loadtxt(args.atom_indices).astype(int)

    project = Project.load_from(args.project)

    SASA = run(project, atom_indices, args.traj_fn)

    io.saveh(args.output, SASA)
Beispiel #36
0
c = tICA.CovarianceMatrix(args.lag, tProb=tProb, populations=pops)

for i, fn in enumerate(traj_list):
    print fn
    t = np.load(fn)
    c.train(t, ass[i])
    #c.train(t)

C, Sigma = c.get_current_estimate()

vals, vecs = scipy.linalg.eig(C, b=Sigma)

print vals
print vecs

io.saveh(args.out, vals=vals, vecs=vecs, C=C, Sigma=Sigma)

muller.plot_v()


ref = io.loadh('ref.h5')
ref['vecs'][:,0] *= -1

vecs[:,0] *= -1

plot([0, vecs[0,0]], [0, vecs[1,0]], color='white', lw=3)
plot([0, vecs[0,1]], [0, vecs[1,1]], color='white', ls='dashed', lw=3)

plot([0, ref['vecs'][0,0]], [0, ref['vecs'][1,0]], color='red', lw=3)
plot([0, ref['vecs'][0,1]], [0, ref['vecs'][1,1]], color='red', ls='dashed', lw=3)
from msmbuilder import io
from msmbuilder.clustering import Hierarchical
from msmbuilder import arglib
import logging
logger = logging.getLogger('msmbuilder.scripts.AssignHierarchical')

parser = arglib.ArgumentParser(description='Assign data using a hierarchical clustering')
parser.add_argument('hierarchical_clustering_zmatrix', default='./Data/ZMatrix.h5',
    help='Path to hierarchical clustering zmatrix' )
parser.add_argument('num_states', help='Number of States', default='none')
parser.add_argument('cutoff_distance', help='Maximum cophenetic distance', default='none')
parser.add_argument('assignments', type=str)

def main(k, d, zmatrix_fn):
    hierarchical = Hierarchical.load_from_disk(zmatrix_fn)
    assignments = hierarchical.get_assignments(k=k, cutoff_distance=d)
    return assignments
    
if __name__ == "__main__":
    args = parser.parse_args()
    k = int(args.num_states) if args.num_states != 'none' else None
    d = float(args.cutoff_distance) if args.cutoff_distance != 'none' else None
    arglib.die_if_path_exists(args.assignments)
    if k is None and d is None:
        logger.error('You need to supply either a number of states or a cutoff distance')
        sys.exit(1)
    
    assignments = main(k, d, args.hierarchical_clustering_zmatrix)
    io.saveh(args.assignments, assignments)
    logger.info('Saved assignments to %s', args.assignments)
Beispiel #38
0
 def test_save(self):
     """Save HDF5 to disk and load it back up"""
     io.saveh(self.filename2, self.data)
     TestData = io.loadh(self.filename2, 'arr_0')
     npt.assert_array_equal(TestData, self.data)