def test_harder_hubscore(): # depends on tpt.committors and tpt.conditional_committors assignments = np.random.randint(10, size=(10, 1000)) msm = MarkovStateModel(lag_time=1) msm.fit(assignments) hub_scores = tpt.hub_scores(msm) ref_hub_scores = np.zeros(10) for A in xrange(10): for B in xrange(10): committors = tpt.committors(A, B, msm) denom = msm.transmat_[A, :].dot(committors) #+ msm.transmat_[A, B] for C in xrange(10): if A == B or A == C or B == C: continue cond_committors = tpt.conditional_committors(A, B, C, msm) temp = 0.0 for i in xrange(10): if i in [A, B]: continue temp += cond_committors[i] * msm.transmat_[A, i] temp /= denom ref_hub_scores[C] += temp ref_hub_scores /= (9 * 8) #print(ref_hub_scores, hub_scores) npt.assert_array_almost_equal(ref_hub_scores, hub_scores)
def test_load(): filenames = [ "frame0.xtc", "frame0.trr", "frame0.dcd", "frame0.binpos", "traj.h5", "frame0.nc", "traj.h5", "frame0.lammpstrj", "frame0.xyz", "frame0.tng" ] num_block = 3 for filename in filenames: t0 = md.load(get_fn(filename), top=nat, discard_overlapping_frames=True) t1 = md.load(get_fn(filename), top=nat, discard_overlapping_frames=False) t2 = md.load([get_fn(filename) for i in xrange(num_block)], top=nat, discard_overlapping_frames=False) t3 = md.load([get_fn(filename) for i in xrange(num_block)], top=nat, discard_overlapping_frames=True) # these don't actually overlap, so discard_overlapping_frames should # have no effect. the overlap is between the last frame of one and the # first frame of the next. yield lambda: eq(t0.n_frames, t1.n_frames) yield lambda: eq(t0.n_frames * num_block, t2.n_frames) yield lambda: eq(t3.n_frames, t2.n_frames)
def test_load(): filenames = [ "frame0.xtc", "frame0.trr", "frame0.dcd", "frame0.binpos", "traj.h5", "frame0.nc", "traj.h5", "frame0.lammpstrj", "frame0.xyz", ] num_block = 3 for filename in filenames: t0 = md.load(get_fn(filename), top=nat, discard_overlapping_frames=True) t1 = md.load(get_fn(filename), top=nat, discard_overlapping_frames=False) t2 = md.load([get_fn(filename) for i in xrange(num_block)], top=nat, discard_overlapping_frames=False) t3 = md.load([get_fn(filename) for i in xrange(num_block)], top=nat, discard_overlapping_frames=True) # these don't actually overlap, so discard_overlapping_frames should # have no effect. the overlap is between the last frame of one and the # first frame of the next. yield lambda: eq(t0.n_frames, t1.n_frames) yield lambda: eq(t0.n_frames * num_block, t2.n_frames) yield lambda: eq(t3.n_frames, t2.n_frames)
def test_fluxes(): # depends on tpt.committors msm = MarkovStateModel(lag_time=1) assignments = np.random.randint(3, size=(10, 1000)) msm.fit(assignments) tprob = msm.transmat_ pop = msm.populations_ # forward committors qplus = tpt.committors(0, 2, msm) ref_fluxes = np.zeros((3, 3)) ref_net_fluxes = np.zeros((3, 3)) for i in xrange(3): for j in xrange(3): if i != j: # Eq. 2.24 in Metzner et al. Transition Path Theory. # Multiscale Model. Simul. 2009, 7, 1192-1219. ref_fluxes[i, j] = pop[i] * tprob[i, j] * (1 - qplus[i]) * qplus[j] for i in xrange(3): for j in xrange(3): ref_net_fluxes[i, j] = np.max([0, ref_fluxes[i, j] - ref_fluxes[j, i]]) fluxes = tpt.fluxes(0, 2, msm) net_fluxes = tpt.net_fluxes(0, 2, msm) #print(fluxes) #print(ref_fluxes) npt.assert_array_almost_equal(ref_fluxes, fluxes) npt.assert_array_almost_equal(ref_net_fluxes, net_fluxes)
def test_load(): filenames = [ "frame0.xtc", "frame0.trr", "frame0.dcd", "frame0.binpos", "traj.h5", 'legacy_msmbuilder_trj0.lh5', 'frame0.nc', six.u('traj.h5') ] num_block = 3 for filename in filenames: t0 = md.load(get_fn(filename), top=nat, discard_overlapping_frames=True) t1 = md.load(get_fn(filename), top=nat, discard_overlapping_frames=False) t2 = md.load([get_fn(filename) for i in xrange(num_block)], top=nat, discard_overlapping_frames=False) t3 = md.load([get_fn(filename) for i in xrange(num_block)], top=nat, discard_overlapping_frames=True) # these don't actually overlap, so discard_overlapping_frames should have no effect # the overlap is between the last frame of one and the first frame of the next. yield lambda: eq(t0.n_frames, t1.n_frames) yield lambda: eq(t0.n_frames * num_block, t2.n_frames) yield lambda: eq(t3.n_frames, t2.n_frames)
def test_fluxes(): # depends on tpt.committors msm = MarkovStateModel(lag_time=1) assignments = np.random.randint(3, size=(10, 1000)) msm.fit(assignments) tprob = msm.transmat_ pop = msm.populations_ # forward committors qplus = tpt.committors(0, 2, msm) ref_fluxes = np.zeros((3, 3)) ref_net_fluxes = np.zeros((3, 3)) for i in xrange(3): for j in xrange(3): if i != j: # Eq. 2.24 in Metzner et al. Transition Path Theory. # Multiscale Model. Simul. 2009, 7, 1192-1219. ref_fluxes[i, j] = (pop[i] * tprob[i, j] * (1 - qplus[i]) * qplus[j]) for i in xrange(3): for j in xrange(3): ref_net_fluxes[i, j] = np.max( [0, ref_fluxes[i, j] - ref_fluxes[j, i]]) fluxes = tpt.fluxes(0, 2, msm) net_fluxes = tpt.net_fluxes(0, 2, msm) # print(fluxes) # print(ref_fluxes) npt.assert_array_almost_equal(ref_fluxes, fluxes) npt.assert_array_almost_equal(ref_net_fluxes, net_fluxes)
def get_bond_connectivity(conf): """Get a list of all the bonds in a conformation Parameters ---------- conf : MDTraj.Trajectory An MDTraj trajectory, only the first frame will be used. Returns ------- ibonds : np.ndarray, shape=[n_bonds, 2], dtype=int n_bonds x 2 array of indices, where each row is the index of two atom who participate in a bond. Notes ----- Regular bonds are assigned to all pairs of atoms where the interatomic distance is less than or equal to 1.3 times the sum of their respective covalent radii. References ---------- Bakken and Helgaker, JCP Vol. 117, Num. 20 22 Nov. 2002 http://folk.uio.no/helgaker/reprints/2002/JCP117b_GeoOpt.pdf """ from scipy.spatial.distance import squareform, pdist xyz = conf.xyz[0, :, :] n_atoms = xyz.shape[0] elements = np.zeros(n_atoms, dtype='S1') atom_names = [a.name for a in conf.top.atoms()] for i in xrange(n_atoms): # name of the element that is atom[i] # take the first character of the AtomNames string, # after stripping off any digits elements[i] = atom_names[i].strip('123456789 ')[0] if not elements[i] in COVALENT_RADII.keys(): raise ValueError("I don't know about this AtomName: {}".format( atom_names[i])) distance_mtx = squareform(pdist(xyz)) connectivity = [] for i in xrange(n_atoms): for j in xrange(i + 1, n_atoms): # Regular bonds are assigned to all pairs of atoms where # the interatomic distance is less than or equal to 1.3 times the # sum of their respective covalent radii. d = distance_mtx[i, j] if d < 1.3 * (COVALENT_RADII[elements[i]] + COVALENT_RADII[elements[j]]): connectivity.append((i, j)) return np.array(connectivity)
def hub_scores(msm, waypoints=None): """ Calculate the hub score for one or more waypoints The "hub score" is a measure of how well traveled a certain state or set of states is in a network. Specifically, it is the fraction of times that a walker visits a state en route from some state A to another state B, averaged over all combinations of A and B. Parameters ---------- msm : msmbuilder.MarkovStateModel MSM to analyze waypoints : array_like, int, optional The index of the intermediate state (or more than one). If None, then all waypoints will be used Returns ------- hub_score : float The hub score for the waypoint References ---------- .. [1] Dickson & Brooks (2012), J. Chem. Theory Comput., 8, 3044-3052. """ n_states = msm.n_states_ if isinstance(waypoints, int): waypoints = [waypoints] elif waypoints is None: waypoints = xrange(n_states) elif not (isinstance(waypoints, list) or isinstance(waypoints, np.ndarray)): raise ValueError("waypoints (%s) must be an int, a list, or None" % str(waypoints)) hub_scores = [] for waypoint in waypoints: other_states = (i for i in xrange(n_states) if i != waypoint) # calculate the hub score for this waypoint hub_score = 0.0 for (source, sink) in itertools.permutations(other_states, 2): hub_score += fraction_visited(source, sink, waypoint, msm) hub_score /= float((n_states - 1) * (n_states - 2)) hub_scores.append(hub_score) return np.array(hub_scores)
def assign_in_memory(metric, generators, project, atom_indices_to_load=None): """ Assign every frame to its closest generator This code does everything in memory, and does not checkpoint. It also does not save any results to disk. Parameters ---------- metric : msmbuilder.metrics.AbstractDistanceMetric A distance metric used to define "closest" project : msmbuilder.Project Used to load the trajectories generators : msmbuilder.Trajectory A trajectory containing the structures of all of the cluster centers atom_indices_to_load : {None, list} The indices of the atoms to load for each trajectory chunk. Note that this method is responsible for loading up atoms from the project, but does NOT load up the generators. Those are passed in as a trajectory object (above). So if the generators are already subsampled to a restricted set of atom indices, but the trajectories on disk are NOT, you'll need to pass in a set of indices here to resolve the difference. See Also -------- assign_with_checkpoint """ n_trajs, max_traj_length = project.n_trajs, np.max(project.traj_lengths) assignments = -1 * np.ones((n_trajs, max_traj_length), dtype='int') distances = -1 * np.ones((n_trajs, max_traj_length), dtype='float32') pgens = metric.prepare_trajectory(generators) for i in xrange(n_trajs): traj = project.load_traj(i, atom_indices=atom_indices_to_load) if traj['XYZList'].shape[1] != generators['XYZList'].shape[1]: raise ValueError( 'Number of atoms in generators does not match ' 'traj we\'re trying to assign! Maybe check atom indices?') ptraj = metric.prepare_trajectory(traj) for j in xrange(len(traj)): d = metric.one_to_all(ptraj, pgens, j) assignments[i, j] = np.argmin(d) distances[i, j] = d[assignments[i, j]] return assignments, distances
def assign_in_memory(metric, generators, project, atom_indices_to_load=None): """ Assign every frame to its closest generator This code does everything in memory, and does not checkpoint. It also does not save any results to disk. Parameters ---------- metric : msmbuilder.metrics.AbstractDistanceMetric A distance metric used to define "closest" project : msmbuilder.Project Used to load the trajectories generators : msmbuilder.Trajectory A trajectory containing the structures of all of the cluster centers atom_indices_to_load : {None, list} The indices of the atoms to load for each trajectory chunk. Note that this method is responsible for loading up atoms from the project, but does NOT load up the generators. Those are passed in as a trajectory object (above). So if the generators are already subsampled to a restricted set of atom indices, but the trajectories on disk are NOT, you'll need to pass in a set of indices here to resolve the difference. See Also -------- assign_with_checkpoint """ n_trajs, max_traj_length = project.n_trajs, np.max(project.traj_lengths) assignments = -1 * np.ones((n_trajs, max_traj_length), dtype='int') distances = -1 * np.ones((n_trajs, max_traj_length), dtype='float32') pgens = metric.prepare_trajectory(generators) for i in xrange(n_trajs): traj = project.load_traj(i, atom_indices=atom_indices_to_load) if traj['XYZList'].shape[1] != generators['XYZList'].shape[1]: raise ValueError('Number of atoms in generators does not match ' 'traj we\'re trying to assign! Maybe check atom indices?') ptraj = metric.prepare_trajectory(traj) for j in xrange(len(traj)): d = metric.one_to_all(ptraj, pgens, j) assignments[i, j] = np.argmin(d) distances[i, j] = d[assignments[i, j]] return assignments, distances
def test_paths(): net_flux = np.array([[0.0, 0.5, 0.5, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.3, 0.0, 0.2], [0.0, 0.0, 0.0, 0.0, 0.5, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.3], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]) sources = np.array([0]) sinks = np.array([4, 5]) ref_paths = [[0, 2, 4], [0, 1, 3, 5], [0, 1, 5]] ref_fluxes = np.array([0.5, 0.3, 0.2]) res_bottle = tpt.paths(sources, sinks, net_flux, remove_path='bottleneck') res_subtract = tpt.paths(sources, sinks, net_flux, remove_path='subtract') for paths, fluxes in [res_bottle, res_subtract]: npt.assert_array_almost_equal(fluxes, ref_fluxes) assert len(paths) == len(ref_paths) for i in xrange(len(paths)): npt.assert_array_equal(paths[i], ref_paths[i])
def read(self, n_frames=None, stride=None, atom_indices=None): """Read data from a lammpstrj file. Parameters ---------- n_frames : int, None The number of frames you would like to read from the file. If None, all of the remaining frames will be loaded. stride : np.ndarray, optional Read only every stride-th frame. atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. Returns ------- xyz : np.ndarray, shape=(n_frames, n_atoms, 3), dtype=np.float32 cell_lengths : np.ndarray, None The lengths (a,b,c) of the unit cell for each frame, or None if the information is not present in the file. cell_angles : np.ndarray, None The angles (\alpha, \beta, \gamma) defining the unit cell for each frame, or None if the information is not present in the file. """ if not self._mode == 'r': raise ValueError('read() is only available when file is opened ' 'in mode="r"') if n_frames is None: frame_counter = itertools.count() else: frame_counter = xrange(n_frames) if stride is None: stride = 1 all_coords, all_lengths, all_angles = [], [], [] for _ in frame_counter: try: frame_coords, frame_lengths, frame_angles = self._read() if atom_indices is not None: frame_coords = frame_coords[atom_indices, :] except _EOF: break all_coords.append(frame_coords) all_lengths.append(frame_lengths) all_angles.append(frame_angles) for j in range(stride - 1): # throw away these frames try: self._read() except _EOF: break all_coords = np.array(all_coords) all_lengths = np.array(all_lengths, dtype=np.float32) all_angles = np.array(all_angles, dtype=np.float32) return all_coords, all_lengths, all_angles
def _read(self): """Read a single frame. """ first = self._fh.readline() # Number of atoms. if first == '': raise _EOF() else: self._n_atoms = int(first) self._fh.readline() # Comment line. self._line_counter += 2 xyz = np.empty(shape=(self._n_atoms, 3)) types = np.empty(shape=self._n_atoms, dtype=str) for i in xrange(self._n_atoms): line = self._fh.readline() if line == '': raise _EOF() split_line = line.split() try: types[i] = split_line[0] xyz[i] = [float(x) for x in split_line[1:4]] except Exception: raise IOError('xyz parse error on line {0:d} of "{1:s}". ' 'This file does not appear to be a valid ' 'xyz file.'.format( self._line_counter, self._filename)) self._line_counter += 1 # --- end body --- self._frame_index += 1 return xyz
def uneven_zip(*args): '''Zip the arguments together like the builtin function, except that when one argument runs out (because its shorter), you keep filling it in with its last value i.e. uneven_zip([1,2,3], 'a', [10,11]) = [[1, 'a', 10], [2, 'a', 11], [3, 'a', 11]] ''' num_args = len(args) args = list(args) for i in xrange(num_args): if not hasattr(args[i], '__len__'): args[i] = (args[i],) lengths = [len(arg) for arg in args] def get(i): result = [None] * num_args for j in range(num_args): try: result[j] = args[j][i] except: result[j] = args[j][lengths[j] - 1] return result zipped = [get(i) for i in range(max(lengths))] return zipped
def save(confs_by_state, states, style, format, outdir): "Save the results to disk" if style == 'sep': for i, trj in enumerate(confs_by_state): for j in xrange(len(trj)): fn = os.path.join(outdir, 'State%d-%d.%s' % (states[i], j, format)) arglib.die_if_path_exists(fn) logger.info("Saving file: %s" % fn) trj[j].save(fn) elif style == 'tps': #print (confs_by_state) for i, trj in enumerate(confs_by_state): #print (trj) fn = os.path.join(outdir, 'State%d.%s' % (states[i], format)) arglib.die_if_path_exists(fn) logger.info("Saving file: %s" % fn) concatenate_trajectories(trj).save(fn) #trj.save(fn) elif style == 'one': fn = os.path.join(outdir, 'Confs.%s' % format) arglib.die_if_path_exists(fn) logger.info("Saving file: %s" % fn) concatenate_trajectories(confs_by_state).save(fn) else: raise ValueError('Invalid style: %s' % style)
def save(confs_by_state, states, style, format, outdir): "Save the results to disk" if style == "sep": for i, trj in enumerate(confs_by_state): for j in xrange(len(trj)): fn = os.path.join(outdir, "State%d-%d.%s" % (states[i], j, format)) arglib.die_if_path_exists(fn) logger.info("Saving file: %s" % fn) trj[j].save(fn) elif style == "tps": for i, trj in enumerate(confs_by_state): fn = os.path.join(outdir, "State%d.%s" % (states[i], format)) arglib.die_if_path_exists(fn) logger.info("Saving file: %s" % fn) trj.save(fn) elif style == "one": fn = os.path.join(outdir, "Confs.%s" % format) arglib.die_if_path_exists(fn) logger.info("Saving file: %s" % fn) concatenate_trajectories(confs_by_state).save(fn) else: raise ValueError("Invalid style: %s" % style)
def write(self, xyz, types=None): """Write one or more frames of data to a xyz file. Parameters ---------- xyz : np.ndarray, shape=(n_frames, n_atoms, 3) The cartesian coordinates of the atoms to write. types : np.ndarray, shape(3, ) The type of each particle. """ if not self._mode == 'w': raise ValueError('write() is only available when file is opened ' 'in mode="w"') if not types: # Make all particles the same type. types = ['X' for _ in xrange(xyz.shape[1])] xyz = ensure_type(xyz, np.float32, 3, 'xyz', can_be_none=False, shape=(None, None, 3), warn_on_cast=False, add_newaxis_on_deficient_ndim=True) in_units_of(xyz, 'nanometers', self.distance_unit, inplace=True) for i in range(xyz.shape[0]): self._fh.write('{0}\n'.format(xyz.shape[1])) self._fh.write("Created with MDTraj {0}, {1}\n".format(version, str(date.today()))) for j, coord in enumerate(xyz[i]): self._fh.write('{0} {1:8.3f} {2:8.3f} {3:8.3f}\n'.format( types[j], coord[0], coord[1], coord[2]))
def write(self, xyz, types=None): """Write one or more frames of data to a xyz file. Parameters ---------- xyz : np.ndarray, shape=(n_frames, n_atoms, 3) The cartesian coordinates of the atoms to write. By convention for this trajectory format, the lengths should be in units of angstroms. types : np.ndarray, shape(3, ) The type of each particle. """ if not self._mode == 'w': raise ValueError('write() is only available when file is opened ' 'in mode="w"') if not types: # Make all particles the same type. types = ['X' for _ in xrange(xyz.shape[1])] xyz = ensure_type(xyz, np.float32, 3, 'xyz', can_be_none=False, shape=(None, None, 3), warn_on_cast=False, add_newaxis_on_deficient_ndim=True) for i in range(xyz.shape[0]): self._fh.write('{0}\n'.format(xyz.shape[1])) self._fh.write("Created with MDTraj {0}, {1}\n".format(version, str(date.today()))) for j, coord in enumerate(xyz[i]): self._fh.write('{0} {1:8.3f} {2:8.3f} {3:8.3f}\n'.format( types[j], coord[0], coord[1], coord[2]))
def get_angle_connectivity(ibonds): """Given the bonds, get the indices of the atoms defining all the bond angles Parameters ---------- ibonds : np.ndarray, shape=[n_bonds, 2], dtype=int n_bonds x 2 array of indices, where each row is the index of two atom who participate in a bond. Returns ------- iangles : np.ndarray, shape[n_angles, 3], dtype=int n_angles x 3 array of indices, where each row is the index of three atoms m,n,o such that n is bonded to both m and o. """ nx = import_('networkx') graph = nx.from_edgelist(ibonds) n_atoms = graph.number_of_nodes() iangles = [] for i in xrange(n_atoms): for (m, n) in combinations(graph.neighbors(i), 2): # so now the there is a bond angle m-i-n iangles.append((m, i, n)) return np.array(iangles)
def get_dihedral_connectivity(ibonds): """Given the bonds, get the indices of the atoms defining all the dihedral angles Parameters ---------- ibonds : np.ndarray, shape=[n_bonds, 2], dtype=int n_bonds x 2 array of indices, where each row is the index of two atom who participate in a bond. Returns ------- idihedrals : np.ndarray, shape[n_dihedrals, 4], dtype=int All sets of 4 atoms A,B,C,D such that A is bonded to B, B is bonded to C, and C is bonded to D """ nx = import_('networkx') graph = nx.from_edgelist(ibonds) n_atoms = graph.number_of_nodes() idihedrals = [] # TODO: CHECK FOR DIHEDRAL ANGLES THAT ARE 180 and recover # conf : msmbuilder.Trajectory # An msmbuilder trajectory, only the first frame will be used. This # is used purely to make the check for angle(ABC) != 180. for a in xrange(n_atoms): for b in graph.neighbors(a): for c in filter(lambda c: c not in [a, b], graph.neighbors(b)): for d in filter(lambda d: d not in [a, b, c], graph.neighbors(c)): idihedrals.append((a, b, c, d)) return np.array(idihedrals)
def save(confs_by_state, states, style, format, outdir): "Save the results to disk" if style == 'sep': for i, trj in enumerate(confs_by_state): for j in xrange(len(trj)): fn = os.path.join(outdir, 'State%d-%d.%s' % (states[i], j, format)) arglib.die_if_path_exists(fn) logger.info("Saving file: %s" % fn) trj[j].save(fn) elif style == 'tps': for i, trj in enumerate(confs_by_state): fn = os.path.join(outdir, 'State%d.%s' % (states[i], format)) arglib.die_if_path_exists(fn) logger.info("Saving file: %s" % fn) trj.save(fn) elif style == 'one': fn = os.path.join(outdir, 'Confs.%s' % format) arglib.die_if_path_exists(fn) logger.info("Saving file: %s" % fn) concatenate_trajectories(confs_by_state).save(fn) else: raise ValueError('Invalid style: %s' % style)
def uneven_zip(*args): '''Zip the arguments together like the builtin function, except that when one argument runs out (because its shorter), you keep filling it in with its last value i.e. uneven_zip([1,2,3], 'a', [10,11]) = [[1, 'a', 10], [2, 'a', 11], [3, 'a', 11]] ''' num_args = len(args) args = list(args) for i in xrange(num_args): if not hasattr(args[i], '__len__'): args[i] = (args[i], ) lengths = [len(arg) for arg in args] def get(i): result = [None] * num_args for j in range(num_args): try: result[j] = args[j][i] except: result[j] = args[j][lengths[j] - 1] return result zipped = [get(i) for i in range(max(lengths))] return zipped
def save_pdb(self, filename, force_overwrite=True): """Save trajectory to RCSB PDB format Parameters ---------- filename : str filesystem path in which to save the trajectory force_overwrite : bool, default=True Overwrite anything that exists at filename, if its already there """ self._check_valid_unitcell() with PDBTrajectoryFile(filename, 'w', force_overwrite=force_overwrite) as f: for i in xrange(self.n_frames): if self._have_unitcell: f.write(convert(self._xyz[i], Trajectory._distance_unit, f.distance_unit), self.topology, modelIndex=i, unitcell_lengths=convert(self.unitcell_lengths[i], Trajectory._distance_unit, f.distance_unit), unitcell_angles=self.unitcell_angles[i]) else: f.write(convert(self._xyz[i], Trajectory._distance_unit, f.distance_unit), self.topology, modelIndex=i)
def load_from(cls, filename): """ Load project from disk Parameters ---------- filename : string filename_or_file can be a path to a legacy .h5 or current .yaml file. Returns ------- project : the loaded project object """ rootdir = os.path.abspath(os.path.dirname(filename)) if filename.endswith('.yaml'): with open(filename) as f: ondisk = yaml.load(f, Loader=Loader) records = { 'conf_filename': ondisk['conf_filename'], 'traj_lengths': [], 'traj_paths': [], 'traj_converted_from': [], 'traj_errors': [] } for trj in ondisk['trajs']: records['traj_lengths'].append(trj['length']) records['traj_paths'].append(trj['path']) records['traj_errors'].append(trj['errors']) records['traj_converted_from'].append( trj['converted_from']) elif filename.endswith('.h5'): ondisk = io.loadh(filename, deferred=False) n_trajs = len(ondisk['TrajLengths']) records = { 'conf_filename': str(ondisk['ConfFilename'][0]), 'traj_lengths': ondisk['TrajLengths'], 'traj_paths': [], 'traj_converted_from': [[None]] * n_trajs, 'traj_errors': [None] * n_trajs } for i in xrange(n_trajs): # this is the convention used in the hdf project format to get the traj paths path = os.path.join( ondisk['TrajFilePath'][0], ondisk['TrajFileBaseName'][0] + str(i) + ondisk['TrajFileType'][0]) records['traj_paths'].append(path) else: raise ValueError('Sorry, I can only open files in .yaml' ' or .h5 format: %s' % filename) return cls(records, validate=False, project_dir=rootdir)
def test_prepare(self): rmsds = [metrics.RMSD(), # all atom indices metrics.RMSD(range(self.n_atoms)), metrics.RMSD(xrange(self.n_atoms)), metrics.RMSD(np.arange(self.n_atoms))] for metric in rmsds: ptraj = metric.prepare_trajectory(self.traj)
def _square_all_pairwise(self, prepared_traj): """Reference implementation of all_pairwise""" warnings.warn( 'This is HORRIBLY inefficient. This operation really needs to be done directly in C') output = np.empty((prepared_traj.n_frames, prepared_traj.n_frames)) for i in xrange(prepared_traj.n_frames): output[i] = self.one_to_all(prepared_traj, prepared_traj, i) return output
def _eval_traj_shapes(self): lengths = np.zeros(self.n_trajs) n_atoms = np.zeros(self.n_trajs) for i in xrange(self.n_trajs): filename = self.traj_filename(i) with md.open(filename) as f: lengths[i] = len(f) n_atoms[i] = md.load_frame(filename, 0).n_atoms return lengths, n_atoms
def propagate_model(transition_matrix, n_steps, initial_populations, observable_vector=None): """Propogate the time evolution of a population vector. Parameters ---------- T : ndarray or sparse matrix A transition matrix NumSteps : int How many timesteps to iterate initial_populations : ndarray The initial population vector observable_vector : ndarray Vector containing the state-wise averaged property of some observable. Can be used to propagate properties such as fraction folded, ensemble average RMSD, etc. Default: None Returns ------- X : ndarray Final population vector, after propagation obslist : list list of floats of length equal to the number of steps, giving the mean value of the observable (dot product of `ObservableVector` and populations) at each timestep See Also -------- sample scipy.sparse.linalg.aslinearoperator """ check_transition(transition_matrix) if observable_vector == None: check_dimensions(transition_matrix, initial_populations) else: check_dimensions(transition_matrix, initial_populations, observable_vector) X = initial_populations.copy() obslist = [] if scipy.sparse.issparse(transition_matrix): TC = transition_matrix.tocsr() else: TC = transition_matrix Tl = scipy.sparse.linalg.aslinearoperator(TC) for i in xrange(n_steps): X = Tl.rmatvec(X) if observable_vector is not None: obslist.append(sum(observable_vector * X)) return X, obslist
def _square_all_pairwise(self, prepared_traj): """Reference implementation of all_pairwise""" warnings.warn( 'This is HORRIBLY inefficient. This operation really needs to be done directly in C' ) output = np.empty((prepared_traj.n_frames, prepared_traj.n_frames)) for i in xrange(prepared_traj.n_frames): output[i] = self.one_to_all(prepared_traj, prepared_traj, i) return output
def save(self, filename_or_file): if isinstance(filename_or_file, string_types): if not filename_or_file.endswith('.yaml'): filename_or_file += '.yaml' dirname = os.path.abspath(os.path.dirname(filename_or_file)) if not os.path.exists(dirname): logger.info("Creating directory: %s" % dirname) os.makedirs(dirname) handle = open(filename_or_file, 'w') own_fid = True elif isinstance(filename_or_file, file): dirname = os.path.abspath(os.path.dirname(filename_or_file.name)) handle = filename_or_file own_fid = False # somewhat complicated logic if the directory you're # saving in is different than the directory this # project references its paths from # the point is that the when the file lists paths, those # paths are going to be interpreted as being with respect to # the directory that the file is in. So when the Project file # is being resaved (but the Trajectorys are not being moved) # then the paths need to change to compensate relative = os.path.relpath(self._project_dir, os.path.dirname(filename_or_file)) records = {'trajs': []} records['conf_filename'] = os.path.join(relative, self._conf_filename) traj_paths = [ os.path.join(relative, path) for path in self._traj_paths ] for i in xrange(len(traj_paths)): # yaml doesn't like numpy types, so we have to sanitize them records['trajs'].append({ 'id': i, 'path': str(traj_paths[i]), 'converted_from': list(self._traj_converted_from[i]), 'length': int(self._traj_lengths[i]), 'errors': self._traj_errors[i] }) yaml.dump(records, handle, Dumper=Dumper) if own_fid: handle.close() return filename_or_file
def read(self, n_frames=None, stride=None, atom_indices=None): """Read data from a TINKER .arc file. Note that only the Cartesian coordinates are read in. The .arc file also contains TINKER-specific numeric atom types and some bonding information, which we do not read in. Parameters ---------- n_frames : int, None The number of frames you would like to read from the file. If None, all of the remaining frames will be loaded. stride : np.ndarray, optional Read only every stride-th frame. atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. Returns ------- xyz : np.ndarray, shape=(n_frames, n_atoms, 3), dtype=np.float32 The cartesian coordinates, in angstroms """ if not self._mode == 'r': raise ValueError('read() is only available when file is opened ' 'in mode="r"') if n_frames is None: frame_counter = itertools.count() else: frame_counter = xrange(n_frames) if stride is None: stride = 1 coords = [] for i in frame_counter: try: coord = self._read() if atom_indices is not None: coord = coord[atom_indices, :] except _EOF: break coords.append(coord) for j in range(stride - 1): # throw away these frames self._read() coords = np.array(coords) return coords
def load_from(cls, filename): """ Load project from disk Parameters ---------- filename : string filename_or_file can be a path to a legacy .h5 or current .yaml file. Returns ------- project : the loaded project object """ rootdir = os.path.abspath(os.path.dirname(filename)) if filename.endswith('.yaml'): with open(filename) as f: ondisk = yaml.load(f, Loader=Loader) records = {'conf_filename': ondisk['conf_filename'], 'traj_lengths': [], 'traj_paths': [], 'traj_converted_from': [], 'traj_errors': []} for trj in ondisk['trajs']: records['traj_lengths'].append(trj['length']) records['traj_paths'].append(trj['path']) records['traj_errors'].append(trj['errors']) records['traj_converted_from'].append(trj['converted_from']) elif filename.endswith('.h5'): ondisk = io.loadh(filename, deferred=False) n_trajs = len(ondisk['TrajLengths']) records = {'conf_filename': str(ondisk['ConfFilename'][0]), 'traj_lengths': ondisk['TrajLengths'], 'traj_paths': [], 'traj_converted_from': [[None]] * n_trajs, 'traj_errors': [None] * n_trajs} for i in xrange(n_trajs): # this is the convention used in the hdf project format to get the traj paths path = os.path.join(ondisk['TrajFilePath'][0], ondisk['TrajFileBaseName'][0] + str(i) + ondisk['TrajFileType'][0]) records['traj_paths'].append(path) else: raise ValueError('Sorry, I can only open files in .yaml' ' or .h5 format: %s' % filename) return cls(records, validate=False, project_dir=rootdir)
def test_mfpt_match(): assignments = np.random.randint(10, size=(10, 2000)) msm = MarkovStateModel(lag_time=1) msm.fit(assignments) # these two do different things mfpts0 = np.vstack([tpt.mfpts(msm, i) for i in xrange(10)]).T mfpts1 = tpt.mfpts(msm) # print(mfpts0) # print(mfpts1) npt.assert_array_almost_equal(mfpts0, mfpts1)
def _read(self): """Read a single frame. """ # --- begin header --- first = self._fh.readline() # ITEM: TIMESTEP if first == '': raise _EOF() self._fh.readline() # timestep self._fh.readline() # ITEM: NUMBER OF ATOMS self._n_atoms = int(self._fh.readline()) # num atoms box_header = self._fh.readline().split() # ITEM: BOX BOUNDS self._line_counter += 5 if len(box_header) == 9: lengths, angles = self.parse_box('triclinic') elif len(box_header) == 6: lengths, angles = self.parse_box('orthogonal') else: raise IOError('lammpstrj parse error on line {0:d} of "{1:s}". ' 'This file does not appear to be a valid ' 'lammpstrj file.'.format(self._line_counter, self._filename)) self._fh.readline() # ITEM: ATOMS ... self._line_counter += 4 # --- end header --- xyz = np.empty(shape=(self._n_atoms, 3)) types = np.empty(shape=(self._n_atoms), dtype='int') # --- begin body --- for _ in xrange(self._n_atoms): line = self._fh.readline() if line == '': raise _EOF() temp = line.split() try: atom_index = int(temp[0]) types[atom_index - 1] = int(temp[1]) xyz[atom_index - 1] = [float(x) for x in temp[2:5]] except Exception: raise IOError( 'lammpstrj parse error on line {0:d} of "{1:s}". ' 'This file does not appear to be a valid ' 'lammpstrj file.'.format(self._line_counter, self._filename)) self._line_counter += 1 # --- end body --- self._frame_index += 1 return xyz, lengths, angles
def read(self, n_frames=None, stride=None, atom_indices=None): """Read data from a xyz file. Parameters ---------- n_frames : int, None The number of frames you would like to read from the file. If None, all of the remaining frames will be loaded. stride : np.ndarray, optional Read only every stride-th frame. atom_indices : array_like, optional If not none, then read only a subset of the atoms coordinates from the file. Returns ------- xyz : np.ndarray, shape=(n_frames, n_atoms, 3), dtype=np.float32 """ if not self._mode == 'r': raise ValueError('read() is only available when file is opened ' 'in mode="r"') if n_frames is None: frame_counter = itertools.count() else: frame_counter = xrange(n_frames) if stride is None: stride = 1 all_coords = [] for i in frame_counter: try: frame_coords = self._read() if atom_indices is not None: frame_coords = frame_coords[atom_indices, :] except _EOF: break all_coords.append(frame_coords) for j in range(stride - 1): # throw away these frames try: self._read() except _EOF: break all_coords = np.array(all_coords) return all_coords
def load_trajectories(project, stride, atom_indices): list_of_trajs = [] for i in xrange(project.n_trajs): # note, LoadTraj is only using the fast strided loading for # HDF5 formatted trajs traj = project.load_traj(i, stride=stride, atom_indices=atom_indices) if atom_indices != None: assert len(atom_indices) == traj.n_atoms list_of_trajs.append(traj) return list_of_trajs
def _read(self): """Read a single frame. """ # --- begin header --- first = self._fh.readline() # ITEM: TIMESTEP if first == '': raise _EOF() self._fh.readline() # timestep self._fh.readline() # ITEM: NUMBER OF ATOMS self._n_atoms = int(self._fh.readline()) # num atoms box_header = self._fh.readline().split() # ITEM: BOX BOUNDS self._line_counter += 5 if len(box_header) == 9: lengths, angles = self.parse_box('triclinic') elif len(box_header) == 6: lengths, angles = self.parse_box('orthogonal') else: raise IOError('lammpstrj parse error on line {0:d} of "{1:s}". ' 'This file does not appear to be a valid ' 'lammpstrj file.'.format( self._line_counter, self._filename)) self._fh.readline() # ITEM: ATOMS ... self._line_counter += 4 # --- end header --- xyz = np.empty(shape=(self._n_atoms, 3)) types = np.empty(shape=self._n_atoms, dtype='int') # --- begin body --- for _ in xrange(self._n_atoms): line = self._fh.readline() if line == '': raise _EOF() temp = line.split() try: atom_index = int(temp[0]) types[atom_index - 1] = int(temp[1]) xyz[atom_index - 1] = [float(x) for x in temp[2:5]] except Exception: raise IOError('lammpstrj parse error on line {0:d} of "{1:s}". ' 'This file does not appear to be a valid ' 'lammpstrj file.'.format( self._line_counter, self._filename)) self._line_counter += 1 # --- end body --- self._frame_index += 1 return xyz, lengths, angles
def run(prep_metric, project, delta_time, atom_indices=None, output='tICAData.h5', min_length=0, stride=1): # We will load the trajectories at the stride, so we need to find # what dt should be once we've strided by some amount lag = delta_time / stride if (float(delta_time) / stride) != lag: raise Exception("Stride must be a divisor of delta_time.") if lag > 0: # Then we're doing tICA tica_obj = tICA(lag=lag, calc_cov_mat=True, prep_metric=prep_metric) else: # If lag is zero, this is equivalent to regular PCA tica_obj = tICA(lag=lag, calc_cov_mat=False, prep_metric=prep_metric) for i in xrange(project.n_trajs): logger.info("Working on trajectory %d" % i) if project.traj_lengths[i] <= lag: logger.info( "Trajectory is not long enough for this lag " "(%d vs %d)", project.traj_lengths[i], lag) continue if project.traj_lengths[i] < min_length: logger.info( "Trajectory is not longer than min_length " "(%d vs %d)", project.traj_lengths[i], min_length) continue # it would be more memory efficient if we trained incrementally # at least for long trajectories traj_chunk = md.load(project.traj_filename(i), stride=stride, atom_indices=atom_indices) tica_obj.train(trajectory=traj_chunk) tica_obj.solve() tica_obj.save(output) logger.info("Saved output to %s", output) return tica_obj
def _validate(self): "Run some checks to ensure that this project is consistent" if not os.path.exists(self.conf_filename): raise ValueError('conf does not exist: %s' % self.conf_filename) for i in xrange(self.n_trajs): if not os.path.exists(self.traj_filename(i)): raise ValueError("%s does not exist" % self.traj_filename(i)) lengths, atoms = self._eval_traj_shapes() if not np.all(self.traj_lengths == lengths): raise ValueError('Trajs length don\'t match what\'s on disk') # make sure all trajs have the same number of atoms # note that it is possible that there are no valid trajectories, so atoms # could be empty if len(atoms) > 0 and not np.all(atoms == atoms[0]): raise ValueError('Not all trajs have the same number of atoms')
def save(self, filename_or_file): if isinstance(filename_or_file, string_types): if not filename_or_file.endswith('.yaml'): filename_or_file += '.yaml' dirname = os.path.abspath(os.path.dirname(filename_or_file)) if not os.path.exists(dirname): logger.info("Creating directory: %s" % dirname) os.makedirs(dirname) handle = open(filename_or_file, 'w') own_fid = True elif isinstance(filename_or_file, file): dirname = os.path.abspath(os.path.dirname(filename_or_file.name)) handle = filename_or_file own_fid = False # somewhat complicated logic if the directory you're # saving in is different than the directory this # project references its paths from # the point is that the when the file lists paths, those # paths are going to be interpreted as being with respect to # the directory that the file is in. So when the Project file # is being resaved (but the Trajectorys are not being moved) # then the paths need to change to compensate relative = os.path.relpath(self._project_dir, os.path.dirname(filename_or_file)) records = {'trajs': []} records['conf_filename'] = os.path.join(relative, self._conf_filename) traj_paths = [os.path.join(relative, path) for path in self._traj_paths] for i in xrange(len(traj_paths)): # yaml doesn't like numpy types, so we have to sanitize them records['trajs'].append({'id': i, 'path': str(traj_paths[i]), 'converted_from': list(self._traj_converted_from[i]), 'length': int(self._traj_lengths[i]), 'errors': self._traj_errors[i]}) yaml.dump(records, handle, Dumper=Dumper) if own_fid: handle.close() return filename_or_file
def normalize_left_eigenvectors(left_eigenvectors): """Normalize the left eigenvectors Normalization condition is <left_eigenvectors[:,i] / populations, left_eigenvectors[:,i]> = 1 Parameters ---------- left_eigenvectors : ndarray The left eigenvectors, as a two-dimensional array where the kth eigenvectors is left_eigenvectors[:,k] Notes ----- Acts inplace. Assumes that left_eigenvectors[:,0] is the equilibrium vector and that detailed balance holds. """ populations = left_eigenvectors[:, 0] populations /= populations.sum() for k in xrange(1, left_eigenvectors.shape[-1]): x = left_eigenvectors[:, k] x /= abs(np.dot(x / populations, x)) ** .5
def load_prep_trajectories(project, stride, atom_indices, metric): """load the trajectories but prepare them during the load. This is helpful for metrics that use dimensionality reduction so you can use more frames without a MemoryError """ list_of_ptrajs = [] which = [] for i in xrange(project.n_trajs): which_frames = np.arange(0, project.traj_lengths[i], stride) which.extend(zip([i] * len(which_frames), which_frames)) ptraj = [] for chunk in md.iterload(project.traj_filename(i), stride=stride, atom_indices=atom_indices): ptrj_chunk = metric.prepare_trajectory(chunk) ptraj.append(ptrj_chunk) ptraj = np.concatenate(ptraj) list_of_ptrajs.append(ptraj) return list_of_ptrajs, np.array(which)
def test_mfpt2(): tprob = np.array([[0.90, 0.10], [0.22, 0.78]]) pi0 = 1 # pi1 T[1, 0] = pi0 T[0, 1] pi1 = pi0 * tprob[0, 1] / tprob[1, 0] pops = np.array([pi0, pi1]) / (pi0 + pi1) msm = MarkovStateModel(lag_time=1) msm.transmat_ = tprob msm.n_states_ = 2 msm.populations_ = pops mfpts = np.vstack([tpt.mfpts(msm, i) for i in xrange(2)]).T #print(1 / (1 - tprob[0, 0]), mfpts[0, 1]) #print(1 / (1 - tprob[1, 1]), mfpts[1, 0]) # since it's a 2x2 the mfpt from 0 -> 1 is the # same as the escape time of 0 npt.assert_almost_equal(1 / (1 - tprob[0, 0]), mfpts[0, 1]) npt.assert_almost_equal(1 / (1 - tprob[1, 1]), mfpts[1, 0])