def fetch_fs_peptide(data_home=None, download_if_missing=True): """Loader for the Fs peptide dataset Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default all mixtape data is stored in '~/mixtape_data' subfolders. download_if_missing: optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. """ data_home = get_data_home(data_home=data_home) if not exists(data_home): makedirs(data_home) data_dir = join(data_home, TARGET_DIRECTORY) if not exists(data_dir): print('downloading fs peptide from %s to %s' % (DATA_URL, data_home)) fhandle = urlopen(DATA_URL) buf = BytesIO(fhandle.read()) zip_file = ZipFile(buf) makedirs(data_dir) for name in zip_file.namelist(): zip_file.extract(name, path=data_dir) top = md.load(join(data_dir, 'fs_peptide.pdb')) trajectories = [] for fn in sorted(glob(join(data_dir, 'trajectory*.xtc'))): print('loading %s...' % basename(fn)) trajectories.append(md.load(fn, top=top)) return Bunch(trajectories=trajectories, DESCR=__doc__)
def test_met_enkephalin_mstep(): import pdb, traceback, sys warnings.filterwarnings("ignore", category=DeprecationWarning) try: b = fetch_met_enkephalin() trajs = b.trajectories # While debugging, restrict to first trajectory only trajs = [trajs[0]] n_seq = len(trajs) n_frames = trajs[0].n_frames n_atoms = trajs[0].n_atoms n_features = n_atoms * 3 print "n_features: ", n_features data_home = get_data_home() data_dir = join(data_home, TARGET_DIRECTORY_MET) top = md.load(join(data_dir, '1plx.pdb')) n_components = 2 # Superpose m data = [] for traj in trajs: traj.superpose(top) Z = traj.xyz Z = np.reshape(Z, (n_frames, n_features), order='F') data.append(Z) # Fit reference model and initial MSLDS model print "Starting Gaussian Model Fit" refmodel = GaussianHMM(n_components=n_components, covariance_type='full').fit(data) print "Done with Gaussian Model Fit" # Obtain sufficient statistics from refmodel rlogprob, rstats = reference_estep(refmodel, data) means = refmodel.means_ covars = refmodel.covars_ transmat = refmodel.transmat_ populations = refmodel.startprob_ As = [] for i in range(n_components): As.append(np.zeros((n_features, n_features))) Qs = refmodel.covars_ bs = refmodel.means_ means = refmodel.means_ covars = refmodel.covars_ # Test AQB solver for MSLDS solver = MetastableSwitchingLDSSolver(n_components, n_features) solver.do_mstep(As, Qs, bs, means, covars, rstats, N_iter=100, verbose=True) except: type, value, tb = sys.exc_info() traceback.print_exc() pdb.post_mortem(tb)
def load_quadwell(data_home=None, random_state=None): """Loader for quad-well dataset Parameters ---------- data_home : optional, default: None Specify another cache folder for the datasets. By default all mixtape data is stored in '~/mixtape_data' subfolders. random_state : {int, None}, default: None Seed the psuedorandom number generator to generate trajectories. If seed is None, the global numpy PRNG is used. If random_state is an int, the simulations will be cached in ``data_home``, or loaded from ``data_home`` if simulations with that seed have been performed already. With random_state=None, new simulations will be performed and the trajectories will not be cached. Notes ----- """ # V = 4*(x**8 + 0.8*np.exp(-80*x**2) + 0.2*(-80*(x-0.5)**2) + 0.5*np.exp(-40*(x+0.5)**2)) random = check_random_state(random_state) data_home = join(get_data_home(data_home=data_home), 'quadwell') if not exists(data_home): makedirs(data_home) if random_state is None: trajectories = _simulate_quadwell(random) else: if not isinstance(random_state, numbers.Integral): raise TypeError('random_state must be an int') path = join(data_home, 'version-0_random-state-%d.pkl' % random_state) if exists(path): trajectories = verboseload(path) else: trajectories = _simulate_quadwell(random) verbosedump(trajectories, path) return Bunch(trajectories=trajectories, DESCR=QUADWELL_DESCRIPTION)
def load_doublewell(data_home=None, random_state=None): """Loader for double-well dataset Parameters ---------- data_home : optional, default: None Specify another cache folder for the datasets. By default all mixtape data is stored in '~/mixtape_data' subfolders. random_state : {int, None}, default: None Seed the psuedorandom number generator to generate trajectories. If seed is None, the global numpy PRNG is used. If random_state is an int, the simulations will be cached in ``data_home``, or loaded from ``data_home`` if simulations with that seed have been performed already. With random_state=None, new simulations will be performed and the trajectories will not be cached. Notes ----- """ random = check_random_state(random_state) data_home = join(get_data_home(data_home=data_home), 'doublewell') if not exists(data_home): makedirs(data_home) if random_state is None: trajectories = _simulate_doublewell(random) else: assert isinstance(random_state, numbers.Integral), 'random_state but be an int' path = join(data_home, 'version-1_random-state-%d.pkl' % random_state) if exists(path): trajectories = verboseload(path) else: trajectories = _simulate_doublewell(random) verbosedump(trajectories, path) return Bunch(trajectories=trajectories, DESCR=DOUBLEWELL_DESCRIPTION)
def test_alanine_dipeptide_stats(): import pdb, traceback, sys warnings.filterwarnings("ignore", category=DeprecationWarning) try: b = fetch_alanine_dipeptide() trajs = b.trajectories # While debugging, restrict to first trajectory only trajs = [trajs[0]] n_seq = len(trajs) n_frames = trajs[0].n_frames n_atoms = trajs[0].n_atoms n_features = n_atoms * 3 data_home = get_data_home() data_dir = join(data_home, TARGET_DIRECTORY_ALANINE) top = md.load(join(data_dir, 'ala2.pdb')) n_components = 2 # Superpose m data = [] for traj in trajs: traj.superpose(top) Z = traj.xyz Z = np.reshape(Z, (n_frames, n_features), order='F') data.append(Z) n_hotstart = 3 # Fit reference model and initial MSLDS model refmodel = GaussianHMM(n_components=n_components, covariance_type='full').fit(data) rlogprob, rstats = reference_estep(refmodel, data) model = MetastableSwitchingLDS(n_components, n_features, n_hotstart=n_hotstart) model.inferrer._sequences = data model.means_ = refmodel.means_ model.covars_ = refmodel.covars_ model.transmat_ = refmodel.transmat_ model.populations_ = refmodel.startprob_ As = [] for i in range(n_components): As.append(np.zeros((n_features, n_features))) model.As_ = As Qs = [] eps = 1e-7 for i in range(n_components): Q = refmodel.covars_[i] + eps*np.eye(n_features) Qs.append(Q) model.Qs_ = Qs model.bs_ = refmodel.means_ logprob, stats = model.inferrer.do_estep() yield lambda: np.testing.assert_array_almost_equal(stats['post'], rstats['post'], decimal=2) yield lambda: np.testing.assert_array_almost_equal(stats['post[1:]'], rstats['post[1:]'], decimal=2) yield lambda: np.testing.assert_array_almost_equal(stats['post[:-1]'], rstats['post[:-1]'], decimal=2) yield lambda: np.testing.assert_array_almost_equal(stats['obs'], rstats['obs'], decimal=1) yield lambda: np.testing.assert_array_almost_equal(stats['obs[1:]'], rstats['obs[1:]'], decimal=1) yield lambda: np.testing.assert_array_almost_equal(stats['obs[:-1]'], rstats['obs[:-1]'], decimal=1) yield lambda: np.testing.assert_array_almost_equal(stats['obs*obs.T'], rstats['obs*obs.T'], decimal=1) yield lambda: np.testing.assert_array_almost_equal( stats['obs*obs[t-1].T'], rstats['obs*obs[t-1].T'], decimal=1) yield lambda: np.testing.assert_array_almost_equal( stats['obs[1:]*obs[1:].T'], rstats['obs[1:]*obs[1:].T'], decimal=1) yield lambda: np.testing.assert_array_almost_equal( stats['obs[:-1]*obs[:-1].T'], rstats['obs[:-1]*obs[:-1].T'], decimal=1) # This test fails consistently. TODO: Figure out why. #yield lambda: np.testing.assert_array_almost_equal( # stats['trans'], rstats['trans'], decimal=2) except: type, value, tb = sys.exc_info() traceback.print_exc() pdb.post_mortem(tb)
def test_alanine_dipeptide(): import pdb, traceback, sys warnings.filterwarnings("ignore", category=DeprecationWarning) try: b = fetch_alanine_dipeptide() trajs = b.trajectories n_seq = len(trajs) n_frames = trajs[0].n_frames n_atoms = trajs[0].n_atoms n_features = n_atoms * 3 sim_T = 1000 data_home = get_data_home() data_dir = join(data_home, TARGET_DIRECTORY_ALANINE) top = md.load(join(data_dir, 'ala2.pdb')) n_components = 2 # Superpose m data = [] for traj in trajs: traj.superpose(top) Z = traj.xyz Z = np.reshape(Z, (len(Z), n_features), order='F') data.append(Z) # Fit MSLDS model n_experiments = 1 n_em_iter = 1 tol = 1e-1 model = MetastableSwitchingLDS(n_components, n_features, n_experiments=n_experiments, n_em_iter=n_em_iter) model.fit(data, gamma=.1, tol=tol, verbose=True) mslds_score = model.score(data) print("MSLDS Log-Likelihood = %f" % mslds_score) # Fit Gaussian HMM for comparison g = GaussianFusionHMM(n_components, n_features) g.fit(data) hmm_score = g.score(data) print("HMM Log-Likelihood = %f" % hmm_score) print() # Generate a trajectory from learned model. sample_traj, hidden_states = model.sample(sim_T) states = [] for k in range(n_components): states.append([]) # Presort the data into the metastable wells for k in range(n_components): for i in range(len(trajs)): traj = trajs[i] Z = traj.xyz Z = np.reshape(Z, (len(Z), n_features), order='F') logprob = log_multivariate_normal_density(Z, np.array(model.means_), np.array(model.covars_), covariance_type='full') assignments = np.argmax(logprob, axis=1) #probs = np.max(logprob, axis=1) # pick structures that have highest log probability in state s = traj[assignments == k] states[k].append(s) # Pick frame from original trajectories closest to current sample gen_traj = None for t in range(sim_T): h = hidden_states[t] best_logprob = -np.inf best_frame = None for i in range(len(trajs)): if t > 0: states[h][i].superpose(gen_traj, t-1) Z = states[h][i].xyz Z = np.reshape(Z, (len(Z), n_features), order='F') mean = sample_traj[t] logprobs = log_multivariate_normal_density(Z, mean, model.Qs_[h], covariance_type='full') ind = np.argmax(logprobs, axis=0) logprob = logprobs[ind] if logprob > best_log_prob: logprob = best_logprob best_frame = states[h][i][ind] if t == 0: gen_traj = best_frame else: gen_traj = gen_traj.join(frame) gen_traj.save('%s.xtc' % self.out) gen_traj[0].save('%s.xtc.pdb' % self.out) except: type, value, tb = sys.exc_info() traceback.print_exc() pdb.post_mortem(tb)