Python get_data_homeの例、mixtape.datasets.base.get_data_home Pythonの例

コード例 #1

0

ファイルを表示

ファイル: fs_peptide.py プロジェクト: jchodera/mixtape

def fetch_fs_peptide(data_home=None, download_if_missing=True):
    """Loader for the Fs peptide dataset

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets. By default
        all mixtape data is stored in '~/mixtape_data' subfolders.

    download_if_missing: optional, True by default
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.
    """
    data_home = get_data_home(data_home=data_home)
    if not exists(data_home):
        makedirs(data_home)

    data_dir = join(data_home, TARGET_DIRECTORY)
    if not exists(data_dir):
        print('downloading fs peptide from %s to %s' % (DATA_URL, data_home))
        fhandle = urlopen(DATA_URL)
        buf = BytesIO(fhandle.read())
        zip_file = ZipFile(buf)
        makedirs(data_dir)
        for name in zip_file.namelist():
            zip_file.extract(name, path=data_dir)

    top = md.load(join(data_dir, 'fs_peptide.pdb'))
    trajectories = []
    for fn in sorted(glob(join(data_dir, 'trajectory*.xtc'))):
        print('loading %s...' % basename(fn))
        trajectories.append(md.load(fn, top=top))

    return Bunch(trajectories=trajectories, DESCR=__doc__)

コード例 #2

0

ファイルを表示

ファイル: test_mslds_mstep.py プロジェクト: rbharath/mixtape

def test_met_enkephalin_mstep():
    import pdb, traceback, sys
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    try:
        b = fetch_met_enkephalin()
        trajs = b.trajectories
        # While debugging, restrict to first trajectory only
        trajs = [trajs[0]]
        n_seq = len(trajs)
        n_frames = trajs[0].n_frames
        n_atoms = trajs[0].n_atoms
        n_features = n_atoms * 3
        print "n_features: ", n_features

        data_home = get_data_home()
        data_dir = join(data_home, TARGET_DIRECTORY_MET)
        top = md.load(join(data_dir, '1plx.pdb'))
        n_components = 2

        # Superpose m
        data = []
        for traj in trajs:
            traj.superpose(top)
            Z = traj.xyz
            Z = np.reshape(Z, (n_frames, n_features), order='F')
            data.append(Z)

        # Fit reference model and initial MSLDS model
        print "Starting Gaussian Model Fit"
        refmodel = GaussianHMM(n_components=n_components,
                            covariance_type='full').fit(data)
        print "Done with Gaussian Model Fit"

        # Obtain sufficient statistics from refmodel
        rlogprob, rstats = reference_estep(refmodel, data)
        means = refmodel.means_
        covars = refmodel.covars_
        transmat = refmodel.transmat_
        populations = refmodel.startprob_
        As = []
        for i in range(n_components):
            As.append(np.zeros((n_features, n_features)))
        Qs = refmodel.covars_
        bs = refmodel.means_
        means = refmodel.means_
        covars = refmodel.covars_

        # Test AQB solver for MSLDS
        solver = MetastableSwitchingLDSSolver(n_components, n_features)
        solver.do_mstep(As, Qs, bs, means, covars, rstats, N_iter=100,
                            verbose=True)
    except:
        type, value, tb = sys.exc_info()
        traceback.print_exc()
        pdb.post_mortem(tb)

コード例 #3

0

ファイルを表示

ファイル: brownian1d.py プロジェクト: jchodera/mixtape

def load_quadwell(data_home=None, random_state=None):
    """Loader for quad-well dataset

    Parameters
    ----------
    data_home : optional, default: None
        Specify another cache folder for the datasets. By default
        all mixtape data is stored in '~/mixtape_data' subfolders.
    random_state : {int, None}, default: None
        Seed the psuedorandom number generator to generate trajectories. If
        seed is None, the global numpy PRNG is used. If random_state is an
        int, the simulations will be cached in ``data_home``, or loaded from
        ``data_home`` if simulations with that seed have been performed already.
        With random_state=None, new simulations will be performed and the
        trajectories will not be cached.

    Notes
    -----
    """

    # V = 4*(x**8 + 0.8*np.exp(-80*x**2) + 0.2*(-80*(x-0.5)**2) + 0.5*np.exp(-40*(x+0.5)**2))

    random = check_random_state(random_state)
    data_home = join(get_data_home(data_home=data_home), 'quadwell')
    if not exists(data_home):
        makedirs(data_home)

    if random_state is None:
        trajectories = _simulate_quadwell(random)
    else:
        if not isinstance(random_state, numbers.Integral):
            raise TypeError('random_state must be an int')
        path = join(data_home, 'version-0_random-state-%d.pkl' % random_state)
        if exists(path):
            trajectories = verboseload(path)
        else:
            trajectories = _simulate_quadwell(random)
            verbosedump(trajectories, path)

    return Bunch(trajectories=trajectories, DESCR=QUADWELL_DESCRIPTION)

コード例 #4

0

ファイルを表示

ファイル: brownian1d.py プロジェクト: rbharath/mixtape

def load_doublewell(data_home=None, random_state=None):
    """Loader for double-well dataset

    Parameters
    ----------
    data_home : optional, default: None
        Specify another cache folder for the datasets. By default
        all mixtape data is stored in '~/mixtape_data' subfolders.
    random_state : {int, None}, default: None
        Seed the psuedorandom number generator to generate trajectories. If
        seed is None, the global numpy PRNG is used. If random_state is an
        int, the simulations will be cached in ``data_home``, or loaded from
        ``data_home`` if simulations with that seed have been performed already.
        With random_state=None, new simulations will be performed and the
        trajectories will not be cached.

    Notes
    -----
    """
    random = check_random_state(random_state)
    data_home = join(get_data_home(data_home=data_home), 'doublewell')
    if not exists(data_home):
        makedirs(data_home)

    if random_state is None:
        trajectories = _simulate_doublewell(random)
    else:
        assert isinstance(random_state, numbers.Integral), 'random_state but be an int'
        path = join(data_home, 'version-1_random-state-%d.pkl' % random_state)
        if exists(path):
            trajectories = verboseload(path)
        else:
            trajectories = _simulate_doublewell(random)
            verbosedump(trajectories, path)

    return Bunch(trajectories=trajectories, DESCR=DOUBLEWELL_DESCRIPTION)

コード例 #5

0

ファイルを表示

ファイル: test_mslds_estep.py プロジェクト: rbharath/mixtape

def test_alanine_dipeptide_stats():
    import pdb, traceback, sys
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    try:
        b = fetch_alanine_dipeptide()
        trajs = b.trajectories
        # While debugging, restrict to first trajectory only
        trajs = [trajs[0]]
        n_seq = len(trajs)
        n_frames = trajs[0].n_frames
        n_atoms = trajs[0].n_atoms
        n_features = n_atoms * 3

        data_home = get_data_home()
        data_dir = join(data_home, TARGET_DIRECTORY_ALANINE)
        top = md.load(join(data_dir, 'ala2.pdb'))
        n_components = 2
        # Superpose m
        data = []
        for traj in trajs:
            traj.superpose(top)
            Z = traj.xyz
            Z = np.reshape(Z, (n_frames, n_features), order='F')
            data.append(Z)

        n_hotstart = 3
        # Fit reference model and initial MSLDS model
        refmodel = GaussianHMM(n_components=n_components,
                            covariance_type='full').fit(data)
        rlogprob, rstats = reference_estep(refmodel, data)

        model = MetastableSwitchingLDS(n_components, n_features,
                n_hotstart=n_hotstart)
        model.inferrer._sequences = data
        model.means_ = refmodel.means_
        model.covars_ = refmodel.covars_
        model.transmat_ = refmodel.transmat_
        model.populations_ = refmodel.startprob_
        As = []
        for i in range(n_components):
            As.append(np.zeros((n_features, n_features)))
        model.As_ = As
        Qs = []
        eps = 1e-7
        for i in range(n_components):
            Q = refmodel.covars_[i] + eps*np.eye(n_features)
            Qs.append(Q)
        model.Qs_ = Qs
        model.bs_ = refmodel.means_
        logprob, stats = model.inferrer.do_estep()

        yield lambda: np.testing.assert_array_almost_equal(stats['post'],
                rstats['post'], decimal=2)
        yield lambda: np.testing.assert_array_almost_equal(stats['post[1:]'],
                rstats['post[1:]'], decimal=2)
        yield lambda: np.testing.assert_array_almost_equal(stats['post[:-1]'],
                rstats['post[:-1]'], decimal=2)
        yield lambda: np.testing.assert_array_almost_equal(stats['obs'],
                rstats['obs'], decimal=1)
        yield lambda: np.testing.assert_array_almost_equal(stats['obs[1:]'],
                rstats['obs[1:]'], decimal=1)
        yield lambda: np.testing.assert_array_almost_equal(stats['obs[:-1]'],
                rstats['obs[:-1]'], decimal=1)
        yield lambda: np.testing.assert_array_almost_equal(stats['obs*obs.T'],
                rstats['obs*obs.T'], decimal=1)
        yield lambda: np.testing.assert_array_almost_equal(
                stats['obs*obs[t-1].T'], rstats['obs*obs[t-1].T'], decimal=1)
        yield lambda: np.testing.assert_array_almost_equal(
                stats['obs[1:]*obs[1:].T'], rstats['obs[1:]*obs[1:].T'],
                decimal=1)
        yield lambda: np.testing.assert_array_almost_equal(
                stats['obs[:-1]*obs[:-1].T'], rstats['obs[:-1]*obs[:-1].T'],
                decimal=1)
        # This test fails consistently. TODO: Figure out why.
        #yield lambda: np.testing.assert_array_almost_equal(
        #        stats['trans'], rstats['trans'], decimal=2)

    except:
        type, value, tb = sys.exc_info()
        traceback.print_exc()
        pdb.post_mortem(tb)

コード例 #6

0

ファイルを表示

ファイル: test_mslds.py プロジェクト: jchodera/mixtape

def test_alanine_dipeptide():
    import pdb, traceback, sys
    warnings.filterwarnings("ignore", 
                    category=DeprecationWarning)
    try:
        b = fetch_alanine_dipeptide()
        trajs = b.trajectories
        n_seq = len(trajs)
        n_frames = trajs[0].n_frames
        n_atoms = trajs[0].n_atoms
        n_features = n_atoms * 3
        sim_T = 1000
        data_home = get_data_home()
        data_dir = join(data_home, TARGET_DIRECTORY_ALANINE)
        top = md.load(join(data_dir, 'ala2.pdb'))
        n_components = 2
        # Superpose m
        data = []
        for traj in trajs:
            traj.superpose(top)
            Z = traj.xyz
            Z = np.reshape(Z, (len(Z), n_features), order='F')
            data.append(Z)

        # Fit MSLDS model 
        n_experiments = 1
        n_em_iter = 1
        tol = 1e-1
        model = MetastableSwitchingLDS(n_components, 
            n_features, n_experiments=n_experiments, 
            n_em_iter=n_em_iter) 
        model.fit(data, gamma=.1, tol=tol, verbose=True)
        mslds_score = model.score(data)
        print("MSLDS Log-Likelihood = %f" %  mslds_score)

        # Fit Gaussian HMM for comparison
        g = GaussianFusionHMM(n_components, n_features)
        g.fit(data)
        hmm_score = g.score(data)
        print("HMM Log-Likelihood = %f" %  hmm_score)
        print()

        # Generate a trajectory from learned model.
        sample_traj, hidden_states = model.sample(sim_T)
        states = []
        for k in range(n_components):
            states.append([])

        # Presort the data into the metastable wells
        for k in range(n_components):
            for i in range(len(trajs)):
                traj = trajs[i]
                Z = traj.xyz
                Z = np.reshape(Z, (len(Z), n_features), order='F')
                logprob = log_multivariate_normal_density(Z,
                    np.array(model.means_),
                    np.array(model.covars_), covariance_type='full')
                assignments = np.argmax(logprob, axis=1)
                #probs = np.max(logprob, axis=1)
                # pick structures that have highest log probability in state
                s = traj[assignments == k]
                states[k].append(s)

        # Pick frame from original trajectories closest to current sample
        gen_traj = None
        for t in range(sim_T):
            h = hidden_states[t]
            best_logprob = -np.inf
            best_frame = None
            for i in range(len(trajs)):
                if t > 0:
                    states[h][i].superpose(gen_traj, t-1)
                Z = states[h][i].xyz
                Z = np.reshape(Z, (len(Z), n_features), order='F')
                mean = sample_traj[t]
                logprobs = log_multivariate_normal_density(Z,
                    mean, model.Qs_[h], covariance_type='full')
                ind = np.argmax(logprobs, axis=0)
                logprob = logprobs[ind]
                if logprob > best_log_prob:
                    logprob = best_logprob
                    best_frame = states[h][i][ind]
            if t == 0:
                gen_traj = best_frame
            else:
                gen_traj = gen_traj.join(frame)
        gen_traj.save('%s.xtc' % self.out)
        gen_traj[0].save('%s.xtc.pdb' % self.out)
    except:
        type, value, tb = sys.exc_info()
        traceback.print_exc()
        pdb.post_mortem(tb)