コード例 #1
0
def test_rmsd_khybrid_mpi_subsample():

    TRJFILE = os.path.join(os.path.dirname(__file__), 'data', 'frame0.xtc')
    TOPFILE = os.path.join(os.path.dirname(__file__), 'data', 'native.pdb')
    SELECTION = '(name N or name C or name CA or name H or name O)'
    SUBSAMPLE_FACTOR = 3

    expected_size = (5, (np.ceil(501 / 3),) * 5)

    with tempfile.TemporaryDirectory() as tdname:

        tdname = mpi.comm.bcast(tdname, root=0)

        for i in range(expected_size[0]):
            shutil.copy(TRJFILE, os.path.join(tdname, 'frame%s.xtc' % i))

        with warnings.catch_warnings():
            warnings.filterwarnings('ignore')
            a, d, inds, s = runhelper([
                '--trajectories', os.path.join(tdname, 'frame?.xtc'),
                '--topology', TOPFILE,
                '--cluster-radius', '0.1',
                '--subsample', str(SUBSAMPLE_FACTOR),
                '--atoms', SELECTION,
                '--algorithm', 'khybrid'],
                expected_size=expected_size,
                expect_reassignment=False)

    trj = md.load(TRJFILE, top=TOPFILE)
    expected_s = md.join([trj[i[1]] for i in inds])
    assert_array_equal(expected_s.xyz, md.join(s).xyz)
コード例 #2
0
def test_rmsd_cluster_mpi_subsample():

    TRJFILE = get_fn('frame0.xtc')
    TOPFILE = get_fn('native.pdb')
    SELECTION = '(name N or name C or name CA or name H or name O)'
    SUBSAMPLE_FACTOR = 3

    expected_size = (5, (np.ceil(501 / 3), ) * 5)

    with tempfile.TemporaryDirectory() as tdname:

        tdname = MPI.COMM_WORLD.bcast(tdname, root=0)

        for i in range(expected_size[0]):
            shutil.copy(TRJFILE, os.path.join(tdname, 'frame%s.xtc' % i))

        with warnings.catch_warnings():
            warnings.filterwarnings('ignore')
            a, d, inds, s = runhelper([
                '--trajectories',
                os.path.join(tdname, 'frame?.xtc'),
                '--topology',
                TOPFILE,
                '--cluster-radii',
                '0.1',
                '--subsample',
                str(SUBSAMPLE_FACTOR),
                '--selection',
                SELECTION,
                '--random-state',
                str(2),
                '--kmedoids-iters',
                str(1),
            ],
                                      expected_size=expected_size)

    a = a.flatten()
    d = d.flatten()

    trj = md.load(TRJFILE, top=TOPFILE)
    trj_sele = trj.atom_slice(trj.top.select(SELECTION))

    expected_s = md.join([trj[i[1]] for i in inds])
    assert_array_equal(expected_s.xyz, md.join(s).xyz)

    expect_a, expect_d = assign_to_nearest_center(
        md.join([trj_sele] * expected_size[0]),
        md.join([trj_sele[i[1]] for i in inds]), md.rmsd)

    assert_array_equal(expect_a[::SUBSAMPLE_FACTOR], a)
    assert_allclose(expect_d[::SUBSAMPLE_FACTOR], d, atol=1e-4)
コード例 #3
0
def test_rmsd_cluster_mpi_basic():

    expected_size = (2, 501)

    TRJFILE = get_fn('frame0.xtc')
    TOPFILE = get_fn('native.pdb')
    SELECTION = '(name N or name C or name CA or name H or name O)'

    with tempfile.TemporaryDirectory() as tdname:

        shutil.copy(TRJFILE, os.path.join(tdname, 'frame0.xtc'))
        shutil.copy(TRJFILE, os.path.join(tdname, 'frame1.xtc'))

        tdname = MPI.COMM_WORLD.bcast(tdname, root=0)

        print('rank', MPI.COMM_WORLD.Get_rank())
        MPI.COMM_WORLD.Barrier()

        a, d, i, s = runhelper([
            '--trajectories',
            os.path.join(tdname, 'frame?.xtc'),
            '--topology',
            TOPFILE,
            '--cluster-radii',
            '0.1',
            '--selection',
            SELECTION,
            '--kmedoids-iters',
            0,
        ],
                               expected_size=expected_size)

    a = a.flatten()
    d = d.flatten()

    trj = md.load(TRJFILE, top=TOPFILE)
    trj_sele = trj.atom_slice(trj.top.select(SELECTION))

    # expected_i = [(1, 194), (1, 40), (0, 430), (1, 420)]
    expected_i = [[0, 0], [0, 42], [0, 430], [0, 319]]
    assert_array_equal(i, expected_i)

    expected_s = md.join([trj[i[1]] for i in expected_i])
    assert_array_equal(expected_s.xyz, md.join(s).xyz)

    expect_a, expect_d = assign_to_nearest_center(
        md.join([trj_sele] * 2), md.join([trj_sele[i[1]] for i in expected_i]),
        md.rmsd)

    assert_array_equal(expect_a, a)
    assert_allclose(expect_d, d, atol=1e-4)
コード例 #4
0
def test_rmsd_khybrid_mpi_basic():

    expected_size = (2, 501)

    TRJFILE = os.path.join(os.path.dirname(__file__), 'data', 'frame0.xtc')
    TOPFILE = os.path.join(os.path.dirname(__file__), 'data', 'native.pdb')
    SELECTION = '(name N or name C or name CA or name H or name O)'

    with tempfile.TemporaryDirectory() as tdname:

        shutil.copy(TRJFILE, os.path.join(tdname, 'frame0.xtc'))
        shutil.copy(TRJFILE, os.path.join(tdname, 'frame1.xtc'))

        tdname = mpi.comm.bcast(tdname, root=0)

        mpi.comm.Barrier()

        a, d, idx, s = runhelper([
            '--trajectories', os.path.join(tdname, 'frame?.xtc'),
            '--topology', TOPFILE,
            '--cluster-radius', '0.095',
            '--atoms', SELECTION,
            '--algorithm', 'khybrid'],
            expected_size=expected_size)

    a = a.flatten()
    d = d.flatten()

    trj = md.load(TRJFILE, top=TOPFILE)
    trj_sele = trj.atom_slice(trj.top.select(SELECTION))

    expected_s = md.join([trj[i[1]] for i in idx])
    expected_i = [[0, 0],
                  [0, 55],
                  [1, 102],
                  [1, 196]]

    assert_array_equal(idx, expected_i)

    expected_s = md.join([trj[i[1]] for i in idx])
    assert_array_equal(
        expected_s.xyz,
        md.join(s).xyz)

    expect_a, expect_d = util.assign_to_nearest_center(
        md.join([trj_sele] * 2),
        md.join([trj_sele[i[1]] for i in idx]), md.rmsd)

    assert_array_equal(expect_a, a)
    assert_allclose(expect_d, d, atol=1e-4)
コード例 #5
0
    def test_exposons_pipeline_weighting(self):

        repeat_trj = md.join([self.trj[0:3], self.trj[0:3], self.trj[3:6]])
        norepeat_trj = md.join([self.trj[0:3], self.trj[3:6]])

        unweighted_mi, unweighted_exp = exposons(repeat_trj,
                                                 0.9,
                                                 threshold=1.0)
        weighted_mi, weighted_exp = exposons(norepeat_trj,
                                             0.9,
                                             threshold=1.0,
                                             weights=[2, 2, 2, 1, 1, 1])

        assert_allclose(unweighted_mi, weighted_mi, rtol=1e-14)
        assert_array_equal(unweighted_exp, weighted_exp)
コード例 #6
0
ファイル: traj_utils.py プロジェクト: AspirinCode/Scripts
def generate_traj_from_stateinds(inds, meta, atom_selection='all'):
    """
    Concatenate several frames from different trajectories to create a new one.

    Parameters
    ----------
    inds: list of tuples, Each element of the list has to be a 2D tuple of ints
        (traj_index, frame_index)

    meta: a metadata object
    atom_selection: str, Which atoms to load

    Returns
    -------
    traj: mdtraj.Trajectory
    """
    frame_list = []
    for traj_i, frame_i in inds:
        top = mdtraj.load_prmtop(meta.loc[traj_i]['top_fn'])
        atoms = top.select(atom_selection)

        frame_list.append(
            mdtraj.load_frame(meta.loc[traj_i]['traj_fn'], atom_indices=atoms,
                              index=frame_i, top=meta.loc[traj_i]['top_fn'])
        )
    traj = mdtraj.join(frame_list, check_topology=False)
    traj.center_coordinates()
    traj.superpose(traj, 0)
    return traj
コード例 #7
0
    def _execute(self, directory, available_resources):

        import mdtraj

        if len(self.input_coordinate_paths) != len(
                self.input_trajectory_paths):

            raise ValueError(
                "There should be the same number of coordinate and trajectory paths."
            )

        if len(self.input_trajectory_paths) == 0:
            raise ValueError("No trajectories were given to concatenate.")

        trajectories = []

        output_coordinate_path = None

        for coordinate_path, trajectory_path in zip(
                self.input_coordinate_paths, self.input_trajectory_paths):

            output_coordinate_path = output_coordinate_path or coordinate_path
            trajectories.append(
                mdtraj.load_dcd(trajectory_path, coordinate_path))

        self.output_coordinate_path = output_coordinate_path
        output_trajectory = (trajectories[0] if len(trajectories) == 1 else
                             mdtraj.join(trajectories, False, False))

        self.output_trajectory_path = path.join(directory,
                                                "output_trajectory.dcd")
        output_trajectory.save_dcd(self.output_trajectory_path)
コード例 #8
0
    def execute(self, directory, available_resources):

        import mdtraj

        if len(self.input_coordinate_paths) != len(self.input_trajectory_paths):

            return PropertyEstimatorException(directory=directory, message='There should be the same number of '
                                                                           'coordinate and trajectory paths.')

        if len(self.input_trajectory_paths) == 0:

            return PropertyEstimatorException(directory=directory, message='No trajectories were '
                                                                           'given to concatenate.')

        trajectories = []

        output_coordinate_path = None

        for coordinate_path, trajectory_path in zip(self.input_coordinate_paths,
                                                    self.input_trajectory_paths):

            output_coordinate_path = output_coordinate_path or coordinate_path
            trajectories.append(mdtraj.load_dcd(trajectory_path, coordinate_path))

        self.output_coordinate_path = output_coordinate_path
        output_trajectory = trajectories[0] if len(trajectories) == 1 else mdtraj.join(trajectories, False, False)

        self.output_trajectory_path = path.join(directory, 'output_trajectory.dcd')
        output_trajectory.save_dcd(self.output_trajectory_path)

        return self._get_output_dictionary()
コード例 #9
0
def sampling_along_tIC(resultdir, opath, tica_trajs, xtc_traj_folder,
                       traj_list_array, pdb_name, tIC_a):
    transformed = np.concatenate(tica_trajs)
    draw_tica_histogram_core(transformed[:, 0], transformed[:, 1], '1', '2')
    tica_trajs = {i: tica_trajs[i]
                  for i in range(len(tica_trajs))
                  }  #tica_trajs is now a dictionary
    inds = sample_dimension(tica_trajs,
                            dimension=tIC_a - 1,
                            n_frames=200,
                            scheme='random')  #sample 200 conformations
    #make trajectory
    traj = md.join(
        md.load_frame(xtc_traj_folder + traj_list_array[i],
                      index=frame_i,
                      top=xtc_traj_folder + pdb_name) for i, frame_i in inds)
    #save the trajectory
    traj.save("%s/tica-dimension-tIC%s.xtc" % (resultdir, tIC_a - 1))
    #show the samples on tICA projections
    samples_coord = []
    for i, frame_i in inds:
        samples_coord.append(
            [tica_trajs[i][frame_i][0], tica_trajs[i][frame_i][1]])
    samples_coord = np.array(samples_coord)
    print(samples_coord.shape)
    plt.plot(samples_coord[:, 0], samples_coord[:, 1], 'o-')
    plt.legend('sample')
    plt.savefig(resultdir + '/' + opath)
コード例 #10
0
    def _analyse_results_using_mdtraj(
        self,
        env: str,
        snapshots: list,
        unitcell: list,
        save_results: bool,
        engine: str,
    ):

        logger.debug(f"Evaluating with {engine}")

        if engine == "openMM":
            lambda_states = [
                lambda_state
                for lambda_state in range(1, self.nr_of_states + 1)
            ]
            xyz_array = snapshots

            # Decide if we want to use the multiprocessing library
            r = starmap(
                self._evaluate_e_on_all_snapshots_openMM,
                zip(repeat(xyz_array), repeat(unitcell), lambda_states,
                    repeat(env)),
            )

            u_kn = np.stack([r_i for r_i in r])

        elif engine == "CHARMM":
            confs = []
            # write out traj in self.base_path
            for (dcd, psf) in self.traj_files[env]:
                traj = mdtraj.load(
                    f"{dcd}",
                    top=f"{psf}",
                )
                # return and append thinned trajs
                traj, _, _ = self._thinning(traj)
                confs.append(traj)

            joined_trajs = mdtraj.join(confs, check_topology=True)
            joined_trajs.save_dcd(f"{self.base_path}/traj.dcd")
            u_kn = np.stack([
                self._evaluate_e_on_all_snapshots_CHARMM(
                    joined_trajs, lambda_state, env)
                for lambda_state in range(1, self.nr_of_states + 1)
            ])
            # remove merged traj
            os.remove(f"{self.base_path}/traj.dcd")

        else:
            raise RuntimeError(f"Either openMM or CHARMM engine, not {engine}")

        if save_results:
            file = f"{self.save_results_to_path}/mbar_data_for_{self.structure_name}_in_{env}.pickle"
            logger.info(f"Saving results: {file}")
            results = {"u_kn": u_kn, "N_k": self.N_k}
            pickle.dump(results, open(file, "wb+"))

        return self.calculate_dG_using_mbar(u_kn, self.N_k, env)
コード例 #11
0
def test_rmsd_kcenters_mpi():

    TRJFILE = os.path.join(os.path.dirname(__file__), 'data', 'frame0.xtc')
    TOPFILE = os.path.join(os.path.dirname(__file__), 'data', 'native.pdb')
    SELECTION = '(name N or name C or name CA or name H or name O)'

    expected_size = (5, 501)

    with tempfile.TemporaryDirectory() as tdname:

        tdname = MPI.COMM_WORLD.bcast(tdname, root=0)

        for i in range(expected_size[0]):
            shutil.copy(TRJFILE, os.path.join(tdname, 'frame%s.xtc' % i))

        with warnings.catch_warnings():
            warnings.filterwarnings('ignore')
            a, d, inds, s = runhelper(
                [
                    '--trajectories',
                    os.path.join(tdname, 'frame?.xtc'),
                    '--topology',
                    TOPFILE,
                    '--cluster-number',
                    '4',
                    # '--subsample', str(SUBSAMPLE_FACTOR),
                    '--atoms',
                    SELECTION,
                    '--algorithm',
                    'kcenters'
                ],
                expected_size=expected_size,
                expect_reassignment=True)

    trj = md.load(TRJFILE, top=TOPFILE)
    trj_sele = trj.atom_slice(trj.top.select(SELECTION))

    result = kcenters.kcenters(trj_sele, 'rmsd', n_clusters=4, mpi_mode=False)

    assert_array_equal(result.center_indices, inds[:, 1])
    assert_array_equal(result.distances, d[0])
    assert_array_equal(result.assignments, a[0])

    expected_s = md.join([trj[i[1]] for i in inds])

    assert_array_equal(expected_s.xyz, md.join(s).xyz)
コード例 #12
0
def test_md_join():
    t_ref = md.load(get_fn('frame0.h5'))[:20]
    loaded = md.load(fn, top=t_ref, stride=2)
    iterloaded = md.join(md.iterload(fn, top=t_ref, stride=2, chunk=6))
    eq(loaded.xyz, iterloaded.xyz)
    eq(loaded.time, iterloaded.time)
    eq(loaded.unitcell_angles, iterloaded.unitcell_angles)
    eq(loaded.unitcell_lengths, iterloaded.unitcell_lengths)
コード例 #13
0
 def fit(self, traj_list, y=None):
     all_trajs = join(traj_list)
     indices = baker_hubbard(all_trajs,
                             freq=self.freq,
                             exclude_water=self.exclude_water,
                             periodic=self.periodic,
                             sidechain_only=self.sidechain_only)
     self.indices = indices[:, 1:]
コード例 #14
0
ファイル: test_trajectory.py プロジェクト: msultan/mdtraj
def test_md_join():
    t_ref = md.load(get_fn('frame0.h5'))[:20]
    loaded = md.load(fn, top=t_ref, stride=2)
    iterloaded = md.join(md.iterload(fn, top=t_ref, stride=2, chunk=6))
    eq(loaded.xyz, iterloaded.xyz)
    eq(loaded.time, iterloaded.time)
    eq(loaded.unitcell_angles, iterloaded.unitcell_angles)
    eq(loaded.unitcell_lengths, iterloaded.unitcell_lengths)
コード例 #15
0
def sampling_representative_structures_for_MSM(resultdir, msm_traj, xtc_traj_folder, traj_list_array, pdb_name):
    transformed_assignments=np.concatenate(msm_traj)
    info_list=[] #elements in info_list: [traj_id, frame_id, class_id, eigenvector_value]
    for j in range(len(msm_traj)):
        for k in range(len(msm_traj[j])):
            info_list.append([j, k, msm_traj[j][k]])
    info_list = np.array(info_list)
    for j in range(min(transformed_assignments), max(transformed_assignments)+1):
        samples = random.sample(list(info_list[np.where(info_list[:, 2] == j)[0]]), 50)
        traj = md.join(md.load_frame(xtc_traj_folder+traj_list_array[i], index=frame_i, top=xtc_traj_folder+pdb_name)
                  for i, frame_i, class_id in samples)
        traj.save("%s/representative_structure_in_state%d.xtc"%(resultdir, j))
コード例 #16
0
def sampling_along_msm_eigenmode(resultdir, msm_eigen_trajs, assignment_list, xtc_traj_folder, traj_list_array, pdb_name, mode_num):
    #mode_num=1: the slowest dynamic mode (2nd eigenvector of TPM)
    info_list=[] #elements in info_list: [traj_id, frame_id, class_id, eigenvector_value]
    for j in range(len(assignment_list)):
        for k in range(len(assignment_list[j])):
            info_list.append([j, k, assignment_list[j][k], msm_eigen_trajs[j][k][mode_num-1]]) #index starts from 0
    #randomly sample 200 conformations
    samples=random.sample(info_list, 200) #can play with the number 200
    sorted_samples=sorted(samples, key=lambda x: x[3])
    #get the xtc file containing all the sampled conformations
    traj = md.join(md.load_frame(xtc_traj_folder+traj_list_array[i], index=frame_i, top=xtc_traj_folder+pdb_name)
                  for i, frame_i, class_id, eigenvector_value in sorted_samples)        
    traj.save("%s/msm-%s-dynamic-mode.xtc"%(resultdir, mode_num))
コード例 #17
0
    def load_pdb(self, stride=4):
        traj = None
        for i, t in enumerate(
                mt.iterload(self.traj_fn, stride=stride, chunck=100)):
            if i == 0:
                traj = t
            else:
                traj = mt.join([traj, t])

        self.traj = traj.atom_slice(traj.topology.select("not water"))
        self.n_frames = traj.n_frames

        return self.traj
コード例 #18
0
def itertrajs(meta, stride=1):
    """Load one mdtraj trajectory at a time and yield it.

    MDTraj does striding badly. It reads in the whole trajectory and
    then performs a stride. We join(iterload) to conserve memory.
    """

    tops = preload_tops(meta)
    for i, row in meta.iterrows():
        yield i, md.join(md.iterload(row['traj_fn'],
                                     top=tops[row['top_fn']],
                                     stride=stride),
                         discard_overlapping_frames=False,
                         check_topology=False)
コード例 #19
0
def sample_ev(n_ev: int,
              n_cut: int,
              ptrajs_df,
              top_path: str,
              threshold: float = 1e-6) -> md.Trajectory:
    n_ev = str(n_ev)
    df = ptrajs_df.loc[:, [n_ev, 'mixing']].copy(deep=True)
    df['cat'] = pd.qcut(df[n_ev], q=n_cut, duplicates='drop')
    df['min'] = df.groupby('cat')['mixing'].transform('min')
    df = df.loc[np.abs(df['mixing'] - df['min']) < threshold, :]
    sample = df.groupby('cat').sample(n=1)
    sample.sort_values(by='cat', inplace=True)
    sample_ixs = list(sample.index)
    traj = md.join(
        [md.load_frame(x, top=top_path, index=y) for x, y in sample_ixs])
    return traj
コード例 #20
0
def main():
    arg = argparse.ArgumentParser(prog='rex.track_traj')
    arg.add_argument('-t', '--top', dest='top_fn', required=True)
    arg.add_argument('-p',
                     '--prefix',
                     dest='prefix_s',
                     required=True,
                     nargs='+')
    arg.add_argument('-o', '--output', dest='output_prefix', default='traj')
    arg.add_argument('--dcd_step', dest='dcd_step', default=25000)
    arg.add_argument('--exchange_rate', dest='exchange_rate', default=5000)
    #
    if len(sys.argv) == 1:
        arg.print_help()
        return
    arg = arg.parse_args()
    #
    history_step = float(arg.dcd_step) / float(arg.exchange_rate)
    #
    history = read_history(arg.prefix_s)
    dcd_frames = history_step * (
        1 + np.arange(int(history.shape[1] / history_step)))
    dcd_frames = np.ceil(dcd_frames).astype(int) - 1
    history = history[:, dcd_frames]

    top = mdtraj.load(arg.top_fn)
    #
    n_replica = history.shape[0]
    n_frame = history.shape[1]
    #
    replica = []
    for k in range(n_replica):
        for prefix in arg.prefix_s:
            replica_fn = 'replica.%d/%s.dcd' % (k, prefix)
            replica.append(mdtraj.load(replica_fn, top=top))
    replica = mdtraj.join(replica, check_topology=False)
    #
    for k in range(n_replica):
        frame = np.array([n_frame * j + i for i, j in enumerate(history[k])])
        traj = replica.slice(frame)
        traj.save("%s.%d.dcd" % (arg.output_prefix, k))
コード例 #21
0
def main():
    #Part to load coordinate file
    topfile = sys.argv[1]
    alength = len(sys.argv)  #number of dcd files can vary
    #trjfile = sys.argv[2]
    #outfile = sys.argv[3]

    #settings for env variables.
    outstr = input(
        "output dcd file prefix? ex) DES_npt_b (becomes DES_npt_b${i}.dcd) \n")
    nb = int(input("how many blocks? ex) 5 \n"))
    bstep = int(input("frames per block? ex) 2000 \n"))
    neq = int(
        input(
            "How many initial frames do you want to cut as equilibration? ex) 500 \n"
        ))

    #input 1 : load traj. (big file)
    #check if multiple trajectories are put and needed to be merged.
    ntrajfile = alength - 2
    if ntrajfile == 1:
        trajtotal = md.load(sys.argv[2], top=topfile)
        nframetotal = trajtotal.n_frames
        print('nframetotal : {}'.format(nframetotal))
    elif ntrajfile == 2:
        traj1, traj2 = md.load(sys.argv[2], top=topfile), md.load(sys.argv[3],
                                                                  top=topfile)
        nframe1, nframe2 = traj1.n_frames, traj2.n_frames
        print('nframe1, nframe2 : {} {}'.format(nframe1, nframe2))
        trajtotal = md.join([traj1, traj2],
                            check_topology=True,
                            discard_overlapping_frames=True)
        nframetotal = trajtotal.n_frames
        print('nframetotal : {}'.format(nframetotal))

    #traj splitting:loop
    for i in range(nb):
        outfile = outstr + str(i) + '.dcd'
        iframe, fframe = neq + i * bstep, neq + (i + 1) * bstep
        trajfrag = trajtotal[iframe:fframe]
        trajfrag.save_dcd(outfile)
コード例 #22
0
def test_iterload_chunk_dcd(get_fn):
    # Makes sure that the actual chunk size yielded by iterload corresponds to the number of
    # frames specified when calling it (for dcd files).
    file = get_fn("alanine-dipeptide-explicit.dcd")
    top = get_fn("alanine-dipeptide-explicit.pdb")

    skip_frames = 3
    frames_chunk = 2

    full = md.load(file, top=top, stride=skip_frames)
    length = len(full)

    chunks = []
    for traj_chunk in md.iterload(file,
                                  top=top,
                                  stride=skip_frames,
                                  chunk=frames_chunk):
        chunks.append(traj_chunk)
    joined = md.join(chunks)
    assert len(full) == len(joined)
    assert eq(full.xyz, joined.xyz)
コード例 #23
0
def sample_clusters(meta, trajs, df):

    clust_id = df['Trajectory'].unique()

    for i in clust_id:
        print(i)

        df_smp = df.ix[df['Trajectory'] == i, ['Key', 'Frame']].sample(1000)
        inds = zip(df_smp['Key'], df_smp['Frame'])

        # Use loc because sample_dimension is nice
        traj = md.join(
            md.load_frame(meta.loc[traj_i]['traj_fn'],
                          index=int(frame_i),
                          top=meta.loc[traj_i]['top_fn'])
            for traj_i, frame_i in inds)

        # Save
        traj_fn = "/Users/robert_arbon/Code/AADH/Analysis/KR_Comparison/pcca_cluster-{}.dcd".format(
            i)
        backup(traj_fn)
        traj.save(traj_fn)
コード例 #24
0
def make_traj(prefix, index):
    list_pdb = open(prefix + '-listfiles')
    lines = list_pdb.readlines()
    if len(lines) > 0: first = lines.pop(0).replace(' \n', '')
    else:
        return False
    t = mdtraj.load(first, top='input.prmtop')

    table, bonds = t.topology.to_dataframe()

    for i in range(len(lines)):
        filename = lines.pop(0).replace(' \n', '')
        #print(filename)
        tnext = mdtraj.load(filename.replace(' \n', ''), top='input.prmtop')
        t = mdtraj.join([t, tnext],
                        check_topology=True,
                        discard_overlapping_frames=False)
    t.save_mdcrd(prefix + '-traj.mdcrd')
    t = mdtraj.load(prefix + '-traj.mdcrd', top='input.prmtop')
    table, bonds = t.topology.to_dataframe()
    #print(index)
    dih = mdtraj.compute_dihedrals(t, index)
    t.save_netcdf(prefix + '-traj.nc')
    return t.n_frames, dih
コード例 #25
0
def sample_clusters():

    meta = load_meta()
    tops = preload_tops(meta)
    print('Sampling trajectories')
    ref = md.load('topology.pdb')
    for i in range(int(num_clusters)):
        print(i)
        df_smp = df.ix[df['Trajectory']==i, ['Key', 'Time_ps']].sample(100)
        inds = zip(df_smp['Key'], df_smp['Time_ps'])

        # Use loc because sample_dimension is nice
        traj = md.join(
            md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=meta.loc[traj_i]['top_fn'])
            for traj_i, frame_i in inds
        )

        # Original trajectories include both BT1 and BT2 so need to superpose
        traj.superpose(reference=ref)

        # Save
        traj_fn = "clusters/rmsd_cluster-{}.dcd".format(i)
        backup(traj_fn)
        traj.save(traj_fn)
コード例 #26
0
def sample_tica_dim(dim=0, n_frames=200, meta=None, ttrajs=None):

    ## Load
    if (not meta is None) & (not ttrajs is None):

        ## Sample
        # These are apparently ordered according tica value
        inds = sample_dimension(ttrajs,
                                dimension=dim,
                                n_frames=n_frames,
                                scheme='random')

        save_generic(inds, "tica-dimension-{}-inds.pickl".format(dim + 1))

        ## Get tica components
        tica_values = np.array(
            [ttrajs[traj_i][frame_i][dim] for traj_i, frame_i in inds])
        tica_values = (tica_values - tica_values.min()) / (tica_values.max() -
                                                           tica_values.min())
        tica_values *= 10
        ## Make trajectory
        top = preload_top(meta)

        # Use loc because sample_dimension is nice
        traj = md.join(
            md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top)
            for traj_i, frame_i in inds)

        ## Supperpose

        ## Save
        traj_fn = "tica-dimension-{}.dcd".format(dim + 1)
        backup(traj_fn)
        traj.save(traj_fn)
    else:
        raise ValueError('Specify meta data and trajectory objects')
コード例 #27
0
## Try to limit RAM usage
def guestimate_stride():
    total_data = meta['nframes'].sum()
    want = kmed.n_clusters * 10
    stride = max(1, total_data // want)
    print("Since we have", total_data, "frames, we're going to stride by",
          stride, "during fitting, because this is probably adequate for",
          kmed.n_clusters, "clusters")
    return stride


## Fit
kmed.fit([traj for _, traj in itertrajs(meta, stride=guestimate_stride())])
print(kmed.summarize())

## Save
save_generic(kmed, 'clusterer.pickl')


## Save centroids
def frame(traj_i, frame_i):
    # Note: kmedoids does 0-based, contiguous integers so we use .iloc
    row = meta.iloc[traj_i]
    return md.load_frame(row['traj_fn'], frame_i, top=row['top_fn'])


centroids = md.join((frame(ti, fi) for ti, fi in kmed.cluster_ids_),
                    check_topology=False)
centroids_fn = 'centroids.xtc'
backup(centroids_fn)
centroids.save("centroids.xtc")
コード例 #28
0
    def calculate_tica_components(self, cluster_method, calculate_strides=False, feats=None):
          
        '''Load in the features, calculate a given number of tICA components (tica_components) given a
        lagtime (lag_time), and save tICA coordinates and eigenvector data. It then creates and populates
        a list for each desired component, clusters the data, saving normalized populations as populations.dat
        and saving each cluster center as a .pdb. tICA plots are created and saved, and implied timescales are
        calculated, saved, and plotted.
        '''
	
	# tICA parameters
        tica_lagtime = 10 # determine from implied timescales
        tica_components = 8 # how many tICs to compute
        n_clusters = 100 # denotes number of microstates
        n_timescales = tica_components # plot all eigenvalues --> timescales
        md_time_step = 0.02 # ns
        subsampled_time_step = 1. # ns multiplier of timescales and lagtimes in implied timescale plot
        stride = int(subsampled_time_step / md_time_step)  #time step stride for sub-sampling
	equil_time = 1. # ns
        equil_steps = 1 #int(equil_time / md_time_step)  time steps to be removed from start
        lagtimes = np.array([1,2,4,8,16,32,64,128,256,512,1024])
        cluster_method = 'kcenters' # 'kcenters/kmeans'
        all_ticas = list(itertools.permutations(range(1,tica_components+1), 2)) # all combinations
        all_ticas = [[1,2]] # override: just show analysis for first two components
        cluster_percentage_cutoff = 5 # clusters with a relative population less than this
                                  # number will not be labeled on plot i.e. 0 : all clusters labeled
        verbose = False

        print("\nCalculating tICA components...")
        
        # Load in feature files THIS WILL NEED TO BE CHANGED
	if feats == None:
	    if calculate_strides:
                self.calculate_stride_distances(stride, equil_steps)
	        data = np.load('/home/server/git/fah-scripts/DataAnalysisScripts/stride_dist/stride_dist_%d.npy' % self.proj_num)
	    else:
	        data = self.data
	else:
	    data = np.load(feats)

	features = []
	for run in data:
	    for clone in run:
	        gen_seq = []
	        for gen in clone:
	            if gen is not None and gen[0] is not None:
    	                if calculate_strides or feats is not None:
			    gen_seq.append(gen)
		        else:
	                    gen_seq.append(gen[::stride]) 
	        if len(gen_seq) > 0:
		    gen_cat = np.concatenate(gen_seq)
		    if calculate_strides:
		        features.append(gen_cat)
		    else:
		        features.append(gen_cat[equil_steps:])
	features = np.asarray(features)
	print(features.shape)
	print(features[0].shape)
	tica_coordinates = tICA(lag_time=tica_lagtime,
            n_components=int(tica_components)).fit_transform(features)
      
        np.save('%s/lag_%d_coord_%d.npy' %(self.tICA_dir, tica_lagtime, tica_components), tica_coordinates)
          
        # Initiate and populate an array for each component    
        for i in range(tica_components):
            exec('tica_' + str(i+1) + ' = []')
          
        for i in tqdm.tqdm(range(len(features))):
            for j in range(len(tica_coordinates[i])):
                for k in range(tica_components):
                    exec('tica_' + str(k+1) + '.append(tica_coordinates[i][j][k])')
            
        # Perform clustering based on the cluster_method parameter.
        if cluster_method == 'kcenters':
            print("Clustering via KCenters...")
            clusters = KCenters(n_clusters)
        elif cluster_method == 'kmeans':
            print("Clustering via KMeans...")
            clusters = KMeans(n_clusters)
        else:
            sys.exit("Invalid cluster_method. Use kmeans or kcenters.")
        
        # Determine cluster assignment for each frame.      
        sequences = clusters.fit_transform(tica_coordinates)
	
        np.save('%s/lag_%d_clusters_%d_sequences.npy' %(self.tICA_dir, tica_lagtime, n_clusters), sequences)
        np.save('%s/lag_%d_clusters_%d_center.npy' %(self.tICA_dir, tica_lagtime, n_clusters),
        clusters.cluster_centers_)

        # Determine cluster populations, normalize the counts, and save as percentages for
        # labeling if a cluster contains more than cluster_percentage_cutoff percent of the data.
        # Finally, save normalized counts.
        print("\nDetermining cluster populations...")
    
        if not os.path.exists('%s/cluster_centers' % self.tICA_dir):
            os.makedirs('%s/cluster_centers' % self.tICA_dir)
        counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)])
        normalized_counts =  counts/float(counts.sum())
        percentages = [ i*100 for i in normalized_counts ]
        population_labels = [ [i,"%.2f"%percentages[i]] for i in range(len(percentages)) if percentages[i] > cluster_percentage_cutoff ]
        np.savetxt('%s/cluster_centers/populations.dat' % self.tICA_dir, normalized_counts)
	

        # Plot all unique combinations of tICA components
        print("\nPlotting tICA components with cluster centers...")
        all_ticas = list(itertools.permutations(range(1,tica_components+1), 2))
        for j in tqdm.tqdm(range(len(all_ticas))): # For each pair
            if all_ticas[j][0] < all_ticas[j][1]:
                plt.figure(j, figsize=(20,16))
                plt.hexbin(eval("tica_"+str(all_ticas[j][0])), eval("tica_"+str(all_ticas[j][1])), bins='log')
                x_centers = [clusters.cluster_centers_[i][all_ticas[j][0]-1] for i in range(len(clusters.cluster_centers_))]
                y_centers = [clusters.cluster_centers_[i][all_ticas[j][1]-1] for i in range(len(clusters.cluster_centers_))]
                high_pop_x_centers = [ x_centers[i] for i in range(len(x_centers)) if percentages[i] > cluster_percentage_cutoff ]
                high_pop_y_centers = [ y_centers[i] for i in range(len(y_centers)) if percentages[i] > cluster_percentage_cutoff ]
                plt.plot(x_centers, y_centers, color='y', linestyle="", marker="o")
                plt.plot(eval("tica_"+str(all_ticas[j][0])+'[0]'), eval("tica_"+str(all_ticas[j][1])+'[0]'), color='k', marker='*',markersize=24)
                plt.xlabel('tic'+str(all_ticas[j][0]))
                plt.ylabel('tic'+str(all_ticas[j][1]))
                plt.title(self.proj_num)
                # Add labels for high-population cluster centers
                for label, x, y in zip(population_labels, high_pop_x_centers, high_pop_y_centers):
                    plt.annotate(
                      label,
                      xy = (x, y), xytext = (-15, 15),
                      textcoords = 'offset points', ha = 'right', va = 'bottom',
                      bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
                      arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
                plt.savefig('%s/tica_' % (self.tICA_dir) +str(all_ticas[j][0])+'_'+str(all_ticas[j][1])+'.png')
                plt.close()
        
###########################################################################
	for filename in os.listdir(self.tICA_dir + '/cluster_centers'):
	    if filename.endswith('.pdb'):
                os.remove(self.tICA_dir + '/cluster_centers/' + filename)  
    # Write out PDBs for each cluster center
        print("Performing cluster analytics and saving center PDBs...\n")
	runs, clones, gens = data.shape[0], data.shape[1], data.shape[2]
	x, y, z = 0, 0, 0
	for i in range(len(features)):
	    if i % clones == 0 and i != 0:
		x += 1
	    if i % gens == 0:
		y = 0
            n_snapshots = len(clusters.distances_[i])

            # Determine frames that are cluster centers
            cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ]
            # Determine number of each cluster, correlates to populations.dat
            cluster_labels = sequences[i][cluster_indices]
            # Save each cluster center as a pdb
            if list(cluster_indices): # load center-containing xtcs to check length
		traj_cat = []
		print('x: %d, y: %d, z: %d' % (x, y, z))

		while True:
		    try:
			traj = base_dir + 'PROJ%s/RUN%s/CLONE%s/results%s/traj_comp.xtc' % (self.proj_num, x, y, z)
                	traj_cat.append(md.load(traj, top=self.gro_file))
			z += 1
		    except:
			break
		if len(traj_cat) > 0:
		    trajectory_file = md.join(traj_cat)
                xtc_len = len(trajectory_file)
	    y += 1
            z = 0
            for j in range(len(cluster_indices)):
                frames = range(xtc_len) # map the strided frame number back to xtc frame number
                strided_frames = frames[equil_steps:][::stride]  
                xtc_frame = frames.index(strided_frames[cluster_indices[j]])
                cluster_traj = trajectory_file[xtc_frame]
                cluster_traj.save_pdb('%s/cluster_centers/state_%d_%.3f.pdb'%(self.tICA_dir, cluster_labels[j],percentages[cluster_labels[j]]))
                if verbose:
                    print('Successfully saved PDB for cluster: %d, (rel.pop: %.3f)'%(cluster_labels[j],percentages[cluster_labels[j]]))
                    print('traj_file: %s (%d/%d)'%(trajectory_file,i,len(features)))
                    print('frame: %d (%d/%d centers from this trajectory)'%(cluster_indices[j],j,len(cluster_indices)))
                    print('strided: npy_frame/npy_len = %d/%d = %f'%(cluster_indices[j],n_snapshots,cluster_indices[j]/n_snapshots))
                    print('re-mapped: orig_frame/xtc_len = %d/%d = %f\n'%(xtc_frame,xtc_len,xtc_frame/xtc_len))
コード例 #29
0
please cite msmbuilder in any publications


"""

import mdtraj as md
import os

from msmbuilder.io.sampling import sample_states
from msmbuilder.io import load_trajs, save_generic, preload_top, backup, load_generic

## Load
meta, ttrajs = load_trajs('ttrajs')
kmeans = load_generic("kmeans.pickl")

## Sample
inds = sample_states(ttrajs, kmeans.cluster_centers_, k=10)

save_generic(inds, "cluster-sample-inds.pickl")

## Make trajectories
top = preload_top(meta)
out_folder = "cluster_samples"
backup(out_folder)
os.mkdir(out_folder)

for state_i, state_inds in enumerate(inds):
    traj = md.join(
        md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top)
        for traj_i, frame_i in state_inds)
    traj.save("{}/{}.xtc".format(out_folder, state_i))
コード例 #30
0
def main():

    Vars = KinfoVariables()
    lib_dir = Vars['lib_dir']
    print(
        '\033[34m## need to change the hard-coded library/repository directory ##\033[0m'
    )
    print('\033[34m current lib_dir:\033[0m ' + lib_dir + '\n')

    args = UserInput()

    if args.use_sk not in sk_ml:  # default SK model: RandomForest
        args.use_sk = 'et'

### These are hard-coded test cases with known key residue positions.
### This was used for examining the code by skipping some steps
#  wrk_dir = '.'
#  args.tmpl_file = wrk_dir+'examples/strada_cido.prot.1atp.pdb'
#  args.traj_file = wrk_dir+'examples/strada_cidi.2.200ps.dcd'
#  args.outpref   = 'test'
#  args.b3k = 39
#  args.dfg = 152
#  args.c_glu = 57

## user-defined library path
    if args.lib_dir:
        lib_dir = args.lib_dir

    ## reference structure must start at resid 1. Modified ref is hardcoded here
    if not os.path.isfile('{0}/{1}'.format(lib_dir, Vars['ref_pdb'])):
        sys.exit(
            '\n    \033[31mFATAL: Reference structure \033[0m"{0}"\033[31m not found in database\033[0m'
            .format(Vars['ref_pdb']))
    else:
        ref_file = lib_dir + '/' + Vars['ref_pdb']
        ref_pkl = lib_dir + '/' + Vars['ref_pkl']
        ref_dfg = Vars['ref_dfg']
        ref_b3k = Vars['ref_b3k']
        ref_c_glu = Vars['ref_c_glu']

######################

## get reference PDB structure 1ATP.pdb coordinates dataframe
    print('\033[34m# Reading in reference file:\033[0m ' + ref_file)
    if not ref_pkl or not os.path.isfile(ref_pkl):
        ref = md.load_pdb(ref_file)
        ref_cd = ExtractCoords(dfg=ref_dfg,
                               b3k=ref_b3k,
                               c_glu=ref_c_glu,
                               pkl=ref_pkl)
        ref_df = CalculateMetrics(ref_cd(ref))
    ## skip calculation if data is already stored in pickle
    else:
        print(
            '  \033[34m## INFO: Read structural residue coords from:\033[0m {0}\n'
            .format(ref_pkl))
        with bz2.open(ref_pkl, 'rb') as fi:
            ref = pickle.load(fi)
        ref_df = CalculateMetrics(ref)

######################
## load trajectory file(s) with MDtraj, can be multiple traj files at once
    traj = []
    print('\033[34m# Reading in trajectory file(s)...\033[0m')
    start = time.perf_counter()
    if not args.pkl or not os.path.isfile(args.pkl):
        start2 = time.perf_counter()
        TrjIn = ReadTraj(top=args.tmpl_file)
        if re.search(r'dcd$|nc$|crd$|xtc$', args.traj_file):
            traj = TrjIn(args.traj_file)
        else:
            traj_list = filter(None, (l.rstrip()
                                      for l in open(args.traj_file, 'r')
                                      if l is not re.search(r'^#', l)))
            mpi = multiprocessing.Pool()
            traj = md.join(mpi.imap(TrjIn, traj_list, 2))
            mpi.close()
            mpi.join()
        end2 = time.perf_counter()
        print(
            '  ## Time to load trajectory: \033[31m{0:.1f}\033[0m ms for \033[34m{1}\033[0m frames\n'
            .format((end2 - start2) * 1000, len(traj)))

        ## superpose all frames to template structure pre-superposed to ref 1ATP.pdb
        if args.superp:
            print(
                '\033[34m# Applying superposition to trajectory with:\033[0m '
                + args.superp)
            tmpl = md.load_pdb(args.tmpl_file)
            sele = tmpl.topology.select(args.superp)
            traj = traj.superpose(tmpl, atom_indices=sele, parallel=True)

        ## get trajectory coordinates dataframe
        print(
            '\033[34m# Extracting structural matrics from trajectory...\033[0m'
        )
        start = time.perf_counter()
        trj_cd = ExtractCoords(dfg=args.dfg,
                               b3k=args.b3k,
                               c_glu=args.c_glu,
                               pkl=args.pkl)
        trj_df = CalculateMetrics(trj_cd(traj))

    ## skip calculation if data is already stored in pickle
    else:
        print(
            '  \033[34m## INFO: Read structural residue coords from:\033[0m {0}\n'
            .format(args.pkl))
        with bz2.open(args.pkl, 'rb') as fi:
            trj_df = CalculateMetrics(pickle.load(fi))

    end = time.perf_counter()
    print('## Total time to get traj descriptors: {0:.1f} ms for {1} frames'.
          format((end - start) * 1000, len(trj_df)))
    del traj  # save memory
    print('\n#########################################\n')

    ######################
    ######################
    ## calculate structural metrics from coordinates, then print out raw output
    print(
        '\033[34m# Calculating structural matrics from coordinates...\033[0m')
    start = time.perf_counter()
    mat_df = CompareMetrics(trj_df, ref_df)

    mat_df.to_csv(args.outpref + '.csv', sep=',')
    end = time.perf_counter()
    print(
        '## Total time to compare descriptors: \033[31m{0:.1f}\033[0m ms for \033[34m{1}\033[0m frames'
        .format((end - start) * 1000, len(mat_df)))
    print('\n#########################################\n')

    #####################
    ## use Kinformation Random Forest Classifier to assign conformation/confidence
    start = time.perf_counter()
    KinfoClassify(mat_df, lib_dir, args.outpref, args.use_sk)
    end = time.perf_counter()
    print(
        '\n## Total time to SK \033[31m{0}\033[0m Classification: \033[31m{1:.3f}\033[0m ms for \033[34m{2}\033[0m frames'
        .format(args.use_sk, (end - start) * 1000, len(mat_df)))

    print('\n#########################################\n')
コード例 #31
0
    ## JOIN CSVs
    print("\tJoining CSVs")
    df_orig = pd.read_csv(original_data_file)
    last_time = df_orig.iloc[-1]['Time (ps)']
    df_re = pd.read_csv(restarted_data_file)
    df_re['Time (ps)'] = df_re['Time (ps)'].apply(lambda x: x + last_time)
    n_orig_rows = df_orig.shape[0]
    df_orig = df_orig.append(df_re.iloc[:1000 - n_orig_rows])
    df_orig.to_csv(file_prefix + file + '.50ns.combined.csv', index=False)

    ### JOIN TRAJECTORIES
    print("\tLoading topology PDB")
    # Load topology PDB
    original_ref = md.load_pdb(original_ref_file)
    print("\tRemoving solvent from topology PDB")
    original_ref_solute = original_ref.remove_solvent()

    # Load original trajectory
    print("\tLoading original trajectory")
    original_traj = md.load_dcd(original_traj_file, top=original_ref_solute)

    # Load restarted trajectory, slicing to keep only the number of additional frames needed
    print("\tLoading restarted trajectory")
    restarted_traj = md.load(restarted_traj_file, top=original_ref_solute)
    restarted_traj = restarted_traj[:1000 - original_traj.n_frames]

    # Join the trajectories
    print("\tJoining trajectories and saving them")
    joined_traj = md.join([original_traj, restarted_traj])
    joined_traj.save(file_prefix + file + ".50ns.combined.solute.dcd")
コード例 #32
0
def guestimate_stride():
    total_data = meta['nframes'].sum()
    want = kmed.n_clusters * 10
    stride = max(1, total_data // want)
    print("Since we have", total_data, "frames, we're going to stride by",
          stride, "during fitting, because this is probably adequate for",
          kmed.n_clusters, "clusters")
    return stride


## Fit
kmed.fit([traj for _, traj in itertrajs(meta, stride=guestimate_stride())])
print(kmed.summarize())

## Save
save_generic(kmed, 'clusterer.pickl')


## Save centroids
def frame(traj_i, frame_i):
    # Note: kmedoids does 0-based, contiguous integers so we use .iloc
    row = meta.iloc[traj_i]
    return md.load_frame(row['traj_fn'], frame_i, top=row['top_fn'])


centroids = md.join((frame(ti, fi) for ti, fi in kmed.cluster_ids_),
                    check_topology=False)
centroids_fn = 'centroids.xtc'
backup(centroids_fn)
centroids.save("centroids.xtc")
コード例 #33
0
  - trajs
"""

import mdtraj as md

from msmbuilder.io import load_trajs, save_generic, preload_top, backup, load_generic
from msmbuilder.io.sampling import sample_msm

## Load
meta, ttrajs = load_trajs('ttrajs')
msm = load_generic('msm.pickl')
kmeans = load_generic('kmeans.pickl')

## Sample
# Warning: make sure ttrajs and kmeans centers have
# the same number of dimensions
inds = sample_msm(ttrajs, kmeans.cluster_centers_, msm, n_steps=200, stride=1)
save_generic(inds, "msm-traj-inds.pickl")

## Make trajectory
top = preload_top(meta)
traj = md.join(
    md.load_frame(meta.loc[traj_i]['traj_fn'], index=frame_i, top=top)
    for traj_i, frame_i in inds
)

## Save
traj_fn = "msm-traj.xtc"
backup(traj_fn)
traj.save(traj_fn)