Ejemplo n.º 1
0
def test_api_still_works_allframes():
    traj1, traj2, ref = _random_trajs()
    old = OldRMSDFeaturizer(ref)
    new = RMSDFeaturizer(ref)

    data_old = old.fit_transform([traj1, traj2])
    data_new = new.fit_transform([traj1, traj2])

    for do, dn in zip(data_old, data_new):
        np.testing.assert_array_almost_equal(do, dn)
        assert dn.shape == (100, 7)
Ejemplo n.º 2
0
def test_api_still_works_allframes():
    traj1, traj2, ref = _random_trajs()
    old = OldRMSDFeaturizer(ref)
    new = RMSDFeaturizer(ref)

    data_old = old.fit_transform([traj1, traj2])
    data_new = new.fit_transform([traj1, traj2])

    for do, dn in zip(data_old, data_new):
        np.testing.assert_array_almost_equal(do, dn)
        assert dn.shape == (100, 7)
Ejemplo n.º 3
0
def test_alanine_dipeptide_basic():
    # This test takes the rmsd of the 0th set of alanine dipeptide
    # trajectories relative to the 0th frame of the dataset.
    # The test asserts that all rmsd's calculated will be equal
    # to the ones that would be calculated straight from mdtraj.

    trajectories = AlanineDipeptide().get_cached().trajectories
    featurizer = RMSDFeaturizer(trajectories[0][0])
    data = featurizer.transform(trajectories[0:1])

    true_rmsd = md.rmsd(trajectories[0], trajectories[0][0])

    np.testing.assert_array_almost_equal(data[0][:, 0], true_rmsd, decimal=4)
Ejemplo n.º 4
0
def test_alanine_dipeptide_basic():
    # This test takes the rmsd of the 0th set of alanine dipeptide
    # trajectories relative to the 0th frame of the dataset.
    # The test asserts that all rmsd's calculated will be equal
    # to the ones that would be calculated straight from mdtraj.

    trajectories = AlanineDipeptide().get_cached().trajectories
    featurizer = RMSDFeaturizer(trajectories[0][0])
    data = featurizer.transform(trajectories[0:1])

    true_rmsd = md.rmsd(trajectories[0], trajectories[0][0])

    np.testing.assert_array_almost_equal(data[0][:, 0], true_rmsd, decimal=4)
Ejemplo n.º 5
0
def test_api_still_works_names():
    traj1, traj2, ref = _random_trajs()
    old = OldRMSDFeaturizer(trj0=ref, atom_indices=np.arange(50))
    with warnings.catch_warnings(record=True) as w:
        new = RMSDFeaturizer(trj0=ref, atom_indices=np.arange(50))
        assert "deprecated" in str(w[-1].message)
        assert "trj0" in str(w[-1].message)

    data_old = old.fit_transform([traj1, traj2])
    data_new = new.fit_transform([traj1, traj2])

    for do, dn in zip(data_old, data_new):
        np.testing.assert_array_almost_equal(do, dn)
        assert dn.shape == (100, 7)
Ejemplo n.º 6
0
def test_two_refs_omitting_indices():
    # This test verifies that the result produced when
    # atom_indices are omitted is the same as the result
    # produced when atom_indices is all atom indices.

    trajectories = AlanineDipeptide().get_cached().trajectories
    featurizer_indices = RMSDFeaturizer(trajectories[0][0:2],
                                        np.arange(trajectories[0].n_atoms))
    data_indices = featurizer_indices.transform(trajectories[0:1])

    featurizer = RMSDFeaturizer(trajectories[0][0:2])
    data = featurizer.transform(trajectories[0:1])

    np.testing.assert_array_almost_equal(data[0], data_indices[0], decimal=4)
Ejemplo n.º 7
0
def test_api_still_works_names():
    traj1, traj2, ref = _random_trajs()
    old = OldRMSDFeaturizer(trj0=ref, atom_indices=np.arange(50))
    with warnings.catch_warnings(record=True) as w:
        new = RMSDFeaturizer(trj0=ref, atom_indices=np.arange(50))
        assert "deprecated" in str(w[-1].message)
        assert "trj0" in str(w[-1].message)

    data_old = old.fit_transform([traj1, traj2])
    data_new = new.fit_transform([traj1, traj2])

    for do, dn in zip(data_old, data_new):
        np.testing.assert_array_almost_equal(do, dn)
        assert dn.shape == (100, 7)
Ejemplo n.º 8
0
def test_that_all_featurizers_run():
    # TODO: include all featurizers, perhaps with generator tests

    trajectories = AlanineDipeptide().get_cached().trajectories
    trj0 = trajectories[0][0]
    atom_indices, pair_indices = get_atompair_indices(trj0)

    featurizer = AtomPairsFeaturizer(pair_indices)
    X_all = featurizer.transform(trajectories)

    featurizer = SuperposeFeaturizer(np.arange(15), trj0)
    X_all = featurizer.transform(trajectories)

    featurizer = DihedralFeaturizer(["phi", "psi"])
    X_all = featurizer.transform(trajectories)

    featurizer = VonMisesFeaturizer(["phi", "psi"])
    X_all = featurizer.transform(trajectories)

    # Below doesn't work on ALA dipeptide
    # featurizer = msmbuilder.featurizer.ContactFeaturizer()
    # X_all = featurizer.transform(trajectories)

    featurizer = RMSDFeaturizer(trj0)
    X_all = featurizer.transform(trajectories)
Ejemplo n.º 9
0
def test_two_refs_basic():
    # This test uses the 0th and 1st frames of the 0th set of
    # adp trajectories as the two reference trajectories and
    # ensures that the rmsd of the 0th frame of the dataset with
    # the 0th reference are identical and the 1st frame of the
    # dataset with the 1st reference are identical.

    trajectories = AlanineDipeptide().get_cached().trajectories
    featurizer = RMSDFeaturizer(trajectories[0][0:2])
    data = featurizer.transform(trajectories[0:1])

    true_rmsd = np.zeros((trajectories[0].n_frames, 2))
    for frame in range(2):
        true_rmsd[:, frame] = md.rmsd(trajectories[0], trajectories[0][frame])

    np.testing.assert_almost_equal(data[0][0, 0], data[0][1, 1], decimal=3)
    np.testing.assert_almost_equal(data[0][1, 0], data[0][0, 1], decimal=3)

    np.testing.assert_array_almost_equal(data[0], true_rmsd, decimal=4)
Ejemplo n.º 10
0
def test_different_indices():
    # This test verifies that the rmsd's calculated from
    # different sets of atom indices are not the same,
    # but that the arrays are still the same shape.

    trajectories = AlanineDipeptide().get_cached().trajectories
    n_atoms = trajectories[0].n_atoms
    halfway_point = n_atoms // 2

    featurizer_first_half = RMSDFeaturizer(trajectories[0][0],
                                           np.arange(halfway_point))
    data_first_half = featurizer_first_half.transform(trajectories[0:1])
    featurizer_second_half = RMSDFeaturizer(trajectories[0][0],
                                            np.arange(halfway_point, n_atoms))
    data_second_half = featurizer_second_half.transform(trajectories[0:1])

    assert data_first_half[0].shape == data_second_half[0].shape
    # janky way to show that the arrays shouldn't be equal here
    assert sum(data_first_half[0][:, 0]) != sum(data_second_half[0][:, 0])
Ejemplo n.º 11
0
def test_two_refs_basic():
    # This test uses the 0th and 1st frames of the 0th set of
    # adp trajectories as the two reference trajectories and
    # ensures that the rmsd of the 0th frame of the dataset with
    # the 0th reference are identical and the 1st frame of the
    # dataset with the 1st reference are identical.

    trajectories = AlanineDipeptide().get_cached().trajectories
    featurizer = RMSDFeaturizer(trajectories[0][0:2])
    data = featurizer.transform(trajectories[0:1])

    true_rmsd = np.zeros((trajectories[0].n_frames, 2))
    for frame in range(2):
        true_rmsd[:, frame] = md.rmsd(trajectories[0], trajectories[0][frame])

    np.testing.assert_almost_equal(data[0][0, 0], data[0][1, 1], decimal=3)
    np.testing.assert_almost_equal(data[0][1, 0], data[0][0, 1], decimal=3)

    np.testing.assert_array_almost_equal(data[0], true_rmsd, decimal=4)
Ejemplo n.º 12
0
def featurize_trajectories(coords, featurizer):
    if featurizer == 'RMSDFeaturizer':
        from msmbuilder.featurizer import RMSDFeaturizer
        feat = RMSDFeaturizer(reference_traj=coords[0])
    elif featurizer == 'DRIDFeaturizer':
        from msmbuilder.featurizer import DRIDFeaturizer
        feat = DRIDFeaturizer()
    elif featurizer == 'ContactFeaturizer':
        from msmbuilder.featurizer import ContactFeaturizer
        feat = ContactFeaturizer(scheme='ca')
    elif featurizer == 'DihedralFeaturizer':
        from msmbuilder.featurizer import DihedralFeaturizer
        feat = DihedralFeaturizer(types=['phi', 'psi'])
    return feat.fit_transform(coords)
Ejemplo n.º 13
0
def test_two_refs_omitting_indices():
    # This test verifies that the result produced when
    # atom_indices are omitted is the same as the result
    # produced when atom_indices is all atom indices.

    trajectories = AlanineDipeptide().get_cached().trajectories
    featurizer_indices = RMSDFeaturizer(trajectories[0][0:2],
                                        np.arange(trajectories[0].n_atoms))
    data_indices = featurizer_indices.transform(trajectories[0:1])

    featurizer = RMSDFeaturizer(trajectories[0][0:2])
    data = featurizer.transform(trajectories[0:1])

    np.testing.assert_array_almost_equal(data[0], data_indices[0], decimal=4)
Ejemplo n.º 14
0
def test_omitting_indices():
    # This test verifies that the result produced when
    # atom_indices are omitted is the same as the result
    # produced when atom_indices is all atom indices.

    dataset = fetch_alanine_dipeptide()
    trajectories = dataset["trajectories"]

    featurizer_indices = RMSDFeaturizer(trajectories[0][0],
                                        np.arange(trajectories[0].n_atoms))
    data_indices = featurizer_indices.transform(trajectories[0:1])
    featurizer = RMSDFeaturizer(trajectories[0][0])
    data = featurizer.transform(trajectories[0:1])

    np.testing.assert_array_almost_equal(data[0][:, 0],
                                         data_indices[0][:, 0], decimal=4)
Ejemplo n.º 15
0
def test_different_indices():
    # This test verifies that the rmsd's calculated from
    # different sets of atom indices are not the same,
    # but that the arrays are still the same shape.

    trajectories = AlanineDipeptide().get_cached().trajectories
    n_atoms = trajectories[0].n_atoms
    halfway_point = n_atoms // 2

    featurizer_first_half = RMSDFeaturizer(trajectories[0][0],
                                           np.arange(halfway_point))
    data_first_half = featurizer_first_half.transform(trajectories[0:1])
    featurizer_second_half = RMSDFeaturizer(trajectories[0][0],
                                            np.arange(halfway_point, n_atoms))
    data_second_half = featurizer_second_half.transform(trajectories[0:1])

    assert data_first_half[0].shape == data_second_half[0].shape
    # janky way to show that the arrays shouldn't be equal here
    assert sum(data_first_half[0][:, 0]) != sum(data_second_half[0][:, 0])
def featurize_trajectories(coords, featurizer):
    '''
    Input
    coords : list of 'MDTrajDataset' object

    Output 
    features : list of arrays, length n_trajs, each of shape (n_samples, n_features)
    '''
    if featurizer == 'RMSDFeaturizer':
        from msmbuilder.featurizer import RMSDFeaturizer
        feat = RMSDFeaturizer(reference_traj=coords[0])
    elif featurizer == 'DRIDFeaturizer':
        from msmbuilder.featurizer import DRIDFeaturizer
        feat = DRIDFeaturizer()
    elif featurizer == 'ContactFeaturizer':
        from msmbuilder.featurizer import ContactFeaturizer
        feat = ContactFeaturizer(scheme='ca')
    elif featurizer == 'DihedralFeaturizer':
        from msmbuilder.featurizer import DihedralFeaturizer
        feat = DihedralFeaturizer(types=['phi', 'psi'])
    return feat.fit_transform(coords)
Ejemplo n.º 17
0
def main():
    import argparse, textwrap
    parser = argparse.ArgumentParser(
        usage=textwrap.dedent(
            '''Use "python %(prog)s -h" for more information.'''),
        formatter_class=argparse.RawTextHelpFormatter,
        description=textwrap.dedent('''\
            First, this program employs msmbuilder to featurize given pdb trajectories into vectorizable space.
	    Second, the vector space is decompose by tICA or PCA to further reduce the dimension. 
            Third, clustering is performed so that each structure in the trajectories is labeled by an index. 
	    Forth, Marcov State Model, albeit may not be well behaved, is built on the labeled trajectories.
	    Last, FAST reward scores are calculated based on the transition-count matrix and user-chosen physical traits. 
        
            Example:
            $ python FAST.py path_to_pdb_trajectories/ --featurizer=DRIDFeaturizer --decomposer=PCA --decomposer-n-components=5 --clusterer=KCenters --n-clusters=5 --msm-prior-counts=0.2 --physical-trait=target-RMSD --target-pdb=/path_to_target_pdb/target.pdb '''
                                    ))
    parser.add_argument(
        'pdbpath',
        help=textwrap.dedent('''[required] Path to pdb trajectories.'''))
    parser.add_argument(
        '--lag-time',
        default=1,
        type=int,
        help=textwrap.dedent('''Lag time of the model. Default value = 1.'''))
    parser.add_argument('--featurizer',
                        default=None,
                        type=str,
                        help=textwrap.dedent('''\
            Featurizer at your choice. Available featurizers are (select them by name): 
            (1) RMSDFeaturizer;
            (2) DihedralFeaturizer, only phi and psi angles;
            (3) DRIDFeaturizer (DRID, Distribution of Reciprocal of Interatomic Distances);
            (4) ContactFeaturizer, CA contact. 	
            Note: user must choose a featurization method. Choose by name. ''')
                        )
    parser.add_argument('--decomposer',
                        default=None,
                        type=str,
                        help=textwrap.dedent('''\
            Decomposer at your choice. Available decomposers are: 
            (1) tICA;
            (2) PCA. 
            Note: selection of decomposer is not necessary but recommended.
            If not provided, program will ignore this step and cluster directly on raw features. '''
                                             ))
    parser.add_argument(
        '--decomposer-n-components',
        default=None,
        type=int,
        help=textwrap.dedent(
            '''Number of components to keep. if n_components is not set all components are kept.'''
        ))
    parser.add_argument('--clusterer',
                        default=None,
                        type=str,
                        help=textwrap.dedent('''\
            Clustering method at your choice. Available clusterer are: 
            (1) KMeans;
            (2) KCenters;
            (3) KMedoids;
            (4) MiniBatchKMeans;
            (5) MiniBatchKMedoids.
            Note: user must choose a clusering method. '''))
    parser.add_argument(
        '--n-clusters',
        default=5,
        type=int,
        help=textwrap.dedent(
            '''The number of clusters to form as well as the number of centroids to generate.'''
        ))
    parser.add_argument('--msm-n-timescales',
                        default=None,
                        type=int,
                        help=textwrap.dedent('''\
	    The number of dynamical timescales to calculate when diagonalizing the transition matrix. 
	    If not specified, it will compute n_states - 1. '''))
    parser.add_argument('--msm-prior-counts',
                        default=0,
                        type=float,
                        help=textwrap.dedent('''\
	    Add a number of 'pseudo counts' to each entry in the counts matrix after ergodic trimming. 
	    When prior_counts == 0 (default), the assigned transition probability between two states 
	    with no observed transitions will be zero, whereas when prior_counts > 0, even this unobserved 
	    transitions will be given nonzero probability. '''))
    parser.add_argument('--physical-trait',
                        default=None,
                        type=str,
                        help=textwrap.dedent('''\
            Physical trait used in calculation of FAST reward score. Available choices are: 
            (1) target-RMSD, if chosen, user must supply a target structure; 
            (2) target-native-contact, if chosen, user must supply a target structure; 
            (3) target-tmscore, if chosen, user must supply the data file containing the TM-scores in column;
            (4) potential, target free, if chosen, user must supply the data file containing the potentials in column; 
            Note: user must choose a physical trait. '''))
    parser.add_argument('--target-pdb',
                        default=None,
                        type=str,
                        help=textwrap.dedent('''\
            The target pdb structure. 
            Note: The target pdb should have the same number of atoms in structure with that in pdb trajectories. '''
                                             ))
    parser.add_argument('--initial-pdb',
                        default=None,
                        type=str,
                        help=textwrap.dedent('''\
            The initial pdb structure. 
            Note: The initial pdb should have the same number of atoms in structure with that in pdb trajectories. '''
                                             ))
    parser.add_argument(
        '--potential',
        default=None,
        type=str,
        help=textwrap.dedent(
            '''The potential file corresponding to the pdb trajectories. '''))
    parser.add_argument(
        '--tmscore',
        default=None,
        type=str,
        help=textwrap.dedent(
            '''The TM-score file corresponding to the pdb trajectories. '''))
    parser.add_argument(
        '--fast-n-simulations',
        default=30,
        type=int,
        help=textwrap.dedent(
            '''Number of parallel simulations in each round of FAST algorithm. Default value: 30. '''
        ))
    parser.add_argument(
        '--fast-alpha',
        default=1.,
        type=float,
        help=textwrap.dedent('''Number of clusters. Default value: 1.0.'''))
    parser.add_argument('--output',
                        type=str,
                        default='output',
                        help=textwrap.dedent('''Output file name.'''))
    args = parser.parse_args()

    from msmbuilder.dataset import dataset
    coords = dataset(os.path.join(args.pdbpath, '*.pdb'))
    print '%i trajectories found. \n' % len(coords)

    ## featurize
    features = featurize_trajectories(coords, args.featurizer)
    print "%s selected" % args.featurizer
    print "features: (n_samples, n_features) = (%i, %i) for each trajectory \n" % (
        features[0].shape[0], features[0].shape[1])

    ## decompose
    if args.decomposer == None:
        print "No decomposer is selected! Program will directly cluster the raw features. \n"
    else:
        features = decompose_features(
            features,
            args.decomposer,
            n_components=args.decomposer_n_components,
            lag_time=args.lag_time)
        print "%s selected" % args.decomposer
        print "features: (n_samples, n_features) = (%i, %i) for each trajectory \n" % (
            features[0].shape[0], features[0].shape[1])

    ## clustering
    clst = cluster_features(features,
                            args.clusterer,
                            n_clusters=args.n_clusters)
    cci = find_cluster_center_indices(features, clst)
    print "%s selected" % args.clusterer
    print "Cluster center indices: %s \n" % cci

    ## build msm
    #msm = build_msm(clst.labels_, lag_time=args.lag_time, n_timescales=args.msm_n_timescales, prior_counts=args.msm_prior_counts)
    #print msm, '\n'
    #print "Transition count matrix: \n %s \n" % msm.countsmat_
    #print "Relative population of each state: %s \n" % msm.populations_

    ## construct transition count matrix
    transition_count_mat = calc_transition_count_mat(
        np.concatenate(clst.labels_), args.n_clusters)
    print 'Transition count matrix: \n', transition_count_mat

    #### calculate FAST reward score
    output_df = pd.DataFrame()
    output_df['idx'] = cci
    output_df['#cluster'] = transition_count_mat.diagonal()

    if args.initial_pdb != None:
        import mdtraj as md
        initial = md.load(args.initial_pdb)

        from msmbuilder.featurizer import RMSDFeaturizer
        rmsd_to_initial = np.concatenate(
            RMSDFeaturizer(initial).fit_transform(coords))[:, 0]

        output_df['iniRMSD'] = rmsd_to_initial[cci]

    if args.target_pdb != None:
        import mdtraj as md
        target = md.load(args.target_pdb)

        from msmbuilder.featurizer import RMSDFeaturizer
        rmsd_to_target = np.concatenate(
            RMSDFeaturizer(target).fit_transform(coords))[:, 0]

        native_contact_dists, native_contact_pairs = md.compute_contacts(
            target, scheme='ca')
        native_contact_pairs = native_contact_pairs[np.where(
            native_contact_dists[0] <= 0.75)]
        print "Target structure has %i pairs of CA-CA contact in total. \n" % len(
            native_contact_pairs)

        from msmbuilder.featurizer import ContactFeaturizer
        native_contact_to_target = np.concatenate(
            ContactFeaturizer(
                contacts=native_contact_pairs,
                scheme='ca').fit_transform(coords))  # (n_samples, n_pairs)
        native_contact_to_target = np.select([
            native_contact_to_target <= 0.75, native_contact_to_target > 0.75
        ], [1, 0])
        native_contact_to_target = np.sum(native_contact_to_target, axis=1)

        output_df['tarRMSD'] = rmsd_to_target[cci]
        output_df['#NativeContact'] = native_contact_to_target[cci]

    if args.potential != None:
        potential = np.loadtxt(args.potential)
        output_df['potential'] = potential[cci]

    if args.tmscore != None:
        tmscore = np.loadtxt(args.tmscore)
        output_df['tmscore'] = tmscore[cci]

    # choose physical trait
    print "%s is selected in FAST \n" % args.physical_trait
    if args.physical_trait == 'target-RMSD':
        if args.target_pdb == None:
            print "User must provide a target structure! \n"
        rewards, sims, c = calc_FAST_reward_score(
            rmsd_to_target,
            cci,
            transition_count_mat,
            alpha=args.fast_alpha,
            n_simulations=args.fast_n_simulations,
            minmax='min')

    elif args.physical_trait == 'target-native-contact':
        if args.target_pdb == None:
            print "User must provide a target structure! \n"
        rewards, sims, c = calc_FAST_reward_score(
            native_contact_to_target,
            cci,
            transition_count_mat,
            alpha=args.fast_alpha,
            n_simulations=args.fast_n_simulations,
            minmax='max')

    elif args.physical_trait == 'target-tmscore':
        if args.tmscore == None:
            print "User must provide a TM-score file corresponding to the pdb trajectories! \n"
        rewards, sims, c = calc_FAST_reward_score(
            tmscore,
            cci,
            transition_count_mat,
            alpha=args.fast_alpha,
            n_simulations=args.fast_n_simulations,
            minmax='max')

    elif args.physical_trait == 'potential':
        if args.potential == None:
            print "User must provide a potential file corresponding to the pdb trajectories! \n"
        rewards, sims, c = calc_FAST_reward_score(
            potential,
            cci,
            transition_count_mat,
            alpha=args.fast_alpha,
            n_simulations=args.fast_n_simulations,
            minmax='min')

    output_df['#Transition'] = c
    output_df['reward'] = rewards
    output_df['#sim'] = sims

    ## output
    with open(args.output + '.CenterIdx_ClusterSize.dat', 'w') as f:
        for i in range(args.n_clusters):
            print >> f, '%6i %6i' % (cci[i], sims[i])

    if args.initial_pdb != None:
        with open(args.output + '.iniRMSD.dat', 'w') as f:
            for ele in rmsd_to_initial:
                print >> f, '%8.3f' % ele

    if args.target_pdb != None:
        with open(args.output + '.tarRMSD.dat', 'w') as f:
            for ele in rmsd_to_target:
                print >> f, '%8.3f' % ele

        with open(args.output + '.tarNativeContact.dat', 'w') as f:
            for ele in native_contact_to_target:
                print >> f, '%8.3f' % ele

    with open(args.output + '.dat', 'w') as f:
        print >> f, output_df

    ## plot
    if args.target_pdb != None:
        plot_cluster(X=rmsd_to_target,
                     Y=native_contact_to_target,
                     cluster_center_indices=cci,
                     figname=args.output + '.tarRMSD_tarNativeContact.png',
                     x_label='RMSD to target / nm',
                     y_label='# native contact',
                     xmin=0,
                     xmax=ceil(rmsd_to_target.max(), 0),
                     ymin=0,
                     ymax=ceil(native_contact_to_target.max()),
                     c_map='winter',
                     cc_color='red')
        if args.initial_pdb != None:
            plot_cluster(X=rmsd_to_initial,
                         Y=rmsd_to_target,
                         cluster_center_indices=cci,
                         figname=args.output + '.tarRMSD_iniRMSD.png',
                         x_label='RMSD to initial / nm',
                         y_label='RMSD to target / nm',
                         xmin=0,
                         xmax=ceil(rmsd_to_target.max(), 0),
                         ymin=0,
                         ymax=ceil(rmsd_to_initial.max(), 0),
                         c_map='winter',
                         cc_color='red')
        if args.tmscore != None:
            plot_cluster(X=tmscore,
                         Y=native_contact_to_target,
                         cluster_center_indices=cci,
                         figname=args.output + '.tmscore_tarNativeContact.png',
                         x_label='TM-score to target',
                         y_label='# native contact',
                         xmin=0,
                         xmax=1,
                         ymin=0,
                         ymax=ceil(native_contact_to_target.max()),
                         c_map='winter',
                         cc_color='red')
            if args.potential != None:
                plot_cluster(X=tmscore,
                             Y=potential,
                             cluster_center_indices=cci,
                             figname=args.output + '.tmscore_potential.png',
                             x_label='TM-score to target',
                             y_label='potential',
                             xmin=0,
                             xmax=1,
                             ymin=floor(potential.min()),
                             ymax=ceil(potential.max()),
                             c_map='winter',
                             cc_color='red')
        if args.potential != None:
            plot_cluster(X=rmsd_to_target,
                         Y=potential,
                         cluster_center_indices=cci,
                         figname=args.output + '.tarRMSD_potential.png',
                         x_label='RMSD to target / nm',
                         y_label='potential',
                         xmin=0,
                         xmax=ceil(rmsd_to_target.max(), 0),
                         ymin=floor(potential.min()),
                         ymax=ceil(potential.max()),
                         c_map='winter',
                         cc_color='red')

    if args.decomposer == 'tICA':
        cat_features = np.concatenate(features)
        plot_cluster(X=cat_features[:, 0],
                     Y=cat_features[:, 1],
                     cluster_center_indices=cci,
                     figname=args.output + '.tICA_1st_2nd.png',
                     x_label='tIC 1',
                     y_label='tIC 2',
                     xmin=floor(cat_features[:, 0].min()),
                     xmax=ceil(cat_features[:, 0].max()),
                     ymin=floor(cat_features[:, 1].min()),
                     ymax=ceil(cat_features[:, 1].max()),
                     c_map='winter',
                     cc_color='red')
    elif args.decomposer == 'PCA':
        cat_features = np.concatenate(features)
        plot_cluster(X=cat_features[:, 0],
                     Y=cat_features[:, 1],
                     cluster_center_indices=cci,
                     figname=args.output + '.PCA_1st_2nd.png',
                     x_label='PC 1',
                     y_label='PC 2',
                     xmin=floor(cat_features[:, 0].min()),
                     xmax=ceil(cat_features[:, 0].max()),
                     ymin=floor(cat_features[:, 1].min()),
                     ymax=ceil(cat_features[:, 1].max()),
                     c_map='winter',
                     cc_color='red')
Ejemplo n.º 18
0
{{header}}

Meta
----
depends:
  - meta.pandas.pickl
  - trajs
  - top.pdb
"""
import mdtraj as md

from msmbuilder.io import load_meta, itertrajs, save_trajs, preload_top

## Load
meta = load_meta()
centroids = md.load("centroids.xtc", top=preload_top(meta))

## Kernel
SIGMA = 0.3  # nm
from msmbuilder.featurizer import RMSDFeaturizer
import numpy as np

featurizer = RMSDFeaturizer(centroids)
lfeats = {}
for i, traj in itertrajs(meta):
    lfeat = featurizer.partial_transform(traj)
    lfeat = np.exp(-lfeat ** 2 / (2 * (SIGMA ** 2)))
    lfeats[i] = lfeat
save_trajs(lfeats, 'ftrajs', meta)
def main():
    import argparse, textwrap
    parser = argparse.ArgumentParser(
        usage=textwrap.dedent(
            '''Use "python %(prog)s -h" for more information.'''),
        formatter_class=argparse.RawTextHelpFormatter,
        description=textwrap.dedent('''\
            First, this program employs msmbuilder to featurize given pdb trajectories into vectorizable space.
	    Second, the vector space is decompose by tICA or PCA to further reduce the dimension. 
            Third, clustering is performed so that each structure in the trajectories is labeled by an index. 
        
            Example:
            $ python Traj-to-cluster.py     \n
                path_to_pdb_trajectories/   \n
                --featurizer=DRIDFeaturizer \n
                --decomposer=PCA            \n
                --decomposer-n-components=5 \n
                --clusterer=KCenters        \n
                --n-clusters=5 '''))
    parser.add_argument(
        'pdbpath',
        help=textwrap.dedent('''[required] Path to pdb trajectories.'''))
    parser.add_argument(
        '--lag-time',
        default=1,
        type=int,
        help=textwrap.dedent('''Lag time of the model. Default value = 1.'''))
    parser.add_argument('--featurizer',
                        default=None,
                        type=str,
                        help=textwrap.dedent('''\
            Featurizer at your choice. Available featurizers are (select them by name): 
            (1) RMSDFeaturizer;
            (2) DihedralFeaturizer, only phi and psi angles;
            (3) DRIDFeaturizer (DRID, Distribution of Reciprocal of Interatomic Distances);
            (4) ContactFeaturizer, CA contact. 	
            Note: user must choose a featurization method. Choose by name. ''')
                        )
    parser.add_argument('--decomposer',
                        default=None,
                        type=str,
                        help=textwrap.dedent('''\
            Decomposer at your choice. Available decomposers are: 
            (1) tICA;
            (2) PCA. 
            Note: selection of decomposer is not necessary but recommended.
            If not provided, program will ignore this step and cluster directly on raw features. '''
                                             ))
    parser.add_argument(
        '--decomposer-n-components',
        default=None,
        type=int,
        help=textwrap.dedent(
            '''Number of components to keep. if n_components is not set all components are kept.'''
        ))
    parser.add_argument('--clusterer',
                        default=None,
                        type=str,
                        help=textwrap.dedent('''\
            Clustering method at your choice. Available clusterer are: 
            (1) KMeans;
            (2) KCenters;
            (3) KMedoids;
            (4) MiniBatchKMeans;
            (5) MiniBatchKMedoids.
            Note: user must choose a clusering method. '''))
    parser.add_argument(
        '--n-clusters',
        default=5,
        type=int,
        help=textwrap.dedent(
            '''The number of clusters to form as well as the number of centroids to generate.'''
        ))
    parser.add_argument(
        '--reference-model',
        default=[],
        action='append',
        type=str,
        help=textwrap.dedent(''' Reference models used to calculate RMSD. '''))
    parser.add_argument('--output',
                        type=str,
                        default='output',
                        help=textwrap.dedent('''Output file name.'''))
    args = parser.parse_args()

    from msmbuilder.dataset import dataset
    coords = dataset(os.path.join(args.pdbpath,
                                  '*.pdb'))  # coords: 'MDTrajDataset' object
    print '%i trajectories found. \n' % len(coords)

    ## featurize
    features = featurize_trajectories(coords, args.featurizer)
    print "%s selected" % args.featurizer
    print "features: (n_samples, n_features) = (%i, %i) for each trajectory \n" % (
        features[0].shape[0], features[0].shape[1])
    with open(args.output + '.features.%s.pkl' % args.featurizer, 'w') as f:
        cp.dump(toNumpy32(features), f, -1)
    sys.stdout.flush()

    ## decompose
    if args.decomposer == None:
        print "No decomposer is selected! Program will directly cluster the raw coordinates. \n"
    else:
        features, components = decompose_features(
            features,
            args.decomposer,
            n_components=args.decomposer_n_components,
            lag_time=args.lag_time)
        print "%s selected" % args.decomposer
        print "features: (n_samples, n_features) = (%i, %i) for each trajectory \n" % (
            features[0].shape[0], features[0].shape[1])

    ## clustering
    clst = cluster_features(features,
                            args.clusterer,
                            n_clusters=args.n_clusters)
    cci = find_cluster_center_indices(features, clst)
    print "%s selected" % args.clusterer
    print "Cluster center indices: %s \n" % cci

    cat_features = np.concatenate(features)
    cat_labels = np.concatenate(clst.labels_)

    ## reference pdb
    if args.reference_model != None:
        import mdtraj as md
        from msmbuilder.featurizer import RMSDFeaturizer
        rmsd_to_ref = []
        for ref in args.reference_model:
            print "\nCompute RMSD to the reference models : %s." % ref
            ref_traj = md.load(ref)
            print "N atoms %i" % ref_traj.n_atoms
            rmsd_to_ref.append(
                np.concatenate(
                    RMSDFeaturizer(ref_traj[0]).fit_transform(coords))[:, 0])
        rmsd_to_ref = np.array(rmsd_to_ref) * 10  # unit : A

        for i in range(len(args.reference_model)):
            with open(
                    args.output +
                    '.ref_RMSD_%s.dat' % bsnm(args.reference_model[i]),
                    'w') as f:
                for e in rmsd_to_ref[i]:
                    print >> f, '%.3f' % e

        plot_cluster(X=rmsd_to_ref[0],
                     Y=rmsd_to_ref[1],
                     cluster_center_indices=cci,
                     figname=args.output + '.ref_RMSD.png',
                     x_label='state 1',
                     y_label='state 2',
                     xmin=0,
                     xmax=ceil(rmsd_to_ref.max()),
                     ymin=0,
                     ymax=ceil(rmsd_to_ref.max()),
                     c_map='winter',
                     cc_color='red')

    ## output
    with open(args.output + '.labels.pkl', 'w') as f:
        cp.dump(cat_labels, f, -1)

    with open(args.output + '.components.pkl', 'w') as f:
        cp.dump(toNumpy32(components), f, -1)

    with open(args.output + '.cluster_center_idx.dat', 'w') as f:
        for i in range(args.n_clusters):
            print >> f, '%6i' % cci[i]

    if args.decomposer == 'tICA':
        plot_cluster(X=cat_features[:, 0],
                     Y=cat_features[:, 1],
                     cluster_center_indices=cci,
                     figname=args.output + '.tICA_1st_2nd.png',
                     x_label='tIC 1',
                     y_label='tIC 2',
                     xmin=floor(cat_features[:, 0].min()),
                     xmax=ceil(cat_features[:, 0].max()),
                     ymin=floor(cat_features[:, 1].min()),
                     ymax=ceil(cat_features[:, 1].max()),
                     c_map='winter',
                     cc_color='red')
    elif args.decomposer == 'PCA':
        plot_cluster(X=cat_features[:, 0],
                     Y=cat_features[:, 1],
                     cluster_center_indices=cci,
                     figname=args.output + '.PCA_1st_2nd.png',
                     x_label='PC 1',
                     y_label='PC 2',
                     xmin=floor(cat_features[:, 0].min()),
                     xmax=ceil(cat_features[:, 0].max()),
                     ymin=floor(cat_features[:, 1].min()),
                     ymax=ceil(cat_features[:, 1].max()),
                     c_map='winter',
                     cc_color='red')
Ejemplo n.º 20
0
"""
Trace Plot
==========
"""
from msmbuilder.example_datasets import FsPeptide
from msmbuilder.featurizer import RMSDFeaturizer

import msmexplorer as msme

# Load Fs Peptide Data
traj = FsPeptide().get().trajectories[0]

# Calculate RMSD
featurizer = RMSDFeaturizer(reference_traj=traj[0])
rmsd = featurizer.partial_transform(traj).flatten()

# Plot Trace
msme.plot_trace(rmsd, label='traj0', xlabel='Timestep', ylabel='RMSD (nm)')
def calculate_rmsd_mat_mdtraj(com_traj, ref_traj):
    from msmbuilder.featurizer import RMSDFeaturizer
    return RMSDFeaturizer(ref_traj).partial_transform(com_traj)
Ejemplo n.º 22
0
{{header}}

Meta
----
depends:
  - meta.pandas.pickl
  - trajs
  - top.pdb
"""
import mdtraj as md

from msmbuilder.io import load_meta, itertrajs, save_trajs, preload_top

## Load
meta = load_meta()
centroids = md.load("centroids.xtc", top=preload_top(meta))

## Kernel
SIGMA = 0.3  # nm
from msmbuilder.featurizer import RMSDFeaturizer
import numpy as np

featurizer = RMSDFeaturizer(centroids)
lfeats = {}
for i, traj in itertrajs(meta):
    lfeat = featurizer.partial_transform(traj)
    lfeat = np.exp(-lfeat**2 / (2 * (SIGMA**2)))
    lfeats[i] = lfeat
save_trajs(lfeats, 'ftrajs', meta)
from msmbuilder.featurizer import RMSDFeaturizer
import matplotlib
matplotlib.use('Agg')
from matplotlib.pylab import plt
from multiprocessing import Pool
from utilities import msmb_feat
import numpy as np
import seaborn as sns
import pandas as pd
import sys

meta = load_meta()
tops = preload_tops(meta)

ref = md.load('topology.pdb')
feat = RMSDFeaturizer(reference_traj=ref)

args = zip(meta.iterrows(), [feat] * meta.shape[0], [tops] * meta.shape[0])

with Pool() as pool:
    ftrajs = dict(pool.imap_unordered(msmb_feat, args))

# Squeeze and extend short trajectories with zeros
# MSMBuilder does rmsd in nm so multiply by 10 to get angstroms
nframes = int(np.max(meta['nframes'].unique()[0]))
ns_to_ang = 10
rtrajs = {}
for k, v in ftrajs.items():
    v = np.squeeze(v)
    diff = nframes - v.shape[0]
    v = np.append(v, np.zeros(diff) + np.nan) * ns_to_ang