def decompose_features(features, decomposer, n_components=None, lag_time=1):
    '''
    Decomposing features is a way to reduce the dimension of the features. 

    Each of the components is a eigenvector of the feature space, dimension: (n_features,) 

    The old features are transformed to the new feature space. 
    
    Consider one sample, which is vectorized to (n_features,).T, 
    apply the transform matrix, which is in the shape (n_components, n_features), 
    we will get its projection onto the new space (n_components,). 

    --------------------------------------------------------------------------------------------------------------------------------------
    Input
    features         : array-like, length n_trajs, each of shape (n_samples, n_features)
	
    Output
    features_new     : array-like, length n_trajs, each of shape (n_samples, n_components) ((n_samples, n_samples) if n_components = None) 

    dcmp.components_ : shape (n_components, n_features), ((n_samples, n_features) if n_components = None)
        PCA  : Principal axes in feature space, representing the directions of maximum variance in the data.
        tICA : Components with maximum autocorrelation. 
    '''
    if decomposer == 'PCA':
        from msmbuilder.decomposition import PCA
        dcmp = PCA(n_components=n_components)
    elif decomposer == 'tICA':
        from msmbuilder.decomposition import tICA
        dcmp = tICA(n_components=n_components, lag_time=lag_time)
    features_new = dcmp.fit_transform(features)
    return features_new, dcmp.components_
def fit_protein_pca(yaml_file):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("pca__"):
            current_mdl_params[i.split("pca__")[1]] = mdl_params[i]

    protein_pca_mdl = PCA(**current_mdl_params)

    for protein in yaml_file["protein_list"]:
        print("Fitting to protein %s" % protein)
        with enter_protein_data_dir(yaml_file, protein):
            featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    protein_pca_mdl.partial_fit(featurized_path)
                except:
                    pass
            print("Done partial fitting to protein %s" % protein)
    # dumping the pca_mdl
    pca_mdl_path = os.path.join(mdl_dir, "pca_mdl.pkl")
    verbosedump(protein_pca_mdl, pca_mdl_path)
    return
Ejemplo n.º 3
0
def decompose_features(features, decomposer, n_components=None, lag_time=1):
    '''
    Input
    features : list of arrays, length n_trajs, each of shape (n_samples, n_features)
	
    Output
    features_new : list of arrays, length n_trajs, each of shape (n_samples, n_features_new) 
    '''
    if decomposer == 'PCA':
        from msmbuilder.decomposition import PCA
        dcmp = PCA(n_components=n_components)
    elif decomposer == 'tICA':
        from msmbuilder.decomposition import tICA
        dcmp = tICA(n_components=n_components, lag_time=lag_time)
    return dcmp.fit_transform(features)
Ejemplo n.º 4
0
def test_1():
    #Compare msmbuilder.pca with sklearn.decomposition

    pcar = PCAr()
    pcar.fit(np.concatenate(trajs))

    pca = PCA()
    pca.fit(trajs)

    y_ref1 = pcar.transform(trajs[0])
    y1 = pca.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
    np.testing.assert_array_almost_equal(pca.components_, pcar.components_)
    np.testing.assert_array_almost_equal(pca.explained_variance_,
                                         pcar.explained_variance_)
    np.testing.assert_array_almost_equal(pca.mean_, pcar.mean_)
    np.testing.assert_array_almost_equal(pca.n_components_, pcar.n_components_)
    np.testing.assert_array_almost_equal(pca.noise_variance_,
                                         pcar.noise_variance_)
Ejemplo n.º 5
0
def test_1():
    #Compare msmbuilder.pca with sklearn.decomposition

    pcar = PCAr()
    pcar.fit(np.concatenate(trajs))

    pca = PCA()
    pca.fit(trajs)

    y_ref1 = pcar.transform(trajs[0])
    y1 = pca.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
    np.testing.assert_array_almost_equal(pca.components_, pcar.components_)
    np.testing.assert_array_almost_equal(pca.explained_variance_,
                                         pcar.explained_variance_)
    np.testing.assert_array_almost_equal(pca.mean_, pcar.mean_)
    np.testing.assert_array_almost_equal(pca.n_components_, pcar.n_components_)
    np.testing.assert_array_almost_equal(pca.noise_variance_,
                                         pcar.noise_variance_)
Ejemplo n.º 6
0
def test_generator():
    # Check to see if it works with a generator

    traj_dict = dict((i, t) for i, t in enumerate(trajs))

    pcar = PCAr()
    pcar.fit(np.concatenate(trajs))

    pca = PCA()
    # on python 3, dict.values() returns a generator
    pca.fit(traj_dict.values())

    y_ref1 = pcar.transform(trajs[0])
    y1 = pca.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
    np.testing.assert_array_almost_equal(pca.components_, pcar.components_)
    np.testing.assert_array_almost_equal(pca.explained_variance_,
                                         pcar.explained_variance_)
    np.testing.assert_array_almost_equal(pca.mean_, pcar.mean_)
    np.testing.assert_array_almost_equal(pca.n_components_, pcar.n_components_)
    np.testing.assert_array_almost_equal(pca.noise_variance_,
                                         pcar.noise_variance_)
Ejemplo n.º 7
0
def test_generator():
    # Check to see if it works with a generator

    traj_dict = dict((i, t) for i, t in enumerate(trajs))

    pcar = PCAr()
    pcar.fit(np.concatenate(trajs))

    pca = PCA()
    # on python 3, dict.values() returns a generator
    pca.fit(traj_dict.values())

    y_ref1 = pcar.transform(trajs[0])
    y1 = pca.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
    np.testing.assert_array_almost_equal(pca.components_, pcar.components_)
    np.testing.assert_array_almost_equal(pca.explained_variance_,
                                         pcar.explained_variance_)
    np.testing.assert_array_almost_equal(pca.mean_, pcar.mean_)
    np.testing.assert_array_almost_equal(pca.n_components_, pcar.n_components_)
    np.testing.assert_array_almost_equal(pca.noise_variance_,
                                         pcar.noise_variance_)
Ejemplo n.º 8
0
def test_2():
    # Tet that PCA it works in a msmbuilder pipeline

    p = Pipeline([('pca', PCA()), ('cluster', KCenters())])
    p.fit(trajs)
Ejemplo n.º 9
0
class SolventShellsAnalysis():
    """Do analysis on solvent shell results.

    The protocol is as follows:
        1. Normalize by shell volume
        2. Flatten to 2d (for compatibility with tICA, et. al.)
        3. Remove zero-variance features

    :param seqs: Sequences of counts. List of shape
                 (n_frames, n_solute, n_shells) arrays
    :param shell_w: Shell width (nm)

    """

    def __init__(self, seqs, shell_w):
        self._seqs3d_unnormed = seqs
        self._seqs3d = None
        self._seqs2d_unpruned = None
        self._seqs2d = None
        self._deleted = None
        self.shell_w = shell_w

        self.tica = None
        self.pca = None

        self.ticax = None
        self.pcax = None

    @property
    def seqs3d_unnormed(self):
        """Unnormalized (input) sequences"""
        return self._seqs3d_unnormed

    @property
    def seqs3d(self):
        """Normalized 3d sequences."""
        if self._seqs3d is None:
            self._seqs3d = [normalize(fp3d, self.shell_w) for fp3d in
                            self.seqs3d_unnormed]
        return self._seqs3d

    @property
    def seqs2d_unpruned(self):
        """Reshaped (2D) sequences."""
        if self._seqs2d_unpruned is None:
            self._seqs2d_unpruned = [reshape(fp3d) for fp3d in self.seqs3d]
        return self._seqs2d_unpruned

    @property
    def seqs2d(self):
        """Reshaped with zero-variance features removed.

        Input this to tICA, MSM, etc.
        """
        if self._seqs2d is None:
            self._seqs2d, self._deleted = prune_all(self.seqs2d_unpruned)
        return self._seqs2d

    @property
    def deleted(self):
        """Which features (2d-indexing) we deleted."""
        if self._deleted is None:
            self._seqs2d, self._deleted = prune_all(self.seqs2d_unpruned)
        return self._deleted


    def fit_tica(self, lag_time):
        self.tica = tICA(n_components=10, lag_time=lag_time,
                         weighted_transform=True)
        self.tica.fit(self.seqs2d)
        self.ticax = self.tica.transform(self.seqs2d)

    def fit_pca(self):
        self.pca = PCA(n_components=10)
        self.pca.fit(self.seqs2d)
        self.pcax = self.pca.transform(self.seqs2d)
Ejemplo n.º 10
0
 def fit_pca(self):
     self.pca = PCA(n_components=10)
     self.pca.fit(self.seqs2d)
     self.pcax = self.pca.transform(self.seqs2d)
Ejemplo n.º 11
0
class SolventShellsAnalysis():
    """Do analysis on solvent shell results.

    The protocol is as follows:
        1. Normalize by shell volume
        2. Flatten to 2d (for compatibility with tICA, et. al.)
        3. Remove zero-variance features

    :param seqs: Sequences of counts. List of shape
                 (n_frames, n_solute, n_shells) arrays
    :param shell_w: Shell width (nm)

    """
    def __init__(self, seqs, shell_w):
        self._seqs3d_unnormed = seqs
        self._seqs3d = None
        self._seqs2d_unpruned = None
        self._seqs2d = None
        self._deleted = None
        self.shell_w = shell_w

        self.tica = None
        self.pca = None

        self.ticax = None
        self.pcax = None

    @property
    def seqs3d_unnormed(self):
        """Unnormalized (input) sequences"""
        return self._seqs3d_unnormed

    @property
    def seqs3d(self):
        """Normalized 3d sequences."""
        if self._seqs3d is None:
            self._seqs3d = [
                normalize(fp3d, self.shell_w) for fp3d in self.seqs3d_unnormed
            ]
        return self._seqs3d

    @property
    def seqs2d_unpruned(self):
        """Reshaped (2D) sequences."""
        if self._seqs2d_unpruned is None:
            self._seqs2d_unpruned = [reshape(fp3d) for fp3d in self.seqs3d]
        return self._seqs2d_unpruned

    @property
    def seqs2d(self):
        """Reshaped with zero-variance features removed.

        Input this to tICA, MSM, etc.
        """
        if self._seqs2d is None:
            self._seqs2d, self._deleted = prune_all(self.seqs2d_unpruned)
        return self._seqs2d

    @property
    def deleted(self):
        """Which features (2d-indexing) we deleted."""
        if self._deleted is None:
            self._seqs2d, self._deleted = prune_all(self.seqs2d_unpruned)
        return self._deleted

    def fit_tica(self, lag_time):
        self.tica = tICA(n_components=10,
                         lag_time=lag_time,
                         weighted_transform=True)
        self.tica.fit(self.seqs2d)
        self.ticax = self.tica.transform(self.seqs2d)

    def fit_pca(self):
        self.pca = PCA(n_components=10)
        self.pca.fit(self.seqs2d)
        self.pcax = self.pca.transform(self.seqs2d)
Ejemplo n.º 12
0
 def fit_pca(self):
     self.pca = PCA(n_components=10)
     self.pca.fit(self.seqs2d)
     self.pcax = self.pca.transform(self.seqs2d)