Example #1
0
    def start(self):
        print('loading all data...')
        xx, ii, ff = mixtape.featurizer.featurize_all(self.filenames, self.featurizer, self.topology)
        print('done loading')

        data = {'filename': [], 'index': [], 'state': []}
        for k in range(self.model['n_states']):
            print('computing weights for k=%d...' % k)
            try:
                weights = discrete_approx_mvn(xx, self.model['means'][k],
                    self.model['vars'][k], self.match_vars)
            except NotSatisfiableError:
                self.error('Satisfiability failure. Could not match the means & '
                           'variances w/ discrete distribution. Try removing the '
                           'constraint on the variances with --no-match-vars?')

            cumsum = np.cumsum(weights)
            for i in range(self.args.n_per_state):
                index = np.sum(cumsum < np.random.rand())
                data['filename'].append(ff[index])
                data['index'].append(ii[index])
                data['state'].append(k)

        df = pd.DataFrame(data)
        print('Saving the indices of the sampled states in CSV format to %s' % self.out)
        with open(self.out, 'w') as f:
            f.write("# command: %s\n" % ' '.join(sys.argv))
            df.to_csv(f)
Example #2
0
    def draw_samples(self, sequences, n_samples, scheme="even", match_vars=False):
        """Sample conformations from each state.

        Parameters
        ----------
        sequences : list
            List of 2-dimensional array observation sequences, each of which
            has shape (n_samples_i, n_features), where n_samples_i
            is the length of the i_th observation.
        n_samples : int
            How many samples to return from each state
        scheme : str, optional, default='even'
            Must be one of ['even', "maxent"].  
        match_vars : bool, default=False
            Flag for matching variances in maxent discrete approximation

        Returns
        -------
        selected_pairs_by_state : np.array, dtype=int, shape=(n_states, n_samples, 2)
            selected_pairs_by_state[state] gives an array of randomly selected (trj, frame)
            pairs from the specified state.
        sample_features : np.ndarray, dtype=float, shape = (n_states, n_samples, n_features)
            sample_features[state, sample] gives the features for the given `sample` of 
            `state`
            
        Notes
        -----
        With scheme='even', this function assigns frames to states crisply then samples from
        the uniform distribution on the frames belonging to each state.
        With scheme='maxent', this scheme uses a maximum entropy method to
        determine a discrete distribution on samples whose mean (and possibly variance)
        matches the GHMM means.

        See Also
        --------
        utils.map_drawn_samples : Extract conformations from MD trajectories by index.
        GaussianFusionHMM.draw_centroids : Draw centers from GHMM
        
        ToDo
        ----
        This function could be separated into several MixIns for
        models with crisp and fuzzy state assignments.  Then we could have
        an optional argument that specifies which way to do the sampling
        from states--e.g. use either the base class function or a 
        different one.
        """
        
        random = check_random_state(self.random_state)
        
        if scheme == 'even':
            logprob = [sklearn.mixture.log_multivariate_normal_density(x, self.means_, self.vars_, covariance_type='diag') for x in sequences]
            ass = [lp.argmax(1) for lp in logprob]
            
            selected_pairs_by_state = []
            for state in range(self.n_states):
                all_frames = [np.where(a == state)[0] for a in ass]
                pairs = [(trj, frame) for (trj, frames) in enumerate(all_frames) for frame in frames]
                selected_pairs_by_state.append([pairs[random.choice(len(pairs))] for i in range(n_samples)])
        
        elif scheme == "maxent":
            X_concat = np.concatenate(sequences)
            all_pairs = np.array([(trj, frame) for trj, X in enumerate(sequences) for frame in range(X.shape[0])])
            selected_pairs_by_state = []
            for k in range(self.n_states):
                print('computing weights for k=%d...' % k)
                try:
                    weights = discrete_approx_mvn(X_concat, self.means_[k], self.vars_[k], match_vars)
                except NotSatisfiableError:
                    self.error('Satisfiability failure. Could not match the means & '
                               'variances w/ discrete distribution. Try removing the '
                               'constraint on the variances with --no-match-vars?')

                weights /= weights.sum()
                frames = random.choice(len(all_pairs), n_samples, p=weights)
                selected_pairs_by_state.append(all_pairs[frames])

        else:
            raise(ValueError("scheme must be one of ['even', 'maxent'])"))
        
        return np.array(selected_pairs_by_state)