def start(self): print('loading all data...') xx, ii, ff = mixtape.featurizer.featurize_all(self.filenames, self.featurizer, self.topology) print('done loading') data = {'filename': [], 'index': [], 'state': []} for k in range(self.model['n_states']): print('computing weights for k=%d...' % k) try: weights = discrete_approx_mvn(xx, self.model['means'][k], self.model['vars'][k], self.match_vars) except NotSatisfiableError: self.error('Satisfiability failure. Could not match the means & ' 'variances w/ discrete distribution. Try removing the ' 'constraint on the variances with --no-match-vars?') cumsum = np.cumsum(weights) for i in range(self.args.n_per_state): index = np.sum(cumsum < np.random.rand()) data['filename'].append(ff[index]) data['index'].append(ii[index]) data['state'].append(k) df = pd.DataFrame(data) print('Saving the indices of the sampled states in CSV format to %s' % self.out) with open(self.out, 'w') as f: f.write("# command: %s\n" % ' '.join(sys.argv)) df.to_csv(f)
def draw_samples(self, sequences, n_samples, scheme="even", match_vars=False): """Sample conformations from each state. Parameters ---------- sequences : list List of 2-dimensional array observation sequences, each of which has shape (n_samples_i, n_features), where n_samples_i is the length of the i_th observation. n_samples : int How many samples to return from each state scheme : str, optional, default='even' Must be one of ['even', "maxent"]. match_vars : bool, default=False Flag for matching variances in maxent discrete approximation Returns ------- selected_pairs_by_state : np.array, dtype=int, shape=(n_states, n_samples, 2) selected_pairs_by_state[state] gives an array of randomly selected (trj, frame) pairs from the specified state. sample_features : np.ndarray, dtype=float, shape = (n_states, n_samples, n_features) sample_features[state, sample] gives the features for the given `sample` of `state` Notes ----- With scheme='even', this function assigns frames to states crisply then samples from the uniform distribution on the frames belonging to each state. With scheme='maxent', this scheme uses a maximum entropy method to determine a discrete distribution on samples whose mean (and possibly variance) matches the GHMM means. See Also -------- utils.map_drawn_samples : Extract conformations from MD trajectories by index. GaussianFusionHMM.draw_centroids : Draw centers from GHMM ToDo ---- This function could be separated into several MixIns for models with crisp and fuzzy state assignments. Then we could have an optional argument that specifies which way to do the sampling from states--e.g. use either the base class function or a different one. """ random = check_random_state(self.random_state) if scheme == 'even': logprob = [sklearn.mixture.log_multivariate_normal_density(x, self.means_, self.vars_, covariance_type='diag') for x in sequences] ass = [lp.argmax(1) for lp in logprob] selected_pairs_by_state = [] for state in range(self.n_states): all_frames = [np.where(a == state)[0] for a in ass] pairs = [(trj, frame) for (trj, frames) in enumerate(all_frames) for frame in frames] selected_pairs_by_state.append([pairs[random.choice(len(pairs))] for i in range(n_samples)]) elif scheme == "maxent": X_concat = np.concatenate(sequences) all_pairs = np.array([(trj, frame) for trj, X in enumerate(sequences) for frame in range(X.shape[0])]) selected_pairs_by_state = [] for k in range(self.n_states): print('computing weights for k=%d...' % k) try: weights = discrete_approx_mvn(X_concat, self.means_[k], self.vars_[k], match_vars) except NotSatisfiableError: self.error('Satisfiability failure. Could not match the means & ' 'variances w/ discrete distribution. Try removing the ' 'constraint on the variances with --no-match-vars?') weights /= weights.sum() frames = random.choice(len(all_pairs), n_samples, p=weights) selected_pairs_by_state.append(all_pairs[frames]) else: raise(ValueError("scheme must be one of ['even', 'maxent'])")) return np.array(selected_pairs_by_state)