def _start(self): print("model") print(self.model_dict) n_features = float(self.model_dict['n_features']) n_states = float(self.model_dict['n_states']) self.model = MetastableSwitchingLDS(n_states, n_features) self.model.load_from_json_dict(self.model_dict) obs, hidden_states = self.model.sample(self.args.n_samples) (n_samples, n_features) = np.shape(obs) features, ii, ff = mixtape.featurizer.featurize_all( self.filenames, self.featurizer, self.topology, self.stride) file_trajectories = [] states = [] state_indices = [] state_files = [] logprob = log_multivariate_normal_density( features, np.array(self.model.means_), np.array(self.model.covars_), covariance_type='full') assignments = np.argmax(logprob, axis=1) probs = np.max(logprob, axis=1) # Presort the data into the metastable wells # i.e.: separate the original trajectories into k # buckets corresponding to the metastable wells for k in range(int(self.model.n_states)): # pick the structures that have the highest log # probability in the state s = features[assignments == k] ind = ii[assignments==k] f = ff[assignments==k] states.append(s) state_indices.append(ind) state_files.append(f) # Loop over the generated feature space trajectory. # At time t, pick the frame from the original trajectory # closest to the current sample in feature space. To save # a bit of computation, just search in the bucket corresponding # to the current metastable well (i.e., the current hidden state). traj = None for t in range(n_samples): featurized_frame = obs[t] h = hidden_states[t] logprob = log_multivariate_normal_density( states[h], featurized_frame[np.newaxis], self.model.Qs_[h][np.newaxis], covariance_type='full') best_frame_pos = np.argmax(logprob, axis=0)[0] best_file = state_files[h][best_frame_pos] best_ind = state_indices[h][best_frame_pos] frame = md.load_frame(best_file, best_ind, self.topology) if t == 0: traj = frame else: frame.superpose(traj, t-1) traj = traj.join(frame) traj.save('%s.xtc' % self.out) traj[0].save('%s.xtc.pdb' % self.out)
def _exact_loglikelihood(self, ob): log_transmat = np.zeros((self.n_chains, self.n_states, self.n_states)) log_startprob = np.zeros((self.n_chains, self.n_states)) for idx, chain in enumerate(self.chains_): log_transmat[idx] = chain._log_transmat log_startprob[idx] = chain._log_startprob n_state_combinations = self.n_states ** self.n_chains state_combinations = [tuple(x) for x in list(itertools.product(np.arange(self.n_states), repeat=self.n_chains))] n_observations = ob.shape[0] n_features = ob.shape[1] fwdlattice = np.zeros((n_observations, n_state_combinations)) # Calculate means and covariances for all state combinations and calculate emission probabilities weight = (1.0 / float(self.n_chains)) weight_squared = weight * weight covars = np.zeros((n_state_combinations, n_features)) # TODO: add support for all covariance types means = np.zeros((n_state_combinations, n_features)) for idx, state_combination in enumerate(state_combinations): for chain_idx, state in enumerate(state_combination): chain = self.chains_[chain_idx] covars[idx] += chain._covars_[state] means[idx] += chain._means_[state] covars[idx] *= weight_squared means[idx] *= weight framelogprob = log_multivariate_normal_density(ob, means, covars, covariance_type='diag') # TODO: add support for all covariance types # Run the forward algorithm fhmmc._forward(n_observations, self.n_chains, self.n_states, state_combinations, log_startprob, log_transmat, framelogprob, fwdlattice) last_column = fwdlattice[-1] assert np.size(last_column) == n_state_combinations score = logsumexp(last_column) return score
def start(self): featurizer = mixtape.featurizer.load(self.args.featurizer) features, ii, ff = mixtape.featurizer.featurize_all( self.filenames, featurizer, self.topology, self.stride) logprob = log_multivariate_normal_density( features, np.array(self.model['means']), np.array(self.model['vars']), covariance_type='diag') assignments = np.argmax(logprob, axis=1) probs = np.max(logprob, axis=1) data = {'filename': [], 'index': [], 'state': []} for k in range(self.model['n_states']): # pick the structures that have the highest log # probability in the state p = probs[assignments == k] sorted_filenms = ff[assignments == k][p.argsort()] sorted_indices = ii[assignments == k][p.argsort()] if len(p) > 0: data['index'].extend(sorted_indices[-self.args.n_per_state:]) index_length = len(sorted_indices[-self.args.n_per_state:]) data['filename'].extend(sorted_filenms[-self.args.n_per_state:]) filename_length = len(sorted_filenms[-self.args.n_per_state:]) assert index_length == filename_length data['state'].extend([k] * index_length) else: print('WARNING: NO STRUCTURES ASSIGNED TO STATE=%d' % k) df = pd.DataFrame(data) print('Saving the indices of the selected frames in CSV format to %s' % self.out) with open(self.out, 'w') as f: f.write("# command: %s\n" % ' '.join(sys.argv)) df.to_csv(f)
def get_word_ll_for_topics(self): """Calculates P(w|z) for all words (rows), given each topic (columns) Returns matrix that is VxK (where V is the number of words in the vocabulary, and K is the number of topics) """ word_vectors = self._w2v_model.syn0 return log_multivariate_normal_density(word_vectors, self._gmm_model.means_, self._gmm_model.covars_, self._gmm_model.covariance_type)
def score(self, X, return_responsibilities=False): nc = len(self.weights) X = numpy.array(X) if X.ndim == 1: X = X[:, None] if X.shape[1] != self.means.shape[1]: raise ValueError('The shape of X is not compatible with self') mins = self.lims[:, 0] maxes = self.lims[:, 1] lpr = numpy.log(self.weights) + \ log_multivariate_normal_density(X, self.means, self.covs, 'full') mask = (X >= mins[None, :]).all(axis=-1) mask &= (X <= maxes[None, :]).all(axis=-1) logprob = logsumexp(lpr, axis=1) logprob[~mask] = -numpy.inf if return_responsibilities: responsibilities = numpy.exp(lpr - logprob[:, None]) responsibilities[~mask] = 0 return logprob, responsibilities return logprob
def _compute_log_likelihood(self, seq): state_combinations = [tuple(x) for x in list(itertools.product(np.arange(self.n_states), repeat=self.n_chains))] n_state_combinations = self.n_states ** self.n_chains n_observations, n_features = seq.shape covars = np.array([self.covar for _ in xrange(n_state_combinations)]) # TODO: correct?! means = np.zeros((n_state_combinations, n_features)) for idx, state_combination in enumerate(state_combinations): for chain_idx, state in enumerate(state_combination): means[idx] += self.means[chain_idx, state] framelogprob = log_multivariate_normal_density(seq, means, covars, covariance_type='full') return framelogprob
def test_framelogprob_reshape(self): n_states = 3 n_chains = 2 n_state_combinations = n_states ** n_chains state_combinations = [tuple(x) for x in list(itertools.product(np.arange(n_states), repeat=n_chains))] covars = np.random.random((n_state_combinations, 10)) means = np.random.random((n_state_combinations, 10)) ob = np.random.random((5, 10)) framelogprob = log_multivariate_normal_density(ob, means, covars, covariance_type='diag') # This test assures that resizing the framelogprob still yields the correct state variables reshaped_framelogprob = framelogprob.reshape((5, n_states, n_states)) for ob_idx in xrange(5): for idx, state_combination in enumerate(state_combinations): self.assertEqual(reshaped_framelogprob[ob_idx][state_combination], framelogprob[ob_idx][idx])
def _exact_loglikelihood(self, ob): log_transmat = np.zeros((self.n_chains, self.n_states, self.n_states)) log_startprob = np.zeros((self.n_chains, self.n_states)) for idx, chain in enumerate(self.chains_): log_transmat[idx] = chain._log_transmat log_startprob[idx] = chain._log_startprob n_state_combinations = self.n_states**self.n_chains state_combinations = [ tuple(x) for x in list( itertools.product(np.arange(self.n_states), repeat=self.n_chains)) ] n_observations = ob.shape[0] n_features = ob.shape[1] fwdlattice = np.zeros((n_observations, n_state_combinations)) # Calculate means and covariances for all state combinations and calculate emission probabilities weight = (1.0 / float(self.n_chains)) weight_squared = weight * weight covars = np.zeros( (n_state_combinations, n_features)) # TODO: add support for all covariance types means = np.zeros((n_state_combinations, n_features)) for idx, state_combination in enumerate(state_combinations): for chain_idx, state in enumerate(state_combination): chain = self.chains_[chain_idx] covars[idx] += chain._covars_[state] means[idx] += chain._means_[state] covars[idx] *= weight_squared means[idx] *= weight framelogprob = log_multivariate_normal_density( ob, means, covars, covariance_type='diag' ) # TODO: add support for all covariance types # Run the forward algorithm fhmmc._forward(n_observations, self.n_chains, self.n_states, state_combinations, log_startprob, log_transmat, framelogprob, fwdlattice) last_column = fwdlattice[-1] assert np.size(last_column) == n_state_combinations score = logsumexp(last_column) return score
def _compute_log_likelihood(self, seq): state_combinations = [ tuple(x) for x in list( itertools.product(np.arange(self.n_states), repeat=self.n_chains)) ] n_state_combinations = self.n_states**self.n_chains n_observations, n_features = seq.shape covars = np.array([self.covar for _ in xrange(n_state_combinations) ]) # TODO: correct?! means = np.zeros((n_state_combinations, n_features)) for idx, state_combination in enumerate(state_combinations): for chain_idx, state in enumerate(state_combination): means[idx] += self.means[chain_idx, state] framelogprob = log_multivariate_normal_density(seq, means, covars, covariance_type='full') return framelogprob
def start(self): featurizer = mixtape.featurizer.load(self.args.featurizer) features, ii, ff = mixtape.featurizer.featurize_all( self.filenames, featurizer, self.topology) logprob = log_multivariate_normal_density(features, np.array( self.model['means']), np.array(self.model['vars']), covariance_type='diag') assignments = np.argmax(logprob, axis=1) probs = np.max(logprob, axis=1) data = {'filename': [], 'index': [], 'state': []} for k in range(self.model['n_states']): # pick the structures that have the highest log # probability in the state p = probs[assignments == k] sorted_filenms = ff[assignments == k][p.argsort()] sorted_indices = ii[assignments == k][p.argsort()] if len(p) > 0: data['index'].extend(sorted_indices[-self.args.n_per_state:]) data['filename'].extend( sorted_filenms[-self.args.n_per_state:]) data['state'].extend([k] * self.args.n_per_state) else: print('WARNING: NO STRUCTURES ASSIGNED TO STATE=%d' % k) df = pd.DataFrame(data) print('Saving the indices of the selected frames in CSV format to %s' % self.out) with open(self.out, 'w') as f: f.write("# command: %s\n" % ' '.join(sys.argv)) df.to_csv(f)
def _compute_log_likelihood(self, X): """Compute the log likelihood of feature matrix X""" return gmm.log_multivariate_normal_density(X, self.means_, self.covars_, self.covariance_type)
def test_alanine_dipeptide(): import pdb, traceback, sys warnings.filterwarnings("ignore", category=DeprecationWarning) try: b = fetch_alanine_dipeptide() trajs = b.trajectories n_seq = len(trajs) n_frames = trajs[0].n_frames n_atoms = trajs[0].n_atoms n_features = n_atoms * 3 sim_T = 1000 data_home = get_data_home() data_dir = join(data_home, TARGET_DIRECTORY_ALANINE) top = md.load(join(data_dir, 'ala2.pdb')) n_components = 2 # Superpose m data = [] for traj in trajs: traj.superpose(top) Z = traj.xyz Z = np.reshape(Z, (len(Z), n_features), order='F') data.append(Z) # Fit MSLDS model n_experiments = 1 n_em_iter = 1 tol = 1e-1 model = MetastableSwitchingLDS(n_components, n_features, n_experiments=n_experiments, n_em_iter=n_em_iter) model.fit(data, gamma=.1, tol=tol, verbose=True) mslds_score = model.score(data) print("MSLDS Log-Likelihood = %f" % mslds_score) # Fit Gaussian HMM for comparison g = GaussianFusionHMM(n_components, n_features) g.fit(data) hmm_score = g.score(data) print("HMM Log-Likelihood = %f" % hmm_score) print() # Generate a trajectory from learned model. sample_traj, hidden_states = model.sample(sim_T) states = [] for k in range(n_components): states.append([]) # Presort the data into the metastable wells for k in range(n_components): for i in range(len(trajs)): traj = trajs[i] Z = traj.xyz Z = np.reshape(Z, (len(Z), n_features), order='F') logprob = log_multivariate_normal_density(Z, np.array(model.means_), np.array(model.covars_), covariance_type='full') assignments = np.argmax(logprob, axis=1) #probs = np.max(logprob, axis=1) # pick structures that have highest log probability in state s = traj[assignments == k] states[k].append(s) # Pick frame from original trajectories closest to current sample gen_traj = None for t in range(sim_T): h = hidden_states[t] best_logprob = -np.inf best_frame = None for i in range(len(trajs)): if t > 0: states[h][i].superpose(gen_traj, t-1) Z = states[h][i].xyz Z = np.reshape(Z, (len(Z), n_features), order='F') mean = sample_traj[t] logprobs = log_multivariate_normal_density(Z, mean, model.Qs_[h], covariance_type='full') ind = np.argmax(logprobs, axis=0) logprob = logprobs[ind] if logprob > best_log_prob: logprob = best_logprob best_frame = states[h][i][ind] if t == 0: gen_traj = best_frame else: gen_traj = gen_traj.join(frame) gen_traj.save('%s.xtc' % self.out) gen_traj[0].save('%s.xtc.pdb' % self.out) except: type, value, tb = sys.exc_info() traceback.print_exc() pdb.post_mortem(tb)
# Train GMM print 'Starting GMM training' words = w2v_model.vocab.keys() word_vectors = w2v_model.syn0 gmm_model = GMM(n_components=num_topics, n_iter=num_gmm_iterations, covariance_type='diag') gmm_model.fit(word_vectors) # joblib.dump(gmm_model, gmm_output_file) print 'Done GMM training' # Get the likelihood of each word vector under each Gaussian component scores = gmm_model.score(test_vectors) print scores ll = sum(scores) print "LL: "+str(ll) # Print topics if desired if print_topics: log_probs = log_multivariate_normal_density(word_vectors, gmm_model.means_, gmm_model.covars_, gmm_model.covariance_type) print np.min(log_probs) _, num_col = log_probs.shape for col in xrange(num_col): top_n = 10 log_component_probs = (log_probs[:,col]).T sorted_indexes = np.argsort(log_component_probs)[::-1][:top_n] ordered_word_probs = [(w2v_model.index2word[idx], log_component_probs[idx]) for idx in sorted_indexes] print '---' print "Topic {0}".format(col+1) print "Total prob:" + str(sum(log_component_probs)) print ", ".join(["{w}: {p}".format(w=w, p=p) for w, p in ordered_word_probs])
n_iter=num_gmm_iterations, covariance_type='diag') gmm_model.fit(word_vectors) # joblib.dump(gmm_model, gmm_output_file) print 'Done GMM training' # Get the likelihood of each word vector under each Gaussian component scores = gmm_model.score(test_vectors) print scores ll = sum(scores) print "LL: " + str(ll) # Print topics if desired if print_topics: log_probs = log_multivariate_normal_density( word_vectors, gmm_model.means_, gmm_model.covars_, gmm_model.covariance_type) print np.min(log_probs) _, num_col = log_probs.shape for col in xrange(num_col): top_n = 10 log_component_probs = (log_probs[:, col]).T sorted_indexes = np.argsort( log_component_probs)[::-1][:top_n] ordered_word_probs = [(w2v_model.index2word[idx], log_component_probs[idx]) for idx in sorted_indexes] print '---' print "Topic {0}".format(col + 1) print "Total prob:" + str(sum(log_component_probs))