Example #1
0
    def _start(self):
        print("model")
        print(self.model_dict)
        n_features = float(self.model_dict['n_features'])
        n_states = float(self.model_dict['n_states'])
        self.model = MetastableSwitchingLDS(n_states, n_features)
        self.model.load_from_json_dict(self.model_dict)
        obs, hidden_states = self.model.sample(self.args.n_samples)
        (n_samples, n_features) = np.shape(obs)

        features, ii, ff = mixtape.featurizer.featurize_all(
            self.filenames, self.featurizer, self.topology, self.stride)
        file_trajectories = []

        states = []
        state_indices = []
        state_files = []
        logprob = log_multivariate_normal_density(
            features, np.array(self.model.means_),
            np.array(self.model.covars_), covariance_type='full')
        assignments = np.argmax(logprob, axis=1)
        probs = np.max(logprob, axis=1)
        # Presort the data into the metastable wells
        # i.e.: separate the original trajectories into k
        # buckets corresponding to the metastable wells
        for k in range(int(self.model.n_states)):
            # pick the structures that have the highest log
            # probability in the state
            s = features[assignments == k]
            ind = ii[assignments==k]
            f = ff[assignments==k]
            states.append(s)
            state_indices.append(ind)
            state_files.append(f)

        # Loop over the generated feature space trajectory.
        # At time t, pick the frame from the original trajectory
        # closest to the current sample in feature space. To save
        # a bit of computation, just search in the bucket corresponding
        # to the current metastable well (i.e., the current hidden state).
        traj = None
        for t in range(n_samples):
            featurized_frame = obs[t]
            h = hidden_states[t]
            logprob = log_multivariate_normal_density(
                states[h], featurized_frame[np.newaxis],
                self.model.Qs_[h][np.newaxis],
                covariance_type='full')
            best_frame_pos = np.argmax(logprob, axis=0)[0]
            best_file = state_files[h][best_frame_pos]
            best_ind = state_indices[h][best_frame_pos]
            frame = md.load_frame(best_file, best_ind, self.topology)
            if t == 0:
                traj = frame
            else:
                frame.superpose(traj, t-1)
                traj = traj.join(frame)
        traj.save('%s.xtc' % self.out)
        traj[0].save('%s.xtc.pdb' % self.out)
    def _exact_loglikelihood(self, ob):
        log_transmat = np.zeros((self.n_chains, self.n_states, self.n_states))
        log_startprob = np.zeros((self.n_chains, self.n_states))
        for idx, chain in enumerate(self.chains_):
            log_transmat[idx] = chain._log_transmat
            log_startprob[idx] = chain._log_startprob

        n_state_combinations = self.n_states ** self.n_chains
        state_combinations = [tuple(x) for x in list(itertools.product(np.arange(self.n_states), repeat=self.n_chains))]
        n_observations = ob.shape[0]
        n_features = ob.shape[1]
        fwdlattice = np.zeros((n_observations, n_state_combinations))

        # Calculate means and covariances for all state combinations and calculate emission probabilities
        weight = (1.0 / float(self.n_chains))
        weight_squared = weight * weight
        covars = np.zeros((n_state_combinations, n_features))  # TODO: add support for all covariance types
        means = np.zeros((n_state_combinations, n_features))
        for idx, state_combination in enumerate(state_combinations):
            for chain_idx, state in enumerate(state_combination):
                chain = self.chains_[chain_idx]
                covars[idx] += chain._covars_[state]
                means[idx] += chain._means_[state]
            covars[idx] *= weight_squared
            means[idx] *= weight
        framelogprob = log_multivariate_normal_density(ob, means, covars, covariance_type='diag')  # TODO: add support for all covariance types

        # Run the forward algorithm
        fhmmc._forward(n_observations, self.n_chains, self.n_states, state_combinations, log_startprob, log_transmat,
                       framelogprob, fwdlattice)

        last_column = fwdlattice[-1]
        assert np.size(last_column) == n_state_combinations
        score = logsumexp(last_column)
        return score
Example #3
0
    def start(self):
        featurizer = mixtape.featurizer.load(self.args.featurizer)

        features, ii, ff = mixtape.featurizer.featurize_all(
            self.filenames, featurizer, self.topology, self.stride)
        logprob = log_multivariate_normal_density(
            features, np.array(self.model['means']),
            np.array(self.model['vars']), covariance_type='diag')

        assignments = np.argmax(logprob, axis=1)
        probs = np.max(logprob, axis=1)

        data = {'filename': [], 'index': [], 'state': []}
        for k in range(self.model['n_states']):
            # pick the structures that have the highest log
            # probability in the state
            p = probs[assignments == k]
            sorted_filenms = ff[assignments == k][p.argsort()]
            sorted_indices = ii[assignments == k][p.argsort()]

            if len(p) > 0:
                data['index'].extend(sorted_indices[-self.args.n_per_state:])
                index_length = len(sorted_indices[-self.args.n_per_state:])
                data['filename'].extend(sorted_filenms[-self.args.n_per_state:])
                filename_length = len(sorted_filenms[-self.args.n_per_state:])
                assert index_length == filename_length
                data['state'].extend([k] * index_length)
            else:
                print('WARNING: NO STRUCTURES ASSIGNED TO STATE=%d' % k)

        df = pd.DataFrame(data)
        print('Saving the indices of the selected frames in CSV format to %s' % self.out)
        with open(self.out, 'w') as f:
            f.write("# command: %s\n" % ' '.join(sys.argv))
            df.to_csv(f)
Example #4
0
	def get_word_ll_for_topics(self):
		"""Calculates P(w|z) for all words (rows), given each topic (columns)

		Returns matrix that is VxK (where V is the number of words in the vocabulary, and K is the number of topics)
		"""
		word_vectors = self._w2v_model.syn0
		return log_multivariate_normal_density(word_vectors, self._gmm_model.means_, self._gmm_model.covars_, self._gmm_model.covariance_type)
Example #5
0
    def score(self, X, return_responsibilities=False):
        nc = len(self.weights)
        X = numpy.array(X)
        if X.ndim == 1:
            X = X[:, None]

        if X.shape[1] != self.means.shape[1]:
            raise ValueError('The shape of X  is not compatible with self')

        mins = self.lims[:, 0]
        maxes = self.lims[:, 1]

        lpr = numpy.log(self.weights) + \
              log_multivariate_normal_density(X,
                    self.means,
                    self.covs, 'full')
        mask = (X >= mins[None, :]).all(axis=-1)
        mask &= (X <= maxes[None, :]).all(axis=-1)
        logprob = logsumexp(lpr, axis=1)
        logprob[~mask] = -numpy.inf
        if return_responsibilities:
            responsibilities = numpy.exp(lpr - logprob[:, None])
            responsibilities[~mask] = 0
            return logprob, responsibilities
        return logprob
Example #6
0
    def get_word_ll_for_topics(self):
        """Calculates P(w|z) for all words (rows), given each topic (columns)

		Returns matrix that is VxK (where V is the number of words in the vocabulary, and K is the number of topics)
		"""
        word_vectors = self._w2v_model.syn0
        return log_multivariate_normal_density(word_vectors,
                                               self._gmm_model.means_,
                                               self._gmm_model.covars_,
                                               self._gmm_model.covariance_type)
    def _compute_log_likelihood(self, seq):
        state_combinations = [tuple(x) for x in list(itertools.product(np.arange(self.n_states), repeat=self.n_chains))]
        n_state_combinations = self.n_states ** self.n_chains
        n_observations, n_features = seq.shape
        covars = np.array([self.covar for _ in xrange(n_state_combinations)])  # TODO: correct?!

        means = np.zeros((n_state_combinations, n_features))
        for idx, state_combination in enumerate(state_combinations):
            for chain_idx, state in enumerate(state_combination):
                means[idx] += self.means[chain_idx, state]

        framelogprob = log_multivariate_normal_density(seq, means, covars, covariance_type='full')
        return framelogprob
Example #8
0
    def test_framelogprob_reshape(self):
        n_states = 3
        n_chains = 2
        n_state_combinations = n_states ** n_chains
        state_combinations = [tuple(x) for x in list(itertools.product(np.arange(n_states), repeat=n_chains))]

        covars = np.random.random((n_state_combinations, 10))
        means = np.random.random((n_state_combinations, 10))

        ob = np.random.random((5, 10))
        framelogprob = log_multivariate_normal_density(ob, means, covars, covariance_type='diag')

        # This test assures that resizing the framelogprob still yields the correct state variables
        reshaped_framelogprob = framelogprob.reshape((5, n_states, n_states))
        for ob_idx in xrange(5):
            for idx, state_combination in enumerate(state_combinations):
                self.assertEqual(reshaped_framelogprob[ob_idx][state_combination], framelogprob[ob_idx][idx])
Example #9
0
    def _exact_loglikelihood(self, ob):
        log_transmat = np.zeros((self.n_chains, self.n_states, self.n_states))
        log_startprob = np.zeros((self.n_chains, self.n_states))
        for idx, chain in enumerate(self.chains_):
            log_transmat[idx] = chain._log_transmat
            log_startprob[idx] = chain._log_startprob

        n_state_combinations = self.n_states**self.n_chains
        state_combinations = [
            tuple(x) for x in list(
                itertools.product(np.arange(self.n_states),
                                  repeat=self.n_chains))
        ]
        n_observations = ob.shape[0]
        n_features = ob.shape[1]
        fwdlattice = np.zeros((n_observations, n_state_combinations))

        # Calculate means and covariances for all state combinations and calculate emission probabilities
        weight = (1.0 / float(self.n_chains))
        weight_squared = weight * weight
        covars = np.zeros(
            (n_state_combinations,
             n_features))  # TODO: add support for all covariance types
        means = np.zeros((n_state_combinations, n_features))
        for idx, state_combination in enumerate(state_combinations):
            for chain_idx, state in enumerate(state_combination):
                chain = self.chains_[chain_idx]
                covars[idx] += chain._covars_[state]
                means[idx] += chain._means_[state]
            covars[idx] *= weight_squared
            means[idx] *= weight
        framelogprob = log_multivariate_normal_density(
            ob, means, covars, covariance_type='diag'
        )  # TODO: add support for all covariance types

        # Run the forward algorithm
        fhmmc._forward(n_observations, self.n_chains, self.n_states,
                       state_combinations, log_startprob, log_transmat,
                       framelogprob, fwdlattice)

        last_column = fwdlattice[-1]
        assert np.size(last_column) == n_state_combinations
        score = logsumexp(last_column)
        return score
Example #10
0
    def _compute_log_likelihood(self, seq):
        state_combinations = [
            tuple(x) for x in list(
                itertools.product(np.arange(self.n_states),
                                  repeat=self.n_chains))
        ]
        n_state_combinations = self.n_states**self.n_chains
        n_observations, n_features = seq.shape
        covars = np.array([self.covar for _ in xrange(n_state_combinations)
                           ])  # TODO: correct?!

        means = np.zeros((n_state_combinations, n_features))
        for idx, state_combination in enumerate(state_combinations):
            for chain_idx, state in enumerate(state_combination):
                means[idx] += self.means[chain_idx, state]

        framelogprob = log_multivariate_normal_density(seq,
                                                       means,
                                                       covars,
                                                       covariance_type='full')
        return framelogprob
Example #11
0
    def start(self):
        featurizer = mixtape.featurizer.load(self.args.featurizer)

        features, ii, ff = mixtape.featurizer.featurize_all(
            self.filenames, featurizer, self.topology)
        logprob = log_multivariate_normal_density(features,
                                                  np.array(
                                                      self.model['means']),
                                                  np.array(self.model['vars']),
                                                  covariance_type='diag')

        assignments = np.argmax(logprob, axis=1)
        probs = np.max(logprob, axis=1)

        data = {'filename': [], 'index': [], 'state': []}
        for k in range(self.model['n_states']):
            # pick the structures that have the highest log
            # probability in the state
            p = probs[assignments == k]
            sorted_filenms = ff[assignments == k][p.argsort()]
            sorted_indices = ii[assignments == k][p.argsort()]

            if len(p) > 0:
                data['index'].extend(sorted_indices[-self.args.n_per_state:])
                data['filename'].extend(
                    sorted_filenms[-self.args.n_per_state:])
                data['state'].extend([k] * self.args.n_per_state)
            else:
                print('WARNING: NO STRUCTURES ASSIGNED TO STATE=%d' % k)

        df = pd.DataFrame(data)
        print('Saving the indices of the selected frames in CSV format to %s' %
              self.out)
        with open(self.out, 'w') as f:
            f.write("# command: %s\n" % ' '.join(sys.argv))
            df.to_csv(f)
Example #12
0
 def _compute_log_likelihood(self, X):
     """Compute the log likelihood of feature matrix X"""
     return gmm.log_multivariate_normal_density(X, self.means_,
                                                self.covars_,
                                                self.covariance_type)
Example #13
0
def test_alanine_dipeptide():
    import pdb, traceback, sys
    warnings.filterwarnings("ignore", 
                    category=DeprecationWarning)
    try:
        b = fetch_alanine_dipeptide()
        trajs = b.trajectories
        n_seq = len(trajs)
        n_frames = trajs[0].n_frames
        n_atoms = trajs[0].n_atoms
        n_features = n_atoms * 3
        sim_T = 1000
        data_home = get_data_home()
        data_dir = join(data_home, TARGET_DIRECTORY_ALANINE)
        top = md.load(join(data_dir, 'ala2.pdb'))
        n_components = 2
        # Superpose m
        data = []
        for traj in trajs:
            traj.superpose(top)
            Z = traj.xyz
            Z = np.reshape(Z, (len(Z), n_features), order='F')
            data.append(Z)

        # Fit MSLDS model 
        n_experiments = 1
        n_em_iter = 1
        tol = 1e-1
        model = MetastableSwitchingLDS(n_components, 
            n_features, n_experiments=n_experiments, 
            n_em_iter=n_em_iter) 
        model.fit(data, gamma=.1, tol=tol, verbose=True)
        mslds_score = model.score(data)
        print("MSLDS Log-Likelihood = %f" %  mslds_score)

        # Fit Gaussian HMM for comparison
        g = GaussianFusionHMM(n_components, n_features)
        g.fit(data)
        hmm_score = g.score(data)
        print("HMM Log-Likelihood = %f" %  hmm_score)
        print()

        # Generate a trajectory from learned model.
        sample_traj, hidden_states = model.sample(sim_T)
        states = []
        for k in range(n_components):
            states.append([])

        # Presort the data into the metastable wells
        for k in range(n_components):
            for i in range(len(trajs)):
                traj = trajs[i]
                Z = traj.xyz
                Z = np.reshape(Z, (len(Z), n_features), order='F')
                logprob = log_multivariate_normal_density(Z,
                    np.array(model.means_),
                    np.array(model.covars_), covariance_type='full')
                assignments = np.argmax(logprob, axis=1)
                #probs = np.max(logprob, axis=1)
                # pick structures that have highest log probability in state
                s = traj[assignments == k]
                states[k].append(s)

        # Pick frame from original trajectories closest to current sample
        gen_traj = None
        for t in range(sim_T):
            h = hidden_states[t]
            best_logprob = -np.inf
            best_frame = None
            for i in range(len(trajs)):
                if t > 0:
                    states[h][i].superpose(gen_traj, t-1)
                Z = states[h][i].xyz
                Z = np.reshape(Z, (len(Z), n_features), order='F')
                mean = sample_traj[t]
                logprobs = log_multivariate_normal_density(Z,
                    mean, model.Qs_[h], covariance_type='full')
                ind = np.argmax(logprobs, axis=0)
                logprob = logprobs[ind]
                if logprob > best_log_prob:
                    logprob = best_logprob
                    best_frame = states[h][i][ind]
            if t == 0:
                gen_traj = best_frame
            else:
                gen_traj = gen_traj.join(frame)
        gen_traj.save('%s.xtc' % self.out)
        gen_traj[0].save('%s.xtc.pdb' % self.out)
    except:
        type, value, tb = sys.exc_info()
        traceback.print_exc()
        pdb.post_mortem(tb)
Example #14
0
				# Train GMM
				print 'Starting GMM training'
				words = w2v_model.vocab.keys()
				word_vectors = w2v_model.syn0
				gmm_model = GMM(n_components=num_topics, n_iter=num_gmm_iterations, covariance_type='diag')
				gmm_model.fit(word_vectors)
				# joblib.dump(gmm_model, gmm_output_file) 
				print 'Done GMM training'

				# Get the likelihood of each word vector under each Gaussian component
				scores = gmm_model.score(test_vectors)
				print scores
				ll = sum(scores)
				print "LL:   "+str(ll)

				# Print topics if desired
				if print_topics:
					log_probs = log_multivariate_normal_density(word_vectors, gmm_model.means_, gmm_model.covars_, gmm_model.covariance_type)
					print np.min(log_probs)
					_, num_col = log_probs.shape
					for col in xrange(num_col):
						top_n = 10
						log_component_probs = (log_probs[:,col]).T
						sorted_indexes = np.argsort(log_component_probs)[::-1][:top_n]
						ordered_word_probs = [(w2v_model.index2word[idx], log_component_probs[idx]) for idx in sorted_indexes]

						print '---'
						print "Topic {0}".format(col+1)
						print "Total prob:" + str(sum(log_component_probs))
						print ", ".join(["{w}: {p}".format(w=w, p=p) for w, p in ordered_word_probs])
Example #15
0
                                n_iter=num_gmm_iterations,
                                covariance_type='diag')
                gmm_model.fit(word_vectors)
                # joblib.dump(gmm_model, gmm_output_file)
                print 'Done GMM training'

                # Get the likelihood of each word vector under each Gaussian component
                scores = gmm_model.score(test_vectors)
                print scores
                ll = sum(scores)
                print "LL:   " + str(ll)

                # Print topics if desired
                if print_topics:
                    log_probs = log_multivariate_normal_density(
                        word_vectors, gmm_model.means_, gmm_model.covars_,
                        gmm_model.covariance_type)
                    print np.min(log_probs)
                    _, num_col = log_probs.shape
                    for col in xrange(num_col):
                        top_n = 10
                        log_component_probs = (log_probs[:, col]).T
                        sorted_indexes = np.argsort(
                            log_component_probs)[::-1][:top_n]
                        ordered_word_probs = [(w2v_model.index2word[idx],
                                               log_component_probs[idx])
                                              for idx in sorted_indexes]

                        print '---'
                        print "Topic {0}".format(col + 1)
                        print "Total prob:" + str(sum(log_component_probs))