def get_kl_divergence(shape, mu, sigma, prior, sample):
    """
    Compute KL divergence between posterior and prior.
    log(q(theta)) - log(p(theta)) where
    p(theta) = pi*N(0,sigma1) + (1-pi)*N(0,sigma2)
    
    shape = shape of the sample we want to compute the KL of
    mu = the mu variable used when sampling
    sigma= the sigma variable used when sampling
    prior = the prior object with parameters
    sample = the sample from the posterior
    """

    #Flatten to a vector
    sample = tf.reshape(sample, [-1])

    #Get the log probability distribution of your sampled variable
    #So essentially get: q( theta | mu, sigma )
    posterior = Normal(mu, sigma)

    prior_1 = Normal(0.0, prior.sigma1)
    prior_2 = Normal(0.0, prior.sigma2)

    #get: sum( log[ q( theta | mu, sigma ) ] )
    q_theta = tf.reduce_sum(posterior.log_prob(sample))

    #get: sum( log[ p( theta ) ] ) for mixture prior
    mix1 = tf.reduce_sum(prior_1.log_prob(sample)) + tf.log(prior.pi_mix)
    mix2 = tf.reduce_sum(prior_2.log_prob(sample)) + tf.log(1.0 - prior.pi_mix)

    #Compute KL distance
    KL = q_theta - tf.reduce_logsumexp([mix1, mix2])

    return KL
def get_KL_divergence_Sample(shape, mu, sigma, prior, Z):
    """
    Compute KL divergence between posterior and prior.
    Instead of computing the real KL distance between the Prior and Variatiational
    posterior of the weights, we will jsut sample its value of the specific values
    of the sampled weights  W. 
    
    In this case:
        - Posterior: Multivariate Independent Gaussian.
        - Prior: Mixture model
    
    The sample of the posterior is:
        KL_sample = log(q(W|theta)) - log(p(W|theta_0)) where
         p(theta) = pi*N(0,sigma1) + (1-pi)*N(0,sigma2)
    
    Input:
        - mus,sigmas: 
        - Z: Samples weights values, the hidden variables !
    shape = shape of the sample we want to compute the KL of
    mu = the mu variable used when sampling
    sigma= the sigma variable used when sampling
    prior = the prior object with parameters
    sample = the sample from the posterior
    
    """

    # Flatten the hidden variables (weights)
    Z = tf.reshape(Z, [-1])

    #Get the log probability distribution of your sampled variable

    # Distribution of the Variational Posterior
    VB_distribution = Normal(mu, sigma)
    # Distribution of the Gaussian Components of the prior
    prior_1_distribution = Normal(0.0, prior.sigma1)
    prior_2_distribution = Normal(0.0, prior.sigma2)

    # Now we compute the log likelihood of those Hidden variables for their
    # prior and posterior.

    #get: sum( log[ q( theta | mu, sigma ) ] )
    q_ll = tf.reduce_sum(VB_distribution.log_prob(Z))

    #get: sum( log[ p( theta ) ] ) for mixture prior
    mix1 = tf.reduce_sum(prior_1_distribution.log_prob(Z)) + tf.log(
        prior.pi_mix)
    mix2 = tf.reduce_sum(
        prior_2_distribution.log_prob(Z)) + tf.log(1.0 - prior.pi_mix)
    p_ll = tf.reduce_logsumexp([mix1, mix2])

    #Compute the sample of the KL distance as the substaction ob both
    KL = q_ll - p_ll

    return KL
def get_KL_divergence_Sample(shape, mu, sigma, prior, Z):
    
    """
    Compute KL divergence between posterior and prior.
    Instead of computing the real KL distance between the Prior and Variatiational
    posterior of the weights, we will jsut sample its value of the specific values
    of the sampled weights  W. 
    
    In this case:
        - Posterior: Multivariate Independent Gaussian.
        - Prior: Mixture model
    
    The sample of the posterior is:
        KL_sample = log(q(W|theta)) - log(p(W|theta_0)) where
         p(theta) = pi*N(0,sigma1) + (1-pi)*N(0,sigma2)
    
    Input:
        - mus,sigmas: 
        - Z: Samples weights values, the hidden variables !
    shape = shape of the sample we want to compute the KL of
    mu = the mu variable used when sampling
    sigma= the sigma variable used when sampling
    prior = the prior object with parameters
    sample = the sample from the posterior
    
    """
    
    # Flatten the hidden variables (weights)
    Z = tf.reshape(Z, [-1])
    
    #Get the log probability distribution of your sampled variable
    
    # Distribution of the Variational Posterior
    VB_distribution = Normal(mu, sigma)
    # Distribution of the Gaussian Components of the prior
    prior_1_distribution = Normal(0.0, prior.sigma1)
    prior_2_distribution = Normal(0.0, prior.sigma2)
    
    # Now we compute the log likelihood of those Hidden variables for their
    # prior and posterior.
    
    #get: sum( log[ q( theta | mu, sigma ) ] )
    q_ll = tf.reduce_sum(VB_distribution.log_prob(Z))
    
    #get: sum( log[ p( theta ) ] ) for mixture prior
    mix1 = tf.reduce_sum(prior_1_distribution.log_prob(Z)) + tf.log(prior.pi_mix)
    mix2 = tf.reduce_sum(prior_2_distribution.log_prob(Z)) + tf.log(1.0 - prior.pi_mix)
    p_ll = tf.reduce_logsumexp([mix1,mix2])
    
    #Compute the sample of the KL distance as the substaction ob both
    KL = q_ll -  p_ll
    
    return KL
Example #4
0
def KL_scale_mixture(shape, mu, sigma, prior, w):
    """Compute KL for scale mixture Gaussian priors
    shape = (n_unit, n_w)
    """
    posterior = Normal(mu, sigma)
    part_post = posterior.log_prob(tf.reshape(w, [-1]))  # flatten
    prior_1 = Normal(0., prior.sigma_1)
    prior_2 = Normal(0., prior.sigma_2)
    part_1 = tf.reduce_sum(prior_1.log_prob(w)) + tf.log(prior.pi)
    part_2 = tf.reduce_sum(prior_2.log_prob(w)) + tf.log(prior.pi)
    prior_mix = tf.stack([part_1, part_2])
    KL = - tf.reduce_sum(tf.reduce_logsumexp(prior_mix, axis=0)) + \
        tf.reduce_sum(part_post)
    return KL
Example #5
0
    def __init__(self, policy, rate, train=True):
        self.rate = rate
        self.policy = policy

        with tf.variable_scope('policy_estimator'):
            self.policy.setup()

            self.X = policy.X
            self.a = policy.a
            self.target = tf.placeholder(dtype='float',
                                         shape=[None, 1],
                                         name='target')

            self.a_pred = policy.a_pred
            self.var = policy.var

            dist = Normal(self.a_pred, self.var)
            self.log_probs = dist.log_prob(self.a)

            self.losses = self.log_probs * self.target
            self.loss = tf.reduce_sum(self.losses, name='loss')

            if train:
                self.opt = tf.train.RMSPropOptimizer(rate, 0.99, 0.0, 1e-6)
                self.grads_and_vars = self.opt.compute_gradients(self.loss)
                self.grads_and_vars = [(g, v) for g, v in self.grads_and_vars
                                       if g is not None]
                self.update = self.opt.apply_gradients(self.grads_and_vars)
Example #6
0
def get_kl_divergence(shape, mu, sigma, prior, sample):
    

    """
    Compute KL divergence between posterior and prior.
    log(q(theta)) - log(p(theta)) where
    p(theta) = pi*N(0,sigma1) + (1-pi)*N(0,sigma2)
    
    shape = shape of the sample we want to compute the KL of
    mu = the mu variable used when sampling
    sigma= the sigma variable used when sampling
    prior = the prior object with parameters
    sample = the sample from the posterior
    """
    
    #Flatten to a vector
    sample = tf.reshape(sample, [-1])
    
    #Get the log probability distribution of your sampled variable
    #So essentially get: q( theta | mu, sigma )
    posterior = Normal(mu, sigma)
    
    
    prior_1 = Normal(0.0, prior.sigma1)
    prior_2 = Normal(0.0, prior.sigma2)
    
    #get: sum( log[ q( theta | mu, sigma ) ] )
    q_theta = tf.reduce_sum(posterior.log_prob(sample))
    
    #get: sum( log[ p( theta ) ] ) for mixture prior
    mix1 = tf.reduce_sum(prior_1.log_prob(sample)) + tf.log(prior.pi_mix)
    mix2 = tf.reduce_sum(prior_2.log_prob(sample)) + tf.log(1.0 - prior.pi_mix)
    
    #Compute KL distance
    KL = q_theta - tf.reduce_logsumexp([mix1,mix2])
    
    return KL
Example #7
0
    def __call__(
        self,
        states,
        actions,
        next_states,
        initial_omega=8,
        training_set_size=4000,
        actions_one_hot=None,
        sess=None,
        summary_writer=None,
    ):
        """

        :param states: Nxm matrix
        :param actions: Vector of all possible actions: Nx n_actions
        :param next_states: Nxm matrix containing the next states
        :param initial_omega: value of the initial omega
        :return:
        """
        self.sess = sess
        self.training_set_size = training_set_size
        self.summary_writer = summary_writer
        train_or_test = U.get_placeholder("train_or_test", tf.bool, ())
        # statistics
        self.Xmean_ph = U.get_placeholder(
            name="Xmean", dtype=self.dtype, shape=(1, self.x_dim)
        )
        self.Ymean_ph = U.get_placeholder(
            name="Ymean", dtype=self.dtype, shape=(1, self.state_dim)
        )
        self.Xstd_ph = U.get_placeholder(
            name="Xstd", dtype=self.dtype, shape=(1, self.x_dim)
        )
        self.Ystd_ph = U.get_placeholder(
            name="Ystd", dtype=self.dtype, shape=(1, self.state_dim)
        )
        self.X = U.get_placeholder(name="X", dtype=self.dtype, shape=(None, self.x_dim))
        self.Y = U.get_placeholder(
            name="Y", dtype=self.dtype, shape=(None, self.state_dim)
        )
        with tf.variable_scope(self.name):
            # build the action vector
            self.omega = tf.get_variable(
                dtype=self.dtype,
                name="omega",
                shape=(),
                initializer=tf.initializers.constant(initial_omega),
            )
            X = self.X  # - Xmean_) / Xstd_
            Y = self.Y  # - YMean_) / Ystd_
            # build the action vector
            forces = self.omega * actions
            forces_full = tf.concat(
                [tf.reshape(forces[:, 0], (-1, 1)), tf.reshape(forces[:, 1], (-1, 1))],
                axis=0,
            )
            batch_size = tf.shape(states)[0]
            x_full = tf.concat([states, states], axis=0)
            x_full = tf.concat([x_full, forces_full], axis=1)
            x_full = (x_full - self.Xmean_ph) / self.Xstd_ph
            next_states_full = tf.concat([next_states, next_states], axis=0)
            next_states_full = (next_states_full - self.Ymean_ph) / self.Ystd_ph

            # build the network
            hidden_layer_size = 10
            biases = tf.get_variable(
                "b",
                [hidden_layer_size],
                initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype),
                dtype=self.dtype,
            )
            W = tf.get_variable(
                "W",
                [self.x_dim, hidden_layer_size],
                initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype),
                dtype=self.dtype,
            )

            x_input = U.switch(train_or_test, X, x_full)
            h = tf.matmul(x_input, W)
            h = tf.tanh(h + biases)

            # now we need state_dim output neurons, one for each state dimension to predict
            biases_out = tf.get_variable(
                "b_out",
                [self.state_dim],
                initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype),
                dtype=self.dtype,
            )
            W_out = tf.get_variable(
                "W_out",
                [hidden_layer_size, self.state_dim],
                initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype),
                dtype=self.dtype,
            )
            means = tf.matmul(h, W_out) + biases_out

            # x_input_first = x_input[:, 0:self.x_dim - 1]
            # forces = tf.reshape(x_input[:, self.x_dim - 1], (-1, 1))
            # x_input = tf.concat([x_input_first, tf.abs(forces)], axis=1)

            hidden_var = 10
            biases_var = tf.get_variable(
                "b_var",
                [hidden_var],
                initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype),
                dtype=self.dtype,
            )
            W_var = tf.get_variable(
                "W_var",
                [self.x_dim, hidden_var],
                initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype),
                dtype=self.dtype,
            )

            h = tf.nn.sigmoid(tf.matmul(x_input, W_var) + biases_var)

            W_out_var = tf.get_variable(
                "W_out_var",
                [hidden_var, self.state_dim],
                initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype),
                dtype=self.dtype,
            )

            biases_out_var = tf.get_variable(
                "b_out_var",
                [self.state_dim],
                initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype),
                dtype=self.dtype,
            )

            var = tf.exp(tf.matmul(h, W_out_var) + biases_out_var)

            std = tf.sqrt(var)

            pdf = Normal(means, std)

            y_output = U.switch(train_or_test, Y, next_states_full)

            log_prob = tf.reduce_sum(pdf.log_prob(y_output), axis=1, keepdims=True)
            prob = tf.reduce_prod(pdf.prob(y_output), axis=1, keepdims=True)

            # loss is the negative loss likelihood
            self.loss = -tf.reduce_mean(log_prob)
            self.valid_loss = -tf.reduce_mean(log_prob)

            self.fitting_vars = [
                biases,
                W,
                biases_out,
                W_out,
                biases_var,
                W_var,
                W_out_var,
                biases_out_var,
            ]
            # create fitting collection
            for v in self.fitting_vars:
                tf.add_to_collection("fitting", v)

            opt = tf.train.AdamOptimizer()
            self.minimize_op = opt.minimize(self.loss, var_list=self.fitting_vars)

            log_prob_a0 = log_prob[0:batch_size, :]
            log_prob_a1 = log_prob[batch_size:, :]
            prob_a0 = prob[0:batch_size, :]
            prob_a1 = prob[batch_size:, :]
            self.log_prob = tf.concat([log_prob_a0, log_prob_a1], axis=1)
            self.prob = tf.concat([prob_a0, prob_a1], axis=1)
            means_list = []
            var_list = []
            for i in range(self.state_dim):
                means_a0 = tf.reshape(means[0:batch_size, i], (-1, 1))
                means_a1 = tf.reshape(means[batch_size : 2 * batch_size, i], (-1, 1))
                means_actions = tf.concat([means_a0, means_a1], axis=1)
                means_ = tf.reduce_sum(
                    tf.multiply(means_actions, actions_one_hot), axis=1, keepdims=True
                )
                means_list.append(means_)
                # same for variance
                var_a0 = tf.reshape(var[0:batch_size, i], (-1, 1))
                var_a1 = tf.reshape(var[batch_size : 2 * batch_size, i], (-1, 1))
                var_actions = tf.concat([var_a0, var_a1], axis=1)
                var_ = tf.reduce_sum(
                    tf.multiply(var_actions, actions_one_hot), axis=1, keepdims=True
                )
                var_list.append(var_)

            self.means = tf.concat(means_list, axis=1)
            self.variances = tf.concat(var_list, axis=1)
            self.train_or_test = train_or_test
            self.loss_summary = tf.summary.scalar("Loss", self.loss)
            self.valid_loss_summary = tf.summary.scalar("ValidLoss", self.valid_loss)
        return self.log_prob, self.prob
Example #8
0
class AIRModel(object):
    """Generic AIR model"""
    def __init__(self,
                 obs,
                 nums,
                 max_steps,
                 glimpse_size,
                 n_appearance,
                 transition,
                 input_encoder,
                 glimpse_encoder,
                 glimpse_decoder,
                 transform_estimator,
                 steps_predictor,
                 output_std=1.,
                 discrete_steps=True,
                 output_multiplier=1.,
                 explore_eps=None,
                 debug=False,
                 **kwargs):
        """Creates the model.

        :param obs: tf.Tensor, images
        :param nums: tf.Tensor, number of objects in images
            Note: it is not used for inference or training; could be removed from here.
        :param max_steps: int, maximum number of steps to take (or objects in the image)
        :param glimpse_size: tuple of ints, size of the attention glimpse
        :param n_appearance: int, number of latent variables describing an object
        :param transition: see :class: AIRCell
        :param input_encoder: see :class: AIRCell
        :param glimpse_encoder: see :class: AIRCell
        :param glimpse_decoder: see :class: AIRCell
        :param transform_estimator: see :class: AIRCell
        :param steps_predictor: see :class: AIRCell
        :param output_std: float, std. dev. of the output Gaussian distribution
        :param discrete_steps: see :class: AIRCell
        :param output_multiplier: float, a factor that multiplies the reconstructed glimpses
        :param explore_eps: see :class: AIRCell
        :param debug: see :class: AIRCell
        :param **kwargs: all other parameters are passed to AIRCell
        """

        self.obs = obs
        self.nums = nums
        self.max_steps = max_steps
        self.glimpse_size = glimpse_size

        self.n_appearance = n_appearance

        self.output_std = output_std
        self.discrete_steps = discrete_steps
        self.explore_eps = explore_eps
        self.debug = debug

        with tf.variable_scope(self.__class__.__name__):
            self.output_multiplier = tf.Variable(output_multiplier,
                                                 dtype=tf.float32,
                                                 trainable=False,
                                                 name='canvas_multiplier')

            shape = self.obs.get_shape().as_list()
            self.batch_size = shape[0]
            self.img_size = shape[1:]
            self._build(transition, input_encoder, glimpse_encoder,
                        glimpse_decoder, transform_estimator, steps_predictor,
                        kwargs)

    def _build(self, transition, input_encoder, glimpse_encoder,
               glimpse_decoder, transform_estimator, steps_predictor, kwargs):
        """Build the model. See __init__ for argument description"""

        if self.explore_eps is not None:
            self.explore_eps = tf.get_variable('explore_eps',
                                               initializer=self.explore_eps,
                                               trainable=False)

        self.cell = AIRCell(self.img_size,
                            self.glimpse_size,
                            self.n_appearance,
                            transition,
                            input_encoder,
                            glimpse_encoder,
                            glimpse_decoder,
                            transform_estimator,
                            steps_predictor,
                            canvas_init=None,
                            discrete_steps=self.discrete_steps,
                            explore_eps=self.explore_eps,
                            debug=self.debug,
                            **kwargs)

        initial_state = self.cell.initial_state(self.obs)

        dummy_sequence = tf.zeros((self.max_steps, self.batch_size, 1),
                                  name='dummy_sequence')
        outputs, state = tf.nn.dynamic_rnn(self.cell,
                                           dummy_sequence,
                                           initial_state=initial_state,
                                           time_major=True)

        for name, output in zip(self.cell.output_names, outputs):
            setattr(self, name, output)

        self.final_state = state[-2]
        self.glimpse = tf.reshape(self.presence * tf.nn.sigmoid(self.glimpse),
                                  (
                                      self.max_steps,
                                      self.batch_size,
                                  ) + tuple(self.glimpse_size))
        self.canvas = tf.reshape(self.canvas, (
            self.max_steps,
            self.batch_size,
        ) + tuple(self.img_size))
        self.canvas *= self.output_multiplier

        self.final_canvas = self.canvas[-1]

        self.output_distrib = Normal(self.final_canvas, self.output_std)

        posterior_step_probs = tf.transpose(tf.squeeze(self.presence_prob))
        self.num_steps_distrib = NumStepsDistribution(posterior_step_probs)

        self.num_step_per_sample = tf.to_float(
            tf.squeeze(tf.reduce_sum(self.presence, 0)))
        self.num_step = tf.reduce_mean(self.num_step_per_sample)
        self.gt_num_steps = tf.squeeze(tf.reduce_sum(self.nums, 0))

    @staticmethod
    def _anneal_weight(init_val,
                       final_val,
                       anneal_type,
                       global_step,
                       anneal_steps,
                       hold_for=0.,
                       steps_div=1.,
                       dtype=tf.float64):

        val, final, step, hold_for, anneal_steps, steps_div = (tf.cast(
            i, dtype) for i in (init_val, final_val, global_step, hold_for,
                                anneal_steps, steps_div))
        step = tf.maximum(step - hold_for, 0.)

        if anneal_type == 'exp':
            decay_rate = tf.pow(final / val, steps_div / anneal_steps)
            val = tf.train.exponential_decay(val, step, steps_div, decay_rate)

        elif anneal_type == 'linear':
            val = final + (val - final) * (1. - step / anneal_steps)
        else:
            raise NotImplementedError

        anneal_weight = tf.maximum(final, val)
        return anneal_weight

    def _prior_loss(self, what_prior, where_scale_prior, where_shift_prior,
                    num_steps_prior, global_step):
        """Creates KL-divergence term of the loss"""

        with tf.variable_scope('KL_divergence'):
            prior_loss = Loss()
            if num_steps_prior is not None:
                if num_steps_prior.anneal is not None:
                    with tf.variable_scope('num_steps_prior'):
                        nsp = num_steps_prior

                        hold_init = getattr(nsp, 'hold_init', 0.)
                        steps_div = getattr(nsp, 'steps_div', 1.)
                        steps_prior_success_prob = self._anneal_weight(
                            nsp.init, nsp.final, nsp.anneal, global_step,
                            nsp.steps, hold_init, steps_div)
                else:
                    steps_prior_success_prob = num_steps_prior.init
                self.steps_prior_success_prob = steps_prior_success_prob

                with tf.variable_scope('num_steps'):
                    prior = geometric_prior(steps_prior_success_prob,
                                            self.max_steps)
                    num_steps_posterior_prob = self.num_steps_distrib.prob()
                    steps_kl = tabular_kl(num_steps_posterior_prob, prior)
                    self.kl_num_steps_per_sample = tf.squeeze(
                        tf.reduce_sum(steps_kl, 1))

                    self.kl_num_steps = tf.reduce_mean(
                        self.kl_num_steps_per_sample)
                    tf.summary.scalar('kl_num_steps', self.kl_num_steps)

                    weight = getattr(num_steps_prior, 'weight', 1.)
                    prior_loss.add(self.kl_num_steps,
                                   self.kl_num_steps_per_sample,
                                   weight=weight)

            if num_steps_prior.analytic:
                # reverse cumsum of q(n) needed to compute \E_{q(n)} [ KL[ q(z|n) || p(z|n) ]]
                step_weight = num_steps_posterior_prob[..., 1:]
                step_weight = tf.transpose(step_weight, (1, 0))
                step_weight = tf.cumsum(step_weight, axis=0, reverse=True)
            else:
                step_weight = tf.squeeze(self.presence)

            self.prior_step_weight = step_weight

            # # this prevents optimising the expectation with respect to q(n)
            # # it's similar to the maximisation step of EM: we have a pre-computed expectation
            # # from the E step, and now we're maximising with respect to the argument of the expectation.
            # self.prior_step_weight = tf.stop_gradient(self.prior_step_weight)

            conditional_kl_weight = 1.
            if what_prior is not None:
                with tf.variable_scope('what'):

                    prior = Normal(what_prior.loc, what_prior.scale)
                    posterior = Normal(self.what_loc, self.what_scale)

                    what_kl = _kl(posterior, prior)
                    what_kl = tf.reduce_sum(what_kl,
                                            -1) * self.prior_step_weight
                    what_kl_per_sample = tf.reduce_sum(what_kl, 0)

                    self.kl_what = tf.reduce_mean(what_kl_per_sample)
                    tf.summary.scalar('kl_what', self.kl_what)
                    prior_loss.add(self.kl_what,
                                   what_kl_per_sample,
                                   weight=conditional_kl_weight)

            if where_scale_prior is not None and where_shift_prior is not None:
                with tf.variable_scope('where'):
                    usx, utx, usy, uty = tf.split(self.where_loc, 4, 2)
                    ssx, stx, ssy, sty = tf.split(self.where_scale, 4, 2)
                    us = tf.concat((usx, usy), -1)
                    ss = tf.concat((ssx, ssy), -1)

                    scale_distrib = Normal(us, ss)
                    scale_prior = Normal(where_scale_prior.loc,
                                         where_scale_prior.scale)
                    scale_kl = _kl(scale_distrib, scale_prior)

                    ut = tf.concat((utx, uty), -1)
                    st = tf.concat((stx, sty), -1)
                    shift_distrib = Normal(ut, st)

                    if 'loc' in where_shift_prior:
                        shift_mean = where_shift_prior.loc
                    else:
                        shift_mean = ut
                    shift_prior = Normal(shift_mean, where_shift_prior.scale)

                    shift_kl = _kl(shift_distrib, shift_prior)
                    where_kl = tf.reduce_sum(scale_kl + shift_kl,
                                             -1) * self.prior_step_weight
                    where_kl_per_sample = tf.reduce_sum(where_kl, 0)
                    self.kl_where = tf.reduce_mean(where_kl_per_sample)
                    tf.summary.scalar('kl_where', self.kl_where)
                    prior_loss.add(self.kl_where,
                                   where_kl_per_sample,
                                   weight=conditional_kl_weight)

        return prior_loss

    def _reinforce(self, importance_weight, decay_rate):
        """Implements REINFORCE for training the discrete probability distribution over number of steps and train-step
         for the baseline"""

        log_prob = self.num_steps_distrib.log_prob(self.num_step_per_sample)

        if self.baseline is not None:
            if not isinstance(self.baseline, tf.Tensor):
                self.baseline_module = self.baseline
                self.baseline = self.baseline_module(self.obs, self.what,
                                                     self.where, self.presence,
                                                     self.final_state)
                self.baseline_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES,
                    scope=self.baseline_module.variable_scope.name)
            importance_weight -= self.baseline

        if decay_rate is not None:
            axes = range(len(importance_weight.get_shape()))
            mean, var = tf.nn.moments(tf.squeeze(importance_weight), axes=axes)
            self.imp_weight_moving_mean = make_moving_average(
                'imp_weight_moving_mean', mean, 0., decay_rate)
            self.imp_weight_moving_var = make_moving_average(
                'imp_weight_moving_var', var, 1., decay_rate)

            factor = tf.maximum(tf.sqrt(self.imp_weight_moving_var), 1.)
            importance_weight = (importance_weight -
                                 self.imp_weight_moving_mean) / factor

        self.importance_weight = importance_weight
        axes = range(len(self.importance_weight.get_shape()))
        imp_weight_mean, imp_weight_var = tf.nn.moments(
            self.importance_weight, axes)
        tf.summary.scalar('imp_weight_mean', imp_weight_mean)
        tf.summary.scalar('imp_weight_var', imp_weight_var)

        reinforce_loss_per_sample = tf.stop_gradient(
            self.importance_weight) * log_prob
        self.reinforce_loss = tf.reduce_mean(reinforce_loss_per_sample)
        tf.summary.scalar('reinforce_loss', self.reinforce_loss)

        return self.reinforce_loss

    def _make_baseline_train_step(self, opt, loss, baseline, baseline_vars):
        baseline_target = tf.stop_gradient(loss)

        self.baseline_loss = .5 * tf.reduce_mean(
            tf.square(baseline_target - baseline))
        tf.summary.scalar('baseline_loss', self.baseline_loss)
        train_step = opt.minimize(self.baseline_loss, var_list=baseline_vars)
        return train_step

    def train_step(self,
                   learning_rate,
                   l2_weight=0.,
                   what_prior=None,
                   where_scale_prior=None,
                   where_shift_prior=None,
                   num_steps_prior=None,
                   use_prior=True,
                   use_reinforce=True,
                   baseline=None,
                   decay_rate=None,
                   optimizer=tf.train.RMSPropOptimizer,
                   opt_kwargs=dict(momentum=.9, centered=True)):
        """Creates the train step and the global_step

        :param learning_rate: float or tf.Tensor
        :param l2_weight: float or tf.Tensor, if > 0. then adds l2 regularisation to the model
        :param what_prior: AttrDict or similar, with `loc` and `scale`, both floats
        :param where_scale_prior: AttrDict or similar, with `loc` and `scale`, both floats
        :param where_shift_prior: AttrDict or similar, with `loc` and `scale`, both floats
        :param num_steps_prior: AttrDict or similar, described as an example:

            >>> num_steps_prior = AttrDict(
            >>> anneal='exp',   # type of annealing of the prior; can be 'exp', 'linear' or None
            >>> init=1. - 1e-7, # initial value of the prior
            >>> final=1e-5,     # final value of the prior
            >>> steps_div=1e4,  # relevant for exponential annealing, see :func: tf.exponential_decay
            >>> steps=1e5,      # number of steps for annealing
            >>> analytic=True
            >>> )

        `init` and `final` describe success probability values in a geometric distribution; for example `init=.9` means
        that the probability of taking a single step is .9, two steps is .9**2 etc.

        :param use_prior: boolean, if False sets the KL-divergence loss term to 0
        :param use_reinforce: boolean, if False doesn't compute gradients for the number of steps
        :param baseline: callable or None, baseline for variance reduction of REINFORCE
        :param decay_rate: float, decay rate to use for exp-moving average for NVIL
        :return: train step and global step
        """

        num_steps_prior['analytic'] = getattr(num_steps_prior, 'analytic',
                                              True)

        self.l2_weight = l2_weight
        self.what_prior = what_prior
        self.where_scale_prior = where_scale_prior
        self.where_shift_prior = where_shift_prior
        self.num_steps_prior = num_steps_prior

        if not hasattr(self, 'baseline'):
            self.baseline = baseline

        self.use_prior = use_prior
        if self.use_prior is not None:
            self.use_prior = tf.Variable(self.use_prior,
                                         trainable=False,
                                         name='use_prior')
            self.toggle_prior = self.use_prior.assign(
                tf.logical_not(self.use_prior))

        self.use_reinforce = use_reinforce

        with tf.variable_scope('loss'):
            global_step = tf.train.get_or_create_global_step()
            loss = Loss()
            self._train_step = []
            self.learning_rate = tf.Variable(learning_rate,
                                             name='learning_rate',
                                             trainable=False)
            make_opt = functools.partial(optimizer, **opt_kwargs)

            # Reconstruction Loss, - \E_q [ p(x | z, n) ]
            rec_loss_per_sample = -self.output_distrib.log_prob(self.obs)
            self.rec_loss_per_sample = tf.reduce_sum(rec_loss_per_sample,
                                                     axis=(1, 2))
            self.rec_loss = tf.reduce_mean(self.rec_loss_per_sample)
            tf.summary.scalar('rec', self.rec_loss)
            loss.add(self.rec_loss, self.rec_loss_per_sample)

            # Prior Loss, KL[ q(z, n | x) || p(z, n) ]
            if use_prior is not None:
                self.prior_loss = self._prior_loss(what_prior,
                                                   where_scale_prior,
                                                   where_shift_prior,
                                                   num_steps_prior,
                                                   global_step)
                tf.summary.scalar('prior', self.prior_loss.value)
                self.prior_weight = tf.to_float(tf.equal(self.use_prior, True))
                loss.add(self.prior_loss, weight=self.prior_weight)

            # REINFORCE
            opt_loss = loss.value
            if use_reinforce:

                self.reinforce_imp_weight = self.rec_loss_per_sample
                if not num_steps_prior.analytic:
                    self.reinforce_imp_weight += self.prior_loss.per_sample

                reinforce_loss = self._reinforce(self.reinforce_imp_weight,
                                                 decay_rate)
                opt_loss += reinforce_loss

            baseline_vars = getattr(self, 'baseline_vars', [])
            model_vars = list(
                set(tf.trainable_variables()) - set(baseline_vars))
            # L2 reg
            if l2_weight > 0.:
                # don't penalise biases
                weights = [w for w in model_vars if len(w.get_shape()) == 2]
                self.l2_loss = l2_weight * sum(map(tf.nn.l2_loss, weights))
                opt_loss += self.l2_loss
                tf.summary.scalar('l2', self.l2_loss)

            opt = make_opt(self.learning_rate)
            gvs = opt.compute_gradients(opt_loss, var_list=model_vars)

            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                self._train_step = opt.apply_gradients(gvs,
                                                       global_step=global_step)

            if self.use_reinforce and self.baseline is not None:
                baseline_opt = make_opt(10 * learning_rate)
                self._baseline_tran_step = self._make_baseline_train_step(
                    baseline_opt, self.reinforce_imp_weight, self.baseline,
                    self.baseline_vars)
                self._true_train_step = self._train_step
                self._train_step = tf.group(self._true_train_step,
                                            self._baseline_tran_step)

            tf.summary.scalar('num_step', self.num_step)
        # Metrics
        gradient_summaries(gvs)
        self.num_step_accuracy = tf.reduce_mean(
            tf.to_float(tf.equal(self.gt_num_steps, self.num_step_per_sample)))

        self.loss = loss
        self.opt_loss = opt_loss
        return self._train_step, global_step
Example #9
0
    def _build_ad_nn(self, tensor_io):
        from drlutils.dataflow.tensor_io import TensorIO
        assert (isinstance(tensor_io, TensorIO))
        from drlutils.model.base import get_current_nn_context
        from tensorpack.tfutils.common import get_global_step_var
        global_step = get_global_step_var()
        nnc = get_current_nn_context()
        is_training = nnc.is_training
        i_state = tensor_io.getInputTensor('state')
        i_agentIdent = tensor_io.getInputTensor('agentIdent')
        i_sequenceLength = tensor_io.getInputTensor('sequenceLength')
        i_resetRNN = tensor_io.getInputTensor('resetRNN')
        l = i_state
        # l = tf.Print(l, [i_state, tf.shape(i_state)], 'State = ')
        # l = tf.Print(l, [i_agentIdent, tf.shape(i_agentIdent)], 'agentIdent = ')
        # l = tf.Print(l, [i_sequenceLength, tf.shape(i_sequenceLength)], 'SeqLen = ')
        # l = tf.Print(l, [i_resetRNN, tf.shape(i_resetRNN)], 'resetRNN = ')
        with tf.variable_scope('critic', reuse=nnc.reuse) as vs:

            def _get_cell():
                cell = tf.nn.rnn_cell.BasicLSTMCell(256)
                # if is_training:
                #     cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=0.9)
                return cell

            cell = tf.nn.rnn_cell.MultiRNNCell([_get_cell() for _ in range(1)])
            rnn_outputs = self._buildRNN(
                l,
                cell,
                tensor_io.batchSize,
                i_agentIdent=i_agentIdent,
                i_sequenceLength=i_sequenceLength,
                i_resetRNN=i_resetRNN,
            )
            rnn_outputs = tf.reshape(
                rnn_outputs, [-1, rnn_outputs.get_shape().as_list()[-1]])
            l = rnn_outputs
            from ad_cur.autodrive.model.selu import fc_selu
            for lidx in range(2):
                l = fc_selu(
                    l,
                    200,
                    keep_prob=1.,  # 由于我们只使用传感器训练,关键信息不能丢
                    is_training=is_training,
                    name='fc-{}'.format(lidx))
            value = tf.layers.dense(l, 1, name='fc-value')
            value = tf.squeeze(value, [1], name="value")
            if not hasattr(self, '_weights_critic'):
                self._weights_critic = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

        with tf.variable_scope('actor', reuse=nnc.reuse) as vs:
            l = tf.stop_gradient(l)
            l = tf.layers.dense(l,
                                128,
                                activation=tf.nn.relu6,
                                name='fc-actor')
            mu_steering = 0.5 * tf.layers.dense(
                l, 1, activation=tf.nn.tanh, name='fc-mu-steering')
            mu_accel = tf.layers.dense(l,
                                       1,
                                       activation=tf.nn.tanh,
                                       name='fc-mu-accel')
            mus = tf.concat([mu_steering, mu_accel], axis=-1)

            # mus = tf.layers.dense(l, 2, activation=tf.nn.tanh, name='fc-mus')
            # sigmas = tf.layers.dense(l, 2, activation=tf.nn.softplus, name='fc-sigmas')
            # sigmas = tf.clip_by_value(sigmas, -0.001, 0.5)
            def saturating_sigmoid(x):
                """Saturating sigmoid: 1.2 * sigmoid(x) - 0.1 cut to [0, 1]."""
                with tf.name_scope("saturating_sigmoid", [x]):
                    y = tf.sigmoid(x)
                    return tf.minimum(1.0, tf.maximum(0.0, 1.2 * y - 0.1))

            sigma_steering_ = 0.1 * tf.layers.dense(
                l, 1, activation=tf.nn.sigmoid, name='fc-sigma-steering')
            sigma_accel_ = 0.25 * tf.layers.dense(
                l, 1, activation=tf.nn.sigmoid, name='fc-sigma-accel')

            if not nnc.is_evaluating:
                sigma_beta_steering = tf.get_default_graph(
                ).get_tensor_by_name('actor/sigma_beta_steering:0')
                sigma_beta_accel = tf.get_default_graph().get_tensor_by_name(
                    'actor/sigma_beta_accel:0')
                sigma_beta_steering = tf.constant(1e-4)
                # sigma_beta_steering_exp = tf.train.exponential_decay(0.3, global_step, 1000, 0.5, name='sigma/beta/steering/exp')
                # sigma_beta_accel_exp = tf.train.exponential_decay(0.5, global_step, 5000, 0.5, name='sigma/beta/accel/exp')
            else:
                sigma_beta_steering = tf.constant(1e-4)
                sigma_beta_accel = tf.constant(1e-4)
            sigma_steering = (sigma_steering_ + sigma_beta_steering)
            sigma_accel = (sigma_accel_ + sigma_beta_accel)

            sigmas = tf.concat([sigma_steering, sigma_accel], axis=-1)
            # if is_training:
            #     pass
            #     # 如果不加sigma_beta,收敛会很慢,并且不稳定,猜测可能是以下原因:
            #     #   1、训练前期尽量大的探索可以避免网络陷入局部最优
            #     #   2、前期过小的sigma会使normal_dist的log_prob过大,导致梯度更新过大,网络一开始就畸形了,很难恢复回来
            #
            # if is_training:
            #     sigmas += sigma_beta_steering
            # sigma_steering = tf.clip_by_value(sigma_steering, sigma_beta_steering, 0.5)
            # sigma_accel = tf.clip_by_value(sigma_accel, sigma_beta_accel, 0.5)
            # sigmas = tf.clip_by_value(sigmas, 0.1, 0.5)
            # sigmas_orig = sigmas
            # sigmas = sigmas + sigma_beta_steering
            # sigmas = tf.minimum(sigmas + 0.1, 100)
            # sigmas = tf.clip_by_value(sigmas, sigma_beta_steering, 1)
            # sigma_steering += sigma_beta_steering
            # sigma_accel += sigma_beta_accel

            # mus = tf.concat([mu_steering, mu_accel], axis=-1)

            from tensorflow.contrib.distributions import Normal
            dists = Normal(mus, sigmas + 0.01)
            policy = tf.squeeze(dists.sample([1]), [0])
            # 裁剪到两倍方差之内
            policy = tf.clip_by_value(policy, mus - 2 * sigmas,
                                      mus + 2 * sigmas)
            if is_training:
                self._addMovingSummary(
                    tf.reduce_mean(mu_steering, name='mu/steering/mean'),
                    tf.reduce_mean(mu_accel, name='mu/accel/mean'),
                    tf.reduce_mean(sigma_steering, name='sigma/steering/mean'),
                    tf.reduce_max(sigma_steering, name='sigma/steering/max'),
                    tf.reduce_mean(sigma_accel, name='sigma/accel/mean'),
                    tf.reduce_max(sigma_accel, name='sigma/accel/max'),
                    # sigma_beta_accel,
                    # sigma_beta_steering,
                )
            # actions = tf.Print(actions, [mus, sigmas, tf.concat([sigma_steering_, sigma_accel_], -1), actions],
            #                    'mu/sigma/sigma.orig/act=', summarize=4)
            if not hasattr(self, '_weights_actor'):
                self._weights_actor = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)
        if not is_training:
            tensor_io.setOutputTensors(policy, value, mus, sigmas)
            return

        i_actions = tensor_io.getInputTensor("action")
        # i_actions = tf.Print(i_actions, [i_actions], 'actions = ')
        i_actions = tf.reshape(i_actions,
                               [-1] + i_actions.get_shape().as_list()[2:])
        log_probs = dists.log_prob(i_actions)
        # exp_v = tf.transpose(
        #     tf.multiply(tf.transpose(log_probs), advantage))
        # exp_v = tf.multiply(log_probs, advantage)
        i_advantage = tensor_io.getInputTensor("advantage")
        i_advantage = tf.reshape(i_advantage,
                                 [-1] + i_advantage.get_shape().as_list()[2:])
        exp_v = log_probs * tf.expand_dims(i_advantage, -1)
        entropy = dists.entropy()
        entropy_beta = tf.get_variable(
            'entropy_beta',
            shape=[],
            initializer=tf.constant_initializer(0.01),
            trainable=False)
        exp_v = entropy_beta * entropy + exp_v
        loss_policy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1),
                                     name='loss/policy')

        i_futurereward = tensor_io.getInputTensor("futurereward")
        i_futurereward = tf.reshape(i_futurereward, [-1] +
                                    i_futurereward.get_shape().as_list()[2:])
        loss_value = tf.reduce_mean(0.5 * tf.square(value - i_futurereward))

        loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1),
                                      name='xentropy_loss')

        from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer
        loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4),
                                                   self._weights_critic)
        loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg')
        loss_value += loss_l2_regularizer
        loss_value = tf.identity(loss_value, name='loss/value')

        # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer])

        self._addParamSummary([('.*', ['rms', 'absmax'])])
        pred_reward = tf.reduce_mean(value, name='predict_reward')
        import tensorpack.tfutils.symbolic_functions as symbf
        advantage = symbf.rms(i_advantage, name='rms_advantage')
        self._addMovingSummary(
            loss_policy,
            loss_value,
            loss_entropy,
            pred_reward,
            advantage,
            loss_l2_regularizer,
            tf.reduce_mean(policy[:, 0], name='actor/steering/mean'),
            tf.reduce_mean(policy[:, 1], name='actor/accel/mean'),
        )
        return loss_policy, loss_value
Example #10
0
    model_lambda = tf.exp(model_log_lambda)
    model_gamma = tf.exp(model_log_gamma)
    model_w_1 = tf.Variable(tf.zeros([n_feats, n_hidden]))
    model_b_1 = tf.Variable(tf.zeros([n_hidden]))
    model_w_2 = tf.Variable(tf.zeros([n_hidden, 1]))
    model_b_2 = tf.Variable(tf.zeros([]))

    # Compute the prediction from the network.
    with tf.variable_scope("prediction"):
        pred = tf.matmul(
            tf.nn.relu(tf.matmul(model_X, model_w_1) + model_b_1), model_w_2
        ) + model_b_2
    # Likelihood function.
    with tf.variable_scope("likelihood"):
        log_l_dist = Normal(pred, tf.reciprocal(tf.sqrt(model_gamma)))
        log_l = tf.reduce_sum(log_l_dist.log_prob(model_y))
    # Priors.
    with tf.variable_scope("priors"):
        prior_lambda = Gamma(alpha, beta)
        prior_gamma = Gamma(alpha, beta)
        prior_w_1 = Normal(
            tf.zeros([n_feats, n_hidden]),
            tf.reciprocal(tf.sqrt(model_lambda))
        )
        prior_b_1 = Normal(
            tf.zeros([n_hidden]),
            tf.reciprocal(tf.sqrt(model_lambda))
        )
        prior_w_2 = Normal(
            tf.zeros([n_hidden, 1]),
            tf.reciprocal(tf.sqrt(model_lambda))
    def __init__(self, args, d, logdir):
        super(dynamic_bern_emb_model, self).__init__(args, d, logdir)

        with tf.name_scope('model'):
            with tf.name_scope('embeddings'):
                self.alpha = tf.Variable(self.alpha_init,
                                         name='alpha',
                                         trainable=self.alpha_trainable)

                self.rho_t = {}
                for t in range(-1, self.T):
                    self.rho_t[t] = tf.Variable(
                        self.rho_init +
                        0.001 * tf.random_normal([self.L, self.K]) / self.K,
                        name='rho_' + str(t))

                with tf.name_scope('priors'):
                    global_prior = Normal(loc=0.0, scale=self.sig)
                    local_prior = Normal(loc=0.0, scale=self.sig / 100.0)

                    self.log_prior = tf.reduce_sum(
                        global_prior.log_prob(self.alpha))
                    self.log_prior = tf.reduce_sum(
                        global_prior.log_prob(self.rho_t[-1]))
                    for t in range(self.T):
                        self.log_prior += tf.reduce_sum(
                            local_prior.log_prob(self.rho_t[t] -
                                                 self.rho_t[t - 1]))

            with tf.name_scope('likelihood'):
                self.placeholders = {}
                self.y_pos = {}
                self.y_neg = {}
                self.ll_pos = 0.0
                self.ll_neg = 0.0
                for t in range(self.T):
                    # Index Masks
                    p_mask = tf.range(int(self.cs / 2),
                                      self.n_minibatch[t] + int(self.cs / 2))
                    rows = tf.tile(
                        tf.expand_dims(tf.range(0, int(self.cs / 2)), [0]),
                        [self.n_minibatch[t], 1])
                    columns = tf.tile(
                        tf.expand_dims(tf.range(0, self.n_minibatch[t]), [1]),
                        [1, int(self.cs / 2)])

                    ctx_mask = tf.concat([
                        rows + columns, rows + columns + int(self.cs / 2) + 1
                    ], 1)

                    # Data Placeholder
                    self.placeholders[t] = tf.placeholder(
                        tf.int32, shape=(self.n_minibatch[t] + self.cs))

                    # Taget and Context Indices
                    p_idx = tf.gather(self.placeholders[t], p_mask)
                    ctx_idx = tf.squeeze(
                        tf.gather(self.placeholders[t], ctx_mask))

                    # Negative samples
                    unigram_logits = tf.tile(
                        tf.expand_dims(tf.log(tf.constant(self.unigram)), [0]),
                        [self.n_minibatch[t], 1])
                    n_idx = tf.multinomial(unigram_logits, self.ns)

                    # Context vectors
                    ctx_alphas = tf.gather(self.alpha, ctx_idx)

                    p_rho = tf.squeeze(tf.gather(self.rho_t[t], p_idx))
                    n_rho = tf.gather(self.rho_t[t], n_idx)

                    # Natural parameter
                    ctx_sum = tf.reduce_sum(ctx_alphas, [1])
                    p_eta = tf.expand_dims(
                        tf.reduce_sum(tf.multiply(p_rho, ctx_sum), -1), 1)
                    n_eta = tf.reduce_sum(
                        tf.multiply(
                            n_rho,
                            tf.tile(tf.expand_dims(ctx_sum, 1),
                                    [1, self.ns, 1])), -1)

                    # Conditional likelihood
                    self.y_pos[t] = Bernoulli(logits=p_eta)
                    self.y_neg[t] = Bernoulli(logits=n_eta)

                    self.ll_pos += tf.reduce_sum(self.y_pos[t].log_prob(1.0))
                    self.ll_neg += tf.reduce_sum(self.y_neg[t].log_prob(0.0))

            self.loss = -(self.n_epochs *
                          (self.ll_pos + self.ll_neg) + self.log_prior)
Example #12
0
    model_log_alpha = tf.Variable(tf.zeros([]))
    model_alpha = tf.exp(model_log_alpha)
    # Compute prior.
    with tf.variable_scope("priors"):
        w_prior = Normal(tf.zeros([n_feats, 1]),
                         tf.reciprocal(tf.sqrt(model_alpha)))
        alpha_prior = Gamma(1., 0.01)
    # Compute the likelihood function.
    with tf.variable_scope("likelihood"):
        logits = tf.matmul(model_X, model_w)
        log_l = -tf.reduce_sum(
            tf.nn.sigmoid_cross_entropy_with_logits(labels=model_y,
                                                    logits=logits))
    # Compute the log-posterior of the model.
    log_p = (log_l * (n_train / n_batch) +
             tf.reduce_sum(w_prior.log_prob(model_w)) +
             alpha_prior.log_prob(model_alpha))


def evaluate(sampler, data_feed):
    """Evaluate the performance of the Bayesian neural network by computing its
    accuracy on the test set.
    """
    # Average predictions across particles.
    logits_pred = sampler.function_posterior(logits, data_feed)
    # avg_pred = np.mean(1. / (1. + np.exp(-logits_pred)), axis=0) > 0.5
    avg_pred = logits_pred.mean(axis=0) > 0.
    # Evaluation.
    return np.mean(avg_pred == y_test.ravel())

    def __init__(self, d, K, sig, sess, logdir):
        self.K = K
        self.sig = sig
        self.sess = sess
        self.logdir = logdir

        with tf.name_scope('model'):
            # Data Placeholder
            with tf.name_scope('input'):
                self.placeholders = tf.placeholder(tf.int32)
                self.words = self.placeholders

            # Index Masks
            with tf.name_scope('context_mask'):
                self.p_mask = tf.cast(
                    tf.range(d.cs / 2, d.n_minibatch + d.cs / 2), tf.int32)
                rows = tf.cast(
                    tf.tile(tf.expand_dims(tf.range(0, d.cs / 2), [0]),
                            [d.n_minibatch, 1]), tf.int32)
                columns = tf.cast(
                    tf.tile(tf.expand_dims(tf.range(0, d.n_minibatch), [1]),
                            [1, d.cs / 2]), tf.int32)
                self.ctx_mask = tf.concat(
                    [rows + columns, rows + columns + d.cs / 2 + 1], 1)

            with tf.name_scope('embeddings'):
                # Embedding vectors
                self.rho = tf.Variable(tf.random_normal([d.L, self.K]) /
                                       self.K,
                                       name='rho')

                # Context vectors
                self.alpha = tf.Variable(tf.random_normal([d.L, self.K]) /
                                         self.K,
                                         name='alpha')

                with tf.name_scope('priors'):
                    prior = Normal(loc=0.0, scale=self.sig)
                    self.log_prior = tf.reduce_sum(
                        prior.log_prob(self.rho) + prior.log_prob(self.alpha))

            with tf.name_scope('natural_param'):
                # Taget and Context Indices
                with tf.name_scope('target_word'):
                    self.p_idx = tf.gather(self.words, self.p_mask)
                    self.p_rho = tf.squeeze(tf.gather(self.rho, self.p_idx))

                # Negative samples
                with tf.name_scope('negative_samples'):
                    unigram_logits = tf.tile(
                        tf.expand_dims(tf.log(tf.constant(d.unigram)), [0]),
                        [d.n_minibatch, 1])
                    self.n_idx = tf.multinomial(unigram_logits, d.ns)
                    self.n_rho = tf.gather(self.rho, self.n_idx)

                with tf.name_scope('context'):
                    self.ctx_idx = tf.squeeze(
                        tf.gather(self.words, self.ctx_mask))
                    self.ctx_alphas = tf.gather(self.alpha, self.ctx_idx)

                # Natural parameter
                ctx_sum = tf.reduce_sum(self.ctx_alphas, [1])
                self.p_eta = tf.expand_dims(
                    tf.reduce_sum(tf.multiply(self.p_rho, ctx_sum), -1), 1)
                self.n_eta = tf.reduce_sum(
                    tf.multiply(
                        self.n_rho,
                        tf.tile(tf.expand_dims(ctx_sum, 1), [1, d.ns, 1])), -1)

            # Conditional likelihood
            self.y_pos = Bernoulli(logits=self.p_eta)
            self.y_neg = Bernoulli(logits=self.n_eta)

            self.ll_pos = tf.reduce_sum(self.y_pos.log_prob(1.0))
            self.ll_neg = tf.reduce_sum(self.y_neg.log_prob(0.0))

            self.log_likelihood = self.ll_pos + self.ll_neg

            scale = 1.0 * d.N / d.n_minibatch
            self.loss = -(scale * self.log_likelihood + self.log_prior)

            # Training
            optimizer = tf.train.AdamOptimizer()
            self.train = optimizer.minimize(self.loss)
            with self.sess.as_default():
                tf.global_variables_initializer().run()
            variable_summaries('rho', self.rho)
            variable_summaries('alpha', self.alpha)
            with tf.name_scope('objective'):
                tf.summary.scalar('loss', self.loss)
                tf.summary.scalar('priors', self.log_prior)
                tf.summary.scalar('ll_pos', self.ll_pos)
                tf.summary.scalar('ll_neg', self.ll_neg)
            self.summaries = tf.summary.merge_all()
            self.train_writer = tf.summary.FileWriter(self.logdir,
                                                      self.sess.graph)
            self.saver = tf.train.Saver()
            config = projector.ProjectorConfig()

            alpha = config.embeddings.add()
            alpha.tensor_name = 'model/embeddings/alpha'
            alpha.metadata_path = '../vocab.tsv'
            rho = config.embeddings.add()
            rho.tensor_name = 'model/embeddings/rho'
            rho.metadata_path = '../vocab.tsv'
            projector.visualize_embeddings(self.train_writer, config)
Example #14
0
def main(_):

    opts = Options(save_path=FLAGS.save_path,
                   train_biom=FLAGS.train_biom,
                   test_biom=FLAGS.test_biom,
                   train_metadata=FLAGS.train_metadata,
                   test_metadata=FLAGS.test_metadata,
                   formula=FLAGS.formula,
                   tree=FLAGS.tree,
                   learning_rate=FLAGS.learning_rate,
                   clipping_size=FLAGS.clipping_size,
                   beta_mean=FLAGS.beta_mean,
                   beta_scale=FLAGS.beta_scale,
                   gamma_mean=FLAGS.gamma_mean,
                   gamma_scale=FLAGS.gamma_scale,
                   epochs_to_train=FLAGS.epochs_to_train,
                   num_neg_samples=FLAGS.num_neg_samples,
                   batch_size=FLAGS.batch_size,
                   min_sample_count=FLAGS.min_sample_count,
                   min_feature_count=FLAGS.min_feature_count,
                   statistics_interval=FLAGS.statistics_interval,
                   summary_interval=FLAGS.summary_interval,
                   checkpoint_interval=FLAGS.checkpoint_interval)

    # preprocessing
    train_table, train_metadata = opts.train_table, opts.train_metadata
    train_metadata = train_metadata.loc[train_table.ids(axis='sample')]

    sample_filter = lambda val, id_, md: (
        (id_ in train_metadata.index) and np.sum(val) > opts.min_sample_count)
    read_filter = lambda val, id_, md: np.sum(val) > opts.min_feature_count
    metadata_filter = lambda val, id_, md: id_ in train_metadata.index

    train_table = train_table.filter(metadata_filter, axis='sample')
    train_table = train_table.filter(sample_filter, axis='sample')
    train_table = train_table.filter(read_filter, axis='observation')
    train_metadata = train_metadata.loc[train_table.ids(axis='sample')]
    sort_f = lambda xs: [xs[train_metadata.index.get_loc(x)] for x in xs]
    train_table = train_table.sort(sort_f=sort_f, axis='sample')
    train_metadata = dmatrix(opts.formula,
                             train_metadata,
                             return_type='dataframe')
    tree = opts.tree
    train_table, tree = match_tips(train_table, tree)
    basis, _ = sparse_balance_basis(tree)
    basis = basis.T

    # hold out data preprocessing
    test_table, test_metadata = opts.test_table, opts.test_metadata
    metadata_filter = lambda val, id_, md: id_ in test_metadata.index
    obs_lookup = set(train_table.ids(axis='observation'))
    feat_filter = lambda val, id_, md: id_ in obs_lookup
    test_table = test_table.filter(metadata_filter, axis='sample')
    test_table = test_table.filter(feat_filter, axis='observation')
    test_metadata = test_metadata.loc[test_table.ids(axis='sample')]
    sort_f = lambda xs: [xs[test_metadata.index.get_loc(x)] for x in xs]
    test_table = test_table.sort(sort_f=sort_f, axis='sample')
    test_metadata = dmatrix(opts.formula,
                            test_metadata,
                            return_type='dataframe')
    test_table, tree = match_tips(test_table, tree)

    p = train_metadata.shape[1]  # number of covariates
    G_data = train_metadata.values
    y_data = train_table.matrix_data.tocoo().T
    y_test = np.array(test_table.matrix_data.todense()).T
    N, D = y_data.shape
    save_path = opts.save_path
    learning_rate = opts.learning_rate
    batch_size = opts.batch_size
    gamma_mean, gamma_scale = opts.gamma_mean, opts.gamma_scale
    beta_mean, beta_scale = opts.beta_mean, opts.beta_scale
    num_neg = opts.num_neg_samples
    clipping_size = opts.clipping_size

    epoch = y_data.nnz // batch_size
    num_iter = int(opts.epochs_to_train * epoch)
    holdout_size = test_metadata.shape[0]
    checkpoint_interval = opts.checkpoint_interval

    # Model code
    with tf.Graph().as_default(), tf.Session() as session:
        with tf.device("/cpu:0"):
            # Place holder variables to accept input data
            Gpos_ph = tf.placeholder(tf.float32, [batch_size, p], name='G_pos')
            Gneg_ph = tf.placeholder(tf.float32, [num_neg, p], name='G_neg')
            G_holdout = tf.placeholder(tf.float32, [holdout_size, p],
                                       name='G_holdout')
            Y_holdout = tf.placeholder(tf.float32, [holdout_size, D],
                                       name='Y_holdout')
            Y_ph = tf.placeholder(tf.float32, [batch_size], name='Y_ph')

            pos_row = tf.placeholder(tf.int32,
                                     shape=[batch_size],
                                     name='pos_row')
            pos_col = tf.placeholder(tf.int32,
                                     shape=[batch_size],
                                     name='pos_col')
            neg_row = tf.placeholder(tf.int32, shape=[num_neg], name='neg_row')
            neg_col = tf.placeholder(tf.int32, shape=[num_neg], name='neg_col')
            neg_data = tf.zeros(shape=[num_neg],
                                name='neg_data',
                                dtype=tf.float32)
            total_zero = tf.constant(y_data.shape[0] * y_data.shape[1] -
                                     y_data.nnz,
                                     dtype=tf.float32)
            total_nonzero = tf.constant(y_data.nnz, dtype=tf.float32)

            # Define PointMass Variables first
            qgamma = tf.Variable(tf.random_normal([1, D - 1]), name='qgamma')
            qbeta = tf.Variable(tf.random_normal([p, D - 1]), name='qB')
            theta = tf.Variable(tf.random_normal([N, 1]), name='theta')

            # Distributions species bias
            gamma = Normal(loc=tf.zeros([1, D - 1]) + gamma_mean,
                           scale=tf.ones([1, D - 1]) * gamma_scale,
                           name='gamma')
            # regression coefficents distribution
            beta = Normal(loc=tf.zeros([p, D - 1]) + beta_mean,
                          scale=tf.ones([p, D - 1]) * beta_scale,
                          name='B')
            Bprime = tf.concat([qgamma, qbeta], axis=0)

            # Add bias terms for samples
            Gpos = tf.concat([tf.ones([batch_size, 1]), Gpos_ph], axis=1)
            Gneg = tf.concat([tf.ones([num_neg, 1]), Gneg_ph], axis=1)

            # Convert basis to SparseTensor
            psi = tf.SparseTensor(indices=np.mat([basis.row,
                                                  basis.col]).transpose(),
                                  values=basis.data,
                                  dense_shape=basis.shape)

            V = tf.transpose(
                tf.sparse_tensor_dense_matmul(psi, tf.transpose(Bprime)))

            # sparse matrix multiplication for positive samples
            pos_prime = tf.reduce_sum(tf.multiply(
                Gpos, tf.transpose(tf.gather(V, pos_col, axis=1))),
                                      axis=1)
            pos_phi = tf.reshape(tf.gather(theta, pos_row),
                                 shape=[batch_size]) + pos_prime
            Y = Poisson(log_rate=pos_phi, name='Y')

            # sparse matrix multiplication for negative samples
            neg_prime = tf.reduce_sum(tf.multiply(
                Gneg, tf.transpose(tf.gather(V, neg_col, axis=1))),
                                      axis=1)
            neg_phi = tf.reshape(tf.gather(theta, neg_row),
                                 shape=[num_neg]) + neg_prime
            neg_poisson = Poisson(log_rate=neg_phi, name='neg_counts')

            loss = -(
              tf.reduce_sum(gamma.log_prob(qgamma)) + \
              tf.reduce_sum(beta.log_prob(qbeta)) + \
              tf.reduce_sum(Y.log_prob(Y_ph)) * (total_nonzero / batch_size) + \
              tf.reduce_sum(neg_poisson.log_prob(neg_data)) * (total_zero / num_neg)
            )

            optimizer = tf.train.AdamOptimizer(learning_rate,
                                               beta1=0.9,
                                               beta2=0.9)
            gradients, variables = zip(*optimizer.compute_gradients(loss))
            gradients, _ = tf.clip_by_global_norm(gradients, clipping_size)
            train = optimizer.apply_gradients(zip(gradients, variables))

            with tf.name_scope('accuracy'):
                holdout_count = tf.reduce_sum(Y_holdout, axis=1)
                spred = tf.nn.softmax(
                    tf.transpose(
                        tf.sparse_tensor_dense_matmul(
                            psi,
                            tf.transpose(
                                (tf.matmul(G_holdout, qbeta) + qgamma)))))

                pred = tf.reshape(holdout_count, [-1, 1]) * spred
                mse = tf.reduce_mean(tf.squeeze(tf.abs(pred - Y_holdout)))
                tf.summary.scalar('mean_absolute_error', mse)

            tf.summary.scalar('loss', loss)
            tf.summary.histogram('qbeta', qbeta)
            tf.summary.histogram('qgamma', qgamma)
            tf.summary.histogram('theta', theta)
            merged = tf.summary.merge_all()

            tf.global_variables_initializer().run()

            writer = tf.summary.FileWriter(save_path, session.graph)
            losses = np.array([0.] * num_iter)
            idx = np.arange(train_metadata.shape[0])
            log_handle = open(os.path.join(save_path, 'run.log'), 'w')
            gen = get_batch(batch_size,
                            N,
                            D,
                            y_data.data,
                            y_data.row,
                            y_data.col,
                            num_neg=num_neg)
            start_time = time.time()
            last_checkpoint_time = 0
            start_time = time.time()
            saver = tf.train.Saver()
            for i in range(num_iter):
                batch_idx = np.random.choice(idx, size=batch_size)
                batch = next(gen)
                (positive_row, positive_col, positive_data, negative_row,
                 negative_col, negative_data) = batch
                feed_dict = {
                    Y_ph: positive_data,
                    Y_holdout: y_test.astype(np.float32),
                    G_holdout: test_metadata.values.astype(np.float32),
                    Gpos_ph: G_data[positive_row, :],
                    Gneg_ph: G_data[negative_row, :],
                    pos_row: positive_row,
                    pos_col: positive_col,
                    neg_row: negative_row,
                    neg_col: negative_col
                }
                if i % 1000 == 0:
                    run_options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()
                    _, summary, train_loss, grads = session.run(
                        [train, merged, loss, gradients],
                        feed_dict=feed_dict,
                        options=run_options,
                        run_metadata=run_metadata)
                    writer.add_run_metadata(run_metadata, 'step%d' % i)
                    writer.add_summary(summary, i)
                elif i % 5000 == 0:
                    _, summary, err, train_loss, grads = session.run(
                        [train, mse, merged, loss, gradients],
                        feed_dict=feed_dict)
                    writer.add_summary(summary, i)
                else:
                    _, summary, train_loss, grads = session.run(
                        [train, merged, loss, gradients], feed_dict=feed_dict)
                    writer.add_summary(summary, i)

                now = time.time()
                if now - last_checkpoint_time > checkpoint_interval:
                    saver.save(session,
                               os.path.join(opts.save_path, "model.ckpt"),
                               global_step=i)
                    last_checkpoint_time = now

                losses[i] = train_loss

            elapsed_time = time.time() - start_time
            print('Elapsed Time: %f seconds' % elapsed_time)

            # Cross validation
            pred_beta = qbeta.eval()
            pred_gamma = qgamma.eval()
            mse, mrc = cross_validation(test_metadata.values,
                                        pred_beta @ basis.T,
                                        pred_gamma @ basis.T, y_test)
            print("MSE: %f, MRC: %f" % (mse, mrc))
Example #15
0
emb = np.hstack((rho, alpha))
L, K = emb.shape

### Parameters
relevance = tf.nn.sigmoid(tf.Variable(np.random.randn(L).astype('float32')))
print('NOT USING RELEVANCE')

trunc = np.sqrt(6)/np.sqrt(K + H0)
w_1 = tf.Variable(np.random.uniform( -trunc, trunc, [K, H0]).astype('float32'))
trunc = np.sqrt(6)/np.sqrt(H0)
w_2 = tf.Variable(np.random.uniform( -trunc, trunc, [H0, 1]).astype('float32'))

### prior on w
prior = Normal(loc = 0.0, scale = lam)
log_prior = tf.reduce_sum(prior.log_prob(w_1)) + tf.reduce_sum(prior.log_prob(w_2))

### placeholders for data minibatches

def extract_features(text):
     #takes numpy array of text and transforms it into a feature representation
     if len(text) == 0:
         return np.zeros((K))
     return np.mean(emb[text], axis=0)

def next_batch(file_list):
    indices = np.random.permutation(len(file_list))[:mb]
    features = np.zeros((mb, K))
    for i, idx in enumerate(indices):
        features[i] = extract_features(np.load(file_list[idx]))
    return features
Example #16
0
    def __init__(self, args, d, logdir):
        super(amortized_bern_emb_model, self).__init__(args, d, logdir)
        with tf.name_scope('model'):

            with tf.name_scope('embeddings'):
                self.alpha = tf.Variable(self.alpha_init,
                                         name='alpha',
                                         trainable=self.alpha_trainable)
                self.rho = tf.Variable(self.rho_init,
                                       name='rho',
                                       trainable=self.rho_trainable)

                trunc = np.sqrt(6) / np.sqrt(self.K + self.H0)
                phi_init = np.random.uniform(
                    -trunc, trunc,
                    [self.n_states, 2 * self.K * self.H0]).astype('float32')
                self.phi = tf.Variable(phi_init, name='phi')

                self.geo_rho = {}
                for t, state in enumerate(d.states):
                    self.geo_rho[state] = tf.Variable(tf.random_normal(
                        self.rho_init.shape),
                                                      trainable=False,
                                                      name=state + '_rho')

                with tf.name_scope('priors'):
                    prior = Normal(loc=0.0, scale=self.sig)
                    if self.alpha_trainable:
                        self.log_prior = tf.reduce_sum(
                            prior.log_prob(self.rho) +
                            tf.reduce_sum(prior.log_prob(self.alpha)) +
                            tf.reduce_sum(prior.log_prob(self.phi)))
                    else:
                        self.log_prior = tf.reduce_sum(
                            prior.log_prob(self.rho)) + tf.reduce_sum(
                                prior.log_prob(self.phi))

                    local_prior = Normal(loc=0.0, scale=self.sig / 100.0)
                    for t, state in enumerate(d.states):
                        self.log_prior += tf.reduce_sum(
                            local_prior.log_prob(
                                self.rho -
                                neural_network(self.rho, self.phi, self.K, t,
                                               self.H0, self.resnet)))

                self.assign_ops = d.T * [0]
                for t, state in enumerate(d.states):
                    self.assign_ops[t] = self.geo_rho[state].assign(
                        neural_network(self.rho, self.phi, self.K, t, self.H0,
                                       self.resnet))

            with tf.name_scope('likelihood'):
                self.placeholders = {}
                self.y_pos = {}
                self.y_neg = {}
                self.ll_pos = 0.0
                self.ll_neg = 0.0
                for t, state in enumerate(self.states):
                    # Index Masks
                    p_mask = tf.range(self.cs / 2,
                                      self.n_minibatch[t] + self.cs / 2)
                    rows = tf.tile(
                        tf.expand_dims(tf.range(0, self.cs / 2), [0]),
                        [self.n_minibatch[t], 1])
                    columns = tf.tile(
                        tf.expand_dims(tf.range(0, self.n_minibatch[t]), [1]),
                        [1, self.cs / 2])

                    ctx_mask = tf.concat(
                        [rows + columns, rows + columns + self.cs / 2 + 1], 1)

                    # Data Placeholder
                    self.placeholders[state] = tf.placeholder(
                        tf.int32, shape=(self.n_minibatch[t] + self.cs))

                    # Taget and Context Indices
                    p_idx = tf.gather(self.placeholders[state], p_mask)
                    ctx_idx = tf.squeeze(
                        tf.gather(self.placeholders[state], ctx_mask))

                    # Negative samples
                    unigram_logits = tf.tile(
                        tf.expand_dims(tf.log(tf.constant(d.unigram)), [0]),
                        [self.n_minibatch[t], 1])
                    n_idx = tf.multinomial(unigram_logits, self.ns)

                    # Context vectors
                    ctx_alphas = tf.gather(self.alpha, ctx_idx)

                    rho_state = neural_network(self.rho, self.phi, self.K, t,
                                               self.H0, self.resnet)
                    p_rho = tf.squeeze(tf.gather(rho_state, p_idx))
                    n_rho = tf.gather(rho_state, n_idx)

                    # Natural parameter
                    ctx_sum = tf.reduce_sum(ctx_alphas, [1])
                    p_eta = tf.expand_dims(
                        tf.reduce_sum(tf.multiply(p_rho, ctx_sum), -1), 1)
                    n_eta = tf.reduce_sum(
                        tf.multiply(
                            n_rho,
                            tf.tile(tf.expand_dims(ctx_sum, 1),
                                    [1, self.ns, 1])), -1)

                    # Conditional likelihood
                    self.y_pos[state] = Bernoulli(logits=p_eta)
                    self.y_neg[state] = Bernoulli(logits=n_eta)

                    self.ll_pos += tf.reduce_sum(
                        self.y_pos[state].log_prob(1.0))
                    self.ll_neg += tf.reduce_sum(
                        self.y_neg[state].log_prob(0.0))

            self.loss = -(self.n_epochs *
                          (self.ll_pos + self.ll_neg) + self.log_prior)
        self.init_eval_model()
Example #17
0
    def __call__(self, session, trainX, trainY, testX, testY):
        """ Initialize the actual graph

        Parameters
        ----------
        session : tf.Session
            Tensorflow session
        trainX : np.array
            Input training design matrix.
        trainY : np.array
            Output training OTU table, where rows are samples and columns are
            observations.
        testX : np.array
            Input testing design matrix.
        testY : np.array
            Output testing OTU table, where rows are samples and columns are
            observations.
        """
        self.session = session
        self.N, self.p = trainX.shape
        self.D = trainY.shape[1]
        holdout_size = testX.shape[0]

        # Place holder variables to accept input data
        self.X_ph = tf.constant(trainX, dtype=tf.float32, name='G_ph')
        self.Y_ph = tf.constant(trainY, dtype=tf.float32, name='Y_ph')
        self.X_holdout = tf.constant(testX, dtype=tf.float32, name='G_holdout')
        self.Y_holdout = tf.constant(testY, dtype=tf.float32, name='Y_holdout')

        batch_ids = tf.multinomial(tf.ones([1, self.N]), self.batch_size)
        sample_ids = tf.squeeze(batch_ids)

        Y_batch = tf.gather(self.Y_ph, sample_ids, axis=0)
        X_batch = tf.gather(self.X_ph, sample_ids, axis=0)

        total_count = tf.reduce_sum(Y_batch, axis=1)
        holdout_count = tf.reduce_sum(self.Y_holdout, axis=1)

        # Define PointMass Variables first
        self.qbeta = tf.Variable(tf.random_normal([self.p, self.D - 1]),
                                 name='qB')

        # regression coefficents distribution
        beta = Normal(loc=tf.zeros([self.p, self.D - 1]) + self.beta_mean,
                      scale=tf.ones([self.p, self.D - 1]) * self.beta_scale,
                      name='B')

        eta = tf.matmul(X_batch, self.qbeta, name='eta')

        phi = tf.nn.log_softmax(tf.concat(
            [tf.zeros([self.batch_size, 1]), eta], axis=1),
                                name='phi')

        Y = Multinomial(total_count=total_count, logits=phi, name='Y')

        # cross validation
        with tf.name_scope('accuracy'):
            pred = tf.reshape(holdout_count, [-1, 1]) * tf.nn.softmax(
                tf.concat([
                    tf.zeros([holdout_size, 1]),
                    tf.matmul(self.X_holdout, self.qbeta)
                ],
                          axis=1),
                name='phi')

            self.cv = tf.reduce_mean(tf.squeeze(tf.abs(pred - self.Y_holdout)))
            tf.summary.scalar('mean_absolute_error', self.cv)

        self.loss = -(tf.reduce_sum(beta.log_prob(self.qbeta)) +
                      tf.reduce_sum(Y.log_prob(Y_batch)) *
                      (self.N / self.batch_size))

        optimizer = tf.train.AdamOptimizer(self.learning_rate,
                                           beta1=self.beta_1,
                                           beta2=self.beta_2)

        gradients, variables = zip(*optimizer.compute_gradients(self.loss))
        self.gradients, _ = tf.clip_by_global_norm(gradients, self.clipnorm)
        self.train = optimizer.apply_gradients(zip(gradients, variables))

        tf.summary.scalar('loss', self.loss)
        tf.summary.histogram('qbeta', self.qbeta)
        self.merged = tf.summary.merge_all()
        if self.save_path is not None:
            self.writer = tf.summary.FileWriter(self.save_path,
                                                self.session.graph)
        else:
            self.writer = None
        tf.global_variables_initializer().run()
def F(x):
    return x**2 - 2 * x + 1


def get_fitness(value):
    return -value


mean = tf.Variable(tf.constant(-30.), dtype=tf.float32)
sigma = tf.Variable(tf.constant(1.), dtype=tf.float32)
N_dist = Normal(loc=mean, scale=sigma)
make_kids = N_dist.sample([POP_SIZE])

tfkids = tf.placeholder(tf.float32, [POP_SIZE, DNA_SIZE])
tfkids_fit = tf.placeholder(tf.float32, [POP_SIZE])
loss = -tf.reduce_mean(N_dist.log_prob(tfkids) * tfkids_fit)
train_op = tf.train.GradientDescentOptimizer(LR).minimize(loss)

x = np.linspace(-70, 70, 100)
plt.plot(x, F(x))
plt.xlim(-70, 70)
plt.ylim(-100, 1000)

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

plt.ion()
for g in range(N_GENERATION):
    kids = sess.run(make_kids)
    kids_fit = get_fitness(F(kids))
Example #19
0
    def __init__(self, args, d, logdir):
        super(completely_separate_bern_emb_model, self).__init__(args, d, logdir)

        with tf.name_scope('model'):
            with tf.name_scope('embeddings'):
                self.geo_alpha = {}
                self.geo_rho = {}
                for t, state in enumerate(d.states):
                    self.geo_alpha[state] = tf.Variable(self.alpha_init 
                        + 0.001*tf.random_normal([d.L, self.K])/self.K,
                        name = state+'_alpha')
                    self.geo_rho[state] = tf.Variable(self.rho_init 
                        + 0.001*tf.random_normal([d.L, self.K])/self.K,
                        name = state+'_rho')

                with tf.name_scope('priors'):
                    prior = Normal(loc = 0.0, scale = self.sig)
                    self.log_prior = 0.0
                    for state in d.states:
                        self.log_prior += tf.reduce_sum(prior.log_prob(self.geo_rho[state])) 
                        self.log_prior += tf.reduce_sum(prior.log_prob(self.geo_alpha[state])) 

            with tf.name_scope('likelihood'):
                self.placeholders = {}
                self.y_pos = {}
                self.y_neg = {}
                self.ll_pos = 0.0
                self.ll_neg = 0.0
                for t, state in enumerate(self.states):
                    # Index Masks
                    p_mask = tf.range(self.cs/2,self.n_minibatch[t] + self.cs/2)
                    rows = tf.tile(tf.expand_dims(tf.range(0, self.cs/2),[0]), [self.n_minibatch[t], 1])
                    columns = tf.tile(tf.expand_dims(tf.range(0, self.n_minibatch[t]), [1]), [1, self.cs/2])
                    
                    ctx_mask = tf.concat([rows+columns, rows+columns +self.cs/2+1], 1)

                    # Data Placeholder
                    self.placeholders[state] = tf.placeholder(tf.int32, shape = (self.n_minibatch[t] + self.cs))

                    # Taget and Context Indices
                    p_idx = tf.gather(self.placeholders[state], p_mask)
                    ctx_idx = tf.squeeze(tf.gather(self.placeholders[state], ctx_mask))
                    
                    # Negative samples
                    unigram_logits = tf.tile(tf.expand_dims(tf.log(tf.constant(d.unigram)), [0]), [self.n_minibatch[t], 1])
                    n_idx = tf.multinomial(unigram_logits, self.ns)

                    # Context vectors
                    ctx_alphas = tf.gather(self.geo_alpha[state], ctx_idx)

                    p_rho = tf.squeeze(tf.gather(self.geo_rho[state], p_idx))
                    n_rho = tf.gather(self.geo_rho[state], n_idx)

                    # Natural parameter
                    ctx_sum = tf.reduce_sum(ctx_alphas,[1])
                    p_eta = tf.expand_dims(tf.reduce_sum(tf.multiply(p_rho, ctx_sum),-1),1)
                    n_eta = tf.reduce_sum(tf.multiply(n_rho, tf.tile(tf.expand_dims(ctx_sum,1),[1,self.ns,1])),-1)
                    
                    # Conditional likelihood
                    self.y_pos[state] = Bernoulli(logits = p_eta)
                    self.y_neg[state] = Bernoulli(logits = n_eta)

                    self.ll_pos += tf.reduce_sum(self.y_pos[state].log_prob(1.0)) 
                    self.ll_neg += tf.reduce_sum(self.y_neg[state].log_prob(0.0))

            self.loss = - (self.n_epochs * (self.ll_pos + self.ll_neg) + self.log_prior)
        self.init_eval_model()
Example #20
0
    #mean_squared_error
    RSEcost = tf.reduce_mean(
        tf.square(y - y_mu))  # use square error for cost function

    #    #negative log-likelihood (same as maximum-likelihood)
    #    y_sigma  = tf.sqrt(tfmixedmodel(Xtf, tf.square(std_encoder1), Ztf, tf.square(std_encoder2)))
    #    NLLcost  = - tf.reduce_sum(-0.5 * tf.log(2. * np.pi) - tf.log(y_sigma)
    #                               -0.5 * tf.square((y - y_mu)/y_sigma))

    #Mean-field Variational inference using ELBO
    p_log_prob = [0.0] * n_samples
    q_log_prob = [0.0] * n_samples
    for s in range(n_samples):
        beta_tf_copy = Normal(loc=beta_mu, scale=std_encoder1)
        beta_sample = beta_tf_copy.sample()
        q_log_prob[s] += tf.reduce_sum(beta_tf.log_prob(beta_sample))
        b_tf_copy = Normal(loc=b_mu, scale=std_encoder2)
        b_sample = b_tf_copy.sample()
        q_log_prob[s] += tf.reduce_sum(b_tf.log_prob(b_sample))

        priormodel = Normal(loc=priormu, scale=priorsigma)
        y_sample = tf.matmul(Xtf, beta_sample) + tf.matmul(Ztf, b_sample)
        p_log_prob[s] += tf.reduce_sum(priormodel.log_prob(beta_sample))
        p_log_prob[s] += tf.reduce_sum(priormodel.log_prob(b_sample))
        modelcopy = Normal(loc=y_sample, scale=priorliksigma)
        p_log_prob[s] += tf.reduce_sum(modelcopy.log_prob(y))

    p_log_prob = tf.stack(p_log_prob)
    q_log_prob = tf.stack(q_log_prob)
    ELBO = -tf.reduce_mean(p_log_prob - q_log_prob)
Example #21
0
def main(_):

    opts = Options(save_path=FLAGS.save_path,
                   train_biom=FLAGS.train_biom,
                   test_biom=FLAGS.test_biom,
                   train_metadata=FLAGS.train_metadata,
                   test_metadata=FLAGS.test_metadata,
                   formula=FLAGS.formula,
                   learning_rate=FLAGS.learning_rate,
                   clipping_size=FLAGS.clipping_size,
                   beta_mean=FLAGS.beta_mean,
                   beta_scale=FLAGS.beta_scale,
                   gamma_mean=FLAGS.gamma_mean,
                   gamma_scale=FLAGS.gamma_scale,
                   epochs_to_train=FLAGS.epochs_to_train,
                   num_neg_samples=FLAGS.num_neg_samples,
                   batch_size=FLAGS.batch_size,
                   min_sample_count=FLAGS.min_sample_count,
                   min_feature_count=FLAGS.min_feature_count,
                   statistics_interval=FLAGS.statistics_interval,
                   summary_interval=FLAGS.summary_interval,
                   checkpoint_interval=FLAGS.checkpoint_interval)
    # preprocessing
    train_table, train_metadata = opts.train_table, opts.train_metadata
    train_metadata = train_metadata.loc[train_table.ids(axis='sample')]

    sample_filter = lambda val, id_, md: (
        (id_ in train_metadata.index) and np.sum(val) > opts.min_sample_count)
    read_filter = lambda val, id_, md: np.sum(val) > opts.min_feature_count
    metadata_filter = lambda val, id_, md: id_ in train_metadata.index

    train_table = train_table.filter(metadata_filter, axis='sample')
    train_table = train_table.filter(sample_filter, axis='sample')
    train_table = train_table.filter(read_filter, axis='observation')
    train_metadata = train_metadata.loc[train_table.ids(axis='sample')]
    sort_f = lambda xs: [xs[train_metadata.index.get_loc(x)] for x in xs]
    train_table = train_table.sort(sort_f=sort_f, axis='sample')
    train_metadata = dmatrix(opts.formula,
                             train_metadata,
                             return_type='dataframe')

    # hold out data preprocessing
    test_table, test_metadata = opts.test_table, opts.test_metadata
    metadata_filter = lambda val, id_, md: id_ in test_metadata.index
    obs_lookup = set(train_table.ids(axis='observation'))
    feat_filter = lambda val, id_, md: id_ in obs_lookup

    test_table = test_table.filter(metadata_filter, axis='sample')
    test_table = test_table.filter(feat_filter, axis='observation')
    test_metadata = test_metadata.loc[test_table.ids(axis='sample')]
    sort_f = lambda xs: [xs[test_metadata.index.get_loc(x)] for x in xs]
    test_table = test_table.sort(sort_f=sort_f, axis='sample')
    test_metadata = dmatrix(opts.formula,
                            test_metadata,
                            return_type='dataframe')

    p = train_metadata.shape[1]  # number of covariates
    G_data = train_metadata.values
    y_data = np.array(train_table.matrix_data.todense()).T
    y_test = np.array(test_table.matrix_data.todense()).T
    N, D = y_data.shape
    save_path = opts.save_path
    learning_rate = opts.learning_rate
    batch_size = opts.batch_size
    gamma_mean, gamma_scale = opts.gamma_mean, opts.gamma_scale
    beta_mean, beta_scale = opts.beta_mean, opts.beta_scale
    num_iter = (N // batch_size) * opts.epochs_to_train
    holdout_size = test_metadata.shape[0]
    checkpoint_interval = opts.checkpoint_interval

    # Model code
    with tf.Graph().as_default(), tf.Session() as session:
        with tf.device("/cpu:0"):
            # Place holder variables to accept input data
            G_ph = tf.placeholder(tf.float32, [batch_size, p], name='G_ph')
            Y_ph = tf.placeholder(tf.float32, [batch_size, D], name='Y_ph')
            G_holdout = tf.placeholder(tf.float32, [holdout_size, p],
                                       name='G_holdout')
            Y_holdout = tf.placeholder(tf.float32, [holdout_size, D],
                                       name='Y_holdout')
            total_count = tf.placeholder(tf.float32, [batch_size],
                                         name='total_count')

            # Define PointMass Variables first
            qgamma = tf.Variable(tf.random_normal([1, D]), name='qgamma')
            qbeta = tf.Variable(tf.random_normal([p, D]), name='qB')

            # Distributions
            # species bias
            gamma = Normal(loc=tf.zeros([1, D]) + gamma_mean,
                           scale=tf.ones([1, D]) * gamma_scale,
                           name='gamma')
            # regression coefficents distribution
            beta = Normal(loc=tf.zeros([p, D]) + beta_mean,
                          scale=tf.ones([p, D]) * beta_scale,
                          name='B')

            Bprime = tf.concat([qgamma, qbeta], axis=0)

            # add bias terms for samples
            Gprime = tf.concat([tf.ones([batch_size, 1]), G_ph], axis=1)

            eta = tf.matmul(Gprime, Bprime)
            phi = tf.nn.log_softmax(eta)
            Y = Multinomial(total_count=total_count, logits=phi, name='Y')

            loss = -(tf.reduce_mean(gamma.log_prob(qgamma)) + \
                     tf.reduce_mean(beta.log_prob(qbeta)) + \
                     tf.reduce_mean(Y.log_prob(Y_ph)) * (N / batch_size))
            loss = tf.Print(loss, [loss])
            optimizer = tf.train.AdamOptimizer(learning_rate)

            gradients, variables = zip(*optimizer.compute_gradients(loss))
            gradients, _ = tf.clip_by_global_norm(gradients,
                                                  opts.clipping_size)
            train = optimizer.apply_gradients(zip(gradients, variables))

            with tf.name_scope('accuracy'):
                holdout_count = tf.reduce_sum(Y_holdout, axis=1)
                pred = tf.reshape(holdout_count, [-1, 1]) * tf.nn.softmax(
                    tf.matmul(G_holdout, qbeta) + qgamma)
                mse = tf.reduce_mean(tf.squeeze(tf.abs(pred - Y_holdout)))
                tf.summary.scalar('mean_absolute_error', mse)

            tf.summary.scalar('loss', loss)
            tf.summary.histogram('qbeta', qbeta)
            tf.summary.histogram('qgamma', qgamma)
            merged = tf.summary.merge_all()

            tf.global_variables_initializer().run()

            writer = tf.summary.FileWriter(save_path, session.graph)

            losses = np.array([0.] * num_iter)
            idx = np.arange(train_metadata.shape[0])
            log_handle = open(os.path.join(save_path, 'run.log'), 'w')

            last_checkpoint_time = 0
            start_time = time.time()
            saver = tf.train.Saver()
            for i in range(num_iter):
                batch_idx = np.random.choice(idx, size=batch_size)
                feed_dict = {
                    Y_ph: y_data[batch_idx].astype(np.float32),
                    G_ph: train_metadata.values[batch_idx].astype(np.float32),
                    Y_holdout: y_test.astype(np.float32),
                    G_holdout: test_metadata.values.astype(np.float32),
                    total_count:
                    y_data[batch_idx].sum(axis=1).astype(np.float32)
                }

                if i % 1000 == 0:
                    run_options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()
                    _, summary, train_loss, grads = session.run(
                        [train, merged, loss, gradients],
                        feed_dict=feed_dict,
                        options=run_options,
                        run_metadata=run_metadata)
                    writer.add_run_metadata(run_metadata, 'step%d' % i)
                    writer.add_summary(summary, i)
                elif i % 5000 == 0:
                    _, summary, err, train_loss, grads = session.run(
                        [train, mse, merged, loss, gradients],
                        feed_dict=feed_dict)
                    writer.add_summary(summary, i)
                else:
                    _, summary, train_loss, grads = session.run(
                        [train, merged, loss, gradients], feed_dict=feed_dict)
                    writer.add_summary(summary, i)

                now = time.time()
                if now - last_checkpoint_time > checkpoint_interval:
                    saver.save(session,
                               os.path.join(opts.save_path, "model.ckpt"),
                               global_step=i)
                    last_checkpoint_time = now

                losses[i] = train_loss
            elapsed_time = time.time() - start_time
            print('Elapsed Time: %f seconds' % elapsed_time)

            # Cross validation
            pred_beta = qbeta.eval()
            pred_gamma = qgamma.eval()
            mse, mrc = cross_validation(test_metadata.values, pred_beta,
                                        pred_gamma, y_test)
            print("MSE: %f, MRC: %f" % (mse, mrc))
Example #22
0
    def __call__(self, session, trainX, trainY, testX, testY):
        """ Initialize the actual graph

        Parameters
        ----------
        session : tf.Session
            Tensorflow session
        trainX : sparse array in coo format
            Test input OTU table, where rows are samples and columns are
            observations
        trainY : np.array
            Test output metabolite table
        testX : sparse array in coo format
            Test input OTU table, where rows are samples and columns are
            observations.  This is mainly for cross validation.
        testY : np.array
            Test output metabolite table.  This is mainly for cross validation.
        """
        self.session = session
        self.nnz = len(trainX.data)
        self.d1 = trainX.shape[1]
        self.d2 = trainY.shape[1]
        self.cv_size = len(testX.data)

        # keep the multinomial sampling on the cpu
        # https://github.com/tensorflow/tensorflow/issues/18058
        with tf.device('/cpu:0'):
            X_ph = tf.SparseTensor(indices=np.array([trainX.row,
                                                     trainX.col]).T,
                                   values=trainX.data,
                                   dense_shape=trainX.shape)
            Y_ph = tf.constant(trainY, dtype=tf.float32)

            X_holdout = tf.SparseTensor(indices=np.array(
                [testX.row, testX.col]).T,
                                        values=testX.data,
                                        dense_shape=testX.shape)
            Y_holdout = tf.constant(testY, dtype=tf.float32)

            total_count = tf.reduce_sum(Y_ph, axis=1)
            batch_ids = tf.multinomial(
                tf.log(tf.reshape(X_ph.values, [1, -1])), self.batch_size)
            batch_ids = tf.squeeze(batch_ids)
            X_samples = tf.gather(X_ph.indices, 0, axis=1)
            X_obs = tf.gather(X_ph.indices, 1, axis=1)
            sample_ids = tf.gather(X_samples, batch_ids)

            Y_batch = tf.gather(Y_ph, sample_ids)
            X_batch = tf.gather(X_obs, batch_ids)

        with tf.device(self.device_name):
            self.qUmain = tf.Variable(tf.random_normal([self.d1, self.p]),
                                      name='qU')
            self.qUbias = tf.Variable(tf.random_normal([self.d1, 1]),
                                      name='qUbias')
            self.qVmain = tf.Variable(tf.random_normal([self.p, self.d2 - 1]),
                                      name='qV')
            self.qVbias = tf.Variable(tf.random_normal([1, self.d2 - 1]),
                                      name='qVbias')

            qU = tf.concat([tf.ones([self.d1, 1]), self.qUbias, self.qUmain],
                           axis=1)
            qV = tf.concat(
                [self.qVbias,
                 tf.ones([1, self.d2 - 1]), self.qVmain], axis=0)

            # regression coefficents distribution
            Umain = Normal(loc=tf.zeros([self.d1, self.p]) + self.u_mean,
                           scale=tf.ones([self.d1, self.p]) * self.u_scale,
                           name='U')
            Ubias = Normal(loc=tf.zeros([self.d1, 1]) + self.u_mean,
                           scale=tf.ones([self.d1, 1]) * self.u_scale,
                           name='biasU')

            Vmain = Normal(loc=tf.zeros([self.p, self.d2 - 1]) + self.v_mean,
                           scale=tf.ones([self.p, self.d2 - 1]) * self.v_scale,
                           name='V')
            Vbias = Normal(loc=tf.zeros([1, self.d2 - 1]) + self.v_mean,
                           scale=tf.ones([1, self.d2 - 1]) * self.v_scale,
                           name='biasV')

            du = tf.gather(qU, X_batch, axis=0, name='du')
            dv = tf.concat([tf.zeros([self.batch_size, 1]), du @ qV],
                           axis=1,
                           name='dv')

            tc = tf.gather(total_count, sample_ids)
            Y = Multinomial(total_count=tc, logits=dv, name='Y')
            num_samples = trainX.shape[0]
            norm = num_samples / self.batch_size
            logprob_vmain = tf.reduce_sum(Vmain.log_prob(self.qVmain),
                                          name='logprob_vmain')
            logprob_vbias = tf.reduce_sum(Vbias.log_prob(self.qVbias),
                                          name='logprob_vbias')
            logprob_umain = tf.reduce_sum(Umain.log_prob(self.qUmain),
                                          name='logprob_umain')
            logprob_ubias = tf.reduce_sum(Ubias.log_prob(self.qUbias),
                                          name='logprob_ubias')
            logprob_y = tf.reduce_sum(Y.log_prob(Y_batch), name='logprob_y')
            self.log_loss = -(logprob_y * norm + logprob_umain +
                              logprob_ubias + logprob_vmain + logprob_vbias)

        # keep the multinomial sampling on the cpu
        # https://github.com/tensorflow/tensorflow/issues/18058
        with tf.device('/cpu:0'):
            # cross validation
            with tf.name_scope('accuracy'):
                cv_batch_ids = tf.multinomial(
                    tf.log(tf.reshape(X_holdout.values, [1, -1])),
                    self.cv_size)
                cv_batch_ids = tf.squeeze(cv_batch_ids)
                X_cv_samples = tf.gather(X_holdout.indices, 0, axis=1)
                X_cv = tf.gather(X_holdout.indices, 1, axis=1)
                cv_sample_ids = tf.gather(X_cv_samples, cv_batch_ids)

                Y_cvbatch = tf.gather(Y_holdout, cv_sample_ids)
                X_cvbatch = tf.gather(X_cv, cv_batch_ids)
                holdout_count = tf.reduce_sum(Y_cvbatch, axis=1)
                cv_du = tf.gather(qU, X_cvbatch, axis=0, name='cv_du')
                pred = tf.reshape(holdout_count, [-1, 1]) * tf.nn.softmax(
                    tf.concat([tf.zeros([self.cv_size, 1]), cv_du @ qV],
                              axis=1,
                              name='pred'))

                self.cv = tf.reduce_mean(tf.squeeze(tf.abs(pred - Y_cvbatch)))

        # keep all summaries on the cpu
        with tf.device('/cpu:0'):
            tf.summary.scalar('logloss', self.log_loss)
            tf.summary.scalar('cv_rmse', self.cv)
            tf.summary.histogram('qUmain', self.qUmain)
            tf.summary.histogram('qVmain', self.qVmain)
            tf.summary.histogram('qUbias', self.qUbias)
            tf.summary.histogram('qVbias', self.qVbias)
            self.merged = tf.summary.merge_all()

            self.writer = tf.summary.FileWriter(self.save_path,
                                                self.session.graph)

        with tf.device(self.device_name):
            with tf.name_scope('optimize'):
                optimizer = tf.train.AdamOptimizer(self.learning_rate,
                                                   beta1=self.beta_1,
                                                   beta2=self.beta_2)

                gradients, self.variables = zip(
                    *optimizer.compute_gradients(self.log_loss))
                self.gradients, _ = tf.clip_by_global_norm(
                    gradients, self.clipnorm)
                self.train = optimizer.apply_gradients(
                    zip(self.gradients, self.variables))

        tf.global_variables_initializer().run()
Example #23
0
class AIRModel(object):
    def __init__(self,
                 obs,
                 nums,
                 max_steps,
                 glimpse_size,
                 n_appearance,
                 transition,
                 input_encoder,
                 glimpse_encoder,
                 glimpse_decoder,
                 transform_estimator,
                 steps_predictor,
                 output_std=1.,
                 discrete_steps=True,
                 step_bias=0.,
                 explore_eps=None,
                 debug=False):

        self.obs = obs
        self.nums = nums
        self.max_steps = max_steps
        self.glimpse_size = glimpse_size

        self.n_appearance = n_appearance

        self.output_std = output_std
        self.discrete_steps = discrete_steps
        self.step_bias = step_bias
        self.explore_eps = explore_eps
        self.debug = debug

        with tf.variable_scope(self.__class__.__name__):
            shape = self.obs.get_shape().as_list()
            self.batch_size = shape[0]
            self.img_size = shape[1:]
            self._build(transition, input_encoder, glimpse_encoder,
                        glimpse_decoder, transform_estimator, steps_predictor)

    def _build(self, transition, input_encoder, glimpse_encoder,
               glimpse_decoder, transform_estimator, steps_predictor):
        if self.explore_eps is not None:
            self.explore_eps = tf.get_variable('explore_eps',
                                               initializer=self.explore_eps,
                                               trainable=False)

        self.cell = AIRCell(self.img_size,
                            self.glimpse_size,
                            self.n_appearance,
                            transition,
                            input_encoder,
                            glimpse_encoder,
                            glimpse_decoder,
                            transform_estimator,
                            steps_predictor,
                            canvas_init=None,
                            discrete_steps=self.discrete_steps,
                            explore_eps=self.explore_eps,
                            debug=self.debug)

        initial_state = self.cell.initial_state(self.obs)

        dummy_sequence = tf.zeros((self.max_steps, self.batch_size, 1),
                                  name='dummy_sequence')
        outputs, state = tf.nn.dynamic_rnn(self.cell,
                                           dummy_sequence,
                                           initial_state=initial_state,
                                           time_major=True)
        for name, output in zip(self.cell.output_names, outputs):
            setattr(self, name, output)
        # canvas, glimpse, what, what_loc, what_scale, where, where_loc, where_scale, presence_prob, presence = outputs

        self.glimpse = tf.reshape(self.presence * tf.nn.sigmoid(self.glimpse),
                                  (
                                      self.max_steps,
                                      self.batch_size,
                                  ) + tuple(self.glimpse_size))
        self.canvas = tf.reshape(self.canvas, (
            self.max_steps,
            self.batch_size,
        ) + tuple(self.img_size))
        self.final_canvas = self.canvas[-1]

        self.output_distrib = Normal(self.final_canvas, self.output_std)

        posterior_step_probs = tf.transpose(tf.squeeze(self.presence_prob))
        self.num_steps_distrib = NumStepsDistribution(posterior_step_probs)

        self.num_step_per_sample = tf.to_float(
            tf.squeeze(tf.reduce_sum(self.presence, 0)))
        self.num_step = tf.reduce_mean(self.num_step_per_sample)
        self.gt_num_steps = tf.squeeze(tf.reduce_sum(self.nums, 0))

    def _prior_loss(self, appearance_prior, where_scale_prior,
                    where_shift_prior, num_steps_prior, global_step):

        with tf.variable_scope('prior_loss'):
            prior_loss = Loss()
            if num_steps_prior is not None:
                if num_steps_prior.anneal is not None:
                    with tf.variable_scope('num_steps_prior'):
                        nsp = num_steps_prior
                        val = tf.get_variable('value',
                                              initializer=num_steps_prior.init,
                                              dtype=tf.float32,
                                              trainable=False)

                        if num_steps_prior.anneal == 'exp':
                            decay_rate = (nsp.final /
                                          nsp.init)**(float(nsp.steps_div) /
                                                      nsp.steps)
                            val = tf.train.exponential_decay(
                                val, global_step, nsp.steps_div, decay_rate)

                        elif num_steps_prior.anneal == 'linear':
                            val = nsp.final + (nsp.init - nsp.final) * (
                                1. - tf.to_float(global_step) / nsp.steps)

                        num_steps_prior_value = tf.maximum(nsp.final, val)
                else:
                    num_steps_prior_value = num_steps_prior.init

                prior = geometric_prior(num_steps_prior_value, 3)
                steps_kl = tabular_kl(self.num_steps_distrib.prob(), prior)
                num_steps_prior_loss_per_sample = tf.squeeze(
                    tf.reduce_sum(steps_kl, 1))

                self.num_steps_prior_loss = tf.reduce_mean(
                    num_steps_prior_loss_per_sample)
                tf.summary.scalar('num_steps_prior', self.num_steps_prior_loss)
                prior_loss.add(self.num_steps_prior_loss,
                               num_steps_prior_loss_per_sample)

            if appearance_prior is not None:
                prior = Normal(appearance_prior.loc, appearance_prior.scale)
                posterior = Normal(self.what_loc, self.what_scale)

                what_kl = _kl(posterior, prior)
                what_kl = tf.reduce_sum(what_kl, -1,
                                        keep_dims=True) * self.presence
                appearance_prior_loss_per_sample = tf.squeeze(
                    tf.reduce_sum(what_kl, 0))

                #         n_samples_with_encoding = tf.reduce_sum(tf.to_float(tf.greater(num_step_per_sample, 0.)))
                #         div = tf.maximum(n_samples_with_encoding, 1.)
                #         appearance_prior_loss = tf.reduce_sum(latent_code_prior_loss_per_sample) / div
                self.appearance_prior_loss = tf.reduce_mean(
                    appearance_prior_loss_per_sample)
                tf.summary.scalar('latent_code_prior',
                                  self.appearance_prior_loss)
                prior_loss.add(self.appearance_prior_loss,
                               appearance_prior_loss_per_sample)

                usx, utx, usy, uty = tf.split(self.where_loc, 4, 2)
                ssx, stx, ssy, sty = tf.split(self.where_scale, 4, 2)
                us = tf.concat((usx, usy), -1)
                ss = tf.concat((ssx, ssy), -1)

                scale_distrib = Normal(us, ss)
                scale_prior = Normal(where_scale_prior.loc,
                                     where_scale_prior.scale)
                scale_kl = _kl(scale_distrib, scale_prior)

                ut = tf.concat((utx, uty), -1)
                st = tf.concat((stx, sty), -1)
                shift_distrib = Normal(ut, st)

                if 'loc' in where_shift_prior:
                    shift_mean = where_shift_prior.loc
                else:
                    shift_mean = ut
                shift_prior = Normal(shift_mean, where_shift_prior.scale)

                shift_kl = _kl(shift_distrib, shift_prior)
                where_kl = tf.reduce_sum(
                    scale_kl + shift_kl, -1, keep_dims=True) * self.presence
                where_kl_per_sample = tf.reduce_sum(tf.squeeze(where_kl), 0)
                self.where_kl = tf.reduce_mean(where_kl_per_sample)
                tf.summary.scalar('where_prior', self.where_kl)
                prior_loss.add(self.where_kl, where_kl_per_sample)

        return prior_loss

    def _reinforce(self, loss, make_opt, baseline=None):
        if baseline is None:
            baseline = getattr(self, 'baseline', None)

        if callable(baseline):
            baseline_module = baseline
            self.baseline = baseline(self.obs, self.what, self.where,
                                     self.presence_prob)

        log_prob = self.num_steps_distrib.log_prob(self.num_step_per_sample)
        log_prob = tf.clip_by_value(log_prob, -1e38, 1e38)

        #     log_prob *= -1 # cause we're maximising
        self.importance_weight = loss._per_sample
        if baseline is not None:
            self.importance_weight -= self.baseline

        reinforce_loss_per_sample = tf.stop_gradient(
            self.importance_weight) * log_prob
        self.reinforce_loss = tf.reduce_mean(reinforce_loss_per_sample)
        tf.summary.scalar('reinforce_loss', self.reinforce_loss)

        # Baseline Optimisation
        baseline_vars, baseline_train_step = [], None
        if baseline is not None:
            baseline_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                scope=baseline_module.variable_scope.name)
            baseline_target = tf.stop_gradient(loss.per_sample)
            baseline_loss_per_sample = (baseline_target - self.baseline)**2
            self.baseline_loss = tf.reduce_mean(baseline_loss_per_sample)
            tf.summary.scalar('baseline_loss', self.baseline_loss)

            baseline_opt = make_opt(10 * self.learning_rate)
            baseline_train_step = baseline_opt.minimize(self.baseline_loss,
                                                        var_list=baseline_vars)

        return self.reinforce_loss, baseline_vars, baseline_train_step

    def train_step(self,
                   learning_rate,
                   l2_weight=0.,
                   appearance_prior=None,
                   where_scale_prior=None,
                   where_shift_prior=None,
                   num_steps_prior=None,
                   use_prior=True,
                   use_reinforce=True,
                   baseline=None):

        self.l2_weight = l2_weight
        self.appearance_prior = appearance_prior
        self.where_scale_prior = where_scale_prior
        self.where_shift_prior = where_shift_prior
        self.num_steps_prior = num_steps_prior
        self.use_prior = use_prior
        self.use_reinforce = use_reinforce

        with tf.variable_scope('loss'):
            global_step = tf.train.get_or_create_global_step()
            loss = Loss()
            self._train_step = []
            self.learning_rate = tf.Variable(learning_rate,
                                             name='learning_rate',
                                             trainable=False)
            make_opt = lambda lr: tf.train.RMSPropOptimizer(
                lr, momentum=.9, centered=True)

            # Reconstruction Loss
            rec_loss_per_sample = -self.output_distrib.log_prob(self.obs)
            self.rec_loss_per_sample = tf.reduce_sum(rec_loss_per_sample,
                                                     axis=(1, 2))
            self.rec_loss = tf.reduce_mean(self.rec_loss_per_sample)
            tf.summary.scalar('rec', self.rec_loss)
            loss.add(self.rec_loss, self.rec_loss_per_sample)

            # Prior Loss
            if use_prior:
                self.prior_loss = self._prior_loss(appearance_prior,
                                                   where_scale_prior,
                                                   where_shift_prior,
                                                   num_steps_prior,
                                                   global_step)
                tf.summary.scalar('prior', self.prior_loss.value)
                loss.add(self.prior_loss)

            # REINFORCE
            opt_loss = loss.value
            baseline_vars = []
            if use_reinforce:
                reinforce_loss, baseline_vars, baseline_train_step = self._reinforce(
                    loss, make_opt, baseline)
                if baseline_train_step is not None:
                    self._train_step.append(baseline_train_step)

                opt_loss += reinforce_loss

            model_vars = list(
                set(tf.trainable_variables()) - set(baseline_vars))
            # L2 reg
            if l2_weight > 0.:
                # don't penalise biases
                weights = [w for w in model_vars if len(w.get_shape()) == 2]
                self.l2_loss = l2_weight * sum(map(tf.nn.l2_loss, weights))
                opt_loss += self.l2_loss
                tf.summary.scalar('l2', self.l2_loss)

            opt = make_opt(self.learning_rate)
            gvs = opt.compute_gradients(opt_loss, var_list=model_vars)
            true_train_step = opt.apply_gradients(gvs, global_step=global_step)
            self._train_step.append(true_train_step)

            # Metrics
            gradient_summaries(gvs)
            self.num_step_accuracy = tf.reduce_mean(
                tf.to_float(
                    tf.equal(self.gt_num_steps, self.num_step_per_sample)))

            self.loss = loss
            return self._train_step, global_step
    def __init__(self, args, d, logdir):
        super(bern_emb_model, self).__init__(args, d, logdir)
        self.n_minibatch = self.n_minibatch.sum()

        with tf.name_scope('model'):
            # Data Placeholder
            with tf.name_scope('input'):
                self.placeholders = tf.placeholder(tf.int32)
                self.words = self.placeholders

            # Index Masks
            with tf.name_scope('context_mask'):
                self.p_mask = tf.cast(
                    tf.range(int(self.cs / 2),
                             self.n_minibatch + int(self.cs / 2)), tf.int32)
                rows = tf.cast(
                    tf.tile(tf.expand_dims(tf.range(0, int(self.cs / 2)), [0]),
                            [self.n_minibatch, 1]), tf.int32)
                columns = tf.cast(
                    tf.tile(tf.expand_dims(tf.range(0, self.n_minibatch), [1]),
                            [1, int(self.cs / 2)]), tf.int32)
                self.ctx_mask = tf.concat(
                    [rows + columns, rows + columns + int(self.cs / 2) + 1], 1)

            with tf.name_scope('embeddings'):
                self.rho = tf.Variable(self.rho_init, name='rho')
                self.alpha = tf.Variable(self.alpha_init,
                                         name='alpha',
                                         trainable=self.alpha_trainable)

                with tf.name_scope('priors'):
                    prior = Normal(loc=0.0, scale=self.sig)
                    if self.alpha_trainable:
                        self.log_prior = tf.reduce_sum(
                            prior.log_prob(self.rho) +
                            prior.log_prob(self.alpha))
                    else:
                        self.log_prior = tf.reduce_sum(prior.log_prob(
                            self.rho))

            with tf.name_scope('natural_param'):
                # Taget and Context Indices
                with tf.name_scope('target_word'):
                    self.p_idx = tf.gather(self.words, self.p_mask)
                    self.p_rho = tf.squeeze(tf.gather(self.rho, self.p_idx))

                # Negative samples
                with tf.name_scope('negative_samples'):
                    unigram_logits = tf.tile(
                        tf.expand_dims(tf.log(tf.constant(self.unigram)), [0]),
                        [self.n_minibatch, 1])
                    self.n_idx = tf.multinomial(unigram_logits, self.ns)
                    self.n_rho = tf.gather(self.rho, self.n_idx)

                with tf.name_scope('context'):
                    self.ctx_idx = tf.squeeze(
                        tf.gather(self.words, self.ctx_mask))
                    self.ctx_alphas = tf.gather(self.alpha, self.ctx_idx)

                # Natural parameter
                ctx_sum = tf.reduce_sum(self.ctx_alphas, [1])
                self.p_eta = tf.expand_dims(
                    tf.reduce_sum(tf.multiply(self.p_rho, ctx_sum), -1), 1)
                self.n_eta = tf.reduce_sum(
                    tf.multiply(
                        self.n_rho,
                        tf.tile(tf.expand_dims(ctx_sum, 1), [1, self.ns, 1])),
                    -1)

            # Conditional likelihood
            self.y_pos = Bernoulli(logits=self.p_eta)
            self.y_neg = Bernoulli(logits=self.n_eta)

            self.ll_pos = tf.reduce_sum(self.y_pos.log_prob(1.0))
            self.ll_neg = tf.reduce_sum(self.y_neg.log_prob(0.0))

            self.log_likelihood = self.ll_pos + self.ll_neg

            scale = 1.0 * self.N / self.n_minibatch
            self.loss = -(self.n_epochs * self.log_likelihood + self.log_prior)
    def loss(self, G_data, y_data, batch):
        """ Computes the loss.

    Parameters
    ----------
    G_data : tf.Tensor
       Design matrix
    y_data : tf.SparseTensor
       Sparse tensor of counts
    batch : tuple of results tf.Tensor
       The output from sample().  The tuple is decomposed as follows

       positive_batch : tf.SparseTensor
          Sparse tensor of positive examples
       negative_batch : tf.SparseTensor
          Sparse tensor of negative examples
       accident_batch : tf.SparseTensor
          Sparse tensor of accidental positive examples.
          These are examples that are claimed to be negative,
          but are actually positive.  This is corrected downstream
          in the `inference` module.  These are added to
          to the negative batch to correct the accident.
          Since Poisson(0) + Poisson(k) = Poisson(k), this should
          be equivalent.  Blame Google for this ugly hack.
       num_exp_pos : int
          Number of expected positive hits.  This is useful for
          scaling the minibatches appropriately.
       num_exp_neg : int
          Number of expected negative hits. This is useful for
          scaling the minibatches appropriately.
    """
        with tf.name_scope('loss'):
            opts = self.opts
            (positive_batch, negative_batch, accident_batch, num_exp_pos,
             num_exp_neg) = batch
            gamma_mean, gamma_scale = opts.gamma_mean, opts.gamma_scale
            beta_mean, beta_scale = opts.beta_mean, opts.beta_scale
            N, D, p = self.N, self.D, self.p
            num_nonzero = tf.size(y_data.values, out_type=tf.float32)

            # unpack sparse tensors
            pos_data = positive_batch.values  # nonzero examples
            pos_row = tf.gather(positive_batch.indices, 0, axis=1)
            pos_col = tf.gather(positive_batch.indices, 1, axis=1)
            neg_data = negative_batch.values  # zero examples
            neg_row = tf.gather(negative_batch.indices, 0, axis=1)
            neg_col = tf.gather(negative_batch.indices, 1, axis=1)
            acc_data = accident_batch.values  # accident examples
            acc_row = tf.gather(accident_batch.indices, 0, axis=1)
            acc_col = tf.gather(accident_batch.indices, 1, axis=1)
            batch_size, num_sampled = opts.batch_size, opts.num_neg_samples

            # obtain prediction to then calculate loss
            Gpos = tf.gather(G_data, pos_row, axis=0)
            y_pred = self.inference(Gpos, pos_col)
            theta = tf.log(
                tf.cast(tf.sparse_reduce_sum(y_data, axis=1),
                        dtype=tf.float32))
            qbeta, qgamma = self.qbeta, self.qgamma

            # Actual calculation of loss is below.
            # Adding sample bias
            y_pred += tf.reshape(tf.gather(theta, pos_row), shape=[batch_size])
            total_zero = tf.constant(N * D, dtype=tf.float32) - num_nonzero
            total_nonzero = num_nonzero
            pos_poisson = Poisson(log_rate=y_pred, name='Y')

            # Distributions species bias
            gamma = Normal(loc=tf.zeros([1, D]) + gamma_mean,
                           scale=tf.ones([1, D]) * gamma_scale,
                           name='gamma')
            # regression coefficents distribution
            beta = Normal(loc=tf.zeros([p, D]) + beta_mean,
                          scale=tf.ones([p, D]) * beta_scale,
                          name='B')

            # sparse matrix multiplication for negative samples
            Gneg = tf.gather(G_data, neg_row, axis=0)
            Gneg = tf.concat([tf.ones([num_sampled, 1]), Gneg], axis=1)
            neg_prime = tf.reduce_sum(tf.multiply(
                Gneg, tf.transpose(tf.gather(self.V, neg_col, axis=1))),
                                      axis=1)
            neg_phi = tf.reshape(tf.gather(theta, neg_row),
                                 shape=[num_sampled]) + neg_prime
            neg_poisson = Poisson(log_rate=neg_phi, name='neg_counts')

            # accident samples
            num_acc = tf.shape(accident_batch.indices)[0]
            Gacc = tf.gather(G_data, acc_row, axis=0)
            Gacc = tf.concat([tf.ones([num_acc, 1]), Gacc], axis=1)
            acc_prime = tf.reduce_sum(tf.multiply(
                Gacc, tf.transpose(tf.gather(self.V, acc_col, axis=1))),
                                      axis=1)
            acc_phi = tf.reshape(tf.gather(theta, acc_row),
                                 shape=[num_acc]) + acc_prime
            acc_poisson = Poisson(log_rate=acc_phi, name='acc_counts')

            pos_data = tf.cast(pos_data, dtype=tf.float32)
            neg_data = tf.cast(neg_data, dtype=tf.float32)
            acc_data = tf.cast(acc_data, dtype=tf.float32)

            num_acc = tf.cast(tf.size(acc_data), tf.float32)
            num_pos = batch_size + num_acc
            num_neg = num_sampled - num_acc

            pos_prob = pos_poisson.log_prob(pos_data)
            neg_prob = neg_poisson.log_prob(neg_data)
            acc_prob = acc_poisson.log_prob(acc_data)

            total_pos = tf.reduce_sum(pos_prob)
            total_acc = tf.reduce_sum(acc_prob)
            total_neg = tf.reduce_sum(neg_prob)
            total_gamma = tf.reduce_sum(gamma.log_prob(qgamma))
            total_beta = tf.reduce_sum(beta.log_prob(qbeta))

            log_loss = - ( total_gamma + total_beta + \
              (total_pos + total_acc) * (total_nonzero / num_pos) + \
              (total_neg - total_acc) * (total_zero / num_neg)
            )
            return log_loss
Example #26
0
 def _log_prob1(mean, std, targets):
     distribution = Normal(loc=mean, scale=std)
     log_prob = distribution.log_prob(targets)
     return log_prob
    def loss(self, G_data, y_data, positive_batch, random_batch):
        """ Computes the loss.

    Parameters
    ----------
    G_data : tf.Tensor
       Design matrix
    y_data : tf.SparseTensor
       Sparse tensor of counts
    positive_batch : tf.Tensor
       A Sparse tensor representing a batch of positive examples.
    random_batch : tf.Tensor
       A Sparse tensor representing a batch of random examples.

    Returns
    -------
    log_loss : tf.Tensor
       Tensor representing the log likelihood of the model.
    """
        with tf.name_scope('loss'):
            gamma_mean, gamma_scale = self.gamma_mean, self.gamma_scale
            beta_mean, beta_scale = self.beta_mean, self.beta_scale
            N, D, p = self.block_size, self.D, self.p
            num_nonzero = tf.cast(tf.size(y_data.values, out_type=tf.int32),
                                  dtype=tf.float32)

            # unpack sparse tensors
            pos_data = tf.cast(positive_batch.values, dtype=tf.float32)
            pos_row = tf.gather(positive_batch.indices, 0, axis=1)
            pos_col = tf.gather(positive_batch.indices, 1, axis=1)

            rand_row = tf.gather(random_batch.indices, 0, axis=1)
            rand_col = tf.gather(random_batch.indices, 1, axis=1)

            num_sampled = tf.size(pos_row, out_type=tf.float32)

            theta = tf.log(  # basically log total counts
                tf.cast(tf.sparse_reduce_sum(y_data, axis=1),
                        dtype=tf.float32))

            # Regression coefficients
            qgamma = tf.Variable(tf.random_normal([1, D]), name='qgamma')
            qbeta = tf.Variable(tf.random_normal([p, D]), name='qbeta')
            self.V = tf.concat([qgamma, qbeta], axis=0, name='V')
            G = tf.concat([tf.ones([G_data.shape[0], 1]), G_data],
                          axis=1,
                          name='G')

            with tf.name_scope('positive_log_prob'):
                # add bias terms for samples
                Gpos = tf.gather(G, pos_row, axis=0)
                Vpos = tf.transpose(tf.gather(self.V, pos_col, axis=1),
                                    name='Vprime')
                # sparse matrix multiplication for positive samples
                y_pred = tf.reduce_sum(tf.multiply(Gpos, Vpos), axis=1)

                theta_pos = tf.squeeze(tf.gather(theta, pos_row))
                pos_prob = tf.reduce_sum(
                    tf.multiply(pos_data, y_pred + theta_pos))
                sparse_scale = num_nonzero / num_sampled

            with tf.name_scope('coefficient_log_prob'):
                Grand = tf.gather(G, rand_row, axis=0)
                Vrand = tf.transpose(tf.gather(self.V, rand_col, axis=1),
                                     name='Vprime')
                # sparse matrix multiplication for random indices
                y_rand = tf.reduce_sum(tf.multiply(Grand, Vrand), axis=1)
                theta_rand = tf.squeeze(tf.gather(theta, rand_row))
                coef_prob = tf.reduce_sum(tf.exp(y_rand + theta_rand))
                coef_scale = N * D / self.num_neg_samples

            total_poisson = pos_prob * sparse_scale - coef_prob * coef_scale

            with tf.name_scope('priors'):
                # Normal priors (a.k.a. L2 regularization)
                # species intercepts
                gamma = Normal(loc=tf.zeros([1, D]) + gamma_mean,
                               scale=tf.ones([1, D]) * gamma_scale,
                               name='gamma')
                # regression coefficents distribution
                beta = Normal(loc=tf.zeros([p, D]) + beta_mean,
                              scale=tf.ones([p, D]) * beta_scale,
                              name='B')

                total_gamma = tf.reduce_sum(gamma.log_prob(qgamma))
                total_beta = tf.reduce_sum(beta.log_prob(qbeta))

            log_loss = - (total_gamma + total_beta + \
                          total_poisson)

            # save parameters to model
            self.qbeta = qbeta
            self.qgamma = qgamma

            return log_loss