Ejemplo n.º 1
0
class PosteriorBNNSampling(BanditAlgorithm):
  """Posterior Sampling algorithm based on a Bayesian neural network."""

  def __init__(self, name, hparams, bnn_model='RMSProp'):
    """Creates a PosteriorBNNSampling object based on a specific optimizer.

    The algorithm has two basic tools: an Approx BNN and a Contextual Dataset.
    The Bayesian Network keeps the posterior based on the optimizer iterations.

    Args:
      name: Name of the algorithm.
      hparams: Hyper-parameters of the algorithm.
      bnn_model: Type of BNN. By default RMSProp (point estimate).
    """

    self.name = name
    self.hparams = hparams
    self.optimizer_n = hparams.optimizer

    self.training_freq = hparams.training_freq
    self.training_epochs = hparams.training_epochs
    self.t = 0
    self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions,
                                    hparams.buffer_s)

    # to be extended with more BNNs (BB alpha-div, GPs, SGFS, constSGD...)
    bnn_name = '{}-bnn'.format(name)
    if bnn_model == 'Variational':
      self.bnn = VariationalNeuralBanditModel(hparams, bnn_name)
    elif bnn_model == 'AlphaDiv':
      self.bnn = BBAlphaDivergence(hparams, bnn_name)
    elif bnn_model == 'Variational_BF':
      self.bnn = BfVariationalNeuralBanditModel(hparams, bnn_name)
    elif bnn_model == 'GP':
      self.bnn = MultitaskGP(hparams)
    else:
      self.bnn = NeuralBanditModel(self.optimizer_n, hparams, bnn_name)

  def action(self, context):
    """Selects action for context based on Thompson Sampling using the BNN."""

    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
      # round robin until each action has been taken "initial_pulls" times
      return self.t % self.hparams.num_actions

    with self.bnn.graph.as_default():
      c = context.reshape((1, self.hparams.context_dim))
      output = self.bnn.sess.run(self.bnn.y_pred, feed_dict={self.bnn.x: c})
      return np.argmax(output)

  def update(self, context, action, reward):
    """Updates data buffer, and re-trains the BNN every training_freq steps."""

    self.t += 1
    self.data_h.add(context, action, reward)

    if self.t % self.training_freq == 0:
      if self.hparams.reset_lr:
        self.bnn.assign_lr()
      self.bnn.train(self.data_h, self.training_epochs)
Ejemplo n.º 2
0
class PosteriorBNNSampling(BanditAlgorithm):
    """Posterior Sampling algorithm based on a Bayesian neural network."""
    def __init__(self, name, hparams, bnn_model='RMSProp'):
        """Creates a PosteriorBNNSampling object based on a specific optimizer.

    The algorithm has two basic tools: an Approx BNN and a Contextual Dataset.
    The Bayesian Network keeps the posterior based on the optimizer iterations.

    Args:
      name: Name of the algorithm.
      hparams: Hyper-parameters of the algorithm.
      bnn_model: Type of BNN. By default RMSProp (point estimate).
    """

        self.name = name
        self.hparams = hparams
        self.optimizer_n = hparams.optimizer

        self.training_freq = hparams.training_freq
        self.training_epochs = hparams.training_epochs
        self.t = 0
        self.data_h = ContextualDataset(hparams.context_dim,
                                        hparams.num_actions, hparams.buffer_s)

        # to be extended with more BNNs (BB alpha-div, GPs, SGFS, constSGD...)
        bnn_name = '{}-bnn'.format(name)
        if bnn_model == 'Variational':
            self.bnn = VariationalNeuralBanditModel(hparams, bnn_name)
        elif bnn_model == 'AlphaDiv':
            self.bnn = BBAlphaDivergence(hparams, bnn_name)
        elif bnn_model == 'Variational_BF':
            self.bnn = BfVariationalNeuralBanditModel(hparams, bnn_name)
        elif bnn_model == 'GP':
            self.bnn = MultitaskGP(hparams)
        else:
            self.bnn = NeuralBanditModel(self.optimizer_n, hparams, bnn_name)

    def action(self, context):
        """Selects action for context based on Thompson Sampling using the BNN."""

        if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
            # round robin until each action has been taken "initial_pulls" times
            return self.t % self.hparams.num_actions

        with self.bnn.graph.as_default():
            c = context.reshape((1, self.hparams.context_dim))
            output = self.bnn.sess.run(self.bnn.y_pred,
                                       feed_dict={self.bnn.x: c})
            return np.argmax(output)

    def update(self, context, action, reward):
        """Updates data buffer, and re-trains the BNN every training_freq steps."""

        self.t += 1
        self.data_h.add(context, action, reward)

        if self.t % self.training_freq == 0:
            if self.hparams.reset_lr:
                self.bnn.assign_lr()
            self.bnn.train(self.data_h, self.training_epochs)
Ejemplo n.º 3
0
class NeuralLinearPosteriorSampling(BanditAlgorithm):
  """Full Bayesian linear regression on the last layer of a deep neural net."""

  def __init__(self, name, hparams, optimizer='RMS'):

    self.name = name
    self.hparams = hparams
    self.latent_dim = self.hparams.layer_sizes[-1]

    # Gaussian prior for each beta_i
    self._lambda_prior = self.hparams.lambda_prior

    self.mu = [
        np.zeros(self.latent_dim)
        for _ in range(self.hparams.num_actions)
    ]

    self.cov = [(1.0 / self.lambda_prior) * np.eye(self.latent_dim)
                for _ in range(self.hparams.num_actions)]

    self.precision = [
        self.lambda_prior * np.eye(self.latent_dim)
        for _ in range(self.hparams.num_actions)
    ]

    # Inverse Gamma prior for each sigma2_i
    self._a0 = self.hparams.a0
    self._b0 = self.hparams.b0

    self.a = [self._a0 for _ in range(self.hparams.num_actions)]
    self.b = [self._b0 for _ in range(self.hparams.num_actions)]

    # Regression and NN Update Frequency
    self.update_freq_lr = hparams.training_freq
    self.update_freq_nn = hparams.training_freq_network

    self.t = 0
    self.optimizer_n = optimizer

    self.num_epochs = hparams.training_epochs
    self.data_h = ContextualDataset(hparams.context_dim,
                                    hparams.num_actions,
                                    intercept=False)
    self.latent_h = ContextualDataset(self.latent_dim,
                                      hparams.num_actions,
                                      intercept=False)
    self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name))

  def action(self, context):
    """Samples beta's from posterior, and chooses best action accordingly."""

    # Round robin until each action has been selected "initial_pulls" times
    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
      return self.t % self.hparams.num_actions

    # Sample sigma2, and beta conditional on sigma2
    sigma2_s = [
        self.b[i] * invgamma.rvs(self.a[i])
        for i in range(self.hparams.num_actions)
    ]

    try:
      beta_s = [
          np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i])
          for i in range(self.hparams.num_actions)
      ]
    except np.linalg.LinAlgError as e:
      # Sampling could fail if covariance is not positive definite
      print('Exception when sampling for {}.'.format(self.name))
      print('Details: {} | {}.'.format(e.message, e.args))
      d = self.latent_dim
      beta_s = [
          np.random.multivariate_normal(np.zeros((d)), np.eye(d))
          for i in range(self.hparams.num_actions)
      ]

    # Compute last-layer representation for the current context
    with self.bnn.graph.as_default():
      c = context.reshape((1, self.hparams.context_dim))
      z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})

    # Apply Thompson Sampling to last-layer representation
    vals = [
        np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions)
    ]
    return np.argmax(vals)

  def update(self, context, action, reward):
    """Updates the posterior using linear bayesian regression formula."""

    self.t += 1
    self.data_h.add(context, action, reward)
    c = context.reshape((1, self.hparams.context_dim))
    z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
    self.latent_h.add(z_context, action, reward)

    # Retrain the network on the original data (data_h)
    if self.t % self.update_freq_nn == 0:

      if self.hparams.reset_lr:
        self.bnn.assign_lr()
      self.bnn.train(self.data_h, self.num_epochs)

      # Update the latent representation of every datapoint collected so far
      new_z = self.bnn.sess.run(self.bnn.nn,
                                feed_dict={self.bnn.x: self.data_h.contexts})
      self.latent_h.replace_data(contexts=new_z)

    # Update the Bayesian Linear Regression
    if self.t % self.update_freq_lr == 0:

      # Find all the actions to update
      actions_to_update = self.latent_h.actions[:-self.update_freq_lr]

      for action_v in np.unique(actions_to_update):

        # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q)
        z, y = self.latent_h.get_data(action_v)

        # The algorithm could be improved with sequential formulas (cheaper)
        s = np.dot(z.T, z)

        # Some terms are removed as we assume prior mu_0 = 0.
        precision_a = s + self.lambda_prior * np.eye(self.latent_dim)
        cov_a = np.linalg.inv(precision_a)
        mu_a = np.dot(cov_a, np.dot(z.T, y))

        # Inverse Gamma posterior update
        a_post = self.a0 + z.shape[0] / 2.0
        b_upd = 0.5 * np.dot(y.T, y)
        b_upd -= 0.5 * np.dot(mu_a.T, np.dot(precision_a, mu_a))
        b_post = self.b0 + b_upd

        # Store new posterior distributions
        self.mu[action_v] = mu_a
        self.cov[action_v] = cov_a
        self.precision[action_v] = precision_a
        self.a[action_v] = a_post
        self.b[action_v] = b_post

  @property
  def a0(self):
    return self._a0

  @property
  def b0(self):
    return self._b0

  @property
  def lambda_prior(self):
    return self._lambda_prior
Ejemplo n.º 4
0
class NeuralUCBSampling(BanditAlgorithm):
    """UCB Sampling algorithm based on a neural network."""
    def __init__(self, name, hparams, bnn_model='RMSProp', optimizer='RMS'):
        """Creates a PosteriorBNNSampling object based on a specific optimizer.

    The algorithm has two basic tools: an Approx BNN and a Contextual Dataset.
    The Bayesian Network keeps the posterior based on the optimizer iterations.

    Args:
      name: Name of the algorithm.
      hparams: Hyper-parameters of the algorithm.
      bnn_model: Type of BNN. By default RMSProp (point estimate).
    """

        self.name = name
        self.hparams = hparams
        self.optimizer_n = hparams.optimizer

        self.training_freq = hparams.training_freq
        self.training_epochs = hparams.training_epochs
        self.t = 0
        self.gamma = 0

        self.bonus = np.zeros(hparams.num_actions)
        self.C1 = 0.001
        self.C2 = 0.001
        self.C3 = 0.00001
        self.data_h = ContextualDataset(hparams.context_dim,
                                        hparams.num_actions, hparams.buffer_s)

        # to be extended with more BNNs (BB alpha-div, GPs, SGFS, constSGD...)
        bnn_name = '{}-ucb'.format(name)
        self.bnn = NeuralBanditModel(self.optimizer_n, hparams, bnn_name)
        self.p = (hparams.context_dim + 1) * (hparams.layer_sizes[0]) + (
            hparams.layer_sizes[0] + 1) * (hparams.layer_sizes[0]) * (
                len(hparams.layer_sizes) - 1) + (hparams.layer_sizes[0] +
                                                 1) * hparams.num_actions
        self.Zinv = (1 / hparams.lamb) * np.eye(self.p)
        self.detZ = hparams.lamb**self.p

    def action(self, context):
        """Selects action for context based on UCB using the NN."""

        if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
            # round robin until each action has been taken "initial_pulls" times
            return self.t % self.hparams.num_actions

        with self.bnn.graph.as_default():
            c = context.reshape((1, self.hparams.context_dim))
            output = self.bnn.sess.run(self.bnn.y_pred,
                                       feed_dict={self.bnn.x: c})

            ### Add confidence bound to outbut²
            listTensorGradients = self.bnn.sess.run(self.bnn.gradAction,
                                                    feed_dict={self.bnn.x: c})
            bonus = []
            for act in range(self.hparams.num_actions):
                grads = np.array([])
                for el in listTensorGradients[act]:
                    grads = np.concatenate((grads, el.flatten()))
                bonus.append(self.gamma * np.sqrt(
                    grads.dot(self.Zinv.dot(grads)) /
                    self.hparams.layer_sizes[0]))
            output += np.array(bonus)
            print("Bonus of the actions", bonus)
            print("Gamma", self.gamma)

            return np.argmax(output)

    def update(self, context, action, reward):
        """Updates data buffer, and re-trains the BNN every training_freq steps."""

        self.t += 1
        self.data_h.add(context, action, reward)

        if self.t % self.training_freq == 0:
            if self.hparams.reset_lr:
                self.bnn.assign_lr()
            self.bnn.train(self.data_h, self.training_epochs)

        tensorGradients = self.bnn.sess.run(
            self.bnn.gradAction[action],
            feed_dict={self.bnn.x: context.reshape(1, -1)})
        grads = np.array([])
        for el in tensorGradients:
            grads = np.concatenate((grads, el.flatten()))

        outer = np.outer(grads, grads) / self.hparams.layer_sizes[0]
        self.detZ *= 1 + grads.dot(
            self.Zinv.dot(grads)) / self.hparams.layer_sizes[0]
        self.Zinv -= self.Zinv.dot(outer.dot(self.Zinv)) / (
            1 +
            (grads.T.dot(self.Zinv.dot(grads)) / self.hparams.layer_sizes[0]))

        el1 = np.sqrt(1 + self.C1 * ((self.hparams.layer_sizes[0])**(-1 / 6)) *
                      np.sqrt(np.log(self.hparams.layer_sizes[0])) *
                      (len(self.hparams.layer_sizes)**4) * (self.t**(7 / 6)) *
                      (self.hparams.lamb**(-7 / 6)))
        el2 = self.hparams.mu * np.sqrt(
            -np.log(self.detZ / (self.hparams.lamb**self.p)) + self.C2 *
            ((self.hparams.layer_sizes[0])**
             (-1 / 6)) * np.sqrt(np.log(self.hparams.layer_sizes[0])) *
            (len(self.hparams.layer_sizes)**4) * (self.t**(5 / 3)) *
            (self.hparams.lamb**
             (-1 / 6)) - 2 * np.log(self.hparams.delta)) + np.sqrt(
                 self.hparams.lamb) * self.hparams.S
        el3 = self.C3 * (
            (1 - self.hparams.mu * self.hparams.layer_sizes[0] *
             self.hparams.lamb)**
            (self.training_epochs) * np.sqrt(self.t / self.hparams.lamb) +
            ((self.hparams.layer_sizes[0])**
             (-1 / 6)) * np.sqrt(np.log(self.hparams.layer_sizes[0])) *
            (len(self.hparams.layer_sizes)**(7 / 2)) * (self.t**(5 / 3)) *
            (self.hparams.lamb**(-5 / 3)) *
            (1 + np.sqrt(self.t / self.hparams.lamb)))
        print("Profile Elements", el1, el2, el3)
        self.gamma = el1 * el2 + el3
class NeuralLinearPosteriorSamplingFiniteMemory(BanditAlgorithm):
    """Full Bayesian linear regression on the last layer of a deep neural net."""
    def __init__(self, name, hparams, optimizer='RMS'):

        self.name = name
        self.hparams = hparams
        self.latent_dim = self.hparams.layer_sizes[-1]

        # Gaussian prior for each beta_i
        self._lambda_prior = self.hparams.lambda_prior

        self.mu = [
            np.zeros(self.latent_dim) for _ in range(self.hparams.num_actions)
        ]

        self.cov = [(1.0 / self.lambda_prior) * np.eye(self.latent_dim)
                    for _ in range(self.hparams.num_actions)]

        self.precision = [
            self.lambda_prior * np.eye(self.latent_dim)
            for _ in range(self.hparams.num_actions)
        ]
        self.mu_prior_flag = self.hparams.mu_prior_flag
        self.sigma_prior_flag = self.hparams.sigma_prior_flag

        self.precision_prior = self.precision[:]
        self.mu_prior = np.zeros((self.latent_dim, self.hparams.num_actions))
        # Inverse Gamma prior for each sigma2_i
        self._a0 = self.hparams.a0
        self._b0 = self.hparams.b0

        self.a = [self._a0 for _ in range(self.hparams.num_actions)]
        self.b = [self._b0 for _ in range(self.hparams.num_actions)]

        # Regression and NN Update Frequency
        self.update_freq_lr = hparams.training_freq
        self.update_freq_nn = hparams.training_freq_network

        self.t = 0
        self.optimizer_n = optimizer

        self.num_epochs = hparams.training_epochs
        self.data_h = ContextualDataset(hparams.context_dim,
                                        hparams.num_actions,
                                        intercept=False,
                                        buffer_s=hparams.mem)
        self.latent_h = ContextualDataset(self.latent_dim,
                                          hparams.num_actions,
                                          intercept=False,
                                          buffer_s=hparams.mem)
        self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name))

    def action(self, context):
        """Samples beta's from posterior, and chooses best action accordingly."""

        # Round robin until each action has been selected "initial_pulls" times
        if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
            return self.t % self.hparams.num_actions

        # Sample sigma2, and beta conditional on sigma2
        sigma2_s = [
            self.b[i] * invgamma.rvs(self.a[i])
            for i in range(self.hparams.num_actions)
        ]

        try:
            beta_s = [
                np.random.multivariate_normal(self.mu[i],
                                              sigma2_s[i] * self.cov[i])
                for i in range(self.hparams.num_actions)
            ]
        except np.linalg.LinAlgError as e:
            # Sampling could fail if covariance is not positive definite
            print('Exception when sampling for {}.'.format(self.name))
            print('Details: {} | {}.'.format(e.message, e.args))
            d = self.latent_dim
            beta_s = [
                np.random.multivariate_normal(np.zeros((d)), np.eye(d))
                for i in range(self.hparams.num_actions)
            ]

        # Compute last-layer representation for the current context
        with self.bnn.graph.as_default():
            c = context.reshape((1, self.hparams.context_dim))
            z_context = self.bnn.sess.run(self.bnn.nn,
                                          feed_dict={self.bnn.x: c})

        # Apply Thompson Sampling to last-layer representation
        vals = [
            np.dot(beta_s[i], z_context.T)
            for i in range(self.hparams.num_actions)
        ]
        return np.argmax(vals)

    def calc_precision_prior(self, contexts):
        precisions_return = []
        n, m = contexts.shape
        prior = (0.01) * np.eye(self.latent_dim)

        if self.cov is not None:
            for action, precision in enumerate(self.cov):
                ind = np.array(
                    [i for i in range(n) if self.data_h.actions[i] == action])
                if len(ind) > 0:
                    """compute confidence scores for old data"""
                    d = []
                    for c in self.latent_h.contexts[ind, :]:
                        d.append(np.dot(np.dot(c, precision), c.T))
                    """compute new data correlations"""
                    phi = []
                    for c in contexts[ind, :]:
                        phi.append(np.outer(c, c))

                    X = cvx.Variable((m, m), PSD=True)
                    # Form objective.
                    obj = cvx.Minimize(
                        sum([(cvx.trace(X * phi[i]) - d[i])**2
                             for i in xrange(len(d))]))
                    prob = cvx.Problem(obj)
                    prob.solve()
                    if X.value is None:
                        precisions_return.append(np.linalg.inv(prior))
                    else:
                        precisions_return.append(np.linalg.inv(X.value +
                                                               prior))
                else:
                    precisions_return.append(np.linalg.inv(prior))
        return precisions_return

    def update(self, context, action, reward):
        """Updates the posterior using linear bayesian regression formula."""

        self.t += 1
        self.data_h.add(context, action, reward)
        c = context.reshape((1, self.hparams.context_dim))
        z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
        self.latent_h.add(z_context, action, reward)

        # Retrain the network on the original data (data_h)
        if self.t % self.update_freq_nn == 0:

            if self.hparams.reset_lr:
                self.bnn.assign_lr()
            self.bnn.train(self.data_h, self.num_epochs)

            # Update the latent representation of every datapoint collected so far

            new_z = self.bnn.sess.run(
                self.bnn.nn, feed_dict={self.bnn.x: self.data_h.contexts})

            # Update the confidence prior using feature uncertainty matching
            if self.sigma_prior_flag == 1:
                self.precision_prior = self.calc_precision_prior(
                    contexts=new_z)
            self.latent_h.replace_data(contexts=new_z)
            # Update the mean prior using the weights of the NN
            if self.mu_prior_flag == 1:
                self.mu_prior = self.bnn.get_mu_prior()

        # Update the Bayesian Linear Regression
        if self.t % self.update_freq_lr == 0:

            # Find all the actions to update
            actions_to_update = self.latent_h.actions[:-self.update_freq_lr]

            for action_v in np.unique(actions_to_update):

                # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q)
                z, y = self.latent_h.get_data(action_v)

                # The algorithm could be improved with sequential formulas (cheaper)
                s = np.dot(z.T, z)

                # Get priors
                sigma0 = self.precision_prior[action_v]
                mu_0 = self.mu_prior[:, action_v]

                # Calc mean and precision using bayesian linear regression
                precision_a = s + sigma0
                cov_a = np.linalg.inv(precision_a)
                mu_a = np.dot(cov_a, (np.dot(z.T, y) + np.dot(sigma0, mu_0)))

                # Inverse Gamma posterior update
                a_post = self.a0 + z.shape[0] / 2.0
                b_upd = 0.5 * np.dot(y.T, y)
                b_upd += 0.5 * np.dot(mu_0.T, np.dot(sigma0, mu_0))
                b_upd -= 0.5 * np.dot(mu_a.T, np.dot(precision_a, mu_a))
                b_post = self.b0 + b_upd

                # Store new posterior distributions
                self.mu[action_v] = mu_a
                self.cov[action_v] = cov_a
                #self.precision[action_v] = precision_a
                self.a[action_v] = a_post
                self.b[action_v] = b_post

    @property
    def a0(self):
        return self._a0

    @property
    def b0(self):
        return self._b0

    @property
    def lambda_prior(self):
        return self._lambda_prior
Ejemplo n.º 6
0
class NeuralLinearPosteriorSampling(BanditAlgorithm):
  """Full Bayesian linear regression on the last layer of a deep neural net."""

  def __init__(self, name, hparams, optimizer='RMS'):

    self.name = name
    self.hparams = hparams
    self.latent_dim = self.hparams.layer_sizes[-1]

    # Gaussian prior for each beta_i
    self._lambda_prior = self.hparams.lambda_prior

    self.mu = [
        np.zeros(self.latent_dim)
        for _ in range(self.hparams.num_actions)
    ]

    self.cov = [(1.0 / self.lambda_prior) * np.eye(self.latent_dim)
                for _ in range(self.hparams.num_actions)]

    self.precision = [
        self.lambda_prior * np.eye(self.latent_dim)
        for _ in range(self.hparams.num_actions)
    ]

    # Inverse Gamma prior for each sigma2_i
    self._a0 = self.hparams.a0
    self._b0 = self.hparams.b0

    self.a = [self._a0 for _ in range(self.hparams.num_actions)]
    self.b = [self._b0 for _ in range(self.hparams.num_actions)]

    # Regression and NN Update Frequency
    self.update_freq_lr = hparams.training_freq
    self.update_freq_nn = hparams.training_freq_network

    self.t = 0
    self.optimizer_n = optimizer

    self.num_epochs = hparams.training_epochs
    self.data_h = ContextualDataset(hparams.context_dim,
                                    hparams.num_actions,
                                    intercept=False)
    self.latent_h = ContextualDataset(self.latent_dim,
                                      hparams.num_actions,
                                      intercept=False)
    self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name))

  def action(self, context):
    """Samples beta's from posterior, and chooses best action accordingly."""

    # Round robin until each action has been selected "initial_pulls" times
    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
      return self.t % self.hparams.num_actions

    # Sample sigma2, and beta conditional on sigma2
    sigma2_s = [
        self.b[i] * invgamma.rvs(self.a[i])
        for i in range(self.hparams.num_actions)
    ]

    try:
      beta_s = [
          np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i])
          for i in range(self.hparams.num_actions)
      ]
    except np.linalg.LinAlgError as e:
      # Sampling could fail if covariance is not positive definite
      print('Exception when sampling for {}.'.format(self.name))
      print('Details: {} | {}.'.format(e.message, e.args))
      d = self.latent_dim
      beta_s = [
          np.random.multivariate_normal(np.zeros((d)), np.eye(d))
          for i in range(self.hparams.num_actions)
      ]

    # Compute last-layer representation for the current context
    with self.bnn.graph.as_default():
      c = context.reshape((1, self.hparams.context_dim))
      z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})

    # Apply Thompson Sampling to last-layer representation
    vals = [
        np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions)
    ]
    return np.argmax(vals)

  def update(self, context, action, reward):
    """Updates the posterior using linear bayesian regression formula."""

    self.t += 1
    self.data_h.add(context, action, reward)
    c = context.reshape((1, self.hparams.context_dim))
    z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
    self.latent_h.add(z_context, action, reward)

    # Retrain the network on the original data (data_h)
    if self.t % self.update_freq_nn == 0:

      if self.hparams.reset_lr:
        self.bnn.assign_lr()
      self.bnn.train(self.data_h, self.num_epochs)

      # Update the latent representation of every datapoint collected so far
      new_z = self.bnn.sess.run(self.bnn.nn,
                                feed_dict={self.bnn.x: self.data_h.contexts})
      self.latent_h.replace_data(contexts=new_z)

    # Update the Bayesian Linear Regression
    if self.t % self.update_freq_lr == 0:

      # Find all the actions to update
      actions_to_update = self.latent_h.actions[:-self.update_freq_lr]

      for action_v in np.unique(actions_to_update):

        # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q)
        z, y = self.latent_h.get_data(action_v)

        # The algorithm could be improved with sequential formulas (cheaper)
        s = np.dot(z.T, z)

        # Some terms are removed as we assume prior mu_0 = 0.
        precision_a = s + self.lambda_prior * np.eye(self.latent_dim)
        cov_a = np.linalg.inv(precision_a)
        mu_a = np.dot(cov_a, np.dot(z.T, y))

        # print('beta_cov: ', cov_a)

        # Inverse Gamma posterior update
        a_post = self.a0 + z.shape[0] / 2.0
        b_upd = 0.5 * np.dot(y.T, y)
        b_upd -= 0.5 * np.dot(mu_a.T, np.dot(precision_a, mu_a))
        b_post = self.b0 + b_upd

        # Store new posterior distributions
        self.mu[action_v] = mu_a
        self.cov[action_v] = cov_a
        self.precision[action_v] = precision_a
        self.a[action_v] = a_post
        self.b[action_v] = b_post

  @property
  def a0(self):
    return self._a0

  @property
  def b0(self):
    return self._b0

  @property
  def lambda_prior(self):
    return self._lambda_prior
Ejemplo n.º 7
0
class NeuralLinearPosteriorSamplingFiniteMemory(BanditAlgorithm):
    """Full Bayesian linear regression on the last layer of a deep neural net."""
    def __init__(self, name, hparams, textflag='no', optimizer='RMS'):

        self.name = name
        self.hparams = hparams
        self.latent_dim = self.hparams.layer_sizes[-1]
        self.intercept = False
        if self.intercept:
            self.param_dim = 1 + self.latent_dim
        else:
            self.param_dim = self.latent_dim
        self.EPSILON = 0.00001
        # Gaussian prior for each beta_i
        self._lambda_prior = self.hparams.lambda_prior
        self.before = []
        self.after = []

        self.mu = [
            np.zeros(self.param_dim) for _ in range(self.hparams.num_actions)
        ]
        self.f = [
            np.zeros(self.param_dim) for _ in range(self.hparams.num_actions)
        ]
        self.yy = [0 for _ in range(self.hparams.num_actions)]
        self.cov = [(1.0 / self.lambda_prior) * np.eye(self.param_dim)
                    for _ in range(self.hparams.num_actions)]

        self.precision = [
            self.lambda_prior * np.eye(self.param_dim)
            for _ in range(self.hparams.num_actions)
        ]
        self.mu_prior_flag = self.hparams.mu_prior_flag
        self.sigma_prior_flag = self.hparams.sigma_prior_flag

        self.precision_prior = self.precision[:]
        self.mu_prior = np.zeros((self.param_dim, self.hparams.num_actions))
        # Inverse Gamma prior for each sigma2_i
        self._a0 = self.hparams.a0
        self._b0 = self.hparams.b0

        self.a = [self._a0 for _ in range(self.hparams.num_actions)]
        self.b = [self._b0 for _ in range(self.hparams.num_actions)]

        # Regression and NN Update Frequency
        self.update_freq_lr = hparams.training_freq
        self.update_freq_nn = hparams.training_freq_network

        self.t = 0
        self.optimizer_n = optimizer

        self.num_epochs = hparams.training_epochs
        self.data_h = ContextualDataset(hparams.context_dim,
                                        hparams.num_actions,
                                        intercept=False,
                                        buffer_s=hparams.mem)
        self.latent_h = ContextualDataset(self.latent_dim,
                                          hparams.num_actions,
                                          intercept=self.intercept,
                                          buffer_s=hparams.mem)
        if textflag == 'yes':
            self.bnn = TextCNN('adam', self.hparams.num_actions,
                               self.hparams.batch_size, '{}-bnn'.format(name))
        else:
            self.bnn = NeuralBanditModel(optimizer, hparams,
                                         '{}-bnn'.format(name))

    def action(self, context):
        """Samples beta's from posterior, and chooses best action accordingly."""

        # Round robin until each action has been selected "initial_pulls" times
        if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
            return self.t % self.hparams.num_actions

        # Sample sigma2, and beta conditional on sigma2
        sigma2_s = [
            self.b[i] * invgamma.rvs(self.a[i])
            for i in range(self.hparams.num_actions)
        ]

        try:
            beta_s = [
                np.random.multivariate_normal(self.mu[i],
                                              sigma2_s[i] * self.cov[i])
                for i in range(self.hparams.num_actions)
            ]
        except np.linalg.LinAlgError as e:
            # Sampling could fail if covariance is not positive definite

            d = self.latent_dim
            beta_s = [
                np.random.multivariate_normal(np.zeros((d)), np.eye(d))
                for i in range(self.hparams.num_actions)
            ]

        # Compute last-layer representation for the current context
        with self.bnn.graph.as_default():
            c = context.reshape((1, self.hparams.context_dim))
            z_context = self.bnn.sess.run(self.bnn.nn,
                                          feed_dict={self.bnn.x: c})
            if self.intercept:
                z_context = np.append(z_context, 1.0).reshape(
                    (1, self.latent_dim + 1))
        # Apply Thompson Sampling to last-layer representation
        vals = [
            np.dot(beta_s[i], z_context.T)
            for i in range(self.hparams.num_actions)
        ]
        return np.argmax(vals)

    def calc_precision_prior(self, contexts):
        precisions_return = []
        n, m = contexts.shape
        prior = (self.EPSILON) * np.eye(self.param_dim)

        if self.cov is not None:
            for action, precision in enumerate(self.cov):
                ind = np.array(
                    [i for i in range(n) if self.data_h.actions[i] == action])
                if len(ind) > 0:
                    """compute confidence scores for old data"""
                    d = []
                    for c in self.latent_h.contexts[ind, :]:
                        d.append(np.dot(np.dot(c, precision), c.T))
                    """compute new data correlations"""
                    phi = []
                    for c in contexts[ind, :]:
                        phi.append(np.outer(c, c))

                    X = cvx.Variable((m, m), PSD=True)
                    # Form objective.
                    obj = cvx.Minimize(
                        sum([(cvx.trace(X * phi[i]) - d[i])**2
                             for i in xrange(len(d))]))
                    prob = cvx.Problem(obj)
                    prob.solve()
                    if X.value is None:
                        precisions_return.append(np.linalg.inv(prior))
                        self.cov[action] = prior

                    else:
                        precisions_return.append(np.linalg.inv(X.value +
                                                               prior))
                        self.cov[action] = X.value + prior
                else:
                    precisions_return.append(np.linalg.inv(prior))
                    self.cov[action] = prior

        return (precisions_return)

    def update(self, context, action, reward):
        """Updates the posterior using linear bayesian regression formula."""

        self.t += 1
        self.data_h.add(context, action, reward)
        c = context.reshape((1, self.hparams.context_dim))
        z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
        self.latent_h.add(z_context, action, reward)

        # Retrain the network on the original data (data_h)
        if self.t % self.update_freq_nn == 0:

            if self.hparams.reset_lr:
                self.bnn.assign_lr()
            self.bnn.train(self.data_h, self.num_epochs)

            # Update the latent representation of every datapoint collected so far

            new_z = self.bnn.sess.run(
                self.bnn.nn, feed_dict={self.bnn.x: self.data_h.contexts})
            self.latent_h.replace_data(contexts=new_z)

            i_contexts = None
            for context in new_z:
                c = np.array(context[:])
                if self.intercept:
                    c = np.append(c, 1.0).reshape((1, self.latent_dim + 1))
                if i_contexts is None:
                    i_contexts = c
                else:
                    i_contexts = np.vstack((i_contexts, c))

            # Update the confidence prior using feature uncertainty matching

            #self.before.append(self.calc_model_evidence())
            if self.sigma_prior_flag == 1:
                self.precision_prior = self.calc_precision_prior(
                    contexts=i_contexts)
            # Update the mean prior using the weights of the NN
            if self.mu_prior_flag == 1:
                weights_p, bias_p = self.bnn.get_mu_prior()
                self.mu_prior[:self.latent_dim] = weights_p
                self.mu_prior[-1] = bias_p
            #self.after.append(self.calc_model_evidence())
            #print(self.before)
            #print(self.after)

            # Update the Bayesian Linear Regression

            for action_v in xrange(self.hparams.num_actions):
                # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q)
                z, y = self.latent_h.get_data(action_v)

                # The algorithm could be improved with sequential formulas (cheaper)
                self.precision[action_v] = (np.dot(z.T, z) +
                                            self.precision_prior[action_v])
                self.f[action_v] = np.dot(z.T, y)

        else:
            if self.intercept:
                z_context = np.append(z_context, 1.0).reshape(
                    (1, self.latent_dim + 1))
            self.precision[action] += np.dot(z_context.T, z_context)
            self.cov[action] = np.linalg.inv(self.precision[action])
            self.f[action] += (z_context.T * reward)[:, 0]

        # Calc mean and precision using bayesian linear regression
        self.mu[action] = np.dot(
            self.cov[action],
            (self.f[action] +
             np.dot(self.precision_prior[action], self.mu_prior[:, action])))

        # Inverse Gamma posterior update
        self.yy[action] += reward**2

        self.a[action] += 0.5
        b_upd = 0.5 * self.yy[action]
        b_upd += 0.5 * np.dot(
            self.mu_prior[:, action].T,
            np.dot(self.precision_prior[action], self.mu_prior[:, action]))
        b_upd -= 0.5 * np.dot(self.mu[action].T,
                              np.dot(self.precision[action], self.mu[action]))
        self.b[action] = self.b0 + b_upd

    @property
    def a0(self):
        return self._a0

    @property
    def b0(self):
        return self._b0

    @property
    def lambda_prior(self):
        return self._lambda_prior

    def calc_model_evidence(self):
        vval = 0
        for action in xrange(self.hparams.num_actions):
            sigma0 = self.precision_prior[action]
            mu_0 = self.mu_prior[:, action]
            z, y = self.latent_h.get_data(action)
            n = z.shape[0]
            s = np.dot(z.T, z)
            s_n = (sigma0 + s)
            cov_a = np.linalg.inv(s_n)
            mu_a = np.dot(cov_a, (np.dot(z.T, y) + np.dot(sigma0, mu_0)))

            a_post = (self.a0 + n / 2.0)
            b_upd = 0.5 * np.dot(y.T, y)
            b_upd += 0.5 * np.dot(mu_0.T, np.dot(sigma0, mu_0))
            b_upd -= 0.5 * np.dot(mu_a.T, np.dot(s_n, mu_a))
            b_post = self.b0 + b_upd
            val = np.float128(1)
            val /= ((np.float128(2.0) * math.pi)**(n / 2.0))
            val *= (gamma(a_post) / gamma(self.a0))
            val *= np.sqrt(np.linalg.det(sigma0) / np.linalg.det(s_n))
            val *= ((self.hparams.b0**self.hparams.a0) / (b_post**a_post))
            vval += val
        vval /= self.hparams.num_actions
        return vval
Ejemplo n.º 8
0
class NeuralLinearEpsilonGreedy(BanditAlgorithm):
    """Full Bayesian linear regression on the last layer of a deep neural net."""
    def __init__(self, name, hparams, textflag='yes', optimizer='RMS'):

        self.name = name
        self.hparams = hparams
        self.epsilon = self.hparams.epsilon
        self.latent_dim = self.hparams.layer_sizes[-1]
        self.intercept = True
        if self.intercept:
            self.param_dim = 1 + self.latent_dim
        else:
            self.param_dim = self.latent_dim
        # Gaussian prior for each beta_i

        # Regression and NN Update Frequency
        self.update_freq_lr = hparams.training_freq
        self.update_freq_nn = hparams.training_freq_network

        self.t = 0
        self.optimizer_n = optimizer

        self.num_epochs = hparams.training_epochs
        self.data_h = ContextualDataset(hparams.context_dim,
                                        hparams.num_actions,
                                        intercept=False)
        self.latent_h = ContextualDataset(self.latent_dim,
                                          hparams.num_actions,
                                          intercept=self.intercept)
        if textflag == 'yes':
            self.bnn = TextCNN('adam', self.hparams.num_actions,
                               self.hparams.batch_size, '{}-bnn'.format(name))
        else:
            self.bnn = NeuralBanditModel(optimizer, hparams,
                                         '{}-bnn'.format(name))

    def action(self, context):
        """Samples beta's from posterior, and chooses best action accordingly."""

        # Round robin until each action has been selected "initial_pulls" times
        if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
            return self.t % self.hparams.num_actions

        with self.bnn.graph.as_default():
            c = context.reshape((1, self.hparams.context_dim))
            y = self.bnn.sess.run(self.bnn.y_pred, feed_dict={self.bnn.x: c})
            if random.random() > self.epsilon:
                return np.argmax(y)
            else:
                return random.randrange(self.hparams.num_actions)

    def update(self, context, action, reward):
        """Updates the posterior using linear bayesian regression formula."""

        self.t += 1
        self.data_h.add(context, action, reward)
        c = context.reshape((1, self.hparams.context_dim))
        z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
        self.latent_h.add(z_context, action, reward)

        # Retrain the network on the original data (data_h)
        if self.t % self.update_freq_nn == 0:

            if self.hparams.reset_lr:
                self.bnn.assign_lr()
            #self.bnn.set_last_layer(self.mu)
            self.bnn.train(self.data_h, self.num_epochs)

    @property
    def a0(self):
        return self._a0

    @property
    def b0(self):
        return self._b0

    @property
    def lambda_prior(self):
        return self._lambda_prior
Ejemplo n.º 9
0
class NeuralGreedy(BanditAlgorithm):
    """Full Bayesian linear regression on the last layer of a deep neural net."""
    def __init__(self, name, hparams, optimizer='RMS'):

        self.name = name
        self.eps = 0.9
        self.decay = 0.99  # computed for 10,000 steps
        self.hparams = hparams

        # Regression and NN Update Frequency
        self.update_freq_lr = hparams.training_freq
        self.update_freq_nn = hparams.training_freq_network

        self.t = 0
        self.optimizer_n = optimizer

        self.num_epochs = hparams.training_epochs
        self.data_h = ContextualDataset(hparams.context_dim,
                                        hparams.num_actions,
                                        intercept=False)
        self.bnn = NeuralBanditModel(optimizer, hparams,
                                     '{}-greedy'.format(name))

    def action(self, context):
        """Samples beta's from posterior, and chooses best action accordingly."""

        # Round robin until each action has been selected "initial_pulls" times
        #if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
        #return self.t % self.hparams.num_actions ## No need with greedy

        if np.random.random() < self.eps:
            return np.random.choice(range(self.hparams.num_actions))
        else:
            with self.bnn.graph.as_default():
                c = context.reshape((1, self.hparams.context_dim))
                output = self.bnn.sess.run(self.bnn.y_pred,
                                           feed_dict={self.bnn.x: c})
                return np.argmax(output)

    def update(self, context, action, reward):
        """Updates the posterior using linear bayesian regression formula."""

        self.t += 1
        self.eps *= self.decay
        self.data_h.add(context, action, reward)
        c = context.reshape((1, self.hparams.context_dim))

        # Retrain the network on the original data (data_h)
        if self.t % self.update_freq_nn == 0:

            if self.hparams.reset_lr:
                self.bnn.assign_lr()
            self.bnn.train(self.data_h, self.num_epochs)

    @property
    def a0(self):
        return self._a0

    @property
    def b0(self):
        return self._b0

    @property
    def lambda_prior(self):
        return self._lambda_prior
Ejemplo n.º 10
0
class NeuralLinUCB(BanditAlgorithm):

    def __init__(self, name, hparams, optimizer='RMS'):

        self.name = name
        self.hparams = hparams
        self.n_a = self.hparams.num_actions
        self.n_d = self.hparams.layer_sizes[-1]
        self.alpha = self.hparams.alpha
        self.lam = self.hparams.lam

        self.a = np.concatenate(tuple([np.eye(self.n_d)[np.newaxis, :, :] for i in range(self.n_a)]), axis=0) * self.lam
        self.inv_a = np.concatenate(tuple([np.eye(self.n_d)[np.newaxis, :, :] for i in range(self.n_a)]),
                                    axis=0) / self.lam

        self.b = np.zeros((self.n_a, self.n_d))

        self.theta = np.zeros((self.n_a, self.n_d))

        # Params for BNN

        self.update_freq_nn = hparams.training_freq_network

        self.t = 0
        self.optimizer_n = optimizer

        self.num_epochs = hparams.training_epochs
        self.data_h = ContextualDataset(hparams.context_dim,
                                        hparams.num_actions,
                                        bootstrap=getattr(hparams, 'bootstrap', None),
                                        intercept=False)

        self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name))

    def action(self, context):
        """

        Args:
          context: Context for which the action need to be chosen.

        Returns:
          action: Selected action for the context.
        """

        with self.bnn.graph.as_default():
            c = context.reshape((1, self.hparams.context_dim))
            z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}).flatten()

        vals = np.array([
            np.dot(self.theta[i], z_context) + self.alpha * np.sqrt(np.dot(z_context, np.dot(self.inv_a[i], z_context)))
            for i in range(self.n_a)])

        return np.argmax(vals)

    def update(self, context, action, reward):
        """Updates action posterior using the linear Bayesian regression formula.

        Args:
          context: Last observed context.
          action: Last observed action.
          reward: Last observed reward.
        """
        self.t += 1
        self.data_h.add(context, action, reward)

        if self.t % self.update_freq_nn == 0:
            if self.hparams.reset_lr:
                self.bnn.assign_lr()
            self.bnn.train(self.data_h, self.num_epochs)

            new_z = self.bnn.sess.run(self.bnn.nn,
                                      feed_dict={self.bnn.x: self.data_h.contexts})
            contexts = new_z
            actions = np.array(self.data_h.actions)
            rewards = self.data_h.rewards[np.arange(actions.shape[
                                                        0]), actions]  # strange but data_h.rewards is of shape (n_samples, n_actions) so we select actions pulled by model

            self.a = np.dot(contexts.T, contexts) + np.concatenate(
                tuple([np.eye(self.n_d)[np.newaxis, :, :] for i in range(self.n_a)]), axis=0) * self.lam
            self.b = np.concatenate(tuple(
                [np.dot(rewards[actions == action], contexts[actions == action])[np.newaxis, :] for action in
                 range(self.n_a)]), axis=0)
            self.inv_a = np.concatenate(
                tuple([np.linalg.inv(self.a[action])[np.newaxis, :, :] for action in range(self.n_a)]), axis=0)
            self.theta = np.concatenate(
                tuple([np.dot(self.inv_a[action], self.b[action])[np.newaxis, :] for action in range(self.n_a)]),
                axis=0)

        else:
            c = context.reshape((1, self.hparams.context_dim))
            z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}).flatten()

            self.a[action] = self.a[action] + np.tensordot(z_context, z_context, axes=0)
            self.inv_a[action] = np.linalg.inv(self.a[action])
            self.b[action] = self.b[action] + reward * z_context
            self.theta[action] = np.dot(self.inv_a[action], self.b[action])
class NeuralLinearPosteriorSampling(BanditAlgorithm):
  """Full Bayesian linear regression on the last layer of a deep neural net."""

  def __init__(self, name, hparams,textflag ='no', optimizer='RMS'):

    self.name = name
    self.hparams = hparams
    self.latent_dim = self.hparams.layer_sizes[-1]
    self.intercept = False
    if self.intercept:
      self.param_dim=1+self.latent_dim
    else:
      self.param_dim = self.latent_dim
    # Gaussian prior for each beta_i
    self._lambda_prior = self.hparams.lambda_prior

    self.mu = [
        np.zeros(self.param_dim)
        for _ in range(self.hparams.num_actions)
    ]

    self.f = [
      np.zeros(self.param_dim)
      for _ in range(self.hparams.num_actions)
    ]
    self.yy = [0 for _ in range(self.hparams.num_actions)]

    self.cov = [(1.0 / self.lambda_prior) * np.eye(self.param_dim)
                for _ in range(self.hparams.num_actions)]

    self.precision = [
        self.lambda_prior * np.eye(self.param_dim)
        for _ in range(self.hparams.num_actions)
    ]

    # Inverse Gamma prior for each sigma2_i
    self._a0 = self.hparams.a0
    self._b0 = self.hparams.b0

    self.a = [self._a0 for _ in range(self.hparams.num_actions)]
    self.b = [self._b0 for _ in range(self.hparams.num_actions)]

    # Regression and NN Update Frequency
    self.update_freq_lr = hparams.training_freq
    self.update_freq_nn = hparams.training_freq_network

    self.t = 0
    self.optimizer_n = optimizer

    self.num_epochs = hparams.training_epochs
    self.data_h = ContextualDataset(hparams.context_dim,
                                    hparams.num_actions,
                                    intercept=False)
    self.latent_h = ContextualDataset(self.latent_dim,
                                      hparams.num_actions,
                                      intercept=self.intercept)
    if textflag=='yes':
      self.bnn = TextCNN('adam', self.hparams.num_actions,self.hparams.batch_size, '{}-bnn'.format(name))
    else:
      self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name))



  def action(self, context):
    """Samples beta's from posterior, and chooses best action accordingly."""

    # Round robin until each action has been selected "initial_pulls" times
    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
      return self.t % self.hparams.num_actions

    # Sample sigma2, and beta conditional on sigma2
    sigma2_s = [
        self.b[i] * invgamma.rvs(self.a[i])
        for i in range(self.hparams.num_actions)
    ]

    try:
      beta_s = [
          np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i])
          for i in range(self.hparams.num_actions)
      ]
    except np.linalg.LinAlgError as e:
      # Sampling could fail if covariance is not positive definite

      d = self.param_dim
      beta_s = [
          np.random.multivariate_normal(np.zeros((d)), np.eye(d))
          for i in range(self.hparams.num_actions)
      ]

    # Compute last-layer representation for the current context
    with self.bnn.graph.as_default():
      c = context.reshape((1, self.hparams.context_dim))
      z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
      if self.intercept:
        z_context = np.append(z_context, 1.0).reshape((1, self.latent_dim + 1))
    # Apply Thompson Sampling to last-layer representation
    vals = [
        np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions)
    ]
    return np.argmax(vals)

  def update(self, context, action, reward):
    """Updates the posterior using linear bayesian regression formula."""

    self.t += 1
    self.data_h.add(context, action, reward)
    c = context.reshape((1, self.hparams.context_dim))
    z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
    self.latent_h.add(z_context, action, reward)

    # Retrain the network on the original data (data_h)
    if self.t % self.update_freq_nn == 0:

      if self.hparams.reset_lr:
        self.bnn.assign_lr()
      #self.bnn.set_last_layer(self.mu)
      self.bnn.train(self.data_h, self.num_epochs)

      # Update the latent representation of every datapoint collected so far
      new_z = self.bnn.sess.run(self.bnn.nn,
                                feed_dict={self.bnn.x: self.data_h.contexts})
      self.latent_h.replace_data(contexts=new_z)
      for action_v in range(self.hparams.num_actions):

        # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q)
        z, y = self.latent_h.get_data(action_v)

        # The algorithm could be improved with sequential formulas (cheaper)
        self.precision[action_v] = (np.dot(z.T, z)+self.lambda_prior * np.eye(self.param_dim)) #the new PHI_0
        self.f[action_v] = np.dot(z.T, y)
    else:
      if self.intercept:
        z_context = np.append(z_context, 1.0).reshape((1, self.latent_dim + 1))
      self.precision[action] += np.dot(z_context.T, z_context)
      self.f[action] += (z_context.T * reward)[:, 0]
    self.yy[action] += reward ** 2
    self.cov[action] = np.linalg.inv(self.precision[action])
    self.mu[action] = np.dot(self.cov[action], self.f[action])

    # Inverse Gamma posterior update
    self.a[action] += 0.5
    b_upd = 0.5 * (self.yy[action] - np.dot(self.mu[action].T, np.dot(self.precision[action], self.mu[action])))
    self.b[action] = self.b0 + b_upd

    #print(self.calc_model_evidence())

  @property
  def a0(self):
    return self._a0

  @property
  def b0(self):
    return self._b0

  @property
  def lambda_prior(self):
    return self._lambda_prior
  def calc_model_evidence(self):
    vval = 0
    mp.mp.dps = 50
    for action in range(self.hparams.num_actions):
      #  val=1
      #  aa = self.a[action]
      #  for i in range(int(self.a[action]-self.a0)):
      #      aa-=1
      #      val*=aa
      #      val/=(2.0*math.pi)
      #      val/=self.b[action]
      #  val*=gamma(aa)
      #  val/=(self.b[action]**aa)
      #  val *= np.sqrt(np.linalg.det(self.lambda_prior * np.eye(self.hparams.context_dim + 1)) / np.linalg.det(self.precision[action]))
      #  val *= (self.b0 ** self.a0)
      #  val/= gamma(self.a0)
      #  vval += val
      #val= 1/float((2.0 * math.pi) ** (self.a[action]-self.a0))
      #val*= (float(gamma(self.a[action]))/float(gamma(self.a0)))
      #val*= np.sqrt(float(np.linalg.det(self.lambda_prior * np.eye(self.hparams.context_dim + 1)))/float(np.linalg.det(self.precision[action])))
      #val*= (float(self.b0**self.a0)/float(self.b[action]**self.a[action]))
      val= mp.mpf(mp.fmul(mp.fneg(mp.log(mp.fmul(2.0 , mp.pi))) , mp.fsub(self.a[action],self.a0)))
      val+= mp.loggamma(self.a[action])
      val-= mp.loggamma(self.a0)
      val+= 0.5*mp.log(np.linalg.det(self.lambda_prior * np.eye(self.hparams.context_dim + 1)))
      val -= 0.5*mp.log(np.linalg.det(self.precision[action]))
      val+= mp.fmul(self.a0,mp.log(self.b0))
      val-= mp.fmul(self.a[action],mp.log(self.b[action]))
      vval+=mp.exp(val)


    vval/=float(self.hparams.num_actions)

    return vval