class PosteriorBNNSampling(BanditAlgorithm): """Posterior Sampling algorithm based on a Bayesian neural network.""" def __init__(self, name, hparams, bnn_model='RMSProp'): """Creates a PosteriorBNNSampling object based on a specific optimizer. The algorithm has two basic tools: an Approx BNN and a Contextual Dataset. The Bayesian Network keeps the posterior based on the optimizer iterations. Args: name: Name of the algorithm. hparams: Hyper-parameters of the algorithm. bnn_model: Type of BNN. By default RMSProp (point estimate). """ self.name = name self.hparams = hparams self.optimizer_n = hparams.optimizer self.training_freq = hparams.training_freq self.training_epochs = hparams.training_epochs self.t = 0 self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, hparams.buffer_s) # to be extended with more BNNs (BB alpha-div, GPs, SGFS, constSGD...) bnn_name = '{}-bnn'.format(name) if bnn_model == 'Variational': self.bnn = VariationalNeuralBanditModel(hparams, bnn_name) elif bnn_model == 'AlphaDiv': self.bnn = BBAlphaDivergence(hparams, bnn_name) elif bnn_model == 'Variational_BF': self.bnn = BfVariationalNeuralBanditModel(hparams, bnn_name) elif bnn_model == 'GP': self.bnn = MultitaskGP(hparams) else: self.bnn = NeuralBanditModel(self.optimizer_n, hparams, bnn_name) def action(self, context): """Selects action for context based on Thompson Sampling using the BNN.""" if self.t < self.hparams.num_actions * self.hparams.initial_pulls: # round robin until each action has been taken "initial_pulls" times return self.t % self.hparams.num_actions with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) output = self.bnn.sess.run(self.bnn.y_pred, feed_dict={self.bnn.x: c}) return np.argmax(output) def update(self, context, action, reward): """Updates data buffer, and re-trains the BNN every training_freq steps.""" self.t += 1 self.data_h.add(context, action, reward) if self.t % self.training_freq == 0: if self.hparams.reset_lr: self.bnn.assign_lr() self.bnn.train(self.data_h, self.training_epochs)
class NeuralLinearPosteriorSampling(BanditAlgorithm): """Full Bayesian linear regression on the last layer of a deep neural net.""" def __init__(self, name, hparams, optimizer='RMS'): self.name = name self.hparams = hparams self.latent_dim = self.hparams.layer_sizes[-1] # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.mu = [ np.zeros(self.latent_dim) for _ in range(self.hparams.num_actions) ] self.cov = [(1.0 / self.lambda_prior) * np.eye(self.latent_dim) for _ in range(self.hparams.num_actions)] self.precision = [ self.lambda_prior * np.eye(self.latent_dim) for _ in range(self.hparams.num_actions) ] # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False) self.latent_h = ContextualDataset(self.latent_dim, hparams.num_actions, intercept=False) self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name)) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly.""" # Round robin until each action has been selected "initial_pulls" times if self.t < self.hparams.num_actions * self.hparams.initial_pulls: return self.t % self.hparams.num_actions # Sample sigma2, and beta conditional on sigma2 sigma2_s = [ self.b[i] * invgamma.rvs(self.a[i]) for i in range(self.hparams.num_actions) ] try: beta_s = [ np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i]) for i in range(self.hparams.num_actions) ] except np.linalg.LinAlgError as e: # Sampling could fail if covariance is not positive definite print('Exception when sampling for {}.'.format(self.name)) print('Details: {} | {}.'.format(e.message, e.args)) d = self.latent_dim beta_s = [ np.random.multivariate_normal(np.zeros((d)), np.eye(d)) for i in range(self.hparams.num_actions) ] # Compute last-layer representation for the current context with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) # Apply Thompson Sampling to last-layer representation vals = [ np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions) ] return np.argmax(vals) def update(self, context, action, reward): """Updates the posterior using linear bayesian regression formula.""" self.t += 1 self.data_h.add(context, action, reward) c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) self.latent_h.add(z_context, action, reward) # Retrain the network on the original data (data_h) if self.t % self.update_freq_nn == 0: if self.hparams.reset_lr: self.bnn.assign_lr() self.bnn.train(self.data_h, self.num_epochs) # Update the latent representation of every datapoint collected so far new_z = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: self.data_h.contexts}) self.latent_h.replace_data(contexts=new_z) # Update the Bayesian Linear Regression if self.t % self.update_freq_lr == 0: # Find all the actions to update actions_to_update = self.latent_h.actions[:-self.update_freq_lr] for action_v in np.unique(actions_to_update): # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q) z, y = self.latent_h.get_data(action_v) # The algorithm could be improved with sequential formulas (cheaper) s = np.dot(z.T, z) # Some terms are removed as we assume prior mu_0 = 0. precision_a = s + self.lambda_prior * np.eye(self.latent_dim) cov_a = np.linalg.inv(precision_a) mu_a = np.dot(cov_a, np.dot(z.T, y)) # Inverse Gamma posterior update a_post = self.a0 + z.shape[0] / 2.0 b_upd = 0.5 * np.dot(y.T, y) b_upd -= 0.5 * np.dot(mu_a.T, np.dot(precision_a, mu_a)) b_post = self.b0 + b_upd # Store new posterior distributions self.mu[action_v] = mu_a self.cov[action_v] = cov_a self.precision[action_v] = precision_a self.a[action_v] = a_post self.b[action_v] = b_post @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior
class NeuralUCBSampling(BanditAlgorithm): """UCB Sampling algorithm based on a neural network.""" def __init__(self, name, hparams, bnn_model='RMSProp', optimizer='RMS'): """Creates a PosteriorBNNSampling object based on a specific optimizer. The algorithm has two basic tools: an Approx BNN and a Contextual Dataset. The Bayesian Network keeps the posterior based on the optimizer iterations. Args: name: Name of the algorithm. hparams: Hyper-parameters of the algorithm. bnn_model: Type of BNN. By default RMSProp (point estimate). """ self.name = name self.hparams = hparams self.optimizer_n = hparams.optimizer self.training_freq = hparams.training_freq self.training_epochs = hparams.training_epochs self.t = 0 self.gamma = 0 self.bonus = np.zeros(hparams.num_actions) self.C1 = 0.001 self.C2 = 0.001 self.C3 = 0.00001 self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, hparams.buffer_s) # to be extended with more BNNs (BB alpha-div, GPs, SGFS, constSGD...) bnn_name = '{}-ucb'.format(name) self.bnn = NeuralBanditModel(self.optimizer_n, hparams, bnn_name) self.p = (hparams.context_dim + 1) * (hparams.layer_sizes[0]) + ( hparams.layer_sizes[0] + 1) * (hparams.layer_sizes[0]) * ( len(hparams.layer_sizes) - 1) + (hparams.layer_sizes[0] + 1) * hparams.num_actions self.Zinv = (1 / hparams.lamb) * np.eye(self.p) self.detZ = hparams.lamb**self.p def action(self, context): """Selects action for context based on UCB using the NN.""" if self.t < self.hparams.num_actions * self.hparams.initial_pulls: # round robin until each action has been taken "initial_pulls" times return self.t % self.hparams.num_actions with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) output = self.bnn.sess.run(self.bnn.y_pred, feed_dict={self.bnn.x: c}) ### Add confidence bound to outbut² listTensorGradients = self.bnn.sess.run(self.bnn.gradAction, feed_dict={self.bnn.x: c}) bonus = [] for act in range(self.hparams.num_actions): grads = np.array([]) for el in listTensorGradients[act]: grads = np.concatenate((grads, el.flatten())) bonus.append(self.gamma * np.sqrt( grads.dot(self.Zinv.dot(grads)) / self.hparams.layer_sizes[0])) output += np.array(bonus) print("Bonus of the actions", bonus) print("Gamma", self.gamma) return np.argmax(output) def update(self, context, action, reward): """Updates data buffer, and re-trains the BNN every training_freq steps.""" self.t += 1 self.data_h.add(context, action, reward) if self.t % self.training_freq == 0: if self.hparams.reset_lr: self.bnn.assign_lr() self.bnn.train(self.data_h, self.training_epochs) tensorGradients = self.bnn.sess.run( self.bnn.gradAction[action], feed_dict={self.bnn.x: context.reshape(1, -1)}) grads = np.array([]) for el in tensorGradients: grads = np.concatenate((grads, el.flatten())) outer = np.outer(grads, grads) / self.hparams.layer_sizes[0] self.detZ *= 1 + grads.dot( self.Zinv.dot(grads)) / self.hparams.layer_sizes[0] self.Zinv -= self.Zinv.dot(outer.dot(self.Zinv)) / ( 1 + (grads.T.dot(self.Zinv.dot(grads)) / self.hparams.layer_sizes[0])) el1 = np.sqrt(1 + self.C1 * ((self.hparams.layer_sizes[0])**(-1 / 6)) * np.sqrt(np.log(self.hparams.layer_sizes[0])) * (len(self.hparams.layer_sizes)**4) * (self.t**(7 / 6)) * (self.hparams.lamb**(-7 / 6))) el2 = self.hparams.mu * np.sqrt( -np.log(self.detZ / (self.hparams.lamb**self.p)) + self.C2 * ((self.hparams.layer_sizes[0])** (-1 / 6)) * np.sqrt(np.log(self.hparams.layer_sizes[0])) * (len(self.hparams.layer_sizes)**4) * (self.t**(5 / 3)) * (self.hparams.lamb** (-1 / 6)) - 2 * np.log(self.hparams.delta)) + np.sqrt( self.hparams.lamb) * self.hparams.S el3 = self.C3 * ( (1 - self.hparams.mu * self.hparams.layer_sizes[0] * self.hparams.lamb)** (self.training_epochs) * np.sqrt(self.t / self.hparams.lamb) + ((self.hparams.layer_sizes[0])** (-1 / 6)) * np.sqrt(np.log(self.hparams.layer_sizes[0])) * (len(self.hparams.layer_sizes)**(7 / 2)) * (self.t**(5 / 3)) * (self.hparams.lamb**(-5 / 3)) * (1 + np.sqrt(self.t / self.hparams.lamb))) print("Profile Elements", el1, el2, el3) self.gamma = el1 * el2 + el3
class NeuralLinearPosteriorSamplingFiniteMemory(BanditAlgorithm): """Full Bayesian linear regression on the last layer of a deep neural net.""" def __init__(self, name, hparams, optimizer='RMS'): self.name = name self.hparams = hparams self.latent_dim = self.hparams.layer_sizes[-1] # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.mu = [ np.zeros(self.latent_dim) for _ in range(self.hparams.num_actions) ] self.cov = [(1.0 / self.lambda_prior) * np.eye(self.latent_dim) for _ in range(self.hparams.num_actions)] self.precision = [ self.lambda_prior * np.eye(self.latent_dim) for _ in range(self.hparams.num_actions) ] self.mu_prior_flag = self.hparams.mu_prior_flag self.sigma_prior_flag = self.hparams.sigma_prior_flag self.precision_prior = self.precision[:] self.mu_prior = np.zeros((self.latent_dim, self.hparams.num_actions)) # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False, buffer_s=hparams.mem) self.latent_h = ContextualDataset(self.latent_dim, hparams.num_actions, intercept=False, buffer_s=hparams.mem) self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name)) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly.""" # Round robin until each action has been selected "initial_pulls" times if self.t < self.hparams.num_actions * self.hparams.initial_pulls: return self.t % self.hparams.num_actions # Sample sigma2, and beta conditional on sigma2 sigma2_s = [ self.b[i] * invgamma.rvs(self.a[i]) for i in range(self.hparams.num_actions) ] try: beta_s = [ np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i]) for i in range(self.hparams.num_actions) ] except np.linalg.LinAlgError as e: # Sampling could fail if covariance is not positive definite print('Exception when sampling for {}.'.format(self.name)) print('Details: {} | {}.'.format(e.message, e.args)) d = self.latent_dim beta_s = [ np.random.multivariate_normal(np.zeros((d)), np.eye(d)) for i in range(self.hparams.num_actions) ] # Compute last-layer representation for the current context with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) # Apply Thompson Sampling to last-layer representation vals = [ np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions) ] return np.argmax(vals) def calc_precision_prior(self, contexts): precisions_return = [] n, m = contexts.shape prior = (0.01) * np.eye(self.latent_dim) if self.cov is not None: for action, precision in enumerate(self.cov): ind = np.array( [i for i in range(n) if self.data_h.actions[i] == action]) if len(ind) > 0: """compute confidence scores for old data""" d = [] for c in self.latent_h.contexts[ind, :]: d.append(np.dot(np.dot(c, precision), c.T)) """compute new data correlations""" phi = [] for c in contexts[ind, :]: phi.append(np.outer(c, c)) X = cvx.Variable((m, m), PSD=True) # Form objective. obj = cvx.Minimize( sum([(cvx.trace(X * phi[i]) - d[i])**2 for i in xrange(len(d))])) prob = cvx.Problem(obj) prob.solve() if X.value is None: precisions_return.append(np.linalg.inv(prior)) else: precisions_return.append(np.linalg.inv(X.value + prior)) else: precisions_return.append(np.linalg.inv(prior)) return precisions_return def update(self, context, action, reward): """Updates the posterior using linear bayesian regression formula.""" self.t += 1 self.data_h.add(context, action, reward) c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) self.latent_h.add(z_context, action, reward) # Retrain the network on the original data (data_h) if self.t % self.update_freq_nn == 0: if self.hparams.reset_lr: self.bnn.assign_lr() self.bnn.train(self.data_h, self.num_epochs) # Update the latent representation of every datapoint collected so far new_z = self.bnn.sess.run( self.bnn.nn, feed_dict={self.bnn.x: self.data_h.contexts}) # Update the confidence prior using feature uncertainty matching if self.sigma_prior_flag == 1: self.precision_prior = self.calc_precision_prior( contexts=new_z) self.latent_h.replace_data(contexts=new_z) # Update the mean prior using the weights of the NN if self.mu_prior_flag == 1: self.mu_prior = self.bnn.get_mu_prior() # Update the Bayesian Linear Regression if self.t % self.update_freq_lr == 0: # Find all the actions to update actions_to_update = self.latent_h.actions[:-self.update_freq_lr] for action_v in np.unique(actions_to_update): # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q) z, y = self.latent_h.get_data(action_v) # The algorithm could be improved with sequential formulas (cheaper) s = np.dot(z.T, z) # Get priors sigma0 = self.precision_prior[action_v] mu_0 = self.mu_prior[:, action_v] # Calc mean and precision using bayesian linear regression precision_a = s + sigma0 cov_a = np.linalg.inv(precision_a) mu_a = np.dot(cov_a, (np.dot(z.T, y) + np.dot(sigma0, mu_0))) # Inverse Gamma posterior update a_post = self.a0 + z.shape[0] / 2.0 b_upd = 0.5 * np.dot(y.T, y) b_upd += 0.5 * np.dot(mu_0.T, np.dot(sigma0, mu_0)) b_upd -= 0.5 * np.dot(mu_a.T, np.dot(precision_a, mu_a)) b_post = self.b0 + b_upd # Store new posterior distributions self.mu[action_v] = mu_a self.cov[action_v] = cov_a #self.precision[action_v] = precision_a self.a[action_v] = a_post self.b[action_v] = b_post @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior
class NeuralLinearPosteriorSampling(BanditAlgorithm): """Full Bayesian linear regression on the last layer of a deep neural net.""" def __init__(self, name, hparams, optimizer='RMS'): self.name = name self.hparams = hparams self.latent_dim = self.hparams.layer_sizes[-1] # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.mu = [ np.zeros(self.latent_dim) for _ in range(self.hparams.num_actions) ] self.cov = [(1.0 / self.lambda_prior) * np.eye(self.latent_dim) for _ in range(self.hparams.num_actions)] self.precision = [ self.lambda_prior * np.eye(self.latent_dim) for _ in range(self.hparams.num_actions) ] # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False) self.latent_h = ContextualDataset(self.latent_dim, hparams.num_actions, intercept=False) self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name)) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly.""" # Round robin until each action has been selected "initial_pulls" times if self.t < self.hparams.num_actions * self.hparams.initial_pulls: return self.t % self.hparams.num_actions # Sample sigma2, and beta conditional on sigma2 sigma2_s = [ self.b[i] * invgamma.rvs(self.a[i]) for i in range(self.hparams.num_actions) ] try: beta_s = [ np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i]) for i in range(self.hparams.num_actions) ] except np.linalg.LinAlgError as e: # Sampling could fail if covariance is not positive definite print('Exception when sampling for {}.'.format(self.name)) print('Details: {} | {}.'.format(e.message, e.args)) d = self.latent_dim beta_s = [ np.random.multivariate_normal(np.zeros((d)), np.eye(d)) for i in range(self.hparams.num_actions) ] # Compute last-layer representation for the current context with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) # Apply Thompson Sampling to last-layer representation vals = [ np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions) ] return np.argmax(vals) def update(self, context, action, reward): """Updates the posterior using linear bayesian regression formula.""" self.t += 1 self.data_h.add(context, action, reward) c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) self.latent_h.add(z_context, action, reward) # Retrain the network on the original data (data_h) if self.t % self.update_freq_nn == 0: if self.hparams.reset_lr: self.bnn.assign_lr() self.bnn.train(self.data_h, self.num_epochs) # Update the latent representation of every datapoint collected so far new_z = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: self.data_h.contexts}) self.latent_h.replace_data(contexts=new_z) # Update the Bayesian Linear Regression if self.t % self.update_freq_lr == 0: # Find all the actions to update actions_to_update = self.latent_h.actions[:-self.update_freq_lr] for action_v in np.unique(actions_to_update): # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q) z, y = self.latent_h.get_data(action_v) # The algorithm could be improved with sequential formulas (cheaper) s = np.dot(z.T, z) # Some terms are removed as we assume prior mu_0 = 0. precision_a = s + self.lambda_prior * np.eye(self.latent_dim) cov_a = np.linalg.inv(precision_a) mu_a = np.dot(cov_a, np.dot(z.T, y)) # print('beta_cov: ', cov_a) # Inverse Gamma posterior update a_post = self.a0 + z.shape[0] / 2.0 b_upd = 0.5 * np.dot(y.T, y) b_upd -= 0.5 * np.dot(mu_a.T, np.dot(precision_a, mu_a)) b_post = self.b0 + b_upd # Store new posterior distributions self.mu[action_v] = mu_a self.cov[action_v] = cov_a self.precision[action_v] = precision_a self.a[action_v] = a_post self.b[action_v] = b_post @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior
class ParameterNoiseSampling(BanditAlgorithm): """Parameter Noise Sampling algorithm based on adding noise to net params. Described in https://arxiv.org/abs/1706.01905 """ def __init__(self, name, hparams): """Creates the algorithm, and sets up the adaptive Gaussian noise.""" self.name = name self.hparams = hparams self.verbose = getattr(self.hparams, 'verbose', True) self.noise_std = getattr(self.hparams, 'noise_std', 0.005) self.eps = getattr(self.hparams, 'eps', 0.05) self.d_samples = getattr(self.hparams, 'd_samples', 300) self.optimizer = getattr(self.hparams, 'optimizer', 'RMS') # keep track of noise heuristic statistics self.std_h = [self.noise_std] self.eps_h = [self.eps] self.kl_h = [] self.t = 0 self.freq_update = hparams.training_freq self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, hparams.buffer_s) self.bnn = NeuralBanditModel(self.optimizer, hparams, '{}-bnn'.format(name)) with self.bnn.graph.as_default(): # noise-injection std placeholder self.bnn.noise_std_ph = tf.placeholder(tf.float32, shape=()) # create noise corruption op; adds noise to all weights tvars = tf.trainable_variables() self.bnn.noisy_grads = [ tf.random_normal(v.get_shape(), 0, self.bnn.noise_std_ph) for v in tvars ] # add noise to all params, then compute prediction, then subtract. with tf.control_dependencies(self.bnn.noisy_grads): self.bnn.noise_add_ops = [ tvars[i].assign_add(n) for i, n in enumerate(self.bnn.noisy_grads) ] with tf.control_dependencies(self.bnn.noise_add_ops): # we force the prediction for 'y' to be recomputed after adding noise self.bnn.noisy_nn, self.bnn.noisy_pred_val = self.bnn.forward_pass( ) self.bnn.noisy_pred = tf.identity(self.bnn.noisy_pred_val) with tf.control_dependencies( [tf.identity(self.bnn.noisy_pred)]): self.bnn.noise_sub_ops = [ tvars[i].assign_add(-n) for i, n in enumerate(self.bnn.noisy_grads) ] def action(self, context): """Selects action based on Thompson Sampling *after* adding noise.""" if self.t < self.hparams.num_actions * self.hparams.initial_pulls: # round robin until each action has been taken "initial_pulls" times return self.t % self.hparams.num_actions with self.bnn.graph.as_default(): # run noise prediction op to choose action, and subtract noise op after. c = context.reshape((1, self.hparams.context_dim)) output, _ = self.bnn.sess.run( [self.bnn.noisy_pred, self.bnn.noise_sub_ops], feed_dict={ self.bnn.x: c, self.bnn.noise_std_ph: self.noise_std }) return np.argmax(output) def update(self, context, action, reward): """Updates the data buffer, and re-trains the BNN and noise level.""" self.t += 1 self.data_h.add(context, action, reward) if self.t % self.freq_update == 0: self.bnn.train(self.data_h, self.num_epochs) self.update_noise() def update_noise(self): """Increase noise if distance btw original and corrupted distrib small.""" kl = self.compute_distance() delta = -np.log1p(-self.eps + self.eps / self.hparams.num_actions) if kl < delta: self.noise_std *= 1.01 else: self.noise_std /= 1.01 self.eps *= 0.99 if self.verbose: print('Update eps={} | kl={} | std={} | delta={} | increase={}.'. format(self.eps, kl, self.noise_std, delta, kl < delta)) # store noise-injection statistics for inspection: std, KL, eps. self.std_h.append(self.noise_std) self.kl_h.append(kl) self.eps_h.append(self.eps) def compute_distance(self): """Computes empirical KL for original and corrupted output distributions.""" random_inputs, _ = self.data_h.get_batch(self.d_samples) y_model = self.bnn.sess.run(self.bnn.y_pred, feed_dict={ self.bnn.x: random_inputs, self.bnn.noise_std_ph: self.noise_std }) y_noisy, _ = self.bnn.sess.run( [self.bnn.noisy_pred, self.bnn.noise_sub_ops], feed_dict={ self.bnn.x: random_inputs, self.bnn.noise_std_ph: self.noise_std }) if self.verbose: # display how often original & perturbed models propose different actions s = np.sum([ np.argmax(y_model[i, :]) == np.argmax(y_noisy[i, :]) for i in range(y_model.shape[0]) ]) print('{} | % of agreement btw original / corrupted actions: {}.'. format(self.name, s / self.d_samples)) kl = self.compute_kl_with_logits(y_model, y_noisy) return kl def compute_kl_with_logits(self, logits1, logits2): """Computes KL from logits samples from two distributions.""" def exp_times_diff(a, b): return np.multiply(np.exp(a), a - b) logsumexp1 = logsumexp(logits1, axis=1) logsumexp2 = logsumexp(logits2, axis=1) logsumexp_diff = logsumexp2 - logsumexp1 exp_diff = exp_times_diff(logits1, logits2) exp_diff = np.sum(exp_diff, axis=1) inv_exp_sum = np.sum(np.exp(logits1), axis=1) term1 = np.divide(exp_diff, inv_exp_sum) kl = term1 + logsumexp_diff kl = np.maximum(kl, 0.0) kl = np.nan_to_num(kl) return np.mean(kl)
class NeuralLinearPosteriorSamplingFiniteMemory(BanditAlgorithm): """Full Bayesian linear regression on the last layer of a deep neural net.""" def __init__(self, name, hparams, textflag='no', optimizer='RMS'): self.name = name self.hparams = hparams self.latent_dim = self.hparams.layer_sizes[-1] self.intercept = False if self.intercept: self.param_dim = 1 + self.latent_dim else: self.param_dim = self.latent_dim self.EPSILON = 0.00001 # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.before = [] self.after = [] self.mu = [ np.zeros(self.param_dim) for _ in range(self.hparams.num_actions) ] self.f = [ np.zeros(self.param_dim) for _ in range(self.hparams.num_actions) ] self.yy = [0 for _ in range(self.hparams.num_actions)] self.cov = [(1.0 / self.lambda_prior) * np.eye(self.param_dim) for _ in range(self.hparams.num_actions)] self.precision = [ self.lambda_prior * np.eye(self.param_dim) for _ in range(self.hparams.num_actions) ] self.mu_prior_flag = self.hparams.mu_prior_flag self.sigma_prior_flag = self.hparams.sigma_prior_flag self.precision_prior = self.precision[:] self.mu_prior = np.zeros((self.param_dim, self.hparams.num_actions)) # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False, buffer_s=hparams.mem) self.latent_h = ContextualDataset(self.latent_dim, hparams.num_actions, intercept=self.intercept, buffer_s=hparams.mem) if textflag == 'yes': self.bnn = TextCNN('adam', self.hparams.num_actions, self.hparams.batch_size, '{}-bnn'.format(name)) else: self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name)) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly.""" # Round robin until each action has been selected "initial_pulls" times if self.t < self.hparams.num_actions * self.hparams.initial_pulls: return self.t % self.hparams.num_actions # Sample sigma2, and beta conditional on sigma2 sigma2_s = [ self.b[i] * invgamma.rvs(self.a[i]) for i in range(self.hparams.num_actions) ] try: beta_s = [ np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i]) for i in range(self.hparams.num_actions) ] except np.linalg.LinAlgError as e: # Sampling could fail if covariance is not positive definite d = self.latent_dim beta_s = [ np.random.multivariate_normal(np.zeros((d)), np.eye(d)) for i in range(self.hparams.num_actions) ] # Compute last-layer representation for the current context with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) if self.intercept: z_context = np.append(z_context, 1.0).reshape( (1, self.latent_dim + 1)) # Apply Thompson Sampling to last-layer representation vals = [ np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions) ] return np.argmax(vals) def calc_precision_prior(self, contexts): precisions_return = [] n, m = contexts.shape prior = (self.EPSILON) * np.eye(self.param_dim) if self.cov is not None: for action, precision in enumerate(self.cov): ind = np.array( [i for i in range(n) if self.data_h.actions[i] == action]) if len(ind) > 0: """compute confidence scores for old data""" d = [] for c in self.latent_h.contexts[ind, :]: d.append(np.dot(np.dot(c, precision), c.T)) """compute new data correlations""" phi = [] for c in contexts[ind, :]: phi.append(np.outer(c, c)) X = cvx.Variable((m, m), PSD=True) # Form objective. obj = cvx.Minimize( sum([(cvx.trace(X * phi[i]) - d[i])**2 for i in xrange(len(d))])) prob = cvx.Problem(obj) prob.solve() if X.value is None: precisions_return.append(np.linalg.inv(prior)) self.cov[action] = prior else: precisions_return.append(np.linalg.inv(X.value + prior)) self.cov[action] = X.value + prior else: precisions_return.append(np.linalg.inv(prior)) self.cov[action] = prior return (precisions_return) def update(self, context, action, reward): """Updates the posterior using linear bayesian regression formula.""" self.t += 1 self.data_h.add(context, action, reward) c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) self.latent_h.add(z_context, action, reward) # Retrain the network on the original data (data_h) if self.t % self.update_freq_nn == 0: if self.hparams.reset_lr: self.bnn.assign_lr() self.bnn.train(self.data_h, self.num_epochs) # Update the latent representation of every datapoint collected so far new_z = self.bnn.sess.run( self.bnn.nn, feed_dict={self.bnn.x: self.data_h.contexts}) self.latent_h.replace_data(contexts=new_z) i_contexts = None for context in new_z: c = np.array(context[:]) if self.intercept: c = np.append(c, 1.0).reshape((1, self.latent_dim + 1)) if i_contexts is None: i_contexts = c else: i_contexts = np.vstack((i_contexts, c)) # Update the confidence prior using feature uncertainty matching #self.before.append(self.calc_model_evidence()) if self.sigma_prior_flag == 1: self.precision_prior = self.calc_precision_prior( contexts=i_contexts) # Update the mean prior using the weights of the NN if self.mu_prior_flag == 1: weights_p, bias_p = self.bnn.get_mu_prior() self.mu_prior[:self.latent_dim] = weights_p self.mu_prior[-1] = bias_p #self.after.append(self.calc_model_evidence()) #print(self.before) #print(self.after) # Update the Bayesian Linear Regression for action_v in xrange(self.hparams.num_actions): # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q) z, y = self.latent_h.get_data(action_v) # The algorithm could be improved with sequential formulas (cheaper) self.precision[action_v] = (np.dot(z.T, z) + self.precision_prior[action_v]) self.f[action_v] = np.dot(z.T, y) else: if self.intercept: z_context = np.append(z_context, 1.0).reshape( (1, self.latent_dim + 1)) self.precision[action] += np.dot(z_context.T, z_context) self.cov[action] = np.linalg.inv(self.precision[action]) self.f[action] += (z_context.T * reward)[:, 0] # Calc mean and precision using bayesian linear regression self.mu[action] = np.dot( self.cov[action], (self.f[action] + np.dot(self.precision_prior[action], self.mu_prior[:, action]))) # Inverse Gamma posterior update self.yy[action] += reward**2 self.a[action] += 0.5 b_upd = 0.5 * self.yy[action] b_upd += 0.5 * np.dot( self.mu_prior[:, action].T, np.dot(self.precision_prior[action], self.mu_prior[:, action])) b_upd -= 0.5 * np.dot(self.mu[action].T, np.dot(self.precision[action], self.mu[action])) self.b[action] = self.b0 + b_upd @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior def calc_model_evidence(self): vval = 0 for action in xrange(self.hparams.num_actions): sigma0 = self.precision_prior[action] mu_0 = self.mu_prior[:, action] z, y = self.latent_h.get_data(action) n = z.shape[0] s = np.dot(z.T, z) s_n = (sigma0 + s) cov_a = np.linalg.inv(s_n) mu_a = np.dot(cov_a, (np.dot(z.T, y) + np.dot(sigma0, mu_0))) a_post = (self.a0 + n / 2.0) b_upd = 0.5 * np.dot(y.T, y) b_upd += 0.5 * np.dot(mu_0.T, np.dot(sigma0, mu_0)) b_upd -= 0.5 * np.dot(mu_a.T, np.dot(s_n, mu_a)) b_post = self.b0 + b_upd val = np.float128(1) val /= ((np.float128(2.0) * math.pi)**(n / 2.0)) val *= (gamma(a_post) / gamma(self.a0)) val *= np.sqrt(np.linalg.det(sigma0) / np.linalg.det(s_n)) val *= ((self.hparams.b0**self.hparams.a0) / (b_post**a_post)) vval += val vval /= self.hparams.num_actions return vval
class NeuralLinearEpsilonGreedy(BanditAlgorithm): """Full Bayesian linear regression on the last layer of a deep neural net.""" def __init__(self, name, hparams, textflag='yes', optimizer='RMS'): self.name = name self.hparams = hparams self.epsilon = self.hparams.epsilon self.latent_dim = self.hparams.layer_sizes[-1] self.intercept = True if self.intercept: self.param_dim = 1 + self.latent_dim else: self.param_dim = self.latent_dim # Gaussian prior for each beta_i # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False) self.latent_h = ContextualDataset(self.latent_dim, hparams.num_actions, intercept=self.intercept) if textflag == 'yes': self.bnn = TextCNN('adam', self.hparams.num_actions, self.hparams.batch_size, '{}-bnn'.format(name)) else: self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name)) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly.""" # Round robin until each action has been selected "initial_pulls" times if self.t < self.hparams.num_actions * self.hparams.initial_pulls: return self.t % self.hparams.num_actions with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) y = self.bnn.sess.run(self.bnn.y_pred, feed_dict={self.bnn.x: c}) if random.random() > self.epsilon: return np.argmax(y) else: return random.randrange(self.hparams.num_actions) def update(self, context, action, reward): """Updates the posterior using linear bayesian regression formula.""" self.t += 1 self.data_h.add(context, action, reward) c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) self.latent_h.add(z_context, action, reward) # Retrain the network on the original data (data_h) if self.t % self.update_freq_nn == 0: if self.hparams.reset_lr: self.bnn.assign_lr() #self.bnn.set_last_layer(self.mu) self.bnn.train(self.data_h, self.num_epochs) @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior
class NeuralGreedy(BanditAlgorithm): """Full Bayesian linear regression on the last layer of a deep neural net.""" def __init__(self, name, hparams, optimizer='RMS'): self.name = name self.eps = 0.9 self.decay = 0.99 # computed for 10,000 steps self.hparams = hparams # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False) self.bnn = NeuralBanditModel(optimizer, hparams, '{}-greedy'.format(name)) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly.""" # Round robin until each action has been selected "initial_pulls" times #if self.t < self.hparams.num_actions * self.hparams.initial_pulls: #return self.t % self.hparams.num_actions ## No need with greedy if np.random.random() < self.eps: return np.random.choice(range(self.hparams.num_actions)) else: with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) output = self.bnn.sess.run(self.bnn.y_pred, feed_dict={self.bnn.x: c}) return np.argmax(output) def update(self, context, action, reward): """Updates the posterior using linear bayesian regression formula.""" self.t += 1 self.eps *= self.decay self.data_h.add(context, action, reward) c = context.reshape((1, self.hparams.context_dim)) # Retrain the network on the original data (data_h) if self.t % self.update_freq_nn == 0: if self.hparams.reset_lr: self.bnn.assign_lr() self.bnn.train(self.data_h, self.num_epochs) @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior
class ParameterNoiseSampling(BanditAlgorithm): """Parameter Noise Sampling algorithm based on adding noise to net params. Described in https://arxiv.org/abs/1706.01905 """ def __init__(self, name, hparams): """Creates the algorithm, and sets up the adaptive Gaussian noise.""" self.name = name self.hparams = hparams self.verbose = getattr(self.hparams, 'verbose', True) self.noise_std = getattr(self.hparams, 'noise_std', 0.005) self.eps = getattr(self.hparams, 'eps', 0.05) self.d_samples = getattr(self.hparams, 'd_samples', 300) self.optimizer = getattr(self.hparams, 'optimizer', 'RMS') # keep track of noise heuristic statistics self.std_h = [self.noise_std] self.eps_h = [self.eps] self.kl_h = [] self.t = 0 self.freq_update = hparams.training_freq self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, hparams.buffer_s) self.bnn = NeuralBanditModel(self.optimizer, hparams, '{}-bnn'.format(name)) with self.bnn.graph.as_default(): # noise-injection std placeholder self.bnn.noise_std_ph = tf.placeholder(tf.float32, shape=()) # create noise corruption op; adds noise to all weights tvars = tf.trainable_variables() self.bnn.noisy_grads = [ tf.random_normal(v.get_shape(), 0, self.bnn.noise_std_ph) for v in tvars ] # add noise to all params, then compute prediction, then subtract. with tf.control_dependencies(self.bnn.noisy_grads): self.bnn.noise_add_ops = [ tvars[i].assign_add(n) for i, n in enumerate(self.bnn.noisy_grads) ] with tf.control_dependencies(self.bnn.noise_add_ops): # we force the prediction for 'y' to be recomputed after adding noise self.bnn.noisy_nn, self.bnn.noisy_pred_val = self.bnn.forward_pass() self.bnn.noisy_pred = tf.identity(self.bnn.noisy_pred_val) with tf.control_dependencies([tf.identity(self.bnn.noisy_pred)]): self.bnn.noise_sub_ops = [ tvars[i].assign_add(-n) for i, n in enumerate(self.bnn.noisy_grads) ] def action(self, context): """Selects action based on Thompson Sampling *after* adding noise.""" if self.t < self.hparams.num_actions * self.hparams.initial_pulls: # round robin until each action has been taken "initial_pulls" times return self.t % self.hparams.num_actions with self.bnn.graph.as_default(): # run noise prediction op to choose action, and subtract noise op after. c = context.reshape((1, self.hparams.context_dim)) output, _ = self.bnn.sess.run( [self.bnn.noisy_pred, self.bnn.noise_sub_ops], feed_dict={self.bnn.x: c, self.bnn.noise_std_ph: self.noise_std}) return np.argmax(output) def update(self, context, action, reward): """Updates the data buffer, and re-trains the BNN and noise level.""" self.t += 1 self.data_h.add(context, action, reward) if self.t % self.freq_update == 0: self.bnn.train(self.data_h, self.num_epochs) self.update_noise() def update_noise(self): """Increase noise if distance btw original and corrupted distrib small.""" kl = self.compute_distance() delta = -np.log1p(- self.eps + self.eps / self.hparams.num_actions) if kl < delta: self.noise_std *= 1.01 else: self.noise_std /= 1.01 self.eps *= 0.99 if self.verbose: print('Update eps={} | kl={} | std={} | delta={} | increase={}.'.format( self.eps, kl, self.noise_std, delta, kl < delta)) # store noise-injection statistics for inspection: std, KL, eps. self.std_h.append(self.noise_std) self.kl_h.append(kl) self.eps_h.append(self.eps) def compute_distance(self): """Computes empirical KL for original and corrupted output distributions.""" random_inputs, _ = self.data_h.get_batch(self.d_samples) y_model = self.bnn.sess.run( self.bnn.y_pred, feed_dict={ self.bnn.x: random_inputs, self.bnn.noise_std_ph: self.noise_std }) y_noisy, _ = self.bnn.sess.run( [self.bnn.noisy_pred, self.bnn.noise_sub_ops], feed_dict={ self.bnn.x: random_inputs, self.bnn.noise_std_ph: self.noise_std }) if self.verbose: # display how often original & perturbed models propose different actions s = np.sum([np.argmax(y_model[i, :]) == np.argmax(y_noisy[i, :]) for i in range(y_model.shape[0])]) print('{} | % of agreement btw original / corrupted actions: {}.'.format( self.name, s / self.d_samples)) kl = self.compute_kl_with_logits(y_model, y_noisy) return kl def compute_kl_with_logits(self, logits1, logits2): """Computes KL from logits samples from two distributions.""" def exp_times_diff(a, b): return np.multiply(np.exp(a), a - b) logsumexp1 = logsumexp(logits1, axis=1) logsumexp2 = logsumexp(logits2, axis=1) logsumexp_diff = logsumexp2 - logsumexp1 exp_diff = exp_times_diff(logits1, logits2) exp_diff = np.sum(exp_diff, axis=1) inv_exp_sum = np.sum(np.exp(logits1), axis=1) term1 = np.divide(exp_diff, inv_exp_sum) kl = term1 + logsumexp_diff kl = np.maximum(kl, 0.0) kl = np.nan_to_num(kl) return np.mean(kl)
class NeuralLinUCB(BanditAlgorithm): def __init__(self, name, hparams, optimizer='RMS'): self.name = name self.hparams = hparams self.n_a = self.hparams.num_actions self.n_d = self.hparams.layer_sizes[-1] self.alpha = self.hparams.alpha self.lam = self.hparams.lam self.a = np.concatenate(tuple([np.eye(self.n_d)[np.newaxis, :, :] for i in range(self.n_a)]), axis=0) * self.lam self.inv_a = np.concatenate(tuple([np.eye(self.n_d)[np.newaxis, :, :] for i in range(self.n_a)]), axis=0) / self.lam self.b = np.zeros((self.n_a, self.n_d)) self.theta = np.zeros((self.n_a, self.n_d)) # Params for BNN self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, bootstrap=getattr(hparams, 'bootstrap', None), intercept=False) self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name)) def action(self, context): """ Args: context: Context for which the action need to be chosen. Returns: action: Selected action for the context. """ with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}).flatten() vals = np.array([ np.dot(self.theta[i], z_context) + self.alpha * np.sqrt(np.dot(z_context, np.dot(self.inv_a[i], z_context))) for i in range(self.n_a)]) return np.argmax(vals) def update(self, context, action, reward): """Updates action posterior using the linear Bayesian regression formula. Args: context: Last observed context. action: Last observed action. reward: Last observed reward. """ self.t += 1 self.data_h.add(context, action, reward) if self.t % self.update_freq_nn == 0: if self.hparams.reset_lr: self.bnn.assign_lr() self.bnn.train(self.data_h, self.num_epochs) new_z = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: self.data_h.contexts}) contexts = new_z actions = np.array(self.data_h.actions) rewards = self.data_h.rewards[np.arange(actions.shape[ 0]), actions] # strange but data_h.rewards is of shape (n_samples, n_actions) so we select actions pulled by model self.a = np.dot(contexts.T, contexts) + np.concatenate( tuple([np.eye(self.n_d)[np.newaxis, :, :] for i in range(self.n_a)]), axis=0) * self.lam self.b = np.concatenate(tuple( [np.dot(rewards[actions == action], contexts[actions == action])[np.newaxis, :] for action in range(self.n_a)]), axis=0) self.inv_a = np.concatenate( tuple([np.linalg.inv(self.a[action])[np.newaxis, :, :] for action in range(self.n_a)]), axis=0) self.theta = np.concatenate( tuple([np.dot(self.inv_a[action], self.b[action])[np.newaxis, :] for action in range(self.n_a)]), axis=0) else: c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}).flatten() self.a[action] = self.a[action] + np.tensordot(z_context, z_context, axes=0) self.inv_a[action] = np.linalg.inv(self.a[action]) self.b[action] = self.b[action] + reward * z_context self.theta[action] = np.dot(self.inv_a[action], self.b[action])
class NeuralLinearPosteriorSampling(BanditAlgorithm): """Full Bayesian linear regression on the last layer of a deep neural net.""" def __init__(self, name, hparams,textflag ='no', optimizer='RMS'): self.name = name self.hparams = hparams self.latent_dim = self.hparams.layer_sizes[-1] self.intercept = False if self.intercept: self.param_dim=1+self.latent_dim else: self.param_dim = self.latent_dim # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.mu = [ np.zeros(self.param_dim) for _ in range(self.hparams.num_actions) ] self.f = [ np.zeros(self.param_dim) for _ in range(self.hparams.num_actions) ] self.yy = [0 for _ in range(self.hparams.num_actions)] self.cov = [(1.0 / self.lambda_prior) * np.eye(self.param_dim) for _ in range(self.hparams.num_actions)] self.precision = [ self.lambda_prior * np.eye(self.param_dim) for _ in range(self.hparams.num_actions) ] # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False) self.latent_h = ContextualDataset(self.latent_dim, hparams.num_actions, intercept=self.intercept) if textflag=='yes': self.bnn = TextCNN('adam', self.hparams.num_actions,self.hparams.batch_size, '{}-bnn'.format(name)) else: self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name)) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly.""" # Round robin until each action has been selected "initial_pulls" times if self.t < self.hparams.num_actions * self.hparams.initial_pulls: return self.t % self.hparams.num_actions # Sample sigma2, and beta conditional on sigma2 sigma2_s = [ self.b[i] * invgamma.rvs(self.a[i]) for i in range(self.hparams.num_actions) ] try: beta_s = [ np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i]) for i in range(self.hparams.num_actions) ] except np.linalg.LinAlgError as e: # Sampling could fail if covariance is not positive definite d = self.param_dim beta_s = [ np.random.multivariate_normal(np.zeros((d)), np.eye(d)) for i in range(self.hparams.num_actions) ] # Compute last-layer representation for the current context with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) if self.intercept: z_context = np.append(z_context, 1.0).reshape((1, self.latent_dim + 1)) # Apply Thompson Sampling to last-layer representation vals = [ np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions) ] return np.argmax(vals) def update(self, context, action, reward): """Updates the posterior using linear bayesian regression formula.""" self.t += 1 self.data_h.add(context, action, reward) c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) self.latent_h.add(z_context, action, reward) # Retrain the network on the original data (data_h) if self.t % self.update_freq_nn == 0: if self.hparams.reset_lr: self.bnn.assign_lr() #self.bnn.set_last_layer(self.mu) self.bnn.train(self.data_h, self.num_epochs) # Update the latent representation of every datapoint collected so far new_z = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: self.data_h.contexts}) self.latent_h.replace_data(contexts=new_z) for action_v in range(self.hparams.num_actions): # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q) z, y = self.latent_h.get_data(action_v) # The algorithm could be improved with sequential formulas (cheaper) self.precision[action_v] = (np.dot(z.T, z)+self.lambda_prior * np.eye(self.param_dim)) #the new PHI_0 self.f[action_v] = np.dot(z.T, y) else: if self.intercept: z_context = np.append(z_context, 1.0).reshape((1, self.latent_dim + 1)) self.precision[action] += np.dot(z_context.T, z_context) self.f[action] += (z_context.T * reward)[:, 0] self.yy[action] += reward ** 2 self.cov[action] = np.linalg.inv(self.precision[action]) self.mu[action] = np.dot(self.cov[action], self.f[action]) # Inverse Gamma posterior update self.a[action] += 0.5 b_upd = 0.5 * (self.yy[action] - np.dot(self.mu[action].T, np.dot(self.precision[action], self.mu[action]))) self.b[action] = self.b0 + b_upd #print(self.calc_model_evidence()) @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior def calc_model_evidence(self): vval = 0 mp.mp.dps = 50 for action in range(self.hparams.num_actions): # val=1 # aa = self.a[action] # for i in range(int(self.a[action]-self.a0)): # aa-=1 # val*=aa # val/=(2.0*math.pi) # val/=self.b[action] # val*=gamma(aa) # val/=(self.b[action]**aa) # val *= np.sqrt(np.linalg.det(self.lambda_prior * np.eye(self.hparams.context_dim + 1)) / np.linalg.det(self.precision[action])) # val *= (self.b0 ** self.a0) # val/= gamma(self.a0) # vval += val #val= 1/float((2.0 * math.pi) ** (self.a[action]-self.a0)) #val*= (float(gamma(self.a[action]))/float(gamma(self.a0))) #val*= np.sqrt(float(np.linalg.det(self.lambda_prior * np.eye(self.hparams.context_dim + 1)))/float(np.linalg.det(self.precision[action]))) #val*= (float(self.b0**self.a0)/float(self.b[action]**self.a[action])) val= mp.mpf(mp.fmul(mp.fneg(mp.log(mp.fmul(2.0 , mp.pi))) , mp.fsub(self.a[action],self.a0))) val+= mp.loggamma(self.a[action]) val-= mp.loggamma(self.a0) val+= 0.5*mp.log(np.linalg.det(self.lambda_prior * np.eye(self.hparams.context_dim + 1))) val -= 0.5*mp.log(np.linalg.det(self.precision[action])) val+= mp.fmul(self.a0,mp.log(self.b0)) val-= mp.fmul(self.a[action],mp.log(self.b[action])) vval+=mp.exp(val) vval/=float(self.hparams.num_actions) return vval