class PosteriorBNNSampling(BanditAlgorithm): """Posterior Sampling algorithm based on a Bayesian neural network.""" def __init__(self, name, hparams, bnn_model='RMSProp'): """Creates a PosteriorBNNSampling object based on a specific optimizer. The algorithm has two basic tools: an Approx BNN and a Contextual Dataset. The Bayesian Network keeps the posterior based on the optimizer iterations. Args: name: Name of the algorithm. hparams: Hyper-parameters of the algorithm. bnn_model: Type of BNN. By default RMSProp (point estimate). """ self.name = name self.hparams = hparams self.optimizer_n = hparams.optimizer self.training_freq = hparams.training_freq self.training_epochs = hparams.training_epochs self.t = 0 self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, hparams.buffer_s) # to be extended with more BNNs (BB alpha-div, GPs, SGFS, constSGD...) bnn_name = '{}-bnn'.format(name) if bnn_model == 'Variational': self.bnn = VariationalNeuralBanditModel(hparams, bnn_name) elif bnn_model == 'AlphaDiv': self.bnn = BBAlphaDivergence(hparams, bnn_name) elif bnn_model == 'Variational_BF': self.bnn = BfVariationalNeuralBanditModel(hparams, bnn_name) elif bnn_model == 'GP': self.bnn = MultitaskGP(hparams) else: self.bnn = NeuralBanditModel(self.optimizer_n, hparams, bnn_name) def action(self, context): """Selects action for context based on Thompson Sampling using the BNN.""" if self.t < self.hparams.num_actions * self.hparams.initial_pulls: # round robin until each action has been taken "initial_pulls" times return self.t % self.hparams.num_actions with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) output = self.bnn.sess.run(self.bnn.y_pred, feed_dict={self.bnn.x: c}) return np.argmax(output) def update(self, context, action, reward): """Updates data buffer, and re-trains the BNN every training_freq steps.""" self.t += 1 self.data_h.add(context, action, reward) if self.t % self.training_freq == 0: if self.hparams.reset_lr: self.bnn.assign_lr() self.bnn.train(self.data_h, self.training_epochs)
def __init__(self, name, hparams, bnn_model='RMSProp'): """Creates a PosteriorBNNSampling object based on a specific optimizer. The algorithm has two basic tools: an Approx BNN and a Contextual Dataset. The Bayesian Network keeps the posterior based on the optimizer iterations. Args: name: Name of the algorithm. hparams: Hyper-parameters of the algorithm. bnn_model: Type of BNN. By default RMSProp (point estimate). """ self.name = name self.hparams = hparams self.optimizer_n = hparams.optimizer self.training_freq = hparams.training_freq self.training_epochs = hparams.training_epochs self.t = 0 self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, hparams.buffer_s) # to be extended with more BNNs (BB alpha-div, GPs, SGFS, constSGD...) bnn_name = '{}-bnn'.format(name) if bnn_model == 'Variational': self.bnn = VariationalNeuralBanditModel(hparams, bnn_name) elif bnn_model == 'AlphaDiv': self.bnn = BBAlphaDivergence(hparams, bnn_name) elif bnn_model == 'Variational_BF': self.bnn = BfVariationalNeuralBanditModel(hparams, bnn_name) elif bnn_model == 'GP': self.bnn = MultitaskGP(hparams) else: self.bnn = NeuralBanditModel(self.optimizer_n, hparams, bnn_name)
def __init__(self, name, hparams, optimizer='RMS'): self.name = name self.hparams = hparams self.latent_dim = self.hparams.layer_sizes[-1] # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.mu = [ np.zeros(self.latent_dim) for _ in range(self.hparams.num_actions) ] self.cov = [(1.0 / self.lambda_prior) * np.eye(self.latent_dim) for _ in range(self.hparams.num_actions)] self.precision = [ self.lambda_prior * np.eye(self.latent_dim) for _ in range(self.hparams.num_actions) ] # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False) self.latent_h = ContextualDataset(self.latent_dim, hparams.num_actions, intercept=False) self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name))
def __init__(self, name, hparams): """Creates the algorithm, and sets up the adaptive Gaussian noise.""" self.name = name self.hparams = hparams self.verbose = getattr(self.hparams, 'verbose', True) self.noise_std = getattr(self.hparams, 'noise_std', 0.005) self.eps = getattr(self.hparams, 'eps', 0.05) self.d_samples = getattr(self.hparams, 'd_samples', 300) self.optimizer = getattr(self.hparams, 'optimizer', 'RMS') # keep track of noise heuristic statistics self.std_h = [self.noise_std] self.eps_h = [self.eps] self.kl_h = [] self.t = 0 self.freq_update = hparams.training_freq self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, hparams.buffer_s) self.bnn = NeuralBanditModel(self.optimizer, hparams, '{}-bnn'.format(name)) with self.bnn.graph.as_default(): # noise-injection std placeholder self.bnn.noise_std_ph = tf.placeholder(tf.float32, shape=()) # create noise corruption op; adds noise to all weights tvars = tf.trainable_variables() self.bnn.noisy_grads = [ tf.random_normal(v.get_shape(), 0, self.bnn.noise_std_ph) for v in tvars ] # add noise to all params, then compute prediction, then subtract. with tf.control_dependencies(self.bnn.noisy_grads): self.bnn.noise_add_ops = [ tvars[i].assign_add(n) for i, n in enumerate(self.bnn.noisy_grads) ] with tf.control_dependencies(self.bnn.noise_add_ops): # we force the prediction for 'y' to be recomputed after adding noise self.bnn.noisy_nn, self.bnn.noisy_pred_val = self.bnn.forward_pass() self.bnn.noisy_pred = tf.identity(self.bnn.noisy_pred_val) with tf.control_dependencies([tf.identity(self.bnn.noisy_pred)]): self.bnn.noise_sub_ops = [ tvars[i].assign_add(-n) for i, n in enumerate(self.bnn.noisy_grads) ]
def __init__(self, name, hparams): """Initialize posterior distributions and hyperparameters. Assume a linear model for each action i: reward = context^T beta_i + noise Each beta_i has a Gaussian prior (lambda parameter), each sigma2_i (noise level) has an inverse Gamma prior (a0, b0 parameters). Mean, covariance, and precision matrices are initialized, and the ContextualDataset created. Args: name: Name of the algorithm. hparams: Hyper-parameters of the algorithm. """ self.name = name self.hparams = hparams # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.mu = [ np.zeros(self.hparams.context_dim + 1) for _ in range(self.hparams.num_actions) ] self.cov = [(1.0 / self.lambda_prior) * np.eye(self.hparams.context_dim + 1) for _ in range(self.hparams.num_actions)] self.precision = [ self.lambda_prior * np.eye(self.hparams.context_dim + 1) for _ in range(self.hparams.num_actions) ] # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] self.t = 0 self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=True)
class ParameterNoiseSampling(BanditAlgorithm): """Parameter Noise Sampling algorithm based on adding noise to net params. Described in https://arxiv.org/abs/1706.01905 """ def __init__(self, name, hparams): """Creates the algorithm, and sets up the adaptive Gaussian noise.""" self.name = name self.hparams = hparams self.verbose = getattr(self.hparams, 'verbose', True) self.noise_std = getattr(self.hparams, 'noise_std', 0.005) self.eps = getattr(self.hparams, 'eps', 0.05) self.d_samples = getattr(self.hparams, 'd_samples', 300) self.optimizer = getattr(self.hparams, 'optimizer', 'RMS') # keep track of noise heuristic statistics self.std_h = [self.noise_std] self.eps_h = [self.eps] self.kl_h = [] self.t = 0 self.freq_update = hparams.training_freq self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, hparams.buffer_s) self.bnn = NeuralBanditModel(self.optimizer, hparams, '{}-bnn'.format(name)) with self.bnn.graph.as_default(): # noise-injection std placeholder self.bnn.noise_std_ph = tf.placeholder(tf.float32, shape=()) # create noise corruption op; adds noise to all weights tvars = tf.trainable_variables() self.bnn.noisy_grads = [ tf.random_normal(v.get_shape(), 0, self.bnn.noise_std_ph) for v in tvars ] # add noise to all params, then compute prediction, then subtract. with tf.control_dependencies(self.bnn.noisy_grads): self.bnn.noise_add_ops = [ tvars[i].assign_add(n) for i, n in enumerate(self.bnn.noisy_grads) ] with tf.control_dependencies(self.bnn.noise_add_ops): # we force the prediction for 'y' to be recomputed after adding noise self.bnn.noisy_nn, self.bnn.noisy_pred_val = self.bnn.forward_pass( ) self.bnn.noisy_pred = tf.identity(self.bnn.noisy_pred_val) with tf.control_dependencies( [tf.identity(self.bnn.noisy_pred)]): self.bnn.noise_sub_ops = [ tvars[i].assign_add(-n) for i, n in enumerate(self.bnn.noisy_grads) ] def action(self, context): """Selects action based on Thompson Sampling *after* adding noise.""" if self.t < self.hparams.num_actions * self.hparams.initial_pulls: # round robin until each action has been taken "initial_pulls" times return self.t % self.hparams.num_actions with self.bnn.graph.as_default(): # run noise prediction op to choose action, and subtract noise op after. c = context.reshape((1, self.hparams.context_dim)) output, _ = self.bnn.sess.run( [self.bnn.noisy_pred, self.bnn.noise_sub_ops], feed_dict={ self.bnn.x: c, self.bnn.noise_std_ph: self.noise_std }) return np.argmax(output) def update(self, context, action, reward): """Updates the data buffer, and re-trains the BNN and noise level.""" self.t += 1 self.data_h.add(context, action, reward) if self.t % self.freq_update == 0: self.bnn.train(self.data_h, self.num_epochs) self.update_noise() def update_noise(self): """Increase noise if distance btw original and corrupted distrib small.""" kl = self.compute_distance() delta = -np.log1p(-self.eps + self.eps / self.hparams.num_actions) if kl < delta: self.noise_std *= 1.01 else: self.noise_std /= 1.01 self.eps *= 0.99 if self.verbose: print('Update eps={} | kl={} | std={} | delta={} | increase={}.'. format(self.eps, kl, self.noise_std, delta, kl < delta)) # store noise-injection statistics for inspection: std, KL, eps. self.std_h.append(self.noise_std) self.kl_h.append(kl) self.eps_h.append(self.eps) def compute_distance(self): """Computes empirical KL for original and corrupted output distributions.""" random_inputs, _ = self.data_h.get_batch(self.d_samples) y_model = self.bnn.sess.run(self.bnn.y_pred, feed_dict={ self.bnn.x: random_inputs, self.bnn.noise_std_ph: self.noise_std }) y_noisy, _ = self.bnn.sess.run( [self.bnn.noisy_pred, self.bnn.noise_sub_ops], feed_dict={ self.bnn.x: random_inputs, self.bnn.noise_std_ph: self.noise_std }) if self.verbose: # display how often original & perturbed models propose different actions s = np.sum([ np.argmax(y_model[i, :]) == np.argmax(y_noisy[i, :]) for i in range(y_model.shape[0]) ]) print('{} | % of agreement btw original / corrupted actions: {}.'. format(self.name, s / self.d_samples)) kl = self.compute_kl_with_logits(y_model, y_noisy) return kl def compute_kl_with_logits(self, logits1, logits2): """Computes KL from logits samples from two distributions.""" def exp_times_diff(a, b): return np.multiply(np.exp(a), a - b) logsumexp1 = logsumexp(logits1, axis=1) logsumexp2 = logsumexp(logits2, axis=1) logsumexp_diff = logsumexp2 - logsumexp1 exp_diff = exp_times_diff(logits1, logits2) exp_diff = np.sum(exp_diff, axis=1) inv_exp_sum = np.sum(np.exp(logits1), axis=1) term1 = np.divide(exp_diff, inv_exp_sum) kl = term1 + logsumexp_diff kl = np.maximum(kl, 0.0) kl = np.nan_to_num(kl) return np.mean(kl)
class NeuralLinearPosteriorSampling(BanditAlgorithm): """Full Bayesian linear regression on the last layer of a deep neural net.""" def __init__(self, name, hparams, optimizer='RMS'): self.name = name self.hparams = hparams self.latent_dim = self.hparams.layer_sizes[-1] # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.mu = [ np.zeros(self.latent_dim) for _ in range(self.hparams.num_actions) ] self.cov = [(1.0 / self.lambda_prior) * np.eye(self.latent_dim) for _ in range(self.hparams.num_actions)] self.precision = [ self.lambda_prior * np.eye(self.latent_dim) for _ in range(self.hparams.num_actions) ] # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False) self.latent_h = ContextualDataset(self.latent_dim, hparams.num_actions, intercept=False) self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name)) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly.""" # Round robin until each action has been selected "initial_pulls" times if self.t < self.hparams.num_actions * self.hparams.initial_pulls: return self.t % self.hparams.num_actions # Sample sigma2, and beta conditional on sigma2 sigma2_s = [ self.b[i] * invgamma.rvs(self.a[i]) for i in range(self.hparams.num_actions) ] try: beta_s = [ np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i]) for i in range(self.hparams.num_actions) ] except np.linalg.LinAlgError as e: # Sampling could fail if covariance is not positive definite print('Exception when sampling for {}.'.format(self.name)) print('Details: {} | {}.'.format(e.message, e.args)) d = self.latent_dim beta_s = [ np.random.multivariate_normal(np.zeros((d)), np.eye(d)) for i in range(self.hparams.num_actions) ] # Compute last-layer representation for the current context with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) # Apply Thompson Sampling to last-layer representation vals = [ np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions) ] return np.argmax(vals) def update(self, context, action, reward): """Updates the posterior using linear bayesian regression formula.""" self.t += 1 self.data_h.add(context, action, reward) c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) self.latent_h.add(z_context, action, reward) # Retrain the network on the original data (data_h) if self.t % self.update_freq_nn == 0: if self.hparams.reset_lr: self.bnn.assign_lr() self.bnn.train(self.data_h, self.num_epochs) # Update the latent representation of every datapoint collected so far new_z = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: self.data_h.contexts}) self.latent_h.replace_data(contexts=new_z) # Update the Bayesian Linear Regression if self.t % self.update_freq_lr == 0: # Find all the actions to update actions_to_update = self.latent_h.actions[:-self.update_freq_lr] for action_v in np.unique(actions_to_update): # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q) z, y = self.latent_h.get_data(action_v) # The algorithm could be improved with sequential formulas (cheaper) s = np.dot(z.T, z) # Some terms are removed as we assume prior mu_0 = 0. precision_a = s + self.lambda_prior * np.eye(self.latent_dim) cov_a = np.linalg.inv(precision_a) mu_a = np.dot(cov_a, np.dot(z.T, y)) # Inverse Gamma posterior update a_post = self.a0 + z.shape[0] / 2.0 b_upd = 0.5 * np.dot(y.T, y) b_upd -= 0.5 * np.dot(mu_a.T, np.dot(precision_a, mu_a)) b_post = self.b0 + b_upd # Store new posterior distributions self.mu[action_v] = mu_a self.cov[action_v] = cov_a self.precision[action_v] = precision_a self.a[action_v] = a_post self.b[action_v] = b_post @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior
class NeuralUCBSampling(BanditAlgorithm): """UCB Sampling algorithm based on a neural network.""" def __init__(self, name, hparams, bnn_model='RMSProp', optimizer='RMS'): """Creates a PosteriorBNNSampling object based on a specific optimizer. The algorithm has two basic tools: an Approx BNN and a Contextual Dataset. The Bayesian Network keeps the posterior based on the optimizer iterations. Args: name: Name of the algorithm. hparams: Hyper-parameters of the algorithm. bnn_model: Type of BNN. By default RMSProp (point estimate). """ self.name = name self.hparams = hparams self.optimizer_n = hparams.optimizer self.training_freq = hparams.training_freq self.training_epochs = hparams.training_epochs self.t = 0 self.gamma = 0 self.bonus = np.zeros(hparams.num_actions) self.C1 = 0.001 self.C2 = 0.001 self.C3 = 0.00001 self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, hparams.buffer_s) # to be extended with more BNNs (BB alpha-div, GPs, SGFS, constSGD...) bnn_name = '{}-ucb'.format(name) self.bnn = NeuralBanditModel(self.optimizer_n, hparams, bnn_name) self.p = (hparams.context_dim + 1) * (hparams.layer_sizes[0]) + ( hparams.layer_sizes[0] + 1) * (hparams.layer_sizes[0]) * ( len(hparams.layer_sizes) - 1) + (hparams.layer_sizes[0] + 1) * hparams.num_actions self.Zinv = (1 / hparams.lamb) * np.eye(self.p) self.detZ = hparams.lamb**self.p def action(self, context): """Selects action for context based on UCB using the NN.""" if self.t < self.hparams.num_actions * self.hparams.initial_pulls: # round robin until each action has been taken "initial_pulls" times return self.t % self.hparams.num_actions with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) output = self.bnn.sess.run(self.bnn.y_pred, feed_dict={self.bnn.x: c}) ### Add confidence bound to outbut² listTensorGradients = self.bnn.sess.run(self.bnn.gradAction, feed_dict={self.bnn.x: c}) bonus = [] for act in range(self.hparams.num_actions): grads = np.array([]) for el in listTensorGradients[act]: grads = np.concatenate((grads, el.flatten())) bonus.append(self.gamma * np.sqrt( grads.dot(self.Zinv.dot(grads)) / self.hparams.layer_sizes[0])) output += np.array(bonus) print("Bonus of the actions", bonus) print("Gamma", self.gamma) return np.argmax(output) def update(self, context, action, reward): """Updates data buffer, and re-trains the BNN every training_freq steps.""" self.t += 1 self.data_h.add(context, action, reward) if self.t % self.training_freq == 0: if self.hparams.reset_lr: self.bnn.assign_lr() self.bnn.train(self.data_h, self.training_epochs) tensorGradients = self.bnn.sess.run( self.bnn.gradAction[action], feed_dict={self.bnn.x: context.reshape(1, -1)}) grads = np.array([]) for el in tensorGradients: grads = np.concatenate((grads, el.flatten())) outer = np.outer(grads, grads) / self.hparams.layer_sizes[0] self.detZ *= 1 + grads.dot( self.Zinv.dot(grads)) / self.hparams.layer_sizes[0] self.Zinv -= self.Zinv.dot(outer.dot(self.Zinv)) / ( 1 + (grads.T.dot(self.Zinv.dot(grads)) / self.hparams.layer_sizes[0])) el1 = np.sqrt(1 + self.C1 * ((self.hparams.layer_sizes[0])**(-1 / 6)) * np.sqrt(np.log(self.hparams.layer_sizes[0])) * (len(self.hparams.layer_sizes)**4) * (self.t**(7 / 6)) * (self.hparams.lamb**(-7 / 6))) el2 = self.hparams.mu * np.sqrt( -np.log(self.detZ / (self.hparams.lamb**self.p)) + self.C2 * ((self.hparams.layer_sizes[0])** (-1 / 6)) * np.sqrt(np.log(self.hparams.layer_sizes[0])) * (len(self.hparams.layer_sizes)**4) * (self.t**(5 / 3)) * (self.hparams.lamb** (-1 / 6)) - 2 * np.log(self.hparams.delta)) + np.sqrt( self.hparams.lamb) * self.hparams.S el3 = self.C3 * ( (1 - self.hparams.mu * self.hparams.layer_sizes[0] * self.hparams.lamb)** (self.training_epochs) * np.sqrt(self.t / self.hparams.lamb) + ((self.hparams.layer_sizes[0])** (-1 / 6)) * np.sqrt(np.log(self.hparams.layer_sizes[0])) * (len(self.hparams.layer_sizes)**(7 / 2)) * (self.t**(5 / 3)) * (self.hparams.lamb**(-5 / 3)) * (1 + np.sqrt(self.t / self.hparams.lamb))) print("Profile Elements", el1, el2, el3) self.gamma = el1 * el2 + el3
class PosteriorBNNSampling(BanditAlgorithm): """Posterior Sampling algorithm based on a Bayesian neural network.""" def __init__(self, name, hparams, bnn_model='RMSProp'): """Creates a PosteriorBNNSampling object based on a specific optimizer. The algorithm has two basic tools: an Approx BNN and a Contextual Dataset. The Bayesian Network keeps the posterior based on the optimizer iterations. Args: name: Name of the algorithm. hparams: Hyper-parameters of the algorithm. bnn_model: Type of BNN. By default RMSProp (point estimate). """ self.name = name self.hparams = hparams self.optimizer_n = hparams.optimizer self.training_freq = hparams.training_freq self.training_epochs = hparams.training_epochs self.t = 0 self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, hparams.buffer_s) # to be extended with more BNNs (BB alpha-div, GPs, SGFS, constSGD...) bnn_name = '{}-bnn'.format(name) if bnn_model == 'Variational': self.bnn = VariationalNeuralBanditModel(hparams, bnn_name) elif bnn_model == 'AlphaDiv': self.bnn = BBAlphaDivergence(hparams, bnn_name) elif bnn_model == 'Variational_BF': self.bnn = BfVariationalNeuralBanditModel(hparams, bnn_name) elif bnn_model == 'GP': self.bnn = MultitaskGP(hparams) elif bnn_model == 'FBNN': self.bnn = FunctionalBNNModel(hparams) else: self.bnn = NeuralBanditModel(self.optimizer_n, hparams, bnn_name) def action(self, context): """Selects action for context based on Thompson Sampling using the BNN.""" if self.t < self.hparams.num_actions * self.hparams.initial_pulls: # round robin until each action has been taken "initial_pulls" times return self.t % self.hparams.num_actions with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) output = self.bnn.sess.run(self.bnn.y_pred, feed_dict={self.bnn.x: c}) return np.argmax(output) def update(self, context, action, reward, train=True): """Updates data buffer, and re-trains the BNN every training_freq steps.""" self.t += 1 self.data_h.add(context, action, reward) if self.t % self.training_freq == 0 and train: if self.hparams.reset_lr: self.bnn.assign_lr() self.bnn.train(self.data_h, self.training_epochs)
class LinearFullPosteriorSampling(BanditAlgorithm): """Thompson Sampling with independent linear models and unknown noise var.""" def __init__(self, name, hparams): """Initialize posterior distributions and hyperparameters. Assume a linear model for each action i: reward = context^T beta_i + noise Each beta_i has a Gaussian prior (lambda parameter), each sigma2_i (noise level) has an inverse Gamma prior (a0, b0 parameters). Mean, covariance, and precision matrices are initialized, and the ContextualDataset created. Args: name: Name of the algorithm. hparams: Hyper-parameters of the algorithm. """ self.name = name self.hparams = hparams # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.mu = [ np.zeros(self.hparams.context_dim + 1) for _ in range(self.hparams.num_actions) ] self.f = [ np.zeros(self.hparams.context_dim + 1) for _ in range(self.hparams.num_actions) ] self.yy = [0 for _ in range(self.hparams.num_actions)] self.cov = [ (1.0 / self.lambda_prior) * np.eye(self.hparams.context_dim + 1) for _ in range(self.hparams.num_actions) ] self.precision = [ self.lambda_prior * np.eye(self.hparams.context_dim + 1) for _ in range(self.hparams.num_actions) ] # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] self.t = 0 self.intercept = True self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=self.intercept) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly. Args: context: Context for which the action need to be chosen. Returns: action: Selected action for the context. """ # Round robin until each action has been selected "initial_pulls" times if self.t < self.hparams.num_actions * self.hparams.initial_pulls: return self.t % self.hparams.num_actions # Sample sigma2, and beta conditional on sigma2 sigma2_s = [ self.b[i] * invgamma.rvs(self.a[i]) for i in range(self.hparams.num_actions) ] try: beta_s = [ np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i]) for i in range(self.hparams.num_actions) ] except np.linalg.LinAlgError as e: # Sampling could fail if covariance is not positive definite d = self.hparams.context_dim + 1 beta_s = [ np.random.multivariate_normal(np.zeros((d)), np.eye(d)) for i in range(self.hparams.num_actions) ] # Compute sampled expected values, intercept is last component of beta vals = [ np.dot(beta_s[i][:-1], context.T) + beta_s[i][-1] for i in range(self.hparams.num_actions) ] return np.argmax(vals) def update(self, context, action, reward): """Updates action posterior using the linear Bayesian regression formula. Args: context: Last observed context. action: Last observed action. reward: Last observed reward. """ self.t += 1 self.data_h.add(context, action, reward) if self.intercept: c = np.array(context[:]) c = np.append(c, 1.0).reshape((1, self.hparams.context_dim + 1)) else: c = np.array(context[:]).reshape((1, self.hparams.context_dim)) # Update posterior of action with formulas: \beta | x,y ~ N(mu_q, cov_q) #x, y = self.data_h.get_data(action) # Some terms are removed as we assume prior mu_0 = 0. self.precision[action] += np.dot(c.T, c) self.f[action] += (c.T * reward)[:, 0] self.yy[action] += reward**2 self.cov[action] = np.linalg.inv(self.precision[action]) self.mu[action] = np.dot(self.cov[action], self.f[action]) # Inverse Gamma posterior update self.a[action] += 0.5 b_upd = 0.5 * (self.yy[action] - np.dot(self.mu[action].T, np.dot(self.precision[action], self.mu[action]))) self.b[action] = self.b0 + b_upd #print(self.calc_model_evidence()) @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior def calc_model_evidence(self): vval = 0 mp.mp.dps = 50 for action in range(self.hparams.num_actions): # val=1 # aa = self.a[action] # for i in range(int(self.a[action]-self.a0)): # aa-=1 # val*=aa # val/=(2.0*math.pi) # val/=self.b[action] # val*=gamma(aa) # val/=(self.b[action]**aa) # val *= np.sqrt(np.linalg.det(self.lambda_prior * np.eye(self.hparams.context_dim + 1)) / np.linalg.det(self.precision[action])) # val *= (self.b0 ** self.a0) # val/= gamma(self.a0) # vval += val #val= 1/float((2.0 * math.pi) ** (self.a[action]-self.a0)) #val*= (float(gamma(self.a[action]))/float(gamma(self.a0))) #val*= np.sqrt(float(np.linalg.det(self.lambda_prior * np.eye(self.hparams.context_dim + 1)))/float(np.linalg.det(self.precision[action]))) #val*= (float(self.b0**self.a0)/float(self.b[action]**self.a[action])) val = mp.mpf( mp.fmul(mp.fneg(mp.log(mp.fmul(2.0, mp.pi))), mp.fsub(self.a[action], self.a0))) val += mp.loggamma(self.a[action]) val -= mp.loggamma(self.a0) val += 0.5 * mp.log( np.linalg.det( self.lambda_prior * np.eye(self.hparams.context_dim + 1))) val -= 0.5 * mp.log(np.linalg.det(self.precision[action])) val += mp.fmul(self.a0, mp.log(self.b0)) val -= mp.fmul(self.a[action], mp.log(self.b[action])) vval += mp.exp(val) vval /= float(self.hparams.num_actions) return vval
class NeuralLinearPosteriorSampling(BanditAlgorithm): """Full Bayesian linear regression on the last layer of a deep neural net.""" def __init__(self, name, hparams, optimizer='RMS'): self.name = name self.hparams = hparams self.latent_dim = self.hparams.layer_sizes[-1] # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.mu = [ np.zeros(self.latent_dim) for _ in range(self.hparams.num_actions) ] self.cov = [(1.0 / self.lambda_prior) * np.eye(self.latent_dim) for _ in range(self.hparams.num_actions)] self.precision = [ self.lambda_prior * np.eye(self.latent_dim) for _ in range(self.hparams.num_actions) ] # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False) self.latent_h = ContextualDataset(self.latent_dim, hparams.num_actions, intercept=False) self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name)) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly.""" # Round robin until each action has been selected "initial_pulls" times if self.t < self.hparams.num_actions * self.hparams.initial_pulls: return self.t % self.hparams.num_actions # Sample sigma2, and beta conditional on sigma2 sigma2_s = [ self.b[i] * invgamma.rvs(self.a[i]) for i in range(self.hparams.num_actions) ] try: beta_s = [ np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i]) for i in range(self.hparams.num_actions) ] except np.linalg.LinAlgError as e: # Sampling could fail if covariance is not positive definite print('Exception when sampling for {}.'.format(self.name)) print('Details: {} | {}.'.format(e.message, e.args)) d = self.latent_dim beta_s = [ np.random.multivariate_normal(np.zeros((d)), np.eye(d)) for i in range(self.hparams.num_actions) ] # Compute last-layer representation for the current context with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) # Apply Thompson Sampling to last-layer representation vals = [ np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions) ] return np.argmax(vals) def update(self, context, action, reward): """Updates the posterior using linear bayesian regression formula.""" self.t += 1 self.data_h.add(context, action, reward) c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) self.latent_h.add(z_context, action, reward) # Retrain the network on the original data (data_h) if self.t % self.update_freq_nn == 0: if self.hparams.reset_lr: self.bnn.assign_lr() self.bnn.train(self.data_h, self.num_epochs) # Update the latent representation of every datapoint collected so far new_z = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: self.data_h.contexts}) self.latent_h.replace_data(contexts=new_z) # Update the Bayesian Linear Regression if self.t % self.update_freq_lr == 0: # Find all the actions to update actions_to_update = self.latent_h.actions[:-self.update_freq_lr] for action_v in np.unique(actions_to_update): # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q) z, y = self.latent_h.get_data(action_v) # The algorithm could be improved with sequential formulas (cheaper) s = np.dot(z.T, z) # Some terms are removed as we assume prior mu_0 = 0. precision_a = s + self.lambda_prior * np.eye(self.latent_dim) cov_a = np.linalg.inv(precision_a) mu_a = np.dot(cov_a, np.dot(z.T, y)) # print('beta_cov: ', cov_a) # Inverse Gamma posterior update a_post = self.a0 + z.shape[0] / 2.0 b_upd = 0.5 * np.dot(y.T, y) b_upd -= 0.5 * np.dot(mu_a.T, np.dot(precision_a, mu_a)) b_post = self.b0 + b_upd # Store new posterior distributions self.mu[action_v] = mu_a self.cov[action_v] = cov_a self.precision[action_v] = precision_a self.a[action_v] = a_post self.b[action_v] = b_post @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior
class LinearFullPosteriorSampling(BanditAlgorithm): """Thompson Sampling with independent linear models and unknown noise var.""" def __init__(self, name, hparams): """Initialize posterior distributions and hyperparameters. Assume a linear model for each action i: reward = context^T beta_i + noise Each beta_i has a Gaussian prior (lambda parameter), each sigma2_i (noise level) has an inverse Gamma prior (a0, b0 parameters). Mean, covariance, and precision matrices are initialized, and the ContextualDataset created. Args: name: Name of the algorithm. hparams: Hyper-parameters of the algorithm. """ self.name = name self.hparams = hparams # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.mu = [ np.zeros(self.hparams.context_dim + 1) for _ in range(self.hparams.num_actions) ] self.cov = [(1.0 / self.lambda_prior) * np.eye(self.hparams.context_dim + 1) for _ in range(self.hparams.num_actions)] self.precision = [ self.lambda_prior * np.eye(self.hparams.context_dim + 1) for _ in range(self.hparams.num_actions) ] # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] self.t = 0 self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=True) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly. Args: context: Context for which the action need to be chosen. Returns: action: Selected action for the context. """ # Round robin until each action has been selected "initial_pulls" times if self.t < self.hparams.num_actions * self.hparams.initial_pulls: return self.t % self.hparams.num_actions # Sample sigma2, and beta conditional on sigma2 sigma2_s = [ self.b[i] * invgamma.rvs(self.a[i]) for i in range(self.hparams.num_actions) ] try: beta_s = [ np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i]) for i in range(self.hparams.num_actions) ] except np.linalg.LinAlgError as e: # Sampling could fail if covariance is not positive definite print('Exception when sampling from {}.'.format(self.name)) print('Details: {} | {}.'.format(e.message, e.args)) d = self.hparams.context_dim + 1 beta_s = [ np.random.multivariate_normal(np.zeros((d)), np.eye(d)) for i in range(self.hparams.num_actions) ] # Compute sampled expected values, intercept is last component of beta vals = [ np.dot(beta_s[i][:-1], context.T) + beta_s[i][-1] for i in range(self.hparams.num_actions) ] return np.argmax(vals) def update(self, context, action, reward): """Updates action posterior using the linear Bayesian regression formula. Args: context: Last observed context. action: Last observed action. reward: Last observed reward. """ self.t += 1 self.data_h.add(context, action, reward) # Update posterior of action with formulas: \beta | x,y ~ N(mu_q, cov_q) x, y = self.data_h.get_data(action) # The algorithm could be improved with sequential update formulas (cheaper) s = np.dot(x.T, x) # Some terms are removed as we assume prior mu_0 = 0. precision_a = s + self.lambda_prior * np.eye(self.hparams.context_dim + 1) cov_a = np.linalg.inv(precision_a) mu_a = np.dot(cov_a, np.dot(x.T, y)) # Inverse Gamma posterior update a_post = self.a0 + x.shape[0] / 2.0 b_upd = 0.5 * (np.dot(y.T, y) - np.dot(mu_a.T, np.dot(precision_a, mu_a))) b_post = self.b0 + b_upd # Store new posterior distributions self.mu[action] = mu_a self.cov[action] = cov_a self.precision[action] = precision_a self.a[action] = a_post self.b[action] = b_post @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior
class NeuralLinearPosteriorSampling(BanditAlgorithm): """Full Bayesian linear regression on the last layer of a deep neural net.""" def __init__(self, name, hparams,textflag ='no', optimizer='RMS'): self.name = name self.hparams = hparams self.latent_dim = self.hparams.layer_sizes[-1] self.intercept = False if self.intercept: self.param_dim=1+self.latent_dim else: self.param_dim = self.latent_dim # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.mu = [ np.zeros(self.param_dim) for _ in range(self.hparams.num_actions) ] self.f = [ np.zeros(self.param_dim) for _ in range(self.hparams.num_actions) ] self.yy = [0 for _ in range(self.hparams.num_actions)] self.cov = [(1.0 / self.lambda_prior) * np.eye(self.param_dim) for _ in range(self.hparams.num_actions)] self.precision = [ self.lambda_prior * np.eye(self.param_dim) for _ in range(self.hparams.num_actions) ] # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False) self.latent_h = ContextualDataset(self.latent_dim, hparams.num_actions, intercept=self.intercept) if textflag=='yes': self.bnn = TextCNN('adam', self.hparams.num_actions,self.hparams.batch_size, '{}-bnn'.format(name)) else: self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name)) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly.""" # Round robin until each action has been selected "initial_pulls" times if self.t < self.hparams.num_actions * self.hparams.initial_pulls: return self.t % self.hparams.num_actions # Sample sigma2, and beta conditional on sigma2 sigma2_s = [ self.b[i] * invgamma.rvs(self.a[i]) for i in range(self.hparams.num_actions) ] try: beta_s = [ np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i]) for i in range(self.hparams.num_actions) ] except np.linalg.LinAlgError as e: # Sampling could fail if covariance is not positive definite d = self.param_dim beta_s = [ np.random.multivariate_normal(np.zeros((d)), np.eye(d)) for i in range(self.hparams.num_actions) ] # Compute last-layer representation for the current context with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) if self.intercept: z_context = np.append(z_context, 1.0).reshape((1, self.latent_dim + 1)) # Apply Thompson Sampling to last-layer representation vals = [ np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions) ] return np.argmax(vals) def update(self, context, action, reward): """Updates the posterior using linear bayesian regression formula.""" self.t += 1 self.data_h.add(context, action, reward) c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) self.latent_h.add(z_context, action, reward) # Retrain the network on the original data (data_h) if self.t % self.update_freq_nn == 0: if self.hparams.reset_lr: self.bnn.assign_lr() #self.bnn.set_last_layer(self.mu) self.bnn.train(self.data_h, self.num_epochs) # Update the latent representation of every datapoint collected so far new_z = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: self.data_h.contexts}) self.latent_h.replace_data(contexts=new_z) for action_v in range(self.hparams.num_actions): # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q) z, y = self.latent_h.get_data(action_v) # The algorithm could be improved with sequential formulas (cheaper) self.precision[action_v] = (np.dot(z.T, z)+self.lambda_prior * np.eye(self.param_dim)) #the new PHI_0 self.f[action_v] = np.dot(z.T, y) else: if self.intercept: z_context = np.append(z_context, 1.0).reshape((1, self.latent_dim + 1)) self.precision[action] += np.dot(z_context.T, z_context) self.f[action] += (z_context.T * reward)[:, 0] self.yy[action] += reward ** 2 self.cov[action] = np.linalg.inv(self.precision[action]) self.mu[action] = np.dot(self.cov[action], self.f[action]) # Inverse Gamma posterior update self.a[action] += 0.5 b_upd = 0.5 * (self.yy[action] - np.dot(self.mu[action].T, np.dot(self.precision[action], self.mu[action]))) self.b[action] = self.b0 + b_upd #print(self.calc_model_evidence()) @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior def calc_model_evidence(self): vval = 0 mp.mp.dps = 50 for action in range(self.hparams.num_actions): # val=1 # aa = self.a[action] # for i in range(int(self.a[action]-self.a0)): # aa-=1 # val*=aa # val/=(2.0*math.pi) # val/=self.b[action] # val*=gamma(aa) # val/=(self.b[action]**aa) # val *= np.sqrt(np.linalg.det(self.lambda_prior * np.eye(self.hparams.context_dim + 1)) / np.linalg.det(self.precision[action])) # val *= (self.b0 ** self.a0) # val/= gamma(self.a0) # vval += val #val= 1/float((2.0 * math.pi) ** (self.a[action]-self.a0)) #val*= (float(gamma(self.a[action]))/float(gamma(self.a0))) #val*= np.sqrt(float(np.linalg.det(self.lambda_prior * np.eye(self.hparams.context_dim + 1)))/float(np.linalg.det(self.precision[action]))) #val*= (float(self.b0**self.a0)/float(self.b[action]**self.a[action])) val= mp.mpf(mp.fmul(mp.fneg(mp.log(mp.fmul(2.0 , mp.pi))) , mp.fsub(self.a[action],self.a0))) val+= mp.loggamma(self.a[action]) val-= mp.loggamma(self.a0) val+= 0.5*mp.log(np.linalg.det(self.lambda_prior * np.eye(self.hparams.context_dim + 1))) val -= 0.5*mp.log(np.linalg.det(self.precision[action])) val+= mp.fmul(self.a0,mp.log(self.b0)) val-= mp.fmul(self.a[action],mp.log(self.b[action])) vval+=mp.exp(val) vval/=float(self.hparams.num_actions) return vval
class LinearFullPosteriorSampling(BanditAlgorithm): """Thompson Sampling with independent linear models and unknown noise var.""" def __init__(self, name, hparams): """Initialize posterior distributions and hyperparameters. Assume a linear model for each action i: reward = context^T beta_i + noise Each beta_i has a Gaussian prior (lambda parameter), each sigma2_i (noise level) has an inverse Gamma prior (a0, b0 parameters). Mean, covariance, and precision matrices are initialized, and the ContextualDataset created. Args: name: Name of the algorithm. hparams: Hyper-parameters of the algorithm. """ self.name = name self.hparams = hparams # Gaussian prior for each beta_i self._lambda_prior = self.hparams.lambda_prior self.mu = [ np.zeros(self.hparams.context_dim + 1) for _ in range(self.hparams.num_actions) ] self.cov = [ (1.0 / self.lambda_prior) * np.eye(self.hparams.context_dim + 1) for _ in range(self.hparams.num_actions) ] self.precision = [ self.lambda_prior * np.eye(self.hparams.context_dim + 1) for _ in range(self.hparams.num_actions) ] # Inverse Gamma prior for each sigma2_i self._a0 = self.hparams.a0 self._b0 = self.hparams.b0 self.a = [self._a0 for _ in range(self.hparams.num_actions)] self.b = [self._b0 for _ in range(self.hparams.num_actions)] self.t = 0 self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=True) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly. Args: context: Context for which the action need to be chosen. Returns: action: Selected action for the context. """ # Round robin until each action has been selected "initial_pulls" times if self.t < self.hparams.num_actions * self.hparams.initial_pulls: return self.t % self.hparams.num_actions # Sample sigma2, and beta conditional on sigma2 sigma2_s = [ self.b[i] * invgamma.rvs(self.a[i]) for i in range(self.hparams.num_actions) ] try: beta_s = [ np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i]) for i in range(self.hparams.num_actions) ] except np.linalg.LinAlgError as e: # Sampling could fail if covariance is not positive definite print('Exception when sampling from {}: {}.'.format(self.name, e)) d = self.hparams.context_dim + 1 beta_s = [ np.random.multivariate_normal(np.zeros((d)), np.eye(d)) for i in range(self.hparams.num_actions) ] # Compute sampled expected values, intercept is last component of beta vals = [ np.dot(beta_s[i][:-1], context.T) + beta_s[i][-1] for i in range(self.hparams.num_actions) ] return np.argmax(vals) def update(self, context, action, reward): """Updates action posterior using the linear Bayesian regression formula. Args: context: Last observed context. action: Last observed action. reward: Last observed reward. """ self.t += 1 self.data_h.add(context, action, reward) # Update posterior of action with formulas: \beta | x,y ~ N(mu_q, cov_q) x, y = self.data_h.get_data(action) # The algorithm could be improved with sequential update formulas (cheaper) s = np.dot(x.T, x) # Some terms are removed as we assume prior mu_0 = 0. precision_a = s + self.lambda_prior * np.eye(self.hparams.context_dim + 1) cov_a = np.linalg.inv(precision_a) mu_a = np.dot(cov_a, np.dot(x.T, y)) # Inverse Gamma posterior update a_post = self.a0 + x.shape[0] / 2.0 b_upd = 0.5 * (np.dot(y.T, y) - np.dot(mu_a.T, np.dot(precision_a, mu_a))) b_post = self.b0 + b_upd # Store new posterior distributions self.mu[action] = mu_a self.cov[action] = cov_a self.precision[action] = precision_a self.a[action] = a_post self.b[action] = b_post @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior
class NeuralLinearEpsilonGreedy(BanditAlgorithm): """Full Bayesian linear regression on the last layer of a deep neural net.""" def __init__(self, name, hparams, textflag='yes', optimizer='RMS'): self.name = name self.hparams = hparams self.epsilon = self.hparams.epsilon self.latent_dim = self.hparams.layer_sizes[-1] self.intercept = True if self.intercept: self.param_dim = 1 + self.latent_dim else: self.param_dim = self.latent_dim # Gaussian prior for each beta_i # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False) self.latent_h = ContextualDataset(self.latent_dim, hparams.num_actions, intercept=self.intercept) if textflag == 'yes': self.bnn = TextCNN('adam', self.hparams.num_actions, self.hparams.batch_size, '{}-bnn'.format(name)) else: self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name)) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly.""" # Round robin until each action has been selected "initial_pulls" times if self.t < self.hparams.num_actions * self.hparams.initial_pulls: return self.t % self.hparams.num_actions with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) y = self.bnn.sess.run(self.bnn.y_pred, feed_dict={self.bnn.x: c}) if random.random() > self.epsilon: return np.argmax(y) else: return random.randrange(self.hparams.num_actions) def update(self, context, action, reward): """Updates the posterior using linear bayesian regression formula.""" self.t += 1 self.data_h.add(context, action, reward) c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}) self.latent_h.add(z_context, action, reward) # Retrain the network on the original data (data_h) if self.t % self.update_freq_nn == 0: if self.hparams.reset_lr: self.bnn.assign_lr() #self.bnn.set_last_layer(self.mu) self.bnn.train(self.data_h, self.num_epochs) @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior
class NeuralGreedy(BanditAlgorithm): """Full Bayesian linear regression on the last layer of a deep neural net.""" def __init__(self, name, hparams, optimizer='RMS'): self.name = name self.eps = 0.9 self.decay = 0.99 # computed for 10,000 steps self.hparams = hparams # Regression and NN Update Frequency self.update_freq_lr = hparams.training_freq self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, intercept=False) self.bnn = NeuralBanditModel(optimizer, hparams, '{}-greedy'.format(name)) def action(self, context): """Samples beta's from posterior, and chooses best action accordingly.""" # Round robin until each action has been selected "initial_pulls" times #if self.t < self.hparams.num_actions * self.hparams.initial_pulls: #return self.t % self.hparams.num_actions ## No need with greedy if np.random.random() < self.eps: return np.random.choice(range(self.hparams.num_actions)) else: with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) output = self.bnn.sess.run(self.bnn.y_pred, feed_dict={self.bnn.x: c}) return np.argmax(output) def update(self, context, action, reward): """Updates the posterior using linear bayesian regression formula.""" self.t += 1 self.eps *= self.decay self.data_h.add(context, action, reward) c = context.reshape((1, self.hparams.context_dim)) # Retrain the network on the original data (data_h) if self.t % self.update_freq_nn == 0: if self.hparams.reset_lr: self.bnn.assign_lr() self.bnn.train(self.data_h, self.num_epochs) @property def a0(self): return self._a0 @property def b0(self): return self._b0 @property def lambda_prior(self): return self._lambda_prior
class ParameterNoiseSampling(BanditAlgorithm): """Parameter Noise Sampling algorithm based on adding noise to net params. Described in https://arxiv.org/abs/1706.01905 """ def __init__(self, name, hparams): """Creates the algorithm, and sets up the adaptive Gaussian noise.""" self.name = name self.hparams = hparams self.verbose = getattr(self.hparams, 'verbose', True) self.noise_std = getattr(self.hparams, 'noise_std', 0.005) self.eps = getattr(self.hparams, 'eps', 0.05) self.d_samples = getattr(self.hparams, 'd_samples', 300) self.optimizer = getattr(self.hparams, 'optimizer', 'RMS') # keep track of noise heuristic statistics self.std_h = [self.noise_std] self.eps_h = [self.eps] self.kl_h = [] self.t = 0 self.freq_update = hparams.training_freq self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, hparams.buffer_s) self.bnn = NeuralBanditModel(self.optimizer, hparams, '{}-bnn'.format(name)) with self.bnn.graph.as_default(): # noise-injection std placeholder self.bnn.noise_std_ph = tf.placeholder(tf.float32, shape=()) # create noise corruption op; adds noise to all weights tvars = tf.trainable_variables() self.bnn.noisy_grads = [ tf.random_normal(v.get_shape(), 0, self.bnn.noise_std_ph) for v in tvars ] # add noise to all params, then compute prediction, then subtract. with tf.control_dependencies(self.bnn.noisy_grads): self.bnn.noise_add_ops = [ tvars[i].assign_add(n) for i, n in enumerate(self.bnn.noisy_grads) ] with tf.control_dependencies(self.bnn.noise_add_ops): # we force the prediction for 'y' to be recomputed after adding noise self.bnn.noisy_nn, self.bnn.noisy_pred_val = self.bnn.forward_pass() self.bnn.noisy_pred = tf.identity(self.bnn.noisy_pred_val) with tf.control_dependencies([tf.identity(self.bnn.noisy_pred)]): self.bnn.noise_sub_ops = [ tvars[i].assign_add(-n) for i, n in enumerate(self.bnn.noisy_grads) ] def action(self, context): """Selects action based on Thompson Sampling *after* adding noise.""" if self.t < self.hparams.num_actions * self.hparams.initial_pulls: # round robin until each action has been taken "initial_pulls" times return self.t % self.hparams.num_actions with self.bnn.graph.as_default(): # run noise prediction op to choose action, and subtract noise op after. c = context.reshape((1, self.hparams.context_dim)) output, _ = self.bnn.sess.run( [self.bnn.noisy_pred, self.bnn.noise_sub_ops], feed_dict={self.bnn.x: c, self.bnn.noise_std_ph: self.noise_std}) return np.argmax(output) def update(self, context, action, reward): """Updates the data buffer, and re-trains the BNN and noise level.""" self.t += 1 self.data_h.add(context, action, reward) if self.t % self.freq_update == 0: self.bnn.train(self.data_h, self.num_epochs) self.update_noise() def update_noise(self): """Increase noise if distance btw original and corrupted distrib small.""" kl = self.compute_distance() delta = -np.log1p(- self.eps + self.eps / self.hparams.num_actions) if kl < delta: self.noise_std *= 1.01 else: self.noise_std /= 1.01 self.eps *= 0.99 if self.verbose: print('Update eps={} | kl={} | std={} | delta={} | increase={}.'.format( self.eps, kl, self.noise_std, delta, kl < delta)) # store noise-injection statistics for inspection: std, KL, eps. self.std_h.append(self.noise_std) self.kl_h.append(kl) self.eps_h.append(self.eps) def compute_distance(self): """Computes empirical KL for original and corrupted output distributions.""" random_inputs, _ = self.data_h.get_batch(self.d_samples) y_model = self.bnn.sess.run( self.bnn.y_pred, feed_dict={ self.bnn.x: random_inputs, self.bnn.noise_std_ph: self.noise_std }) y_noisy, _ = self.bnn.sess.run( [self.bnn.noisy_pred, self.bnn.noise_sub_ops], feed_dict={ self.bnn.x: random_inputs, self.bnn.noise_std_ph: self.noise_std }) if self.verbose: # display how often original & perturbed models propose different actions s = np.sum([np.argmax(y_model[i, :]) == np.argmax(y_noisy[i, :]) for i in range(y_model.shape[0])]) print('{} | % of agreement btw original / corrupted actions: {}.'.format( self.name, s / self.d_samples)) kl = self.compute_kl_with_logits(y_model, y_noisy) return kl def compute_kl_with_logits(self, logits1, logits2): """Computes KL from logits samples from two distributions.""" def exp_times_diff(a, b): return np.multiply(np.exp(a), a - b) logsumexp1 = logsumexp(logits1, axis=1) logsumexp2 = logsumexp(logits2, axis=1) logsumexp_diff = logsumexp2 - logsumexp1 exp_diff = exp_times_diff(logits1, logits2) exp_diff = np.sum(exp_diff, axis=1) inv_exp_sum = np.sum(np.exp(logits1), axis=1) term1 = np.divide(exp_diff, inv_exp_sum) kl = term1 + logsumexp_diff kl = np.maximum(kl, 0.0) kl = np.nan_to_num(kl) return np.mean(kl)
class NeuralLinUCB(BanditAlgorithm): def __init__(self, name, hparams, optimizer='RMS'): self.name = name self.hparams = hparams self.n_a = self.hparams.num_actions self.n_d = self.hparams.layer_sizes[-1] self.alpha = self.hparams.alpha self.lam = self.hparams.lam self.a = np.concatenate(tuple([np.eye(self.n_d)[np.newaxis, :, :] for i in range(self.n_a)]), axis=0) * self.lam self.inv_a = np.concatenate(tuple([np.eye(self.n_d)[np.newaxis, :, :] for i in range(self.n_a)]), axis=0) / self.lam self.b = np.zeros((self.n_a, self.n_d)) self.theta = np.zeros((self.n_a, self.n_d)) # Params for BNN self.update_freq_nn = hparams.training_freq_network self.t = 0 self.optimizer_n = optimizer self.num_epochs = hparams.training_epochs self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions, bootstrap=getattr(hparams, 'bootstrap', None), intercept=False) self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name)) def action(self, context): """ Args: context: Context for which the action need to be chosen. Returns: action: Selected action for the context. """ with self.bnn.graph.as_default(): c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}).flatten() vals = np.array([ np.dot(self.theta[i], z_context) + self.alpha * np.sqrt(np.dot(z_context, np.dot(self.inv_a[i], z_context))) for i in range(self.n_a)]) return np.argmax(vals) def update(self, context, action, reward): """Updates action posterior using the linear Bayesian regression formula. Args: context: Last observed context. action: Last observed action. reward: Last observed reward. """ self.t += 1 self.data_h.add(context, action, reward) if self.t % self.update_freq_nn == 0: if self.hparams.reset_lr: self.bnn.assign_lr() self.bnn.train(self.data_h, self.num_epochs) new_z = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: self.data_h.contexts}) contexts = new_z actions = np.array(self.data_h.actions) rewards = self.data_h.rewards[np.arange(actions.shape[ 0]), actions] # strange but data_h.rewards is of shape (n_samples, n_actions) so we select actions pulled by model self.a = np.dot(contexts.T, contexts) + np.concatenate( tuple([np.eye(self.n_d)[np.newaxis, :, :] for i in range(self.n_a)]), axis=0) * self.lam self.b = np.concatenate(tuple( [np.dot(rewards[actions == action], contexts[actions == action])[np.newaxis, :] for action in range(self.n_a)]), axis=0) self.inv_a = np.concatenate( tuple([np.linalg.inv(self.a[action])[np.newaxis, :, :] for action in range(self.n_a)]), axis=0) self.theta = np.concatenate( tuple([np.dot(self.inv_a[action], self.b[action])[np.newaxis, :] for action in range(self.n_a)]), axis=0) else: c = context.reshape((1, self.hparams.context_dim)) z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}).flatten() self.a[action] = self.a[action] + np.tensordot(z_context, z_context, axes=0) self.inv_a[action] = np.linalg.inv(self.a[action]) self.b[action] = self.b[action] + reward * z_context self.theta[action] = np.dot(self.inv_a[action], self.b[action])