def run_experiments(self, iterations=10): cum_rew = np.zeros( (self.nb_contexts, len(self.algo_protos), iterations)) cum_reg = np.zeros(cum_rew.shape) for iter in range(iterations): print(str(iter + 1), '/', str(iterations)) t_init = time.time() dataset, opt_linear = self.dataset_proto() print('dataset created') opt_rewards, opt_actions = opt_linear algos = [algo_proto() for algo_proto in self.algo_protos] print('algo ready') outcome = run_contextual_bandit(self.context_dim, self.num_actions, dataset, algos) h_actions, h_rewards = outcome cum_rew[:, :, iter] = np.cumsum(h_rewards, axis=0) cum_reg[:, :, iter] = np.cumsum(opt_rewards)[:, np.newaxis] - cum_rew[:, :, iter] # print('Iter {} took {} ms'%(iter, time.time()-t_init)) # if other_results is not None: # self.results = np.concatenate((other_results, results), axis=2) # else: # self.results = results self.cum_rew = cum_rew self.cum_reg = cum_reg
def main(): data_type = 'mushroom' # vae_data = get_vae_features() # features, rewards, opt_vals = construct_dataset_from_features(vae_data) # dataset = np.hstack((features, rewards)) num_contexts = 2000 dataset, opt_mushroom = sample_mushroom_data(file_name, num_contexts) opt_rewards, opt_actions = opt_mushroom context_dim = 117 num_actions = 2 # dataset, opt_rewards, opt_actions, num_actions, context_dim # hyperparams hp_nlinear = HyperParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.005, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25, keep_prob=1.0, global_step=50) algos = [NeuralLinearPosteriorSampling('NeuralLinear', hp_nlinear)] t_init = time.time() # run contextual bandit experiment print(context_dim, num_actions) results = run_contextual_bandit(context_dim, num_actions, dataset, algos) _, h_rewards = results np.save("mushroom_rewards.npy", h_rewards) display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type)
def Run(context_dim, num_actions, dataset, algos, opt_rewards, opt_actions, data_type): # Run contextual bandit problem t_init = time.time() results = run_contextual_bandit(context_dim, num_actions, dataset, algos) _, h_rewards = results # Display results display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type) # Append Results res=[] for j, a in enumerate(algos): res.append((np.sum(h_rewards[:, j]))) return res
def main(): data_type = 'mnist' vae_data = get_vae_features() features, rewards, opt_vals = construct_dataset_from_features(vae_data) dataset = np.hstack((features, rewards)) context_dim = features.shape[1] num_actions = 10 init_lrs = [0.001, 0.0025, 0.005, 0.01] base_lrs = [0.0005, 0.001] modes = ["triangular", "triangular2", "exp_range"] batch_sizes = [32, 128, 512] layer_sizes = [[50, 50], [100, 100], [100]] # hyperparams for init_lr in init_lrs: for base_lrs in base_lrs: for mode in modes hp_nlinear = HyperParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, layer_sizes=[50, 50], batch_size=32, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25, keep_prob=1.0, global_step=1, mode=mode) algos = [NeuralLinearPosteriorSampling('NeuralLinear', hp_nlinear)] # run contextual bandit experiment print(context_dim, num_actions) results = run_contextual_bandit(context_dim, num_actions, dataset, algos) actions, rewards = results np.save("results.npy", rewards)
def run_trial(process): for idx, combo in enumerate(combos): if idx % num_processes == process: print('running combo %d: %s', idx, combo) # hyperparams hp_nlinear = HyperParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, layer_sizes=combo["layer_size"], batch_size=combo["batch_size"], activate_decay=True, initial_lr=combo["init_lr"], base_lr=combo["base_lr"], max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=combo["training_freq"], training_epochs=100, a0=6, b0=6, lambda_prior=0.25, keep_prob=1.0, global_step=1, mode=mode) algos = [NeuralLinearPosteriorSampling('NeuralLinear', hp_nlinear)] # run contextual bandit experiment print(context_dim, num_actions) results = run_contextual_bandit(context_dim, num_actions, dataset, algos) actions, rewards = results np.save(mode + "results" + str(idx) + ".npy", rewards)
def main(_): # Problem parameters num_contexts = 2000 # Data type in {linear, sparse_linear, mushroom, financial, jester, # statlog, adult, covertype, census, wheel} data_type = 'mushroom' # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_linear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_rms = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=3) hparams_dropout = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80) hparams_bbb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear2 = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_pnoise = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, noise_std=0.05, eps=0.1, d_samples=300, ) hparams_alpha_div = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100, alpha=1.0, k=20, prior_variance=0.1) hparams_gp = tf.contrib.training.HParams(num_actions=num_actions, num_outputs=num_actions, context_dim=context_dim, reset_lr=False, learn_embeddings=True, max_num_points=1000, show_training=False, freq_summary=1000, batch_size=512, keep_fixed_after_max_obs=True, training_freq=50, initial_pulls=2, training_epochs=100, lr=0.01, buffer_s=-1, initial_lr=0.001, lr_decay_rate=0.0, optimizer='RMS', task_latent_dim=5, activate_decay=False) algos = [ UniformSampling('Uniform Sampling', hparams), UniformSampling('Uniform Sampling 2', hparams), FixedPolicySampling('fixed1', [0.75, 0.25], hparams), FixedPolicySampling('fixed2', [0.25, 0.75], hparams), PosteriorBNNSampling('RMS', hparams_rms, 'RMSProp'), PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), LinearFullPosteriorSampling('LinFullPost', hparams_linear), BootstrappedBNNSampling('BootRMS', hparams_rms), ParameterNoiseSampling('ParamNoise', hparams_pnoise), PosteriorBNNSampling('BBAlphaDiv', hparams_alpha_div, 'AlphaDiv'), PosteriorBNNSampling('MultitaskGP', hparams_gp, 'GP'), ] # Run contextual bandit problem t_init = time.time() results = run_contextual_bandit(context_dim, num_actions, dataset, algos) _, h_rewards = results # Display results display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type)
def main(_): np.random.seed(FLAGS.seed) tf.set_random_seed(FLAGS.seed) dt = datetime.datetime.now() timestr = '{}-{}-{}-{}'.format(dt.month, dt.day, dt.hour, dt.minute) FLAGS.logdir = os.path.join(FLAGS.logdir, timestr) # Problem parameters num_contexts = FLAGS.num_context # Data type in {linear, sparse_linear, mushroom, financial, jester, # statlog, adult, covertype, census, wheel} data_type = FLAGS.bandit # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals layer_sizes = [int(i) for i in FLAGS.layers.split(',')] # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_rms = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=layer_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=2000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=20) hparams_dropout = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=layer_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=2000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80) hparams_bbb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=layer_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=True, freq_summary=2000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=20, initial_training_steps=2000, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) hparams_gp = tf.contrib.training.HParams(num_actions=num_actions, num_outputs=num_actions, context_dim=context_dim, reset_lr=False, learn_embeddings=True, max_num_points=1000, show_training=False, freq_summary=2000, batch_size=512, keep_fixed_after_max_obs=True, training_freq=50, initial_pulls=2, training_epochs=100, lr=0.01, buffer_s=-1, initial_lr=0.001, lr_decay_rate=0.0, optimizer='RMS', task_latent_dim=5, activate_decay=False) hparams_fsvgd = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, # activation=tf.nn.relu, layer_sizes=layer_sizes, batch_size=512, activate_decay=False, initial_lr=0.1, lr=FLAGS.lr, n_mm_sample=4, mm_n_particles=40, mm_jitter=FLAGS.mm_jitter, # max_grad_norm=5.0, show_training=True, freq_summary=2000, buffer_s=-1, initial_pulls=2, optimizer='Adam', use_sigma_exp_transform=True, cleared_times_trained=20, initial_training_steps=2000, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100, n_particles=20, interp_batch_size=FLAGS.interp_batch_size, prior_variance=FLAGS.prior_variance) algos = [ UniformSampling('Uniform Sampling', hparams), # UniformSampling('Uniform Sampling 2', hparams), # FixedPolicySampling('fixed1', [0.75, 0.25], hparams), # FixedPolicySampling('fixed2', [0.25, 0.75], hparams), PosteriorBNNSampling('fSVGD', hparams_fsvgd, 'SVGD'), # PosteriorBNNSampling('RMS', hparams_rms, 'RMSProp'), # PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), # NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), # NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), # LinearFullPosteriorSampling('LinFullPost', hparams_linear), BootstrappedBNNSampling('BootRMS', hparams_rms), # ParameterNoiseSampling('ParamNoise', hparams_pnoise), # PosteriorBNNSampling('BBAlphaDiv', hparams_alpha_div, 'AlphaDiv'), PosteriorBNNSampling('MultitaskGP', hparams_gp, 'GP'), ] # Run contextual bandit problem t_init = time.time() results = run_contextual_bandit(context_dim, num_actions, dataset, algos, opt_rewards) _, h_rewards = results # Display results display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type)
def main(_): # Problem parameters num_contexts = 20000 nb_simulations = 2 l_sizes = [50, 50] plt_dir = "plots/" dict_dir = "dicts/" # Data type in {linear, sparse_linear, mushroom, financial, jester, # statlog, adult, covertype, census, wheel} data_type = 'adult' # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_linear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_dropout = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80) hparams_bbb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear2 = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear_finite_memory = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=1, sigma_prior_flag=1) hparams_nlinear_finite_memory_no_prior = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=0, sigma_prior_flag=0) hparams_nlinear_finite_memory_no_sig_prior = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=1, sigma_prior_flag=0) hparams_ucb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, optimizer='RMS', training_freq=1, training_freq_network=50, training_epochs=100, lambda_prior=0.25, delta=0.01, lamb=0.01, mu=1, S=1) algos = [ #UniformSampling('Uniform Sampling', hparams), #FixedPolicySampling('fixed1', [0.75, 0.25], hparams), PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), #NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), LinearFullPosteriorSampling('LinFullPost', hparams_linear), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory', hparams_nlinear_finite_memory), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory_noP', hparams_nlinear_finite_memory_no_prior), #NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory_noSigP', hparams_nlinear_finite_memory_no_sig_prior), #NeuralUCBSampling('NeuralUCB', hparams_ucb) ] regrets = {} rewards = {} for a in algos: regrets[a.name] = np.zeros((nb_simulations, num_contexts)) rewards[a.name] = np.zeros(nb_simulations) rewards['opt_reward'] = np.zeros(nb_simulations) for k in range(nb_simulations): algos = [ #UniformSampling('Uniform Sampling', hparams), #FixedPolicySampling('fixed1', [0.75, 0.25], hparams), PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), #NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), LinearFullPosteriorSampling('LinFullPost', hparams_linear), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory', hparams_nlinear_finite_memory), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory_noP', hparams_nlinear_finite_memory_no_prior), #NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory_noSigP', hparams_nlinear_finite_memory_no_sig_prior), #NeuralUCBSampling('NeuralUCB', hparams_ucb) ] # Run contextual bandit problem t_init = time.time() results = run_contextual_bandit(context_dim, num_actions, dataset, algos) _, h_rewards = results # Display results display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type) for j, a in enumerate(algos): regrets[a.name][k, :] = np.cumsum(opt_rewards - h_rewards[:, j]) rewards[a.name][k] = np.sum(h_rewards[:, j]) rewards['opt_reward'][k] = np.sum(opt_rewards) save_plot(algos, regrets, data_type, num_contexts, plt_dir) np.save(dict_dir + 'dict_' + data_type + '.npy', rewards)
def run_iter(): # Data type in {linear, sparse_linear, mushroom, financial, jester, # statlog, adult, covertype, census, wheel} data_type = FLAGS.dataset # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_linear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_rms = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=3) hparams_bbb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_luga = tf.contrib.training.HParams(num_actions=num_actions, num_contexts=num_contexts, context_dim=context_dim, activation=tf.nn.relu, latent_dim=50, batch_size=512, initial_lr=2e-4, show_training=False, lr_decay=False, freq_summary=10000, buffer_s=-1, initial_pulls=2, training_freq=20, training_epochs=40, lambda_prior=0.25, show_loss=False, kl=1.0, recon=1.0, psigma=1.0, glnoise=False) hparams_sivi1 = tf.contrib.training.HParams(num_actions=num_actions, num_contexts=num_contexts, context_dim=context_dim, activation=tf.nn.relu, latent_dim=50, batch_size=512, initial_lr=1e-3, show_training=False, verbose=False, lr_decay=False, freq_summary=10000, buffer_s=-1, initial_pulls=2, training_freq=20, training_epochs=40, lambda_prior=0.25, show_loss=False, kl=1.0, recon=1.0, two_decoder=False, glnoise=False, psigma=1.25) hparams_lusi_abl_km = tf.contrib.training.HParams(num_actions=num_actions, num_contexts=num_contexts, context_dim=context_dim, activation=tf.nn.relu, latent_dim=50, batch_size=512, initial_lr=1e-3, show_training=False, verbose=False, lr_decay=False, freq_summary=10000, buffer_s=-1, initial_pulls=2, training_freq=20, training_epochs=40, lambda_prior=0.25, show_loss=False, km=1, onez=0, recon=1.0, two_decoder=False, glnoise=False, psigma=1.25) hparams_luga_abl_km = tf.contrib.training.HParams(num_actions=num_actions, num_contexts=num_contexts, context_dim=context_dim, activation=tf.nn.relu, latent_dim=50, batch_size=512, initial_lr=2e-4, show_training=False, lr_decay=False, freq_summary=10000, buffer_s=-1, initial_pulls=2, training_freq=20, training_epochs=40, lambda_prior=0.25, show_loss=False, km=1, onez=0, recon=1.0, psigma=1.0, glnoise=False) algos = [ UniformSampling('Uniform Sampling', hparams), #1 PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), #2 NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), #3 LinearFullPosteriorSampling('LinFullPost', hparams_linear), #4 PiposteriorBNNSampling('DGF', hparams_bbb, 'DGF'), #5 VariationalSampling_v4('LU_Gaussian', hparams_luga), #6 # A smaller learning rate like 3e-4 or 1e-4 will work better on the 'mushroom' dataset for LU_SIVI and LU_Gaussian VariationalSamplingSivi_dgf_v7("LU_SIVI", hparams_sivi1), #7 # For Ablation Study VariationalSampling_abl('LU_Gaussian_Ablation_multi_z_1m', hparams_luga_abl_km), VariationalSamplingSivi_dgf_abl("LU_SIVI_Ablation_multi_z_1m", hparams_lusi_abl_km) ] t_init = time.time() results = run_contextual_bandit(context_dim, num_actions, dataset, algos) _, h_rewards = results display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type) opt_rewards = opt_rewards.reshape([-1, 1]) regret_i = opt_rewards - h_rewards return regret_i
def main(_): # Problem parameters num_contexts = 3500#2000 tfn=200 MEMSIZE = 700#num_contexts/10 # Data type in {linear, sparse_linear, mushroom, financial, jester, # statlog, adult, covertype, census, wheel} data_type = 'financial' # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_linear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_rms = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=3) hparams_dropout = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80) hparams_bbb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear2 = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=tfn, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear_finite_memory = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=MEMSIZE, mu_prior_flag=1, sigma_prior_flag=1, ) hparams_nlinear_finite_memory2 = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=tfn, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=MEMSIZE, mu_prior_flag=1, sigma_prior_flag=1, ) hparams_nlinear_finite_memory_no_prior = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=MEMSIZE, mu_prior_flag=0, sigma_prior_flag=0, ) hparams_nlinear_finite_memory2_no_prior = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=tfn, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=MEMSIZE, mu_prior_flag=0, sigma_prior_flag=0, ) hparams_nlinear_finite_memory_no_sig_prior = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=MEMSIZE, mu_prior_flag=1, sigma_prior_flag=0, ) hparams_nlinear_finite_memory2_no_sig_prior = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=tfn, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=MEMSIZE, mu_prior_flag=1, sigma_prior_flag=0, ) hparams_pnoise = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, noise_std=0.05, eps=0.1, d_samples=300, ) hparams_alpha_div = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100, alpha=1.0, k=20, prior_variance=0.1) hparams_gp = tf.contrib.training.HParams(num_actions=num_actions, num_outputs=num_actions, context_dim=context_dim, reset_lr=False, learn_embeddings=True, max_num_points=1000, show_training=False, freq_summary=1000, batch_size=512, keep_fixed_after_max_obs=True, training_freq=50, initial_pulls=2, training_epochs=100, lr=0.01, buffer_s=-1, initial_lr=0.001, lr_decay_rate=0.0, optimizer='RMS', task_latent_dim=5, activate_decay=False) Nruns=50 par=0 NAgents = 10 res=[[]for i in xrange(NAgents)] # for i in xrange(Nruns): print(i) algos = [ UniformSampling('Uniform Sampling', hparams), # UniformSampling('Uniform Sampling 2', hparams), # FixedPolicySampling('fixed1', [0.75, 0.25], hparams), # FixedPolicySampling('fixed2', [0.25, 0.75], hparams), # PosteriorBNNSampling('RMS', hparams_rms, 'RMSProp'), # PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), # PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory', hparams_nlinear_finite_memory), NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory2', hparams_nlinear_finite_memory2), NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory_noP', hparams_nlinear_finite_memory_no_prior), NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory2_noP', hparams_nlinear_finite_memory2_no_prior), NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory_noSigP', hparams_nlinear_finite_memory_no_sig_prior), NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory2_noSigP', hparams_nlinear_finite_memory2_no_sig_prior), LinearFullPosteriorSampling('LinFullPost', hparams_linear), # BootstrappedBNNSampling('BootRMS', hparams_rms), # ParameterNoiseSampling('ParamNoise', hparams_pnoise), # PosteriorBNNSampling('BBAlphaDiv', hparams_alpha_div, 'AlphaDiv'), # PosteriorBNNSampling('MultitaskGP', hparams_gp, 'GP'), ] if par==0: # Run contextual bandit problem t_init = time.time() results = run_contextual_bandit(context_dim, num_actions, dataset, algos) _, h_rewards = results # Display results display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type) # Append Results for j, a in enumerate(algos): res[j].append((np.sum(h_rewards[:, j]))) else: par_res = Parallel(n_jobs=num_cores)( delayed(Run)(context_dim, num_actions, dataset, algos, opt_rewards, opt_actions, data_type) for i in xrange(Nruns)) for j, rr in enumerate(par_res): res[j].append(rr[j]) if i<(Nruns-3): algos=None display_final_results(algos,opt_rewards, res, data_type)
def main(argv): # Problem parameters num_contexts = 4000 tfn = 400 tfe = tfn * 2 data_type = 'statlog' l_sizes = [50] outdir = "./" # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim, vocab_processor = sampled_vals if not os.path.exists(outdir): os.makedirs(outdir) # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_linear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_txt = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, batch_size=64, initial_lr=0.1, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, training_freq=1, training_freq_network=tfn, training_epochs=tfe, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=tfe, a0=6, b0=6, lambda_prior=0.25) hparams_epsilon = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=tfe, epsilon=0.1) hparams_nlinear_finite_memory = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=tfe, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=1, sigma_prior_flag=1) hparams_nlinear_finite_memory_no_prior = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=tfe, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=0, sigma_prior_flag=0) hparams_nlinear_finite_memory_no_sig_prior = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=tfn, training_epochs=tfe, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=1, sigma_prior_flag=0) Nruns = 10 n_algs = 5 res = np.zeros((n_algs, num_contexts)) totalreward = [0 for i in xrange(n_algs)] rewards = [[] for i in xrange(n_algs)] for i_run in xrange(Nruns): algos = [ NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory', hparams_nlinear_finite_memory), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory_noP', hparams_nlinear_finite_memory_no_prior), NeuralLinearPosteriorSamplingFiniteMemory( 'NeuralLinearFiniteMemory_noSigP', hparams_nlinear_finite_memory_no_sig_prior), LinearFullPosteriorSampling('LinFullPost', hparams_linear), ] results = run_contextual_bandit(context_dim, num_actions, dataset, algos) h_actions, h_rewards = results for j, a in enumerate(algos): print(np.sum(h_rewards[:, j])) totalreward[j] += ((np.sum(h_rewards[:, j])) / Nruns) rewards[j].append((np.sum(h_rewards[:, j]))) actions = [[] for i in xrange(len(h_actions[0]))] for aa in h_actions: for i, a in enumerate(aa): actions[i].append(a) for i_alg in xrange(len(algos)): res[i_alg, :] += 1 * ((actions[i_alg] != opt_actions)) if i_run < (Nruns - 1): algos = None display_final_results(algos, opt_rewards, opt_actions, rewards, data_type)
def main(_): # Problem parameters num_contexts = 2000 num_test_contexts = 200 # Data type in {linear, sparse_linear, mushroom, financial, jester, # statlog, adult, covertype, census, wheel} data_type = 'mushroom' # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals # dataset = dataset_full[:200, :] sampled_vals_t = sample_data(data_type, num_test_contexts) dataset_test, opt_rewards_t, _, _, _ = sampled_vals_t # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_linear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_rms = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=3) hparams_dropout = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80) hparams_bbb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear2 = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_pnoise = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, noise_std=0.05, eps=0.1, d_samples=300, ) hparams_alpha_div = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100, alpha=1.0, k=20, prior_variance=0.1) hparams_gp = tf.contrib.training.HParams(num_actions=num_actions, num_outputs=num_actions, context_dim=context_dim, reset_lr=False, learn_embeddings=True, max_num_points=1000, show_training=False, freq_summary=1000, batch_size=512, keep_fixed_after_max_obs=True, training_freq=50, initial_pulls=2, training_epochs=100, lr=0.01, buffer_s=-1, initial_lr=0.001, lr_decay_rate=0.0, optimizer='RMS', task_latent_dim=5, activate_decay=False) algos = [ # UniformSampling('Uniform Sampling', hparams), # UniformSampling('Uniform Sampling 2', hparams), # FixedPolicySampling('fixed1', [0.75, 0.25], hparams), # FixedPolicySampling('fixed2', [0.25, 0.75], hparams), # PosteriorBNNSampling('RMS', hparams_rms, 'RMSProp'), # PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), # PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), # NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), # LinearFullPosteriorSampling('LinFullPost', hparams_linear), # BootstrappedBNNSampling('BootRMS', hparams_rms), # ParameterNoiseSampling('ParamNoise', hparams_pnoise), # PosteriorBNNSampling('BBAlphaDiv', hparams_alpha_div, 'AlphaDiv'), # PosteriorBNNSampling('MultitaskGP', hparams_gp, 'GP'), ] # al1 = [NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2)] # al2 = [NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2)] # al3 = [NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2)] # al4 = [NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2)] # Run contextual bandit problem t_init = time.time() log_algos_my = [] log_algos_avg = [[] for i in range(5)] reg_algos_avg = [[] for i in range(5)] log_algos_avg_t = [[] for i in range(5)] algos_avg = [[] for i in range(5)] algos_avg_t = [[] for i in range(5)] for i in range(5): sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals # dataset = dataset_full[:200, :] sampled_vals_t = sample_data(data_type, num_test_contexts) dataset_test, opt_rewards_t, _, _, _ = sampled_vals_t print("starting new chance") al1 = [NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2)] al2 = [NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2)] al3 = [NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2)] al4 = [NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2)] for al_i in [al1, al2, al3, al4]: al_i[0].update_freq_lr=10 al_i[0].update_freq_nn=50 print("al1", al1[0].hparams.training_freq, "training") results = run_mixup_contextual_bandit(context_dim, num_actions, dataset, al1) _, h_rewards, al1 = results # al1 = tmp[0] for j in range(len(algos)): log_algos_my.append(["old mix", np.sum(h_rewards[:, j])]) algos_avg[0].append(np.sum(h_rewards[:, j])) log_algos_avg[0].append((np.sum(opt_rewards)-np.sum(h_rewards[:, j]))/np.sum(opt_rewards)) reg_algos_avg[0].append(np.sum(opt_rewards)-np.sum(h_rewards[:, j])) # Display results display_results(al1, opt_rewards, opt_actions, h_rewards, t_init, data_type, "mix", i) results = run_random_mixup_contextual_bandit(context_dim, num_actions, dataset, al2) _, h_rewards, al2 = results # al2 = tmp[0] for j in range(len(algos)): log_algos_my.append(["random mix", np.sum(h_rewards[:, j])]) algos_avg[1].append(np.sum(h_rewards[:, j])) log_algos_avg[1].append((np.sum(opt_rewards)-np.sum(h_rewards[:, j]))/np.sum(opt_rewards)) reg_algos_avg[1].append(np.sum(opt_rewards)-np.sum(h_rewards[:, j])) # Display results display_results(al2, opt_rewards, opt_actions, h_rewards, t_init, data_type, "mix random", i) results = run_contrast_mixup_contextual_bandit(context_dim, num_actions, dataset, al3) _, h_rewards, al3 = results # al3 = tmp[0] for j in range(len(algos)): log_algos_my.append(["contrast mix", np.sum(h_rewards[:, j])]) algos_avg[2].append(np.sum(h_rewards[:, j])) log_algos_avg[2].append((np.sum(opt_rewards)-np.sum(h_rewards[:, j]))/np.sum(opt_rewards)) reg_algos_avg[2].append(np.sum(opt_rewards)-np.sum(h_rewards[:, j])) # Display results display_results(al3, opt_rewards, opt_actions, h_rewards, t_init, data_type, "contrast mix", i) results = run_contextual_bandit(context_dim, num_actions, dataset, al4) _, h_rewards, al4 = results # al4 = tmp[0] # Display results display_results(al4, opt_rewards, opt_actions, h_rewards, t_init, data_type, "orig", i) for j in range(len(algos)): log_algos_my.append(["orig", np.sum(h_rewards[:, j])]) algos_avg[3].append(np.sum(h_rewards[:, j])) log_algos_avg[3].append((np.sum(opt_rewards)-np.sum(h_rewards[:, j]))/np.sum(opt_rewards)) reg_algos_avg[3].append(np.sum(opt_rewards)-np.sum(h_rewards[:, j])) print(log_algos_my, "my") ual = [UniformSampling('Uniform Sampling', hparams)] results = run_contextual_bandit(context_dim, num_actions, dataset, ual) _, h_rewards, ual = results # al4 = tmp[0] # Display results display_results(ual, opt_rewards, opt_actions, h_rewards, t_init, data_type, "uniform", i) for j in range(len(algos)): log_algos_my.append(["ual", np.sum(h_rewards[:, j])]) algos_avg[4].append(np.sum(h_rewards[:, j])) log_algos_avg[4].append((np.sum(opt_rewards)-np.sum(h_rewards[:, j]))/np.sum(opt_rewards)) reg_algos_avg[4].append(np.sum(opt_rewards)-np.sum(h_rewards[:, j])) print(log_algos_my, "my") # train_results = log_algos_avg.copy() # print(log_algos_their, "their") # print("algo 1 my vs their", sum(log_algos_my[0]), sum(log_algos_their[0])) # print("algo 2 my vs their", sum(log_algos_my[1]), sum(log_algos_their[1])) ######### ######### ######## # testing phase ####### ######## ######## print("starting test, switching off training") for al_i in [al1, al2, al3, al4]: al_i[0].update_freq_lr=10000 al_i[0].update_freq_nn=10000 print("al1", al1[0].hparams.training_freq, "test") results = run_contextual_bandit(context_dim, num_actions, dataset_test, al1) _, h_rewards, al1 = results # al1 = tmp[0] for j in range(len(algos)): log_algos_my.append(["old mix", np.sum(h_rewards[:, j])]) algos_avg_t[0].append(np.sum(h_rewards[:, j])) log_algos_avg_t[0].append((np.sum(opt_rewards_t)-np.sum(h_rewards[:, j]))/np.sum(opt_rewards_t)) # Display results display_results(al1, opt_rewards, opt_actions, h_rewards, t_init, data_type, "mix", i) results = run_contextual_bandit(context_dim, num_actions, dataset_test, al2) _, h_rewards, al2 = results # al2 = tmp[0] for j in range(len(algos)): log_algos_my.append(["random mix", np.sum(h_rewards[:, j])]) algos_avg_t[1].append(np.sum(h_rewards[:, j])) log_algos_avg_t[1].append((np.sum(opt_rewards_t)-np.sum(h_rewards[:, j]))/np.sum(opt_rewards_t)) # Display results display_results(al2, opt_rewards, opt_actions, h_rewards, t_init, data_type, "mix random", i) results = run_contextual_bandit(context_dim, num_actions, dataset_test, al3) _, h_rewards, al3 = results # al3 = tmp[0] for j in range(len(algos)): log_algos_my.append(["contrast mix", np.sum(h_rewards[:, j])]) algos_avg_t[2].append(np.sum(h_rewards[:, j])) log_algos_avg_t[2].append((np.sum(opt_rewards_t)-np.sum(h_rewards[:, j]))/np.sum(opt_rewards_t)) # Display results display_results(al3, opt_rewards, opt_actions, h_rewards, t_init, data_type, "contrast mix", i) results = run_contextual_bandit(context_dim, num_actions, dataset_test, al4) _, h_rewards, al4 = results # al4 = tmp[0] # Display results display_results(al4, opt_rewards, opt_actions, h_rewards, t_init, data_type, "orig", i) for j in range(len(algos)): log_algos_my.append(["orig", np.sum(h_rewards[:, j])]) algos_avg_t[3].append(np.sum(h_rewards[:, j])) log_algos_avg_t[3].append((np.sum(opt_rewards_t)-np.sum(h_rewards[:, j]))/np.sum(opt_rewards_t)) print(log_algos_my, "my") for i, ex in enumerate(['orig mix', 'random mix', 'contrast mix', 'orig', 'uniform']): print("TRAINNN", ex, " ", np.mean(log_algos_avg[i]), np.mean(algos_avg[i]), np.mean(reg_algos_avg[i])) for i, ex in enumerate(['orig mix', 'random mix', 'contrast mix', 'orig']): print("TESTTT", ex, " ", np.mean(log_algos_avg_t[i]), np.mean(algos_avg_t[i]))
def main(_): # Problem parameters num_contexts = 40000 # parameters of finite tfn = 400 tfe = tfn * 2 data_type = 'statlog' l_sizes = [50, 50] outdir = "./" # Data type in {linear, sparse_linear, mushroom, financial, jester, # statlog, adult, covertype, census, wheel} data_type = 'moon' nExperiment = 2 # Create dataset sampled_vals = sample_data(data_type, num_contexts) dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals # Define hyperparameters and algorithms hparams = tf.contrib.training.HParams(num_actions=num_actions) hparams_linear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, a0=6, b0=6, lambda_prior=0.25, initial_pulls=2) hparams_rms = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, p=0.95, q=3) hparams_dropout = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50, 50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, use_dropout=True, keep_prob=0.80) hparams_bbb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50, 50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100) hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50, 50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear2 = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=10, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=0.25) hparams_nlinear_finite_memory = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=1, sigma_prior_flag=1) hparams_nlinear_finite_memory_no_prior = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=0, sigma_prior_flag=0) hparams_nlinear_finite_memory_no_sig_prior = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=l_sizes, batch_size=64, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, training_freq=1, training_freq_network=50, training_epochs=100, a0=6, b0=6, lambda_prior=1, mem=num_actions * 100, mu_prior_flag=1, sigma_prior_flag=0) hparams_pnoise = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', reset_lr=True, lr_decay_rate=0.5, training_freq=50, training_epochs=100, noise_std=0.05, eps=0.1, d_samples=300, ) hparams_alpha_div = tf.contrib.training.HParams( num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, optimizer='RMS', use_sigma_exp_transform=True, cleared_times_trained=10, initial_training_steps=100, noise_sigma=0.1, reset_lr=False, training_freq=50, training_epochs=100, alpha=1.0, k=20, prior_variance=0.1) hparams_gp = tf.contrib.training.HParams(num_actions=num_actions, num_outputs=num_actions, context_dim=context_dim, reset_lr=False, learn_embeddings=True, max_num_points=1000, show_training=False, freq_summary=1000, batch_size=512, keep_fixed_after_max_obs=True, training_freq=50, initial_pulls=2, training_epochs=100, lr=0.01, buffer_s=-1, initial_lr=0.001, lr_decay_rate=0.0, optimizer='RMS', task_latent_dim=5, activate_decay=False) hparams_greedy = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, optimizer='RMS', training_freq=50, training_freq_network=50, training_epochs=100, lambda_prior=0.25, delta=0.01, lamb=0.01, mu=1, S=1) hparams_ucb = tf.contrib.training.HParams(num_actions=num_actions, context_dim=context_dim, init_scale=0.3, activation=tf.nn.relu, layer_sizes=[50], batch_size=512, activate_decay=True, initial_lr=0.1, max_grad_norm=5.0, show_training=False, freq_summary=1000, buffer_s=-1, initial_pulls=2, reset_lr=True, lr_decay_rate=0.5, optimizer='RMS', training_freq=50, training_freq_network=50, training_epochs=100, lambda_prior=0.25, delta=0.01, lamb=0.01, mu=1, S=1) # Run contextual bandit problem t_init = time.time() for i in range(nExperiment): algos = [ #UniformSampling('Uniform Sampling', hparams), #UniformSampling('Uniform Sampling 2', hparams), #FixedPolicySampling('fixed1', [0.75, 0.25], hparams), #FixedPolicySampling('fixed2', [0.25, 0.75], hparams), #PosteriorBNNSampling('RMS', hparams_rms, 'RMSProp'), #PosteriorBNNSampling('Dropout', hparams_dropout, 'RMSProp'), #PosteriorBNNSampling('BBB', hparams_bbb, 'Variational'), #NeuralLinearPosteriorSampling('NeuralLinear', hparams_nlinear), #NeuralLinearPosteriorSampling('NeuralLinear2', hparams_nlinear2), LinearFullPosteriorSampling('LinFullPost', hparams_linear), #BootstrappedBNNSampling('BootRMS', hparams_rms), #NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory', hparams_nlinear_finite_memory), #NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory_noP', hparams_nlinear_finite_memory_no_prior), #NeuralLinearPosteriorSamplingFiniteMemory('NeuralLinearFiniteMemory_noSigP', hparams_nlinear_finite_memory_no_sig_prior) #ParameterNoiseSampling('ParamNoise', hparams_pnoise), #PosteriorBNNSampling('BBAlphaDiv', hparams_alpha_div, 'AlphaDiv'), #PosteriorBNNSampling('MultitaskGP', hparams_gp, 'GP'),hparams_ucb #NeuralUCBSampling('NeuralUCB', hparams_ucb) NeuralGreedy('NeuralGreedy', hparams_greedy) ] results = run_contextual_bandit(context_dim, num_actions, dataset, algos) _, h_rewards = results np.savetxt("resultLin" + str(i) + ".csv", h_rewards[:, 0], delimiter=',') np.savetxt("resultMoon" + str(i) + ".csv", h_rewards[:, 1], delimiter=',') # Display results display_results(algos, opt_rewards, opt_actions, h_rewards, t_init, data_type)