def train_and_draw(): """ run softmax selection for TRAINING_ITERATIONS and plot running mean """ list_action_values = np.ones(N_ARMS) # index=action, value=avg reward counts = np.zeros(N_ARMS) # initialise av distribution to be uniform at outset av_softmax = np.zeros(N_ARMS) # prob. dist of action_values av_softmax[:] = 1.0 / N_ARMS plt.xlabel('Plays') plt.ylabel('Avg Rewards') hist_reward = [] for i in range(TRAINING_ITERATIONS): choice = np.random.choice(list(range(N_ARMS)), p=av_softmax) counts[choice] += 1 k = counts[choice] rwd = generate_reward(ARM_PROB[choice]) old_avg = list_action_values[choice] new_avg = old_avg + (1 / k) * (rwd - old_avg) list_action_values[choice] = new_avg av_softmax = softmax(list_action_values) hist_reward.append(rwd) plt.scatter(i, np.mean(hist_reward)) plt.show()
def train_draw(epochs=5000, learning_rate=1e-2): """ main training loop """ env = ContextBandit(NUM_WEBSITES) # 2-layer with input and output the # of arms # and intermediate hidden layer state_input = tf.placeholder(dtype=tf.float32, shape=(None, NUM_WEBSITES)) dense_output = layers.Dense(HIDDEN_UNITS, activation='relu')(state_input) y_pred = layers.Dense(NUM_WEBSITES, activation='relu')(dense_output) one_hot_reward = tf.placeholder(dtype=tf.float32, shape=(None, NUM_WEBSITES)) loss = tf.losses.mean_squared_error(labels=one_hot_reward, predictions=y_pred) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) train = optimizer.minimize(loss) with tf.control_dependencies([ train ]): # requirement for partial run; can't use train itself as fetch item dummy = tf.constant(0) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) plt.xlabel('Plays') plt.ylabel('Avg Rewards') hist_reward = [] for i in range(epochs): # Use partial_run so that we only run the model once to generate the reward predictions # and then perform the action and reward lookup, and continue to generate loss partial_setup = sess.partial_run_setup([y_pred, dummy], [state_input, one_hot_reward]) curr_state = np.expand_dims(one_hot(NUM_WEBSITES, env.get_state()), axis=0) curr_y_pred = sess.partial_run(partial_setup, y_pred, feed_dict={state_input: curr_state})[0] # Convert reward preds to softmax prob. distribution av_softmax = softmax(curr_y_pred, tau=2.0) # Normalize to make sure sums to 1. av_softmax /= av_softmax.sum() # Choose action based on prob. distribution choice = np.random.choice(NUM_WEBSITES, p=av_softmax) # Take action, get reward cur_reward = env.choose_arm(choice) # Convert to OH numpy array curr_one_hot_reward = curr_y_pred.copy() curr_one_hot_reward[choice] = cur_reward curr_one_hot_reward = np.expand_dims(curr_one_hot_reward, axis=0) sess.partial_run(partial_setup, dummy, feed_dict={one_hot_reward: curr_one_hot_reward}) hist_reward.append(cur_reward) plt.scatter(i, np.mean(hist_reward)) plt.show()
def train(epochs=5000, learning_rate=1e-2): """ main training loop """ # 2-layer with input and output the # of arms # and intermediate hidden layer model = th.nn.Sequential(th.nn.Linear(NUM_WEBSITES, HIDDEN_UNITS), th.nn.ReLU(), th.nn.Linear(HIDDEN_UNITS, NUM_WEBSITES), th.nn.ReLU()) loss_fn = th.nn.MSELoss() env = ContextBandit(NUM_WEBSITES) plt.xlabel('Plays') plt.ylabel('Avg Rewards') hist_reward = [] # Get current state and convert to PyTorch var cur_state = Variable(th.Tensor(one_hot(NUM_WEBSITES, env.get_state()))) optimizer = th.optim.Adam(model.parameters(), lr=learning_rate) for i in range(epochs): # Get reward predictions y_pred = model(cur_state) # Convert reward preds to softmax prob. distribution av_softmax = softmax(y_pred.data.numpy(), tau=2.0) # Normalize to make sure sums to 1. av_softmax /= av_softmax.sum() # Choose action based on prob. distribution choice = np.random.choice(NUM_WEBSITES, p=av_softmax) # Take action, get reward cur_reward = env.choose_arm(choice) # Convert to OH numpy array one_hot_reward = y_pred.data.numpy().copy() one_hot_reward[choice] = cur_reward reward = Variable(th.Tensor(one_hot_reward)) loss = loss_fn(y_pred, reward) optimizer.zero_grad() loss.backward() optimizer.step() # Update current state cur_state = Variable(th.Tensor(one_hot(NUM_WEBSITES, env.get_state()))) hist_reward.append(cur_reward) plt.scatter(i, np.mean(hist_reward)) plt.show()
def train_draw(epochs=5000, learning_rate=1e-2): """ main training loop """ env = ContextBandit(NUM_WEBSITES) ffn_model = build_ffn_model() optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) # loss = tf.losses.mean_squared_error(labels=one_hot_reward, # predictions=y_pred) # train = optimizer.minimize(loss) plt.xlabel('Plays') plt.ylabel('Avg Rewards') hist_reward = [] for i in range(epochs): # Use partial_run so that we only run the model once to generate the reward predictions # and then perform the action and reward lookup, and continue to generate loss with tf.GradientTape() as tape: curr_state = np.expand_dims(one_hot(NUM_WEBSITES, env.get_state()), axis=0) curr_y_pred = ffn_model(curr_state) # Convert reward preds to softmax prob. distribution av_softmax = softmax(curr_y_pred[0], tau=2.0) # Normalize to make sure sums to 1. av_softmax /= av_softmax.sum() # Choose action based on prob. distribution choice = np.random.choice(NUM_WEBSITES, p=av_softmax) # Take action, get reward cur_reward = env.choose_arm(choice) hist_reward.append(cur_reward) plt.scatter(i, np.mean(hist_reward)) # Convert to OH numpy array curr_y_target = np.copy(curr_y_pred) curr_y_target[0, choice] = cur_reward loss_value = tf.losses.mean_squared_error(labels=curr_y_target, predictions=curr_y_pred) grads = tape.gradient(loss_value, ffn_model.trainable_variables) optimizer.apply_gradients(zip(grads, ffn_model.trainable_variables), global_step=tf.train.get_or_create_global_step()) plt.show()
def train(epochs=5000): """ main training loop """ model = build_ffn_model() env = ContextBandit(NUM_WEBSITES) plt.xlabel('Plays') plt.ylabel('Avg Rewards') hist_reward = [] # Get current state cur_state = one_hot(NUM_WEBSITES, env.get_state()) cur_batch = np.expand_dims(cur_state, axis=0) for i in range(epochs): # Get reward predictions y_pred = model.predict(cur_batch)[0] # Convert reward preds to softmax prob. distribution av_softmax = softmax(y_pred, tau=2.0) # Normalize to make sure sums to 1. av_softmax /= av_softmax.sum() # Choose action based on prob. distribution choice = np.random.choice(NUM_WEBSITES, p=av_softmax) # Take action, get reward cur_reward = env.choose_arm(choice) # Convert to OH numpy array one_hot_reward = y_pred.copy() one_hot_reward[choice] = cur_reward # reward = Variable(th.Tensor(one_hot_reward)) # loss = loss_fn(y_pred, reward) model.train_on_batch(cur_batch, np.expand_dims(one_hot_reward, axis=0)) # Update current state cur_state = one_hot(NUM_WEBSITES, env.get_state()) cur_batch = np.expand_dims(cur_state, axis=0) hist_reward.append(cur_reward) plt.scatter(i, np.mean(hist_reward)) plt.show()