Esempio n. 1
0
def train_and_draw():
    """
    run softmax selection for TRAINING_ITERATIONS and plot running mean
    """
    list_action_values = np.ones(N_ARMS)  # index=action, value=avg reward
    counts = np.zeros(N_ARMS)
    # initialise av distribution to be uniform at outset
    av_softmax = np.zeros(N_ARMS)  # prob. dist of action_values
    av_softmax[:] = 1.0 / N_ARMS

    plt.xlabel('Plays')
    plt.ylabel('Avg Rewards')

    hist_reward = []
    for i in range(TRAINING_ITERATIONS):
        choice = np.random.choice(list(range(N_ARMS)), p=av_softmax)
        counts[choice] += 1
        k = counts[choice]
        rwd = generate_reward(ARM_PROB[choice])
        old_avg = list_action_values[choice]
        new_avg = old_avg + (1 / k) * (rwd - old_avg)
        list_action_values[choice] = new_avg
        av_softmax = softmax(list_action_values)
        hist_reward.append(rwd)
        plt.scatter(i, np.mean(hist_reward))

    plt.show()
Esempio n. 2
0
def train_draw(epochs=5000, learning_rate=1e-2):
    """
    main training loop
    """
    env = ContextBandit(NUM_WEBSITES)
    # 2-layer with input and output the # of arms
    # and intermediate hidden layer

    state_input = tf.placeholder(dtype=tf.float32, shape=(None, NUM_WEBSITES))
    dense_output = layers.Dense(HIDDEN_UNITS, activation='relu')(state_input)
    y_pred = layers.Dense(NUM_WEBSITES, activation='relu')(dense_output)
    one_hot_reward = tf.placeholder(dtype=tf.float32,
                                    shape=(None, NUM_WEBSITES))
    loss = tf.losses.mean_squared_error(labels=one_hot_reward,
                                        predictions=y_pred)
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    train = optimizer.minimize(loss)
    with tf.control_dependencies([
            train
    ]):  # requirement for partial run; can't use train itself as fetch item
        dummy = tf.constant(0)
    sess = tf.Session()
    init = tf.global_variables_initializer()
    sess.run(init)

    plt.xlabel('Plays')
    plt.ylabel('Avg Rewards')
    hist_reward = []

    for i in range(epochs):
        # Use partial_run so that we only run the model once to generate the reward predictions
        # and then perform the action and reward lookup, and continue to generate loss
        partial_setup = sess.partial_run_setup([y_pred, dummy],
                                               [state_input, one_hot_reward])
        curr_state = np.expand_dims(one_hot(NUM_WEBSITES, env.get_state()),
                                    axis=0)
        curr_y_pred = sess.partial_run(partial_setup,
                                       y_pred,
                                       feed_dict={state_input: curr_state})[0]
        # Convert reward preds to softmax prob. distribution
        av_softmax = softmax(curr_y_pred, tau=2.0)
        # Normalize to make sure sums to 1.
        av_softmax /= av_softmax.sum()
        # Choose action based on prob. distribution
        choice = np.random.choice(NUM_WEBSITES, p=av_softmax)
        # Take action, get reward
        cur_reward = env.choose_arm(choice)
        # Convert to OH numpy array
        curr_one_hot_reward = curr_y_pred.copy()
        curr_one_hot_reward[choice] = cur_reward
        curr_one_hot_reward = np.expand_dims(curr_one_hot_reward, axis=0)
        sess.partial_run(partial_setup,
                         dummy,
                         feed_dict={one_hot_reward: curr_one_hot_reward})

        hist_reward.append(cur_reward)
        plt.scatter(i, np.mean(hist_reward))
    plt.show()
def train(epochs=5000, learning_rate=1e-2):
    """
    main training loop
    """
    # 2-layer with input and output the # of arms
    # and intermediate hidden layer
    model = th.nn.Sequential(th.nn.Linear(NUM_WEBSITES, HIDDEN_UNITS),
                             th.nn.ReLU(),
                             th.nn.Linear(HIDDEN_UNITS, NUM_WEBSITES),
                             th.nn.ReLU())

    loss_fn = th.nn.MSELoss()
    env = ContextBandit(NUM_WEBSITES)

    plt.xlabel('Plays')
    plt.ylabel('Avg Rewards')
    hist_reward = []

    # Get current state and  convert to PyTorch var
    cur_state = Variable(th.Tensor(one_hot(NUM_WEBSITES, env.get_state())))
    optimizer = th.optim.Adam(model.parameters(), lr=learning_rate)
    for i in range(epochs):
        # Get reward predictions
        y_pred = model(cur_state)
        # Convert reward preds to softmax prob. distribution
        av_softmax = softmax(y_pred.data.numpy(), tau=2.0)
        # Normalize to make sure sums to 1.
        av_softmax /= av_softmax.sum()
        # Choose action based on prob. distribution
        choice = np.random.choice(NUM_WEBSITES, p=av_softmax)
        # Take action, get reward
        cur_reward = env.choose_arm(choice)
        # Convert to OH numpy array
        one_hot_reward = y_pred.data.numpy().copy()
        one_hot_reward[choice] = cur_reward
        reward = Variable(th.Tensor(one_hot_reward))
        loss = loss_fn(y_pred, reward)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update current state
        cur_state = Variable(th.Tensor(one_hot(NUM_WEBSITES, env.get_state())))

        hist_reward.append(cur_reward)
        plt.scatter(i, np.mean(hist_reward))
    plt.show()
def train_draw(epochs=5000, learning_rate=1e-2):
    """
    main training loop
    """
    env = ContextBandit(NUM_WEBSITES)
    ffn_model = build_ffn_model()
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

    # loss = tf.losses.mean_squared_error(labels=one_hot_reward,
    #                                     predictions=y_pred)
    # train = optimizer.minimize(loss)

    plt.xlabel('Plays')
    plt.ylabel('Avg Rewards')
    hist_reward = []

    for i in range(epochs):
        # Use partial_run so that we only run the model once to generate the reward predictions
        # and then perform the action and reward lookup, and continue to generate loss

        with tf.GradientTape() as tape:
            curr_state = np.expand_dims(one_hot(NUM_WEBSITES, env.get_state()), axis=0)
            curr_y_pred = ffn_model(curr_state)
            # Convert reward preds to softmax prob. distribution
            av_softmax = softmax(curr_y_pred[0], tau=2.0)
            # Normalize to make sure sums to 1.
            av_softmax /= av_softmax.sum()
            # Choose action based on prob. distribution
            choice = np.random.choice(NUM_WEBSITES, p=av_softmax)
            # Take action, get reward
            cur_reward = env.choose_arm(choice)
            hist_reward.append(cur_reward)
            plt.scatter(i, np.mean(hist_reward))
            # Convert to OH numpy array
            curr_y_target = np.copy(curr_y_pred)
            curr_y_target[0, choice] = cur_reward
            loss_value = tf.losses.mean_squared_error(labels=curr_y_target,
                                                      predictions=curr_y_pred)
        grads = tape.gradient(loss_value,
                              ffn_model.trainable_variables)
        optimizer.apply_gradients(zip(grads, ffn_model.trainable_variables),
                                  global_step=tf.train.get_or_create_global_step())

    plt.show()
Esempio n. 5
0
def train(epochs=5000):
    """
    main training loop
    """
    model = build_ffn_model()
    env = ContextBandit(NUM_WEBSITES)

    plt.xlabel('Plays')
    plt.ylabel('Avg Rewards')
    hist_reward = []

    # Get current state
    cur_state = one_hot(NUM_WEBSITES, env.get_state())
    cur_batch = np.expand_dims(cur_state, axis=0)
    for i in range(epochs):
        # Get reward predictions
        y_pred = model.predict(cur_batch)[0]

        # Convert reward preds to softmax prob. distribution
        av_softmax = softmax(y_pred, tau=2.0)
        # Normalize to make sure sums to 1.
        av_softmax /= av_softmax.sum()
        # Choose action based on prob. distribution
        choice = np.random.choice(NUM_WEBSITES, p=av_softmax)
        # Take action, get reward
        cur_reward = env.choose_arm(choice)
        # Convert to OH numpy array
        one_hot_reward = y_pred.copy()
        one_hot_reward[choice] = cur_reward
        # reward = Variable(th.Tensor(one_hot_reward))
        # loss = loss_fn(y_pred, reward)
        model.train_on_batch(cur_batch, np.expand_dims(one_hot_reward, axis=0))

        # Update current state
        cur_state = one_hot(NUM_WEBSITES, env.get_state())
        cur_batch = np.expand_dims(cur_state, axis=0)

        hist_reward.append(cur_reward)
        plt.scatter(i, np.mean(hist_reward))
    plt.show()