Ejemplo n.º 1
0
def realistic(steps, eps, alpha):
    """Realistic constant step-size epsilon-greedy method.

    :param steps: Number of timesteps
    :type steps: int
    :param eps: The probability of choosing the exploration vs exploitation.
    :type eps: float
    :param alpha: Constant step-size
    :type alpha: float
    :return: Two lists: rewards and if the chosen action was optimal
    :rtype: tuple
    """
    q = list(np.random.normal(0, 1, size=10))  # true action values
    q_est = [0] * 10  # estimated action values
    optimals = list()
    optimal = argmax(q)
    for i in range(steps):
        # choose action
        if random.random() < 1 - eps:
            action = argmax(q_est)  # exploitation
        else:
            action = random.randint(0, 9)  # exploration
        optimals.append(action is optimal)
        reward = np.random.normal(q[action], 1)  # get a reward
        q_est[action] += (reward - q_est[action]
                          ) * alpha  # update estimated values (alpha = 0.1)

    return optimals
Ejemplo n.º 2
0
def constant_step(steps, eps, alpha):
    """Nonstationary 10-armed bandit problem.
    Exponential recency-weighted average method (constant step-size).

    :param steps: Number of timesteps
    :type steps: int
    :param eps: The probability of choosing the exploration vs exploitation.
    :type eps: float
    :param alpha: Constant step-size
    :type alpha: float
    :return: Two lists: rewards and if the chosen action was optimal
    :rtype: tuple
    """
    q = list(np.random.normal(0, 1, size=10))  # true action values
    q_est = [0] * 10  # estimated action values
    rewards = list()  # rewards on each step
    optimals = list()  # bool array: 1 - max action value, otherwise 0

    for i in range(steps):
        # choose action
        if random.random() < 1 - eps:
            action = argmax(q_est)  # exploitation
        else:
            action = random.randint(0, 9)  # exploration
        optimals.append(
            action == argmax(q))  # check if the action had maximum value
        rewards.append(np.random.normal(
            q[action], 1))  # get a normally distributed reward
        # update estimated values
        q_est[action] += (rewards[-1] - q_est[action]) * alpha

        # introduce true value fluctuations
        q += np.random.normal(0, 0.01, size=10)

    return rewards, optimals
Ejemplo n.º 3
0
def const_eps_greedy(steps, eps):
    """Nonstationary 10-armed bandit problem.
    Constant step-size (0.1) epsilon-greedy method.

    :param steps: Number of timesteps
    :type steps: int
    :param eps: The probability of choosing the exploration vs exploitation.
    :type eps: float
    :return: Rewards
    :rtype: list
    """
    q = list(np.random.normal(0, 1, size=10))           # true action values
    q_est = [0] * 10                                    # estimated action values
    rewards = list()                                    # rewards on each step

    for i in range(steps):
        # choose an action
        if random.random() < 1 - eps:                   # exploitation
            action = argmax(q_est)
        else:                                           # exploration
            action = random.randint(0, 9)
        rewards.append(np.random.normal(q[action], 1))  # get action normally distributed reward

        # update estimated values
        q_est[action] += (rewards[-1] - q_est[action]) * 0.1

        # introduce true value fluctuations
        q += np.random.normal(0, 0.01, size=10)
    return rewards
Ejemplo n.º 4
0
def ucb(steps, c):
    """Stationary 10-armed bandit problem.
    Upper-Confidence-Bound Action Selection.

    :param steps: Number of time steps
    :type steps: int
    :param c: Degree of exploration
    :type c: float
    :return: Rewards
    :rtype: list
    """
    q = list(np.random.normal(0, 1, size=10))           # true action values
    q_est = [0] * 10                                    # estimated action values
    action_counts = [0] * 10                            # action counter
    ucb_q_est = [5] * 10                                # ucb estimations
    rewards = list()

    for i in range(0, steps):
        action = argmax(ucb_q_est)                       # choose greedy
        rewards.append(np.random.normal(q[action], 1))   # get action reward

        # update ucb estimations
        for j in range(10):
            if action_counts[j] != 0:
                sqrt = (log(i) / action_counts[j]) ** 0.5
                ucb_q_est[j] = q_est[j] + c * sqrt
        action_counts[action] += 1                       # update action counter

        # update estimated values
        q_est[action] += (rewards[-1] - q_est[action]) / action_counts[action]

        # introduce true value fluctuations
        q += np.random.normal(0, 0.01, size=10)
    return rewards
Ejemplo n.º 5
0
def eps_greedy(steps, eps):
    """Stationary 10-armed bandit problem.
    Sample-average epsilon-greedy method.

    :param steps: Number of timesteps
    :type steps: int
    :param eps: The probability of choosing the exploration vs exploitation
    :type eps: float
    :return: Rewards
    :rtype: list
    """
    q = list(np.random.normal(0, 1, size=10))  # true action values
    q_est = [0] * 10  # estimated action values
    action_counts = [0] * 10  # action counter
    rewards = list()  # rewards on each step

    for i in range(steps):
        # choose an action
        if random.random() < 1 - eps:  # exploitation
            action = argmax(q_est)
        else:  # exploration
            action = random.randint(0, 9)
        action_counts[action] += 1  # update action counter
        rewards.append(np.random.normal(
            q[action], 1))  # get action normally distributed reward

        # update estimated values
        q_est[action] += (rewards[-1] - q_est[action]) / action_counts[action]

    return rewards
Ejemplo n.º 6
0
def optimistic(steps, alpha):
    """Optimistic constant step-size greedy method.

    :param steps: Number of timesteps
    :type steps: int
    :param alpha: Constant step-size
    :type alpha: float
    :return: Two lists: rewards and if the chosen action was optimal
    :rtype: tuple
    """
    q = list(np.random.normal(0, 1, size=10))  # true action values
    q_est = [
        5
    ] * 10  # estimated action values are +5 to true values (optimistic)
    optimals = list()
    optimal = argmax(q)
    for i in range(steps):
        a = argmax(q_est)  # greedy approach
        optimals.append(a is optimal)  # check if the action had maximum value
        reward = np.random.normal(q[a], 1)  # get a reward
        q_est[a] += (reward -
                     q_est[a]) * alpha  # update estimated values (alpha = 0.1)

    return optimals
Ejemplo n.º 7
0
def sample_average(steps, eps):
    """Nonstationary 10-armed bandit problem.
    Sample-average epsilon-greedy method.

    :param steps: Number of timesteps
    :type steps: int
    :param eps: The probability of choosing the exploration vs exploitation.
    :type eps: float
    :return: Two lists: rewards and if the chosen action was optimal
    :rtype: tuple
    """

    q = list(np.random.normal(0, 1, size=10))  # true action values
    q_est = [0] * 10  # estimated action values
    action_counts = [0] * 10  # action counter
    rewards = list()  # rewards on each step
    optimals = list()  # bool array: 1 - max action value, otherwise 0

    for i in range(steps):
        # choose action
        if random.random() < 1 - eps:
            action = argmax(q_est)  # exploitation
        else:
            action = random.randint(0, 9)  # exploration
        action_counts[action] += 1  # update action counter
        optimals.append(
            action == argmax(q))  # check if the action had maximum value
        rewards.append(np.random.normal(
            q[action], 1))  # get a normally distributed reward
        # update estimated values
        q_est[action] += (rewards[-1] - q_est[action]) / action_counts[action]

        # introduce some random value fluctuations
        q += np.random.normal(0, 0.01, size=10)

    return rewards, optimals
Ejemplo n.º 8
0
def grad_bline(steps, alpha):
    """Stationary 10-armed bandit problem.
    Gradient Bandit Algorithm with baseline.

    :param steps: Number of timesteps
    :type steps: int
    :param alpha: Constant step-size
    :type alpha: float
    :return: Optimals
    :rtype: list
    """
    q = list(np.random.normal(4, 1, size=10))  # true action values
    h = [0] * 10  # action preferences
    p = [0.1] * 10  # probabilities of choosing an action
    mean = 0  # mean reward initialisation
    optimals = list()  # list of bools
    optimal = argmax(q)

    for i in range(steps):

        action = random.choices(range(10), weights=p,
                                k=1)[0]  # choose an action
        optimals.append(
            action is optimal)  # check if the action had maximum value
        reward = np.random.normal(q[action], 1)  # get action reward

        # update preferences
        mean = mean + (reward - mean) / (
            i + 1)  # incremental formula for mean value
        h_exps = []
        for j, _ in enumerate(h):
            if j == action:
                h[j] = h[j] + alpha * (reward - mean) * (
                    1 - p[j])  # preference for chosen action
            else:
                h[j] = h[j] - alpha * (
                    reward - mean) * p[j]  # preference for other actions
            h_exps.append(exp(h[j]))  # exponents for each preference

        # update action probabilities
        h_exps_sum = sum(h_exps)
        p = [x / h_exps_sum for x in h_exps]

    return optimals
Ejemplo n.º 9
0
def optimistic_greedy(steps, q0):
    """Stationary 10-armed bandit problem.
    Constant step-size greedy method

    :param steps: Number of timesteps
    :type steps: int
    :param q0: Initial value for q estimation
    :type q0: float
    :return: Rewards
    :rtype: list
    """
    q = list(np.random.normal(0, 1, size=10))                   # true action values
    q_est = [q0] * 10                                           # estimated action values
    rewards = list()                                            # rewards on each step

    for i in range(steps):
        action = argmax(q_est)                                  # choose action
        rewards.append(np.random.normal(q[action], 1))          # get action normally distributed reward
        q_est[action] += (rewards[-1] - q_est[action]) * 0.1    # update estimated values

    return rewards