def realistic(steps, eps, alpha): """Realistic constant step-size epsilon-greedy method. :param steps: Number of timesteps :type steps: int :param eps: The probability of choosing the exploration vs exploitation. :type eps: float :param alpha: Constant step-size :type alpha: float :return: Two lists: rewards and if the chosen action was optimal :rtype: tuple """ q = list(np.random.normal(0, 1, size=10)) # true action values q_est = [0] * 10 # estimated action values optimals = list() optimal = argmax(q) for i in range(steps): # choose action if random.random() < 1 - eps: action = argmax(q_est) # exploitation else: action = random.randint(0, 9) # exploration optimals.append(action is optimal) reward = np.random.normal(q[action], 1) # get a reward q_est[action] += (reward - q_est[action] ) * alpha # update estimated values (alpha = 0.1) return optimals
def constant_step(steps, eps, alpha): """Nonstationary 10-armed bandit problem. Exponential recency-weighted average method (constant step-size). :param steps: Number of timesteps :type steps: int :param eps: The probability of choosing the exploration vs exploitation. :type eps: float :param alpha: Constant step-size :type alpha: float :return: Two lists: rewards and if the chosen action was optimal :rtype: tuple """ q = list(np.random.normal(0, 1, size=10)) # true action values q_est = [0] * 10 # estimated action values rewards = list() # rewards on each step optimals = list() # bool array: 1 - max action value, otherwise 0 for i in range(steps): # choose action if random.random() < 1 - eps: action = argmax(q_est) # exploitation else: action = random.randint(0, 9) # exploration optimals.append( action == argmax(q)) # check if the action had maximum value rewards.append(np.random.normal( q[action], 1)) # get a normally distributed reward # update estimated values q_est[action] += (rewards[-1] - q_est[action]) * alpha # introduce true value fluctuations q += np.random.normal(0, 0.01, size=10) return rewards, optimals
def const_eps_greedy(steps, eps): """Nonstationary 10-armed bandit problem. Constant step-size (0.1) epsilon-greedy method. :param steps: Number of timesteps :type steps: int :param eps: The probability of choosing the exploration vs exploitation. :type eps: float :return: Rewards :rtype: list """ q = list(np.random.normal(0, 1, size=10)) # true action values q_est = [0] * 10 # estimated action values rewards = list() # rewards on each step for i in range(steps): # choose an action if random.random() < 1 - eps: # exploitation action = argmax(q_est) else: # exploration action = random.randint(0, 9) rewards.append(np.random.normal(q[action], 1)) # get action normally distributed reward # update estimated values q_est[action] += (rewards[-1] - q_est[action]) * 0.1 # introduce true value fluctuations q += np.random.normal(0, 0.01, size=10) return rewards
def ucb(steps, c): """Stationary 10-armed bandit problem. Upper-Confidence-Bound Action Selection. :param steps: Number of time steps :type steps: int :param c: Degree of exploration :type c: float :return: Rewards :rtype: list """ q = list(np.random.normal(0, 1, size=10)) # true action values q_est = [0] * 10 # estimated action values action_counts = [0] * 10 # action counter ucb_q_est = [5] * 10 # ucb estimations rewards = list() for i in range(0, steps): action = argmax(ucb_q_est) # choose greedy rewards.append(np.random.normal(q[action], 1)) # get action reward # update ucb estimations for j in range(10): if action_counts[j] != 0: sqrt = (log(i) / action_counts[j]) ** 0.5 ucb_q_est[j] = q_est[j] + c * sqrt action_counts[action] += 1 # update action counter # update estimated values q_est[action] += (rewards[-1] - q_est[action]) / action_counts[action] # introduce true value fluctuations q += np.random.normal(0, 0.01, size=10) return rewards
def eps_greedy(steps, eps): """Stationary 10-armed bandit problem. Sample-average epsilon-greedy method. :param steps: Number of timesteps :type steps: int :param eps: The probability of choosing the exploration vs exploitation :type eps: float :return: Rewards :rtype: list """ q = list(np.random.normal(0, 1, size=10)) # true action values q_est = [0] * 10 # estimated action values action_counts = [0] * 10 # action counter rewards = list() # rewards on each step for i in range(steps): # choose an action if random.random() < 1 - eps: # exploitation action = argmax(q_est) else: # exploration action = random.randint(0, 9) action_counts[action] += 1 # update action counter rewards.append(np.random.normal( q[action], 1)) # get action normally distributed reward # update estimated values q_est[action] += (rewards[-1] - q_est[action]) / action_counts[action] return rewards
def optimistic(steps, alpha): """Optimistic constant step-size greedy method. :param steps: Number of timesteps :type steps: int :param alpha: Constant step-size :type alpha: float :return: Two lists: rewards and if the chosen action was optimal :rtype: tuple """ q = list(np.random.normal(0, 1, size=10)) # true action values q_est = [ 5 ] * 10 # estimated action values are +5 to true values (optimistic) optimals = list() optimal = argmax(q) for i in range(steps): a = argmax(q_est) # greedy approach optimals.append(a is optimal) # check if the action had maximum value reward = np.random.normal(q[a], 1) # get a reward q_est[a] += (reward - q_est[a]) * alpha # update estimated values (alpha = 0.1) return optimals
def sample_average(steps, eps): """Nonstationary 10-armed bandit problem. Sample-average epsilon-greedy method. :param steps: Number of timesteps :type steps: int :param eps: The probability of choosing the exploration vs exploitation. :type eps: float :return: Two lists: rewards and if the chosen action was optimal :rtype: tuple """ q = list(np.random.normal(0, 1, size=10)) # true action values q_est = [0] * 10 # estimated action values action_counts = [0] * 10 # action counter rewards = list() # rewards on each step optimals = list() # bool array: 1 - max action value, otherwise 0 for i in range(steps): # choose action if random.random() < 1 - eps: action = argmax(q_est) # exploitation else: action = random.randint(0, 9) # exploration action_counts[action] += 1 # update action counter optimals.append( action == argmax(q)) # check if the action had maximum value rewards.append(np.random.normal( q[action], 1)) # get a normally distributed reward # update estimated values q_est[action] += (rewards[-1] - q_est[action]) / action_counts[action] # introduce some random value fluctuations q += np.random.normal(0, 0.01, size=10) return rewards, optimals
def grad_bline(steps, alpha): """Stationary 10-armed bandit problem. Gradient Bandit Algorithm with baseline. :param steps: Number of timesteps :type steps: int :param alpha: Constant step-size :type alpha: float :return: Optimals :rtype: list """ q = list(np.random.normal(4, 1, size=10)) # true action values h = [0] * 10 # action preferences p = [0.1] * 10 # probabilities of choosing an action mean = 0 # mean reward initialisation optimals = list() # list of bools optimal = argmax(q) for i in range(steps): action = random.choices(range(10), weights=p, k=1)[0] # choose an action optimals.append( action is optimal) # check if the action had maximum value reward = np.random.normal(q[action], 1) # get action reward # update preferences mean = mean + (reward - mean) / ( i + 1) # incremental formula for mean value h_exps = [] for j, _ in enumerate(h): if j == action: h[j] = h[j] + alpha * (reward - mean) * ( 1 - p[j]) # preference for chosen action else: h[j] = h[j] - alpha * ( reward - mean) * p[j] # preference for other actions h_exps.append(exp(h[j])) # exponents for each preference # update action probabilities h_exps_sum = sum(h_exps) p = [x / h_exps_sum for x in h_exps] return optimals
def optimistic_greedy(steps, q0): """Stationary 10-armed bandit problem. Constant step-size greedy method :param steps: Number of timesteps :type steps: int :param q0: Initial value for q estimation :type q0: float :return: Rewards :rtype: list """ q = list(np.random.normal(0, 1, size=10)) # true action values q_est = [q0] * 10 # estimated action values rewards = list() # rewards on each step for i in range(steps): action = argmax(q_est) # choose action rewards.append(np.random.normal(q[action], 1)) # get action normally distributed reward q_est[action] += (rewards[-1] - q_est[action]) * 0.1 # update estimated values return rewards