def solve(self, env, policy): # solve environment with respect to policy actions, energy = [], 0 # Set state to starting state of environment. state, prevState = env.getStartingState(), None isTerminalState = False while not isTerminalState: # Policy has best actions for given state. act = policy.get(state) if act is None: act = random.choice(env.getActions(state)) # Execute selected action in current state. state, reward, isTerminalState = env.do(state, act) actions.append(act) energy += reward if energy < -1000: break # We get a list of actions that were executed and sum of rewards that were given when agent entered certain state. return actions, energy
def adp_random_exploration(env, transs={}, utils={}, freqs={}, policy={}, rewards={}, **kwargs): """ Active ADP (adaptive dynamic programming) learning algorithm which returns the best policy for a given environment env and experience dictionary exp The experience dictionary exp can be empty if the agent has no experience with the environment but can also be full with values from previous trials The algorithm returns the number of iterations needed to reach a terminal state For reference look in page 834. @param env: Environment @param transs: A transition table (N_s'_sa) with outcome frequencies given state action pairs, initially zero. @param utils: Utilities table @param freqs: A table of frequencies (N_sa) for state-action pairs, initially zero. @param t: A parameter for choosing best action or random action. @param tStep: A step to increment parameter t. @param alpha: Step size function @param maxItr: Maximum iterations """ tStep = kwargs.get("tStep", 0.01) alpha = kwargs.get("alpha", _alpha) maxItr = kwargs.get("maxItr", 50) tFac = kwargs.get("tFac", 1.0) t = kwargs.get("currItrs", 0) / 5 if kwargs.get("remember", False) else 0 minRnd = kwargs.get("minRnd", 0.0) itr = 0 isTerminal = False state = env.getStartingState() rewardSum = 0 # Get possible actions with respect to current state. actions = env.getActions(state) _policy_iteration(transs, utils, policy, rewards, th=alpha(itr)) bestAction = policy.get(state, random.choice(actions)) while not isTerminal: # while not terminal if random.random() < max(minRnd, 1.0 / (tFac * (t + 1))) or bestAction is None: # If it is the first iteration or exploration event # then randomly choose an action. Taking a random action in 1/t instances. bestAction = random.choice(actions) # do the action with the best policy # or do some random exploration newState, reward, isTerminal = env.do(state, bestAction) rewards[newState] = reward rewardSum += reward # Set to zero if newState does not exist yet. For new state? freqs.setdefault(newState, 0) freqs[newState] += 1 # update transition table. The first one returns dictionary of actions for specific state and the # second one a dictionary of possible states from specific action (best action). transs.setdefault(state, {}).setdefault(bestAction, {}).setdefault(newState, 0) transs[state][bestAction][newState] += 1 actions = env.getActions(newState) for ac in actions: transs.setdefault(newState, {}).setdefault(ac, {}) _policy_iteration(transs, utils, policy, rewards, th=alpha(itr)) bestAction = policy.get(newState, random.choice(actions)) # Is this part from the book: # Having obtained a utility function U that is optimal for the learned model, # the agent can extract an optimal action by one-step look-ahead to maximize # the expected utility; alternatively, if it uses policy iteration, the # optimal policy is already available, so it should simply execute the # action the optimal policy recommends. Or should it? state = newState # A GLIE scheme must also eventually become greedy, so that the agent's actions # become optimal with respect to the learned (and hence the true) model. That is # why the parameter t needs to be incremented. t, itr = t + tStep, itr + 1 if itr >= maxItr: break return itr, rewardSum
def adp_optimistic_rewards(env, transs={}, utils={}, freqs={}, policy={}, rewards={}, **kwargs): """ Active ADP (adaptive dynamic programming) @param env: Environment @param transs: A transition table (N_s'_sa) with outcome frequencies given state action pairs, initially zero. @param utils: Utilities table @param freqs: A table of frequencies (N_sa) for state-action pairs, initially zero. @param R_plus: An optimistic estimate of the best possible reward obtainable in any state. @param N_e: Limit of how many number of optimistic reward is given before true utility. @param alpha: Step size function @param maxItr: Maximum iterations """ R_plus = kwargs.get("R_plus", 5) N_e = kwargs.get("N_e", 12) alpha = kwargs.get("alpha", _alpha) maxItr = kwargs.get("maxItr", 10) itr = 0 isTerminal = False state = env.getStartingState() rewardSum = 0 # Get possible actions with respect to current state. actions = env.getActions(state) _policy_iteration(transs, utils, policy, rewards, R_plus=R_plus, N_e=N_e, th=alpha(itr)) bestAction = policy.get(state, random.choice(actions)) while not isTerminal: # while not terminal if bestAction is None: # If it is the first iteration or exploration event # then randomly choose an action. Taking a random action in 1/t instances. bestAction = random.choice(actions) # do the action with the best policy # or do some random exploration newState, reward, isTerminal = env.do(state, bestAction) rewards[newState] = reward rewardSum += reward # Set to zero if newState does not exist yet. For new state? freqs.setdefault(newState, 0) freqs[newState] += 1 # update transition table. The first one returns dictionary of actions for specific state and the # second one a dictionary of possible states from specific action (best action). transs.setdefault(state, {}).setdefault(bestAction, {}).setdefault(newState, 0) transs[state][bestAction][newState] += 1 # We need to get actions on new state. actions = env.getActions(newState) for ac in actions: transs.setdefault(newState, {}).setdefault(ac, {}) _policy_iteration(transs, utils, policy, rewards, R_plus=R_plus, N_e=N_e, th=alpha(itr)) # rewardEstimate, bestAction = max(_getEstimatesOptimistic(transs, utils, state, R_plus, N_e, actions)) bestAction = policy.get(newState, random.choice(actions)) state = newState itr += 1 if itr >= maxItr: break return itr, rewardSum
def adp_optimistic_rewards(env, transs={}, utils={}, freqs={}, policy={}, rewards={}, **kwargs): """ Active ADP (adaptive dynamic programming) @param env: Environment @param transs: A transition table (N_s'_sa) with outcome frequencies given state action pairs, initially zero. @param utils: Utilities table @param freqs: A table of frequencies (N_sa) for state-action pairs, initially zero. @param R_plus: An optimistic estimate of the best possible reward obtainable in any state. @param N_e: Limit of how many number of optimistic reward is given before true utility. @param alpha: Step size function @param maxItr: Maximum iterations """ R_plus = kwargs.get('R_plus', 5) N_e = kwargs.get('N_e', 12) alpha = kwargs.get('alpha', _alpha) maxItr = kwargs.get('maxItr', 10) itr = 0 isTerminal = False state = env.getStartingState() rewardSum = 0 # Get possible actions with respect to current state. actions = env.getActions(state) _policy_iteration(transs, utils, policy, rewards, R_plus=R_plus, N_e=N_e, th=alpha(itr)) bestAction = policy.get(state, random.choice(actions)) while not isTerminal: # while not terminal if bestAction is None: # If it is the first iteration or exploration event # then randomly choose an action. Taking a random action in 1/t instances. bestAction = random.choice(actions) # do the action with the best policy # or do some random exploration newState, reward, isTerminal = env.do(state, bestAction) rewards[newState] = reward rewardSum += reward # Set to zero if newState does not exist yet. For new state? freqs.setdefault(newState, 0) freqs[newState] += 1 # update transition table. The first one returns dictionary of actions for specific state and the # second one a dictionary of possible states from specific action (best action). transs.setdefault(state, {}).setdefault(bestAction, {}).setdefault(newState, 0) transs[state][bestAction][newState] += 1 # We need to get actions on new state. actions = env.getActions(newState) for ac in actions: transs.setdefault(newState, {}).setdefault(ac, {}) _policy_iteration(transs, utils, policy, rewards, R_plus=R_plus, N_e=N_e, th=alpha(itr)) #rewardEstimate, bestAction = max(_getEstimatesOptimistic(transs, utils, state, R_plus, N_e, actions)) bestAction = policy.get(newState, random.choice(actions)) state = newState itr += 1 if itr >= maxItr: break return itr, rewardSum
def adp_random_exploration(env, transs={}, utils={}, freqs={}, policy={}, rewards={}, **kwargs): """ Active ADP (adaptive dynamic programming) learning algorithm which returns the best policy for a given environment env and experience dictionary exp The experience dictionary exp can be empty if the agent has no experience with the environment but can also be full with values from previous trials The algorithm returns the number of iterations needed to reach a terminal state For reference look in page 834. @param env: Environment @param transs: A transition table (N_s'_sa) with outcome frequencies given state action pairs, initially zero. @param utils: Utilities table @param freqs: A table of frequencies (N_sa) for state-action pairs, initially zero. @param t: A parameter for choosing best action or random action. @param tStep: A step to increment parameter t. @param alpha: Step size function @param maxItr: Maximum iterations """ tStep = kwargs.get('tStep', 0.01) alpha = kwargs.get('alpha', _alpha) maxItr = kwargs.get('maxItr', 50) tFac = kwargs.get('tFac', 1.) t = kwargs.get('currItrs', 0) / 5 if kwargs.get('remember', False) else 0 minRnd = kwargs.get('minRnd', 0.0) itr = 0 isTerminal = False state = env.getStartingState() rewardSum = 0 # Get possible actions with respect to current state. actions = env.getActions(state) _policy_iteration(transs, utils, policy, rewards, th=alpha(itr)) bestAction = policy.get(state, random.choice(actions)) while not isTerminal: # while not terminal if random.random() < max(minRnd, 1. / (tFac * (t + 1))) or bestAction is None: # If it is the first iteration or exploration event # then randomly choose an action. Taking a random action in 1/t instances. bestAction = random.choice(actions) # do the action with the best policy # or do some random exploration newState, reward, isTerminal = env.do(state, bestAction) rewards[newState] = reward rewardSum += reward # Set to zero if newState does not exist yet. For new state? freqs.setdefault(newState, 0) freqs[newState] += 1 # update transition table. The first one returns dictionary of actions for specific state and the # second one a dictionary of possible states from specific action (best action). transs.setdefault(state, {}).setdefault(bestAction, {}).setdefault(newState, 0) transs[state][bestAction][newState] += 1 actions = env.getActions(newState) for ac in actions: transs.setdefault(newState, {}).setdefault(ac, {}) _policy_iteration(transs, utils, policy, rewards, th=alpha(itr)) bestAction = policy.get(newState, random.choice(actions)) # Is this part from the book: # Having obtained a utility function U that is optimal for the learned model, # the agent can extract an optimal action by one-step look-ahead to maximize # the expected utility; alternatively, if it uses policy iteration, the # optimal policy is already available, so it should simply execute the # action the optimal policy recommends. Or should it? state = newState # A GLIE scheme must also eventually become greedy, so that the agent's actions # become optimal with respect to the learned (and hence the true) model. That is # why the parameter t needs to be incremented. t, itr = t + tStep, itr + 1 if itr >= maxItr: break return itr, rewardSum