def __init__(self, environmentName): """ Class for performing value interation in the given environment Parameters ---------- environmentName : string Name of gym environment to utilize. Returns ------- None. """ self.env = gridworld.GridworldEnv() self.theta = 0.0001 self.discount_factor = 0.9
import gym import gridworld import random import time from gym_minigrid.wrappers import * import numpy as np discount = 0.99 env = gridworld.GridworldEnv() obs = env.reset() pass policy = np.ones((env.nS, env.nA)) / env.nA policy_old = np.zeros((env.nS, env.nA)) policy_delta = np.ones((env.nS, env.nA)) * 0.00001 v = np.zeros(env.nS) stm = np.ones((env.nS, env.nS)) v_old = np.copy(v) delta = np.ones(env.nS) * 0.00001 def update_value(): global v_old, v while True: for s in range(env.nS): vs = 0 for a in range(env.nA): state_transition_prob, s_next, reward, done = env.P[s][a][0]
import numpy as np import gridworld as gw # 创建环境 env = gw.GridworldEnv() # V表是当前状态走下一步时最大回报(先求下一步不同方向的回报,然后求最大值) # Q表是计算所有状态所有可能的走法 def value_iteration(env, theta=0.0001, discount_factor=1.0): """ Value Iteration Algorithm. Args: env: OpenAI environment. env.P represents the transition probabilities of the environment. theta: Stopping threshold. If the value of all states changes less than theta in one iteration we are done. discount_factor: lambda time discount factor. Returns: A tuple (policy, V) of the optimal policy and the optimal value function. """ def one_step_lookahead(state, V): """ Helper function to calculate the value for all action in a given state. Args: state: The state to consider (int) V: The value to use as an estimator, Vector of length env.nS
step.state), step.action, step.reward, step.done, torch.from_numpy( step.state_prime) def grid_to_tensor(step): return torch.tensor(step.state, dtype=torch.int), \ torch.tensor(step.action, dtype=torch.int64),\ step.reward, \ torch.tensor(step.state_prime, dtype=torch.int) #env = gym.make('CartPole-v0') #env = #config = EnvConfig('CartPole-v0', 4, prepro) config = EnvConfig(gridworld.GridworldEnv(), 16, grid, grid_to_tensor, max_steps=20) env = config.env q = nn.Linear(env.nS, env.nA) #q.load_state_dict(torch.load('mountain_car.wgt')) optimizor = optim.SGD(q.parameters(), lr=0.001) def greedyestimate(obs): est = q(obs) act = torch.argmax(est, dim=1) return act
# Greedily update the policy if chosen_a != best_a: policy_stable = False policy[s] = np.eye(env.nA)[best_a] # If the policy is stable we've found an optimal policy. Return it iterations += 1 if policy_stable: print('Value converged at iteration:', iterations) return policy, V sizes = [5, 10, 20, 30, 50] for size in sizes: print("Running PI Size: ", size) env = gridworld.GridworldEnv(shape=[size, size]) tic = time.time() policy, v = policy_improvement(env) toc = time.time() elapsed_time = (toc - tic) * 1000 print(f"Time to converge: {elapsed_time: 0.3} ms") # print("Policy Probability Distribution:") # print(policy) # print("") # print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):") # print(np.reshape(np.argmax(policy, axis=1), env.shape)) # print("")
import gridworld import numpy as np grid = gridworld.GridworldEnv() def policy_evaluation(policy, env, delta=0.0001, discount_factor=1.0): """Evaluate a Policy given the full dynamics of the environment. Args: policy: [S, A] matrix, env = environment with transition probabilities where where env.P[s][a] = (prob, next_state, reward, done), delta = change of value function, discount factor = how much we weight future rewards Returns: value of this policy """ V = np.zeros(env.nS) # initialize V(s) to be zero for all s while True: current_delta = 0 for s in range(env.nS): v = 0 for a, prob in enumerate(policy[s]): for trans_prob, next_state, reward, done in env.P[s][a]: v += prob * trans_prob * (reward + discount_factor * V[next_state]) current_delta = max(current_delta, np.abs(v - V[s])) V[s] = v if current_delta < delta: break return np.array(V)