コード例 #1
0
 def __init__(self, environmentName):
     """
     Class for performing value interation in the given environment
     Parameters
     ----------
     environmentName : string
         Name of gym environment to utilize.
     Returns
     -------
     None.
     """
     self.env = gridworld.GridworldEnv()
     self.theta = 0.0001
     self.discount_factor = 0.9
コード例 #2
0
import gym
import gridworld
import random
import time
from gym_minigrid.wrappers import *
import numpy as np

discount = 0.99

env = gridworld.GridworldEnv()
obs = env.reset()
pass

policy = np.ones((env.nS, env.nA)) / env.nA
policy_old = np.zeros((env.nS, env.nA))
policy_delta = np.ones((env.nS, env.nA)) * 0.00001

v = np.zeros(env.nS)
stm = np.ones((env.nS, env.nS))

v_old = np.copy(v)
delta = np.ones(env.nS) * 0.00001


def update_value():
    global v_old, v
    while True:
        for s in range(env.nS):
            vs = 0
            for a in range(env.nA):
                state_transition_prob, s_next, reward, done = env.P[s][a][0]
コード例 #3
0
import numpy as np
import gridworld as gw

# 创建环境
env = gw.GridworldEnv()


# V表是当前状态走下一步时最大回报(先求下一步不同方向的回报,然后求最大值)
# Q表是计算所有状态所有可能的走法
def value_iteration(env, theta=0.0001, discount_factor=1.0):
    """
    Value Iteration Algorithm.

    Args:
        env: OpenAI environment. env.P represents the transition probabilities of the environment.
        theta: Stopping threshold. If the value of all states changes less than theta
            in one iteration we are done.
        discount_factor: lambda time discount factor.

    Returns:
        A tuple (policy, V) of the optimal policy and the optimal value function.
    """

    def one_step_lookahead(state, V):
        """
        Helper function to calculate the value for all action in a given state.

        Args:
            state: The state to consider (int)
            V: The value to use as an estimator, Vector of length env.nS
コード例 #4
0
        step.state), step.action, step.reward, step.done, torch.from_numpy(
            step.state_prime)


def grid_to_tensor(step):
    return torch.tensor(step.state, dtype=torch.int), \
           torch.tensor(step.action, dtype=torch.int64),\
           step.reward, \
           torch.tensor(step.state_prime, dtype=torch.int)


#env = gym.make('CartPole-v0')
#env =

#config = EnvConfig('CartPole-v0', 4, prepro)
config = EnvConfig(gridworld.GridworldEnv(),
                   16,
                   grid,
                   grid_to_tensor,
                   max_steps=20)
env = config.env

q = nn.Linear(env.nS, env.nA)
#q.load_state_dict(torch.load('mountain_car.wgt'))
optimizor = optim.SGD(q.parameters(), lr=0.001)


def greedyestimate(obs):
    est = q(obs)
    act = torch.argmax(est, dim=1)
    return act
コード例 #5
0
ファイル: policy_iteration.py プロジェクト: ghanley/OMSCS
            # Greedily update the policy
            if chosen_a != best_a:
                policy_stable = False
            policy[s] = np.eye(env.nA)[best_a]

        # If the policy is stable we've found an optimal policy. Return it
        iterations += 1
        if policy_stable:
            print('Value converged at iteration:', iterations)
            return policy, V


sizes = [5, 10, 20, 30, 50]
for size in sizes:
    print("Running PI Size: ", size)
    env = gridworld.GridworldEnv(shape=[size, size])

    tic = time.time()
    policy, v = policy_improvement(env)
    toc = time.time()
    elapsed_time = (toc - tic) * 1000
    print(f"Time to converge: {elapsed_time: 0.3} ms")

# print("Policy Probability Distribution:")
# print(policy)
# print("")

# print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):")
# print(np.reshape(np.argmax(policy, axis=1), env.shape))
# print("")
コード例 #6
0
import gridworld
import numpy as np

grid = gridworld.GridworldEnv()


def policy_evaluation(policy, env, delta=0.0001, discount_factor=1.0):
    """Evaluate a Policy given the full dynamics of the environment.
    Args: policy: [S, A] matrix, env = environment with transition probabilities where
    where env.P[s][a] = (prob, next_state, reward, done),
    delta = change of value function,
    discount factor = how much we weight future rewards

    Returns: value of this policy
    """
    V = np.zeros(env.nS)  # initialize V(s) to be zero for all s
    while True:
        current_delta = 0
        for s in range(env.nS):
            v = 0
            for a, prob in enumerate(policy[s]):
                for trans_prob, next_state, reward, done in env.P[s][a]:
                    v += prob * trans_prob * (reward +
                                              discount_factor * V[next_state])
            current_delta = max(current_delta, np.abs(v - V[s]))
            V[s] = v

        if current_delta < delta:
            break
    return np.array(V)