Beispiel #1
0
    def __init__(self,
                 env=GridWorldEnv(),
                 discountingFactor=0.9,
                 convergenceThreshold=1e-4,
                 iterationThresholdValue=1000,
                 iterationThresholdPolicy=100,
                 mode='prod'):
        """Initialize the PolicyIteration Class
    Parameters
    env : (object)
    An instance of environment type
    discountingFactor : float
    The discounting factor for future rewards
    convergenceThreshold : float
    Threshold value for determining convergence
    iterationThresholdValue : int
    The maximum number of iteration to check for convergence of value
    iterationThresholdPolicy:
    The maximum number of iteration to check for convergence of policy
    mode : str
    Mode (prod/debug) indicating the run mode. Effects the information/ verbosity of messages.

    Examples
    --------
    policyIteration = PolicyIteration(env = GridWorldEnv(),mode='debug')
    """
        self.env = env
        self.gamma = discountingFactor
        self.th = convergenceThreshold
        self.maxIterValue = iterationThresholdValue
        self.maxIterPolicy = iterationThresholdPolicy
        self.stateCount = self.env.get_statespace_len()
        self.actionCount = self.env.get_actionspace_len()
        self.uniformActionProbability = 1.0 / self.actionCount
        self.stateDict = env.stateDict
        self.actionDict = env.actionDict
        self.mode = mode
        self.stateCount = self.env.get_statespace_len()
        self.V = np.zeros(self.stateCount)
        self.Q = [np.zeros(self.actionCount) for s in range(self.stateCount)]
        self.Policy = np.zeros(self.stateCount)
        self.totalReward = 0
        self.totalSteps = 0
Beispiel #2
0
    :param n_episodes: number of episodes
    :param max_episode_len: maximum number of time steps in episode
    :return:
    """
    for episode in range(n_episodes):
        observation = env.reset()
        total_reward = 0

        for step in range(max_episode_len):
            env.render()

            action = policy[observation]
            observation, reward, done, info = env.step(action)
            total_reward += reward

            if done:
                env.render()
                break

        print(f'Episode {episode + 1} finished after {step + 1} steps '
              f'with total reward {total_reward:.3f}')


#env = gym.make('Taxi-v3')
env = GridWorldEnv(config=GridWorldEnvConfig())
#run_environment(env, n_episodes=1)

with open('grid_policy.h5', 'rb') as file:
    policy = pickle.load(file)
run_policy(env, policy, n_episodes=1)
Beispiel #3
0
import os
from itertools import count
import torch
import torch.optim as optim
from actor import Actor
from critic import Critic
from grid_world import GridWorldEnv

state_size = 2
action_size = 4
lr = 1e-4

env = GridWorldEnv(width=4, height=3, start=(3, 0), goal=(0, 2))


def compute_returns(next_value, rewards, masks, gamma=0.99):
    R = next_value
    returns = []
    for step in reversed(range(len(rewards))):
        R = rewards[step] + gamma * R * masks[step]
        returns.insert(0, R)
    return returns


def trainIters(actor, critic, n_iters):
    optimizerA = optim.Adam(actor.parameters())
    optimizerC = optim.Adam(critic.parameters())
    for epoch in range(n_iters):
        state = env.reset()
        log_probs = []
        values = []
os.environ['TF_DISABLE_MKL'] = '1'
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.activations import softmax, relu, linear, tanh

from modern_keras import reinforce
from grid_world import GridWorldEnv, A, num_states
import numpy as np

if __name__ == "__main__":

    np.set_printoptions(precision=3, suppress=True)
    env = GridWorldEnv()

    actor_model = Sequential()
    actor_model.add(Dense(64, activation=relu))
    actor_model.add(Dense(64, activation=relu))
    actor_model.add(Dense(len(A), activation=softmax))

    base_line_model = Sequential()
    base_line_model.add(Dense(64, activation=tanh))
    base_line_model.add(Dense(64, activation=tanh))
    base_line_model.add(Dense(1, activation=linear))

    reinforce(actor_model,
              base_line_model,
              env.deep_reset,
              env.deep_get_state,