Exemple #1
0
def plot_trial(mdp_data):
    """Plot a trial given a learned MDP and policy, return as a gif file."""
    time = 0
    cart_pole = CartPole(Physics())
    state_tuple = (0., 0., 0., 0.)
    state = cart_pole.get_state(state_tuple)
    cart_pole.plot_cart(state_tuple, time)
    os.mkdir('frames')  # contain frames
    files = []
    # simulate a trial
    while True:
        time += 1
        action = choose_action(state, mdp_data)
        state_tuple = cart_pole.simulate(action, state_tuple)
        new_state = cart_pole.get_state(state_tuple)
        cart_pole.plot_cart(state_tuple, time)
        files.append(f'frame{time}.png')
        if new_state == mdp_data['num_states'] - 1:
            break
        state = new_state
    # create gif file
    with imageio.get_writer('simulation.gif', mode='I') as writer:
        for filename in files:
            image = imageio.imread(f'frames/{filename}')
            writer.append_data(image)
    # remove redundancy
    shutil.rmtree("frames")
Exemple #2
0
 def __init__(self):
     self.env = CartPole.env()
     self.agent = network.Network(
         time_step_spec=self.env.time_step_spec(),
         observation_spec=self.env.observation_spec(),
         action_spec=self.env.action_spec(),
         training=False)
Exemple #3
0
def main():
    # Simulation parameters
    pause_time = 0.0001
    min_trial_length_to_start_display = 100
    display_started = min_trial_length_to_start_display == 0

    NUM_STATES = 163
    GAMMA = 0.995
    TOLERANCE = 0.01
    NO_LEARNING_THRESHOLD = 20

    # Time cycle of the simulation
    time = 0

    # These variables perform bookkeeping (how many cycles was the pole
    # balanced for before it fell). Useful for plotting learning curves.
    time_steps_to_failure = []
    num_failures = 0
    time_at_start_of_current_trial = 0

    # You should reach convergence well before this
    max_failures = 500

    # Initialize a cart pole
    cart_pole = CartPole(Physics())

    # Starting `state_tuple` is (0, 0, 0, 0)
    # x, x_dot, theta, theta_dot represents the actual continuous state vector
    x, x_dot, theta, theta_dot = 0.0, 0.0, 0.0, 0.0
    state_tuple = (x, x_dot, theta, theta_dot)

    # `state` is the number given to this state, you only need to consider
    # this representation of the state
    state = cart_pole.get_state(state_tuple)
    # if min_trial_length_to_start_display == 0 or display_started == 1:
    #     cart_pole.show_cart(state_tuple, pause_time)

    mdp_data = initialize_mdp_data(NUM_STATES)

    # This is the criterion to end the simulation.
    # You should change it to terminate when the previous
    # 'NO_LEARNING_THRESHOLD' consecutive value function computations all
    # converged within one value function iteration. Intuitively, it seems
    # like there will be little learning after this, so end the simulation
    # here, and say the overall algorithm has converged.

    consecutive_no_learning_trials = 0
    while consecutive_no_learning_trials < NO_LEARNING_THRESHOLD:

        action = choose_action(state, mdp_data)

        # Get the next state by simulating the dynamics
        state_tuple = cart_pole.simulate(action, state_tuple)
        # x, x_dot, theta, theta_dot = state_tuple

        # Increment simulation time
        time = time + 1

        # Get the state number corresponding to new state vector
        new_state = cart_pole.get_state(state_tuple)
        # if display_started == 1:
        #     cart_pole.show_cart(state_tuple, pause_time)

        # reward function to use - do not change this!
        if new_state == NUM_STATES - 1:
            R = -1
        else:
            R = 0

        update_mdp_transition_counts_reward_counts(mdp_data, state, action,
                                                   new_state, R)

        # Recompute MDP model whenever pole falls
        # Compute the value function V for the new model
        if new_state == NUM_STATES - 1:

            update_mdp_transition_probs_reward(mdp_data)

            converged_in_one_iteration = update_mdp_value(
                mdp_data, TOLERANCE, GAMMA)

            if converged_in_one_iteration:
                consecutive_no_learning_trials = consecutive_no_learning_trials + 1
            else:
                consecutive_no_learning_trials = 0

        # Do NOT change this code: Controls the simulation, and handles the case
        # when the pole fell and the state must be reinitialized.
        if new_state == NUM_STATES - 1:
            num_failures += 1
            if num_failures >= max_failures:
                break
            print('[INFO] Failure number {}'.format(num_failures))
            time_steps_to_failure.append(time - time_at_start_of_current_trial)
            # time_steps_to_failure[num_failures] = time - time_at_start_of_current_trial
            time_at_start_of_current_trial = time

            if time_steps_to_failure[num_failures -
                                     1] > min_trial_length_to_start_display:
                display_started = 1

            # Reinitialize state
            # x = 0.0
            x = -1.1 + np.random.uniform() * 2.2
            x_dot, theta, theta_dot = 0.0, 0.0, 0.0
            state_tuple = (x, x_dot, theta, theta_dot)
            state = cart_pole.get_state(state_tuple)
        else:
            state = new_state

    # plot the learning curve (time balanced vs. trial)
    log_tstf = np.log(np.array(time_steps_to_failure))
    plt.plot(np.arange(len(time_steps_to_failure)), log_tstf, 'k')
    window = 30
    w = np.array([1 / window for _ in range(window)])
    weights = lfilter(w, 1, log_tstf)
    x = np.arange(window // 2, len(log_tstf) - window // 2)
    plt.plot(x, weights[window:len(log_tstf)], 'r--')
    plt.xlabel('Num failures')
    plt.ylabel('Log of num steps to failure')
    plt.savefig('./control.pdf')
Exemple #4
0
import tensorflow as tf

from env import CartPole
from agent import network

# Config GPU
gpus = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpus))
tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

# Environment
env = CartPole.env(gui=True)

# Agent
agent = network.Network(time_step_spec=env.time_step_spec(),
                        observation_spec=env.observation_spec(),
                        action_spec=env.action_spec(),
                        training=False)

state = agent.get_initial_state()
while True:
    time_step = env.current_time_step()
    policy_step = agent.action(time_step, state)
    action, state, _ = policy_step
    _, reward, _, _ = env.step(action)
    if time_step.is_last():
        state = agent.get_initial_state()

    # Debug
    print('Action: {}, Reward: {}'.format(action.numpy(), reward.numpy()))
    env.render()
Exemple #5
0
import time
import tensorflow as tf
from tf_agents.policies import random_tf_policy

from agent import network
from buffer import per
from env import CartPole
from criterion import ExpectedReturn
from helper.utils import parse_experiences

# Environment
train_env = CartPole.env(gui=False)

# Agent
agent = network.Network(time_step_spec=train_env.time_step_spec(),
                        observation_spec=train_env.observation_spec(),
                        action_spec=train_env.action_spec(),
                        training=True)

# Metrics and Evaluation
ER = ExpectedReturn()

# Replay buffer
initial_collect_steps = 10000
replay_buffer = per.PrioritizedExperienceRelay(agent.data_spec,
                                               n_steps=agent.get_n_steps(),
                                               batch_size=train_env.batch_size)

# Init buffer
random_policy = random_tf_policy.RandomTFPolicy(
    time_step_spec=agent.time_step_spec,
Exemple #6
0
import tensorflow as tf
from tf_agents.policies import random_tf_policy
from tf_agents.utils import common

from env import CartPole
from src.agent import DQN_FC
from src.buffer import ReplayBuffer
from src.eval import ExpectedReturn

# Compulsory config for tf_agents
tf.compat.v1.enable_v2_behavior()

# Environment
cartpole = CartPole.TfEnv()
train_env, train_display = cartpole.gen_env()
eval_env, eval_display = cartpole.gen_env()

# Agent
dqn = DQN_FC(train_env)
train_step_counter = tf.Variable(0)
agent = dqn.gen_agent(train_step_counter)

# Policy
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())

# Replay buffer
replay_buffer = ReplayBuffer(
    agent.collect_data_spec,
    batch_size=train_env.batch_size,
)
Exemple #7
0
from stable_baselines import DQN
from env import CartPole
from stable_baselines.deepq.policies import LnMlpPolicy

if __name__ == '__main__':
    env = CartPole()
    model = DQN(LnMlpPolicy, env)
    print('start')

    model.learn(total_timesteps=10000)
Exemple #8
0
from env import CartPole

cartpole = CartPole.env(gui=True)

counter = 0
while counter < 1000:
    counter = counter + 1
    cartpole.step(0)
    cartpole.render()