def _thunk():
     env = ContinuousCartPoleEnv()
     return env
Exemple #2
0
import gym
import sys

import numpy as np
import matplotlib.pyplot as plt

from continuous_cartpole import ContinuousCartPoleEnv

# Create the Cart-Pole game environment
env = ContinuousCartPoleEnv()

rewards_list = []
steps_list = []
num_episodes = 5
episodes_list = np.arange(1, num_episodes + 1)

# Number of episodes
for i_episode in range(num_episodes):
    print("")
    print("========= EPISODE %d =========" % (i_episode + 1))
    observation = env.reset()
    total_reward = 0

    # Number of time-steps
    for t in range(100):
        env.render()
        action = env.action_space.sample()  # Take random action
        observation, reward, done, info = env.step(action)
        total_reward += reward
        '''
        print("----------- Begin time-step %d ----------" % (t))
Exemple #3
0
def train_agent(agent,
                desc='Agent1',
                file_name='agent1',
                runs=5,
                episodes=5000,
                time_steps=300,
                test_episodes=10,
                init_state=None,
                init_noise=None,
                model_dir='../save/models',
                data_dir='../save/stats',
                plt_dir='../save/plots',
                show=False):

    print_header(1, desc)

    run_train_stats = []
    run_test_stats = []

    for run in range(runs):
        print_header(2, 'RUN {}'.format(run + 1))
        print_header(3, 'Training')

        # Training
        env = ContinuousCartPoleEnv(reward_function=agent.reward_fun)

        # Clear weights
        agent.reset_parameters()

        # Train agent...
        stats = agent.train(env,
                            episodes,
                            time_steps,
                            initial_state=init_state,
                            initial_noise=init_noise)
        # ... and append statistics to list
        run_train_stats.append(stats)

        # Save agent checkpoint
        exp_model_dir = model_dir + '/' + file_name
        mkdir(exp_model_dir)
        with open(
                '{}/model_{}_run_{}_{}.pkl'.format(exp_model_dir,
                                                   file_name, run + 1,
                                                   timestamp()), 'wb') as f:
            pickle.dump(agent, f)

        # Run (deterministic) tests on the trained agent and save the statistics
        test_stats = test_agent(env,
                                agent,
                                run=run + 1,
                                episodes=test_episodes,
                                time_steps=time_steps,
                                initial_state=init_state,
                                initial_noise=init_noise,
                                render=show)
        run_test_stats.append(test_stats)

    # Concatenate stats for all runs ...
    train_rewards = []
    train_lengths = []
    train_losses = []
    test_rewards = []
    test_lengths = []

    for r in range(runs):
        train_rewards.append(run_train_stats[r].episode_rewards)
        train_lengths.append(run_train_stats[r].episode_lengths)
        train_losses.append(run_train_stats[r].episode_loss)
        test_rewards.append(run_test_stats[r].episode_rewards)
        test_lengths.append(run_test_stats[r].episode_lengths)

    train_rewards = np.array(train_rewards)
    train_lengths = np.array(train_lengths)
    train_losses = np.array(train_losses)
    test_rewards = np.array(test_rewards)
    test_lengths = np.array(test_lengths)

    # ... and store them in a dictionary
    plot_stats = [{
        'run': 'train',
        'stats': {
            'rewards': train_rewards,
            'lengths': train_lengths,
            'losses': train_losses
        }
    }, {
        'run': 'test',
        'stats': {
            'rewards': test_rewards,
            'lengths': test_lengths
        }
    }]

    # ... and print their aggregate values
    print_header(1, 'Aggregate Stats')
    print_agg_stats(plot_stats)

    # Save Statistics
    exp_stats_dir = data_dir + '/' + file_name
    mkdir(exp_stats_dir)
    with open(
            '{}/stats_{}_{}.pkl'.format(exp_stats_dir, file_name, timestamp()),
            'wb') as f:
        pickle.dump(plot_stats, f)

    # Plot Statistics
    plot_run_stats(plot_stats, path=plt_dir, experiment=file_name, show=show)
Exemple #4
0
 def _thunk():
     # env = gym.make(ENV_NAME)
     env = ContinuousCartPoleEnv()
     return env
Exemple #5
0
import contextlib
from arg_parser import parse
from pathlib import Path
from continuous_cartpole import ContinuousCartPoleEnv
from cartpole_TD import TDLambda
from utils import D2C, Visualizer, reward_laplacian

if __name__ == '__main__':
    print('--- running main ---')
    args = parse()
    env = ContinuousCartPoleEnv(reward_function=reward_laplacian)
    state_dim = env.observation_space.shape[0]
    action_dim = args.action_dim
    d2c_converter = D2C(action_dim, env.action_space.low,
                        env.action_space.high)

    # --- choose algorithm and hyperparameters ---
    # dqn = DQN(state_dim, action_dim, gamma=0.99, d2c=d2c_converter)
    # state_dim, action_dim, gamma, trace_decay, alpha, d2c
    td_lam = TDLambda(state_dim,
                      action_dim,
                      gamma=args.gamma,
                      trace_decay=args.trace_decay,
                      alpha=args.alpha,
                      d2c=d2c_converter)

    episodes = args.episode
    time_steps = args.steps
    epsilon = args.epsilon
    render = args.render
Exemple #6
0
parser.add_argument('--smw',
                    action='store',
                    default=10,
                    help='Smoothing window.',
                    type=int)

args = parser.parse_args()

initial_state = initial_states[args.inist]
initial_noise = initial_noises[args.inirnd]

with open(args.file, 'rb') as f:
    agent = pickle.load(f)

reward_function = agent.reward_fun
env = ContinuousCartPoleEnv(reward_function=reward_function)

stats = test_agent(env,
                   agent,
                   episodes=args.ep,
                   time_steps=args.ts,
                   initial_state=initial_state,
                   initial_noise=initial_noise,
                   render=True,
                   deterministic=not args.stoc)

plt_stats = [{
    'run': 'test',
    'stats': {
        'rewards': stats.episode_rewards.reshape([1, args.ep]),
        'lengths': stats.episode_lengths.reshape([1, args.ep])
Exemple #7
0
######## Hyperparameters #########
max_nb_episodes = 1000
T = 1024  #
N = 1
update_time = N * T
K_epochs = 25
batch_size = 32
eps_clip = 0.1  # to encourage policy change
gamma = 0.99
lr = 0.00025
betas = (0.9, 0.99)
action_std = 0.25
max_length_episode = 650
render = False
######## environment #########
env = ContinuousCartPoleEnv(reward)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
# torch.seed()
# env.seed()
# np.random.seed()

######## Cuda ##########
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

####### intialization########

running_reward = 0
avg_length = 0
avg_running_reward = 0
Exemple #8
0
def train(args):
    env = ContinuousCartPoleEnv()
    STATE_SIZE = 4
    ACTION_SPACE_SIZE = 1

    actor = LunarLanderActor(state_size=STATE_SIZE,
                             num_actions=ACTION_SPACE_SIZE)
    critic = Critic(state_size=STATE_SIZE)
    agent = Agent(env,
                  actor_lr=args["ACTOR_LEARNING_RATE"],
                  critic_lr=args["CRITIC_LEARNING_RATE"],
                  actor_model=actor,
                  critic_model=critic,
                  device=args["DEVICE"],
                  gamma=args["GAMMA"])

    stats = {"episode_reward": deque([]), "del_ts": deque([])}

    if args["LOAD_PREVIOUS"]:
        print("Loading previously trained model")
        agent.load()

    for i in range(args["NUM_EPISODES"]):
        print("Starting episode", i)
        total = 0

        agent.start_episode()
        state = env.reset()

        num_step = 0
        done = False
        oup_noise = np.zeros(ACTION_SPACE_SIZE)
        while not done:
            action = agent.get_action(state)

            # Exploration strategy
            gauss_noise = np.random.normal(0,
                                           args["exploration_stddev"],
                                           size=ACTION_SPACE_SIZE)
            oup_noise = gauss_noise + args["KAPPA"] * oup_noise
            target_action = torch.clamp(action + torch.Tensor(oup_noise),
                                        min=-1,
                                        max=1)

            new_state, reward, done, info = env.step(
                target_action.detach().numpy())
            transition = Transition(reward=reward,
                                    state=state,
                                    action=action,
                                    target_action=target_action,
                                    next_state=new_state)
            agent.step(transition)

            if (num_step % args["PRINT_EVERY"] == 0):
                print("\tStep", num_step, "for episode", i)
                print("\t", action, target_action)
                print("\tReward accumulated:", total)

            assert (type(target_action) == torch.Tensor)
            assert (target_action.requires_grad)
            assert (action.requires_grad)

            total += reward
            state = new_state
            num_step += 1

        # Learn from this episode
        agent.learn()

        if args["RENDER_ENV"]:
            env.render()

        if i % 1 == 0:
            agent.save()
            stats["episode_reward"].append(total)

            transitions, del_ts = agent.get_episode_stats()
            stats["del_ts"].extend(del_ts)

            print("Reward is ", total, "and average reward is",
                  total / num_step)

    return stats
Exemple #9
0
import contextlib
from arg_parser import parse
from pathlib import Path
from continuous_cartpole import ContinuousCartPoleEnv
from actor_critic_discrete import ActorCritic
from utils import D2C, Visualizer
from utils import reward_carrot_stick

if __name__ == '__main__':
    print('--- running main ---')
    args = parse()
    env = ContinuousCartPoleEnv(reward_function=reward_carrot_stick)
    state_dim = env.observation_space.shape[0]
    action_dim = args.action_dim
    d2c_converter = D2C(action_dim, env.action_space.low,
                        env.action_space.high)

    # --- choose algorithm and hyperparameters ---
    actorcritic = ActorCritic(state_dim,
                              action_dim,
                              gamma=args.gamma,
                              d2c=d2c_converter)
    episodes = args.episode
    time_steps = args.steps
    render = args.render
    # --- run algorithm ---
    actorcritic.load_models()
    stats = actorcritic.train(env, episodes, time_steps)
    actorcritic.save_models()

    # --- visualize the results ---