Beispiel #1
0
import pybullet_envs
import gym
import numpy as np
from sac_torch import Agent

if __name__ == '__main__':
    env = gym.make('InvertedPendulumBulletEnv-v0')
    #print(env.action_space.shape[0])
    agent = Agent(input_dims=env.observation_space.shape[0],
                  env=env,
                  n_actions=env.action_space.shape[0])
    n_games = 200
    best_score = env.reward_range[0]
    score_history = []
    load_checkpoints = False

    if load_checkpoints:
        agent.load_models()
        env.render(mode='human')

    for i in range(n_games):
        observation = env.reset()
        done = False
        score = 0
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.remember(observation, action, reward, observation_, done)
            #if not load_checkpoints:
            agent.learn()
# np.random.seed(42)


def one_hot_single_value(cur_val, total_vals):
    """Coverts cur_val into one-hot vector of size total_vals"""
    x = [0] * total_vals
    x[cur_val] = 1
    return x


if __name__ == '__main__':
    env = SimpleBandit(num_states=10, num_actions=4, max_trajectory_len=20)
    n_states, n_actions = env.state_space.n, env.action_space.n

    agent = Agent(input_dims=[env.state_space.n],
                  env=env,
                  n_actions=env.action_space.n)
    n_games = 2000
    # uncomment this line and do a mkdir tmp && mkdir video if you want to
    # record video of the agent playing the game.
    # env = wrappers.Monitor(env, 'tmp/video', video_callable=lambda episode_id: True, force=True)
    filename = 'simple_bandit.png'

    figure_file = 'plots/' + filename

    best_score = env.reward_range[0]
    score_history = []
    load_checkpoint = False

    if load_checkpoint:
        agent.load_models()
Beispiel #3
0
def main():
    Hyper.init()
    env = make_env(
        Constants.env_id)  # See wrapper code for environment in atari_image.py
    Hyper.n_actions = env.action_space.n
    shape = (env.observation_space.shape)
    agent = Agent(input_dims=shape, env=env, n_actions=env.action_space.n)
    filename = f"{Constants.env_id}_games{Hyper.n_games}_alpha{Hyper.alpha}.png"
    figure_file = f'plots/{filename}'

    best_ave_score = env.reward_range[0]
    best_score = 0
    score_history = []
    load_checkpoint = False
    if load_checkpoint:
        agent.load_models()
        env.render(mode='human')
    total_steps = 0
    game_id = 0
    for i in range(Hyper.n_games):
        game_id += 1
        if game_id % 20 == 0:
            Hyper.alpha = Hyper.alpha * 1.2
            Hyper.beta = Hyper.beta * 1.2
        observation = env.reset()
        done = False
        steps = 0
        score = 0
        while not done:
            # Sample action from the policy
            action = agent.choose_action(observation)

            # Sample transition from the environment
            new_observation, reward, done, info = env.step(action)
            steps += 1
            total_steps += 1

            # Store transition in the replay buffer
            agent.remember(observation, action, reward, new_observation, done)
            if not load_checkpoint:
                agent.learn()
            score += reward
            observation = new_observation
        score_history.append(score)
        avg_score = np.mean(score_history[-100:])
        if score > best_score:
            best_score = score

        if avg_score > best_ave_score:
            best_ave_score = avg_score
            if not load_checkpoint:
                agent.save_models()

        episode = i + 1
        print(
            f"episode {episode}: score {score}, best_score {best_score}, best ave score {best_ave_score}, trailing 100 games avg {avg_score}, steps {steps}, total steps {total_steps}"
        )

    print(f"total number of steps taken: {total_steps}")
    if not load_checkpoint:
        x = [i + 1 for i in range(Hyper.n_games)]
        plot_learning_curve(x, score_history, figure_file)
Beispiel #4
0
 rewardmodel = RewardModel(model_cuda,
                           n_actions,
                           n_spaces,
                           1,
                           512,
                           3,
                           ensemble_size=ensemble_size)
 modelloss = []
 rewards = []
 agent_cuda = args[5]
 agent = Agent(agent_cuda,
               alpha=0.0003,
               beta=0.0003,
               reward_scale=2,
               env_id=env_id,
               input_dims=env.observation_space.shape,
               tau=0.005,
               env=env,
               batch_size=256,
               layer1_size=256,
               layer2_size=256,
               n_actions=n_actions)
 horizon = int(args[9])
 num_control_samples = 100
 num_elite = 30
 grad_steps = 10
 mpc = MPCController(agent_cuda,
                     env,
                     horizon=horizon,
                     num_control_samples=num_control_samples,
                     num_elite=num_elite,
                     agent=agent,
Beispiel #5
0
from sac_torch import Agent

seed = 18095048
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

env_name = 'InvertedPendulumBulletEnv-v0'
env = gym.make(env_name)
if seed is not None:
    env.seed(seed)
state = env.reset()
# initializing a model
model = Agent(input_dims=env.observation_space.shape[0],
              env=env,
              n_actions=env.action_space.shape[0])

mean_rewards = []
for i in range(100):
    print('game ' + str(i))
    rewards = [model.train_on_env(env) for _ in range(100)]
    mean_rewards.append(np.mean(rewards))
    print("mean reward:%.3f" % (np.mean(rewards)))
    plt.figure(figsize=[9, 6])
    plt.title("Mean reward per 100 games")
    plt.plot(mean_rewards)
    plt.grid()
    # plt.show()
    plt.savefig('plots/SAC_learning_curve.png')
    plt.close()