Example #1
0
def test_train_1_episode():
    env = gym.make('CartPole-v0')
    n_state_dims = env.observation_space.shape[0]
    n_action_dims = env.action_space.n
    run_one_episode = partial(util.run_one_episode, env)
    policy_network = util.PolicyNetwork(n_state_dims,
                                        n_action_dims,
                                        hidden_units=[])
    _policy, _scores = ppo.train(run_one_episode, policy_network, n_episodes=1)
Example #2
0
def test_train_pong_long(n_episodes=1000,
                         batchnorm=False,
                         device=device,
                         render=False,
                         **kwargs):
    env = gym.make('PongDeterministic-v4')
    run_one_episode = partial(pong.run_one_episode, env, render=render)
    policy_network = pong.PolicyNetwork(batchnorm=batchnorm, device=device)
    _policy, _scores = ppo.train(run_one_episode,
                                 policy_network,
                                 n_episodes=n_episodes,
                                 **kwargs)
Example #3
0
def test_train_3k():
    torch.manual_seed(0)
    env = gym.make('CartPole-v0')
    n_state_dims = env.observation_space.shape[0]
    n_action_dims = env.action_space.n
    run_one_episode = partial(util.run_one_episode, env)
    policy_network = util.PolicyNetwork(n_state_dims,
                                        n_action_dims,
                                        hidden_units=[16])
    _policy, scores = ppo.train(run_one_episode,
                                policy_network,
                                n_episodes=3000,
                                alpha=1e-3,
                                gamma=1.,
                                entropy_beta=0.01,
                                weight_decay=0)
    assert np.mean(scores[-100:]) > 195.
Example #4
0
def main():
    args = mujoco_arg_parser()
    logger.configure(dir=args.logdir)

    nenv = 1
    envs = []
    for i in range(nenv):
        e = gym.make(args.env)
        e.seed(args.seed + i)  #for repeatability
        e = Monitor(e, logger.get_dir(), allow_early_resets=True)
        envs.append(e)
    envs = DummyVecEnv(envs)
    envs = VecNormalize(envs)

    set_global_seeds(args.seed)  #for repeatability

    agent = MlpAgent(envs.observation_space.shape[0],
                     envs.action_space.shape[0])
    if args.checkpoint:
        agent.load_state_dict(torch.load(args.checkpoint))

    agent = train(agent,
                  envs,
                  N_steps=2048,
                  N_updates=args.updates,
                  batch_size=128,
                  lam=0.95,
                  gamma=0.99,
                  N_train_sample_epochs=10,
                  log_interval=1,
                  ent_coef=0.0,
                  lr=3e-4,
                  cliprange=0.2,
                  save_interval=100)

    if args.play:
        logger.log("Running trained model")
        obs = np.zeros((envs.num_envs, ) + envs.observation_space.shape)
        obs[:] = envs.reset()
        while True:
            actions = agent.step(obs)[0]
            obs[:] = envs.step(actions)[0]
            envs.render()
Example #5
0
def test_train_pong_1_episode():
    env = gym.make('PongDeterministic-v4')
    run_one_episode = partial(pong.run_one_episode, env, render=True)
    policy_network = pong.PolicyNetwork()
    _policy, _scores = ppo.train(run_one_episode, policy_network, n_episodes=1)
Example #6
0
show_demo = False  #whether or not to make a video of the current progress
max_steps_in_demo_episode = 200  #number of steps to show in demo episode

#environement parameters
starting_floor = 0
total_floors = 1
worker_id = 1
env = create_env(starting_floor, total_floors, worker_id)
policy_actions = unpickle_object(
    'action_map')  #map going grom actions to env actions
override_threshold = 2000  #score used to determine if agent is stuck

#deep learning setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ActorCritic(len(policy_actions)).to(device)
#model = unpickle_object('policy_model')
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
########################################################################
"""
Train the model.
"""
experiment = Experiment(api_key="47QJ41M89a6zNXZgS9sY6NQfI",
                        project_name="unity",
                        workspace="wbarich")

model = train(env, model, gamma, max_epochs, batch_size,
              epochs_before_printing, mini_batch_size, ppo_epochs,
              policy_actions, device, optimizer, max_steps_in_demo_episode,
              show_demo, override_threshold, experiment)
########################################################################
import ppo
if __name__ == "__main__":
    train = True
    name = "LunarLander-v2"
    actor_size = 32
    critic_size = 256
    if train:
        env = ppo.train(
            environment_name=name,
            gamma=0.99,
            clip_ratio=0.1,
            pi_lr=2e-3,
            vf_lr=2e-3,
            k_epochs=4,
            update_every_j_timestep=32,
            max_episode_length=2_000,
            max_steps=500,
            critic_hidden_size=critic_size,
            actor_hidden_size=actor_size,
            render=False,
            random_seed=1,
            solved_reward=249,
            observationNormalization=False,
            actorGradientNormalization=0,
            normalizeAdvantage=False,
            initialization="orthogonal",  # normal = normal distribution, None
            advantageAlgorithm=
            "GAE",  # None = use A2C reward calculation o "GAE"
            # pathForBasePolicyToTrain=f"./model/ppo_{name}_policy_latest.pth",
            # pathForBaseCriticToTrain=f"./model/ppo_{name}_critic_latest.pth",
            coeficient_entropy=0.01,
import ppo
if __name__ == "__main__":
    train = True
    observationNormalization = False
    if train:
        env = ppo.train(
            environment_name="CartPole-v0",
            solved_reward=200,
            gamma=0.99,
            clip_ratio=0.25,
            pi_lr=0.000_3,
            vf_lr=0.000_3,
            k_epochs=4,
            update_every_j_timestep=100,
            max_episode_length=500,
            max_steps=210,
            critic_hidden_size=200,
            actor_hidden_size=16,
            render=False,
            random_seed=1,
            observationNormalization=observationNormalization,
            actorGradientNormalization=5,
            advantageAlgorithm="GAE",
        )

    ppo.play_latest("CartPole-v0",
                    16,
                    plot=False,
                    observationNormalization=observationNormalization)