config.envs = multi_env(config.env_name, config.num_agents)
config.num_episodes = 1000
config.steps = 1000
config.state_size = config.envs.observation_space.shape[0]
config.action_size = config.envs.action_space.shape[0]
config.activ_actor = F.relu
config.lr_actor = 3e-4
config.hidden_actor = (512, 512)
config.optim_actor = Adam
config.grad_clip_actor = 5
config.activ_critic = F.relu
config.lr_critic = 3e-4
config.hidden_critic = (512, 512)
config.optim_critic = Adam
config.grad_clip_critic = 5
config.gamma = 0.99
config.ppo_clip = 0.2
config.ppo_epochs = 10
config.ppo_batch_size = 32
config.ent_weight = 0.01
config.val_loss_weight = 1
config.use_gae = True
config.lamda = 0.95
config.env_solved = 1.0
config.times_solved = 10

#agent = A2CAgent(config)
agent = PPOAgent(config)

agent.train()
예제 #2
0
        actor = PolicyModelConv(width, height,
                                env_wrapper.env.action_space.n).cuda()

    critic = PolicyModel(width, height).cuda()
    icm = IntrinsicCuriosityModule(env_wrapper.env.action_space.n).cuda()

    optimizer = torch.optim.Adam([{
        'params': actor.parameters(),
        'lr': lr_actor
    }, {
        'params': icm.parameters(),
        'lr': lr_icm
    }, {
        'params': critic.parameters(),
        'lr': lr_critic
    }])

    # https://www.aicrowd.com/challenges/neurips-2020-procgen-competition
    # Challenge generalize for 8 million time steps cover 200 levels
    # max batch size GPU limit 64x64 * 2000 * nets_size
    # print(get_n_params(actor))
    agent = PPOAgent(env_wrapper,
                     actor,
                     critic,
                     icm,
                     optimizer,
                     name=args.model)
    # SAVE MODEL EVERY (8000000/4) / 2000 / 50
    # print(get_n_params(actor))
    agent.train(2000, int(8000000 / motion_blur_c))
예제 #3
0
def cartpole(to_file=True, episodes=None):

    loop_forever = False
    if episodes is None:
        loop_forever = True

    env = gym.make("CartPole-v0")

    results = {
        "loss": [],
        "episode_length": [],
        "entropy": [],
        "learning_rate": [],
    }
    agent = PPOAgent(4, 2)
    i_episode = 0

    mean_losses = []
    mean_entropies = []
    mean_episode_lengths = []
    learning_rates = []

    while loop_forever or i_episode < episodes:

        observation = env.reset()
        episode_length = 0

        for timestep in range(200):
            prev_obs = observation
            action, action_prob = agent.act(prev_obs)
            observation, reward, done, _ = env.step(action)
            if done:
                break
            agent.store_transition(prev_obs, observation, action, action_prob,
                                   reward)
            episode_length = timestep

        loss_mean, entropy_mean, learning_rate = agent.train()
        mean_losses.append(loss_mean)
        mean_entropies.append(entropy_mean)
        mean_episode_lengths.append(episode_length)
        learning_rates.append(learning_rate)

        results["loss"] = mean_losses
        results["entropy"] = mean_entropies
        results["episode_length"] = mean_episode_lengths
        results["learning_rate"] = learning_rates

        if i_episode % 100 == 0:
            log.info(f"Finished episode {i_episode}")
        if to_file:
            if i_episode % 100 == 0:
                with open("../pickles/ant_no_joints_cost/results.p",
                          "wb") as file:
                    pickle.dump(results, file)
            if i_episode % 1000 == 0:
                agent.save(i_episode)

        i_episode += 1

    env.close()
    return results