config.envs = multi_env(config.env_name, config.num_agents) config.num_episodes = 1000 config.steps = 1000 config.state_size = config.envs.observation_space.shape[0] config.action_size = config.envs.action_space.shape[0] config.activ_actor = F.relu config.lr_actor = 3e-4 config.hidden_actor = (512, 512) config.optim_actor = Adam config.grad_clip_actor = 5 config.activ_critic = F.relu config.lr_critic = 3e-4 config.hidden_critic = (512, 512) config.optim_critic = Adam config.grad_clip_critic = 5 config.gamma = 0.99 config.ppo_clip = 0.2 config.ppo_epochs = 10 config.ppo_batch_size = 32 config.ent_weight = 0.01 config.val_loss_weight = 1 config.use_gae = True config.lamda = 0.95 config.env_solved = 1.0 config.times_solved = 10 #agent = A2CAgent(config) agent = PPOAgent(config) agent.train()
actor = PolicyModelConv(width, height, env_wrapper.env.action_space.n).cuda() critic = PolicyModel(width, height).cuda() icm = IntrinsicCuriosityModule(env_wrapper.env.action_space.n).cuda() optimizer = torch.optim.Adam([{ 'params': actor.parameters(), 'lr': lr_actor }, { 'params': icm.parameters(), 'lr': lr_icm }, { 'params': critic.parameters(), 'lr': lr_critic }]) # https://www.aicrowd.com/challenges/neurips-2020-procgen-competition # Challenge generalize for 8 million time steps cover 200 levels # max batch size GPU limit 64x64 * 2000 * nets_size # print(get_n_params(actor)) agent = PPOAgent(env_wrapper, actor, critic, icm, optimizer, name=args.model) # SAVE MODEL EVERY (8000000/4) / 2000 / 50 # print(get_n_params(actor)) agent.train(2000, int(8000000 / motion_blur_c))
def cartpole(to_file=True, episodes=None): loop_forever = False if episodes is None: loop_forever = True env = gym.make("CartPole-v0") results = { "loss": [], "episode_length": [], "entropy": [], "learning_rate": [], } agent = PPOAgent(4, 2) i_episode = 0 mean_losses = [] mean_entropies = [] mean_episode_lengths = [] learning_rates = [] while loop_forever or i_episode < episodes: observation = env.reset() episode_length = 0 for timestep in range(200): prev_obs = observation action, action_prob = agent.act(prev_obs) observation, reward, done, _ = env.step(action) if done: break agent.store_transition(prev_obs, observation, action, action_prob, reward) episode_length = timestep loss_mean, entropy_mean, learning_rate = agent.train() mean_losses.append(loss_mean) mean_entropies.append(entropy_mean) mean_episode_lengths.append(episode_length) learning_rates.append(learning_rate) results["loss"] = mean_losses results["entropy"] = mean_entropies results["episode_length"] = mean_episode_lengths results["learning_rate"] = learning_rates if i_episode % 100 == 0: log.info(f"Finished episode {i_episode}") if to_file: if i_episode % 100 == 0: with open("../pickles/ant_no_joints_cost/results.p", "wb") as file: pickle.dump(results, file) if i_episode % 1000 == 0: agent.save(i_episode) i_episode += 1 env.close() return results