def test_agent_ppo(actor, test_range: List[int]): # Train for each env on a test set: 0..200 // mean over 100 tries # Test for each env on a test set: 200..400 // mean over 100 tries all_lvl_rewards = [] all_lvl_steps_n = [] for i in range(test_range[0], test_range[1]): env = EnvWrapper('procgen:procgen-starpilot-v0', start_level=i, num_levels=1) agent = PPOAgent(env, actor) lvl_rewards = [] lvl_steps_n = [] s1 = env.reset() for i in range(1): rewards = [] steps_before_done = 0 while True: s = s1 steps_before_done += 1 action, _, _ = agent.act(s) s1, r, d, _ = env.step(action) rewards.append(r) if d: break lvl_rewards.append(sum(rewards)) lvl_steps_n.append(steps_before_done) all_lvl_rewards.append(mean(lvl_rewards)) all_lvl_steps_n.append(mean(lvl_steps_n)) return all_lvl_rewards, all_lvl_steps_n
def get_solution_brain_set(): agent = PPOAgent( state_size=STATE_SIZE, action_size=ACTION_SIZE, seed=SEED, actor_critic_factory=lambda: PPO_Actor_Critic( actor_model=MLP(layer_sizes=(STATE_SIZE, 128, 128, ACTION_SIZE), seed=SEED, output_function=torch.nn.Tanh(), with_batchnorm=BATCHNORM, output_layer_initialization_fn=lambda l: init_layer_within_range(l), hidden_layer_initialization_fn=lambda l: init_layer_inverse_root_fan_in(l), activation_function=torch.nn.LeakyReLU(True), dropout=DROPOUT), critic_model=MLP(layer_sizes=(STATE_SIZE, 128, 128, 1), seed=SEED, output_function=torch.nn.Tanh(), with_batchnorm=BATCHNORM, output_layer_initialization_fn=lambda l: init_layer_within_range(l), hidden_layer_initialization_fn=lambda l: init_layer_inverse_root_fan_in(l), activation_function=torch.nn.LeakyReLU(True), dropout=DROPOUT), action_size=ACTION_SIZE, continuous_actions=True, ), optimizer_factory=lambda params: torch.optim.Adam( params, lr=LR, weight_decay=WEIGHT_DECAY, eps=EPSILON), batch_size=BATCH_SIZE, ) crawler_brain = Brain( brain_name=BRAIN_NAME, action_size=ACTION_SIZE, state_shape=STATE_SIZE, observation_type='vector', agents=[agent], ) brain_set = BrainSet(brains=[crawler_brain]) return brain_set
actor = PolicyModelConv(width, height, env_wrapper.env.action_space.n).cuda() critic = PolicyModel(width, height).cuda() icm = IntrinsicCuriosityModule(env_wrapper.env.action_space.n).cuda() optimizer = torch.optim.Adam([{ 'params': actor.parameters(), 'lr': lr_actor }, { 'params': icm.parameters(), 'lr': lr_icm }, { 'params': critic.parameters(), 'lr': lr_critic }]) # https://www.aicrowd.com/challenges/neurips-2020-procgen-competition # Challenge generalize for 8 million time steps cover 200 levels # max batch size GPU limit 64x64 * 2000 * nets_size # print(get_n_params(actor)) agent = PPOAgent(env_wrapper, actor, critic, icm, optimizer, name=args.model) # SAVE MODEL EVERY (8000000/4) / 2000 / 50 # print(get_n_params(actor)) agent.train(2000, int(8000000 / motion_blur_c))
config.num_agents = 5 config.envs = multi_env(config.env_name, config.num_agents) config.num_episodes = 1000 config.steps = 1000 config.state_size = config.envs.observation_space.shape[0] config.action_size = config.envs.action_space.shape[0] config.activ_actor = F.relu config.lr_actor = 3e-4 config.hidden_actor = (512, 512) config.optim_actor = Adam config.grad_clip_actor = 5 config.activ_critic = F.relu config.lr_critic = 3e-4 config.hidden_critic = (512, 512) config.optim_critic = Adam config.grad_clip_critic = 5 config.gamma = 0.99 config.ppo_clip = 0.2 config.ppo_epochs = 10 config.ppo_batch_size = 32 config.ent_weight = 0.01 config.val_loss_weight = 1 config.use_gae = True config.lamda = 0.95 config.env_solved = 1.0 config.times_solved = 10 #agent = A2CAgent(config) agent = PPOAgent(config) agent.train()
log_dir = f'./experiments/nolimit_holdem_ppo_result_adv_{evaluate_every}/' # Set a global seed set_global_seed(0) with tf.Session() as sess: # Initialize a global step global_step = tf.Variable(0, name='global_step', trainable=False) # Set up the agents agent = PPOAgent( sess, action_num=env.action_num, train_every=train_every, state_shape=env.state_shape, replay_memory_init_size=memory_init_size, replay_memory_size=max_buffer_size, actor_layers=[64, 64], critic_layers=[64, 64], ) random_agent = RandomAgent(action_num=eval_env.action_num) env.set_agents([agent, random_agent]) eval_env.set_agents([agent, random_agent]) # Initialize global variables sess.run(tf.global_variables_initializer()) # Include this line to verify graph not being updated in each iteration. This helps identify memory leaks. # Leave uncommented since tf.train.Saver() below is a graph operation. # sess.graph.finalize()
results = { "loss": np.zeros(shape=(8, ), dtype=object), "entropy": np.zeros(shape=(8, ), dtype=object), "learning_rate": np.zeros(shape=(8, ), dtype=object), "episode_length": np.zeros(shape=(1, ), dtype=object), "returns": np.zeros(shape=(1, ), dtype=object), } results["episode_length"][0] = [] results["returns"][0] = [] for i in range(8): results["loss"][i] = [] results["entropy"][i] = [] results["learning_rate"][i] = [] action_heads = [ PPOAgent(28, 5), PPOAgent(28, 5), PPOAgent(28, 5), PPOAgent(28, 5), PPOAgent(28, 5), PPOAgent(28, 5), PPOAgent(28, 5), PPOAgent(28, 5), ] i_episode = 0 print("loaded agent") def discretize_action(a: int): if a == 0:
def cartpole(to_file=True, episodes=None): loop_forever = False if episodes is None: loop_forever = True env = gym.make("CartPole-v0") results = { "loss": [], "episode_length": [], "entropy": [], "learning_rate": [], } agent = PPOAgent(4, 2) i_episode = 0 mean_losses = [] mean_entropies = [] mean_episode_lengths = [] learning_rates = [] while loop_forever or i_episode < episodes: observation = env.reset() episode_length = 0 for timestep in range(200): prev_obs = observation action, action_prob = agent.act(prev_obs) observation, reward, done, _ = env.step(action) if done: break agent.store_transition(prev_obs, observation, action, action_prob, reward) episode_length = timestep loss_mean, entropy_mean, learning_rate = agent.train() mean_losses.append(loss_mean) mean_entropies.append(entropy_mean) mean_episode_lengths.append(episode_length) learning_rates.append(learning_rate) results["loss"] = mean_losses results["entropy"] = mean_entropies results["episode_length"] = mean_episode_lengths results["learning_rate"] = learning_rates if i_episode % 100 == 0: log.info(f"Finished episode {i_episode}") if to_file: if i_episode % 100 == 0: with open("../pickles/ant_no_joints_cost/results.p", "wb") as file: pickle.dump(results, file) if i_episode % 1000 == 0: agent.save(i_episode) i_episode += 1 env.close() return results