def test_agent_ppo(actor, test_range: List[int]):
    # Train for each env on a test set: 0..200 // mean over 100 tries
    # Test for each env on a test set: 200..400 // mean over 100 tries

    all_lvl_rewards = []
    all_lvl_steps_n = []
    for i in range(test_range[0], test_range[1]):
        env = EnvWrapper('procgen:procgen-starpilot-v0',
                         start_level=i,
                         num_levels=1)
        agent = PPOAgent(env, actor)

        lvl_rewards = []
        lvl_steps_n = []
        s1 = env.reset()

        for i in range(1):
            rewards = []
            steps_before_done = 0

            while True:
                s = s1
                steps_before_done += 1
                action, _, _ = agent.act(s)
                s1, r, d, _ = env.step(action)
                rewards.append(r)

                if d:
                    break

            lvl_rewards.append(sum(rewards))
            lvl_steps_n.append(steps_before_done)

        all_lvl_rewards.append(mean(lvl_rewards))
        all_lvl_steps_n.append(mean(lvl_steps_n))

    return all_lvl_rewards, all_lvl_steps_n
예제 #2
0
def get_solution_brain_set():
    agent = PPOAgent(
        state_size=STATE_SIZE,
        action_size=ACTION_SIZE,
        seed=SEED,
        actor_critic_factory=lambda: PPO_Actor_Critic(
            actor_model=MLP(layer_sizes=(STATE_SIZE, 128, 128, ACTION_SIZE),
                            seed=SEED,
                            output_function=torch.nn.Tanh(),
                            with_batchnorm=BATCHNORM,
                            output_layer_initialization_fn=lambda l:
                            init_layer_within_range(l),
                            hidden_layer_initialization_fn=lambda l:
                            init_layer_inverse_root_fan_in(l),
                            activation_function=torch.nn.LeakyReLU(True),
                            dropout=DROPOUT),
            critic_model=MLP(layer_sizes=(STATE_SIZE, 128, 128, 1),
                             seed=SEED,
                             output_function=torch.nn.Tanh(),
                             with_batchnorm=BATCHNORM,
                             output_layer_initialization_fn=lambda l:
                             init_layer_within_range(l),
                             hidden_layer_initialization_fn=lambda l:
                             init_layer_inverse_root_fan_in(l),
                             activation_function=torch.nn.LeakyReLU(True),
                             dropout=DROPOUT),
            action_size=ACTION_SIZE,
            continuous_actions=True,
        ),
        optimizer_factory=lambda params: torch.optim.Adam(
            params, lr=LR, weight_decay=WEIGHT_DECAY, eps=EPSILON),
        batch_size=BATCH_SIZE,
    )

    crawler_brain = Brain(
        brain_name=BRAIN_NAME,
        action_size=ACTION_SIZE,
        state_shape=STATE_SIZE,
        observation_type='vector',
        agents=[agent],
    )
    brain_set = BrainSet(brains=[crawler_brain])
    return brain_set
예제 #3
0
        actor = PolicyModelConv(width, height,
                                env_wrapper.env.action_space.n).cuda()

    critic = PolicyModel(width, height).cuda()
    icm = IntrinsicCuriosityModule(env_wrapper.env.action_space.n).cuda()

    optimizer = torch.optim.Adam([{
        'params': actor.parameters(),
        'lr': lr_actor
    }, {
        'params': icm.parameters(),
        'lr': lr_icm
    }, {
        'params': critic.parameters(),
        'lr': lr_critic
    }])

    # https://www.aicrowd.com/challenges/neurips-2020-procgen-competition
    # Challenge generalize for 8 million time steps cover 200 levels
    # max batch size GPU limit 64x64 * 2000 * nets_size
    # print(get_n_params(actor))
    agent = PPOAgent(env_wrapper,
                     actor,
                     critic,
                     icm,
                     optimizer,
                     name=args.model)
    # SAVE MODEL EVERY (8000000/4) / 2000 / 50
    # print(get_n_params(actor))
    agent.train(2000, int(8000000 / motion_blur_c))
config.num_agents = 5
config.envs = multi_env(config.env_name, config.num_agents)
config.num_episodes = 1000
config.steps = 1000
config.state_size = config.envs.observation_space.shape[0]
config.action_size = config.envs.action_space.shape[0]
config.activ_actor = F.relu
config.lr_actor = 3e-4
config.hidden_actor = (512, 512)
config.optim_actor = Adam
config.grad_clip_actor = 5
config.activ_critic = F.relu
config.lr_critic = 3e-4
config.hidden_critic = (512, 512)
config.optim_critic = Adam
config.grad_clip_critic = 5
config.gamma = 0.99
config.ppo_clip = 0.2
config.ppo_epochs = 10
config.ppo_batch_size = 32
config.ent_weight = 0.01
config.val_loss_weight = 1
config.use_gae = True
config.lamda = 0.95
config.env_solved = 1.0
config.times_solved = 10

#agent = A2CAgent(config)
agent = PPOAgent(config)

agent.train()
예제 #5
0
log_dir = f'./experiments/nolimit_holdem_ppo_result_adv_{evaluate_every}/'

# Set a global seed
set_global_seed(0)

with tf.Session() as sess:

    # Initialize a global step
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Set up the agents
    agent = PPOAgent(
        sess,
        action_num=env.action_num,
        train_every=train_every,
        state_shape=env.state_shape,
        replay_memory_init_size=memory_init_size,
        replay_memory_size=max_buffer_size,
        actor_layers=[64, 64],
        critic_layers=[64, 64],
    )
    random_agent = RandomAgent(action_num=eval_env.action_num)
    env.set_agents([agent, random_agent])
    eval_env.set_agents([agent, random_agent])

    # Initialize global variables
    sess.run(tf.global_variables_initializer())

    # Include this line to verify graph not being updated in each iteration. This helps identify memory leaks.
    # Leave uncommented since tf.train.Saver() below is a graph operation.
    # sess.graph.finalize()
예제 #6
0
    results = {
        "loss": np.zeros(shape=(8, ), dtype=object),
        "entropy": np.zeros(shape=(8, ), dtype=object),
        "learning_rate": np.zeros(shape=(8, ), dtype=object),
        "episode_length": np.zeros(shape=(1, ), dtype=object),
        "returns": np.zeros(shape=(1, ), dtype=object),
    }
    results["episode_length"][0] = []
    results["returns"][0] = []
    for i in range(8):
        results["loss"][i] = []
        results["entropy"][i] = []
        results["learning_rate"][i] = []

    action_heads = [
        PPOAgent(28, 5),
        PPOAgent(28, 5),
        PPOAgent(28, 5),
        PPOAgent(28, 5),
        PPOAgent(28, 5),
        PPOAgent(28, 5),
        PPOAgent(28, 5),
        PPOAgent(28, 5),
    ]
    i_episode = 0

print("loaded agent")


def discretize_action(a: int):
    if a == 0:
예제 #7
0
def cartpole(to_file=True, episodes=None):

    loop_forever = False
    if episodes is None:
        loop_forever = True

    env = gym.make("CartPole-v0")

    results = {
        "loss": [],
        "episode_length": [],
        "entropy": [],
        "learning_rate": [],
    }
    agent = PPOAgent(4, 2)
    i_episode = 0

    mean_losses = []
    mean_entropies = []
    mean_episode_lengths = []
    learning_rates = []

    while loop_forever or i_episode < episodes:

        observation = env.reset()
        episode_length = 0

        for timestep in range(200):
            prev_obs = observation
            action, action_prob = agent.act(prev_obs)
            observation, reward, done, _ = env.step(action)
            if done:
                break
            agent.store_transition(prev_obs, observation, action, action_prob,
                                   reward)
            episode_length = timestep

        loss_mean, entropy_mean, learning_rate = agent.train()
        mean_losses.append(loss_mean)
        mean_entropies.append(entropy_mean)
        mean_episode_lengths.append(episode_length)
        learning_rates.append(learning_rate)

        results["loss"] = mean_losses
        results["entropy"] = mean_entropies
        results["episode_length"] = mean_episode_lengths
        results["learning_rate"] = learning_rates

        if i_episode % 100 == 0:
            log.info(f"Finished episode {i_episode}")
        if to_file:
            if i_episode % 100 == 0:
                with open("../pickles/ant_no_joints_cost/results.p",
                          "wb") as file:
                    pickle.dump(results, file)
            if i_episode % 1000 == 0:
                agent.save(i_episode)

        i_episode += 1

    env.close()
    return results