Exemple #1
0
def make_agents(env):
    load_path = "zoo/ppo_masking/final_model"
    model = PPO.load(load_path, env)
    random1 = RandomAgent(env)
    random2 = RandomAgent(env)
    random3 = RandomAgent(env)

    return [model, random1, random2, random3]
Exemple #2
0
def train(load_path):
    env = LoveLetterMultiAgentEnv(num_players=4)
    env.seed(SEED)

    # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.)
    # model = PPO(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
    #             optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)
    if load_path:
        model = PPO.load(load_path, env)
    else:
        model = PPO(MlpPolicy, env)

    random_agents = [RandomAgent(env, SEED + i) for i in range(3)]
    agents = [model, *random_agents]
    env.set_agents(agents)

    eval_callback = EvalCallback(env,
                                 best_model_save_path=LOGDIR,
                                 log_path=LOGDIR,
                                 eval_freq=EVAL_FREQ,
                                 n_eval_episodes=EVAL_EPISODES)

    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    model.save(os.path.join(
        LOGDIR, "final_model"))  # probably never get to this point.

    env.close()
Exemple #3
0
def make_agents(env):
    human = HumanAgent()
    # load_path = "zoo/ppo_reward_bugfix4/latest/best_model"
    # load_path = "zoo/ppo_logging/2020-12-27T15:51:49/final_model"
    # load_path = "zoo/ppo_kl/2020-12-27T16:28:42/final_model"
    # model = PPO.load(load_path, env)
    random1 = RandomAgent(env)
    # random2 = RandomAgent(env)

    return [human, random1]  # model]  # random1, random2]
Exemple #4
0
def make_agents(env):
    # load_path = "zoo/ppo_masking/final_model"
    # load_path = "zoo/ppo_logging/2020-12-27T15:51:49/final_model"
    load_path = "zoo/ppo_kl/2020-12-27T16:28:42/final_model"
    model = PPO.load(load_path, env)
    random1 = RandomAgent(env)
    # random2 = RandomAgent(env)
    # random3 = RandomAgent(env)

    return [model, random1]  #, random2, random3]
def train(output_folder, load_path):
    base_output = Path(output_folder)
    full_output = base_output / datetime.datetime.now().isoformat(
        timespec="seconds")
    # latest = base_output / "latest"
    # latest.symlink_to(full_output)

    logger.configure(folder=str(full_output))

    env = LoveLetterMultiAgentEnv(num_players=4,
                                  reward_fn=Rewards.fast_elimination_reward)
    env.seed(SEED)

    # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.)
    # model = PPO(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
    #             optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)
    if load_path:
        model = PPO.load(load_path, env)
    else:
        # def test_fn(env):
        #     return env.valid_action_mask()
        #
        model = PPO(MlpPolicy, env, verbose=1,
                    ent_coef=0.05)  #, action_mask_fn=test_fn)

    other_agents = [RandomAgent(env, SEED + i) for i in range(3)]
    # other_agents = [
    #     PPO.load("zoo/ppo_logging/2020-12-27T15:51:49/final_model", env),
    # ]
    #     PPO.load("zoo/ppo_reward_bugfix2/latest/best_model", env),
    #     PPO.load("zoo/ppo_reward_bugfix2/latest/best_model", env),
    # ]
    agents = [model, *other_agents]
    env.set_agents(agents)

    eval_callback = EvalCallback(
        env,
        best_model_save_path=str(full_output),
        log_path=str(full_output),
        eval_freq=EVAL_FREQ,
        n_eval_episodes=EVAL_EPISODES,
    )

    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    model.save(str(full_output / "final_model"))

    env.close()