def make_agents(env): # load_path = "zoo/ppo_masking/final_model" # load_path = "zoo/ppo_logging/2020-12-27T15:51:49/final_model" load_path = "zoo/ppo_kl/2020-12-27T16:28:42/final_model" model = PPO.load(load_path, env) random1 = RandomAgent(env) # random2 = RandomAgent(env) # random3 = RandomAgent(env) return [model, random1] #, random2, random3]
def _load(self, env_cls, env_kwargs, agent_kwargs): with open(self.kwargs_path, 'rb') as kwargs_file: kwargs = pickle.load(kwargs_file) kwargs['env'].update(env_kwargs) kwargs['agent'].update(agent_kwargs) env = self._build_env(env_cls, kwargs['env'], kwargs['agent']['n_steps']) agent = PPO.load(path=self.agent_path, env=env, tensorboard_log=self.tensorboard_path, **kwargs['agent']) return agent, env
def run_experiment(args): # Again could have used the SB3 tools here, buuuut... vecEnv = [] for i in range(args.n_envs): # Bit of trickery here to avoid referencing # to the same "i" vecEnv.append((lambda idx: lambda: create_env(args, idx))(i)) vecEnv = DummyVecEnv(vecEnv) constraint = AVAILABLE_CONSTRAINTS[args.constraint] agent = None if constraint == "ClipPPO": # Create a vanilla PPO agent = PPO("MlpPolicy", vecEnv, verbose=2, device="cpu", n_steps=args.n_steps, clip_range=args.clip_range, learning_rate=args.learning_rate, gamma=args.gamma, ent_coef=args.ent_coef, gae_lambda=1.0, n_epochs=args.n_epochs) else: constraint = constraint(args) agent = SmallStepPPO("MlpPolicy", vecEnv, verbose=2, device="cpu", n_steps=args.n_steps, step_constraint=constraint, learning_rate=args.learning_rate, step_constraint_max_updates=args.max_updates, gamma=args.gamma, ent_coef=args.ent_coef, gae_lambda=1.0) output_log_file = None if args.output_log: output_log_file = open(args.output_log, "w") logger.Logger.CURRENT = logger.Logger( folder=None, output_formats=[logger.HumanOutputFormat(output_log_file)]) agent.learn(total_timesteps=args.total_timesteps) if args.output is not None: agent.save(os.path.join(args.output, AGENT_FILE)) vecEnv.close() if output_log_file: output_log_file.close()
def make_agents(env): # new_load_path = "zoo/ppo_recreate_best/latest/best_model" # new_load_path = "zoo/ppo_reward_bugfix2/latest/best_model" new_load_path = "zoo/ppo_reward_bugfix4/latest/best_model" # new_load_path = "zoo/ppo_masking_fast_elimination3/best_model" new_model = PPO.load(new_load_path, env) # new_load_path = "zoo/ppo_headsup/latest/best_model" # new_model2 = PPO.load(new_load_path, env) # old_load_path = "ppo2/final_model" # old_load_path = "zoo/ppo_masking_fast_elimination3/best_model" old_load_path = "zoo/ppo_reward_bugfix2/latest/best_model" old_load_path2 = "zoo/ppo_recreate_best/latest/best_model" # old_load_path = "zoo/ppo_headsup/latest/best_model" old_model = PPO.load(old_load_path, env) old_model2 = PPO.load(old_load_path2, env) # random1 = RandomAgent(env) # random2 = RandomAgent(env) return [old_model, old_model2, new_model]
def train(output_folder, load_path): base_output = Path(output_folder) full_output = base_output / datetime.datetime.now().isoformat( timespec="seconds") # latest = base_output / "latest" # latest.symlink_to(full_output) logger.configure(folder=str(full_output)) env = LoveLetterMultiAgentEnv(num_players=4, reward_fn=Rewards.fast_elimination_reward) env.seed(SEED) # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.) # model = PPO(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10, # optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2) if load_path: model = PPO.load(load_path, env) else: # def test_fn(env): # return env.valid_action_mask() # model = PPO(MlpPolicy, env, verbose=1, ent_coef=0.05) #, action_mask_fn=test_fn) other_agents = [RandomAgent(env, SEED + i) for i in range(3)] # other_agents = [ # PPO.load("zoo/ppo_logging/2020-12-27T15:51:49/final_model", env), # ] # PPO.load("zoo/ppo_reward_bugfix2/latest/best_model", env), # PPO.load("zoo/ppo_reward_bugfix2/latest/best_model", env), # ] agents = [model, *other_agents] env.set_agents(agents) eval_callback = EvalCallback( env, best_model_save_path=str(full_output), log_path=str(full_output), eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES, ) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) model.save(str(full_output / "final_model")) env.close()
def train(load_path): env = LoveLetterMultiAgentEnv(num_players=4, reward_fn=Rewards.game_completion_reward) env.seed(SEED) # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.) # model = PPO(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10, # optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2) if load_path: model = PPO.load(load_path, env) else: model = PPO(MlpPolicy, env) random_agents = [RandomAgent(env, SEED + i) for i in range(3)] agents = [model, *random_agents] env.set_agents(agents) eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point. env.close()
scenario = os.path.join(code_location, "scenarios", game, "custom_rewards.json") # state = os.path.join(retro.data.DATA_PATH, "data", "contrib", game, "MarioCircuit1.GP.100cc.1P.DK.Start.state") state = os.path.join(retro.data.DATA_PATH, "data", "contrib", game, "MarioCircuit1.GP.50cc.1P.Luigi.Start.state") # state = os.path.join(retro.data.DATA_PATH, "data", "contrib", game, "DonutPlains1.GP.50cc.1P.Koopa.Start.state") model_name = os.path.join( code_location, "models", "ppo_SuperMarioKart-Snes_e304080a-dd37-4efa-9140-aecc0079e710_final") env = get_env(game, state, scenario) # Record a movie of the output # moviepath = "testmodel.mp4" # env = MovieRecordWrapper(env, savedir=moviepath) env = DummyVecEnv([lambda: env]) model = PPO.load(model_name) model.set_env(env) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) print("Step reward: {}".format(rewards)) # cumulative_reward = np.sum(rewards) + cumulative_reward env.render() if np.any(dones): # print("Cumulative reward: {}".format(cumulative_reward)) time.sleep(1) break
print("testing SB3 TD3") test_trainer( 100, 100, SB3OffPolicyTrainer( continious_env_fn, TD3("MlpPolicy", continious_env_fn(), device="cpu"))) print("testing SB3 SAC") test_trainer( 100, 100, SB3OffPolicyTrainer( continious_env_fn, SAC("MlpPolicy", continious_env_fn(), device="cpu"))) print("testing SB3 DDPG") test_trainer( 100, 100, SB3OffPolicyTrainer( continious_env_fn, DDPG("MlpPolicy", continious_env_fn(), device="cpu"))) print("testing SB3 PPO") test_trainer( 100, 100, SB3OnPolicyTrainer( discrete_env_fn, PPO("MlpPolicy", discrete_env_fn(), device="cpu", n_steps=10))) print("testing SB3 PPO with continuous env") test_trainer( 100, 100, SB3OnPolicyTrainer( continious_env_fn, PPO("MlpPolicy", continious_env_fn(), device="cpu", n_steps=10)))
def _create(self, env_cls, env_kwargs, agent_kwargs): env = self._build_env(env_cls, env_kwargs, agent_kwargs['n_steps']) agent = PPO(env=env, tensorboard_log=self.tensorboard_path, **agent_kwargs) return agent, env
project_name = "miki.pacman/MK2" google_drive_checkpoints_path = "MK2/saves" exp_id = "MK-19" params = get_exp_params(exp_id, project_name) params.update({"state_versions": [16, 17, 18, 19]}) if __name__ == '__main__': with tempfile.TemporaryDirectory(dir="/tmp") as temp: checkpointer = GoogleDriveCheckpointer( project_experiments_path=google_drive_checkpoints_path, exp_id=exp_id) checkpoints_list = checkpointer.get_list_of_checkpoints() checkpoint = checkpoints_list[len(checkpoints_list) // 2] checkpointer.download_checkpoints([checkpoint], temp) env1, env2, env3 = params["env_function"](params, train=False) model = PPO.load(os.path.join(temp, checkpoint)) p1 = {"policy": model, "frameskip": params["frameskip"], "env": env2} p2 = {"policy": "human", "frameskip": 60, "env": env3} for i in range(4): PygameInteractiveEnvRecorder( fps=60, env=env1, p1=p1, p2=p2, render_n_frames_after_done=250, record_output_path=f"/tmp/{exp_id}_video_{i}.mp4").run()
batch_size=128, learning_rate=0.001, n_epochs=8, gamma=0.99, ent_coef=0.01, vf_coef=0.5, gae_lambda=0.95, clip_range=0.2, clip_range_vf=float('inf'), max_grad_norm=0.5 #float('inf') ) # Create the learning agent according to the chosen algorithm agent = PPO(MlpPolicy, env, **config, tensorboard_log=tensorboard_data_path, verbose=True) # Load an agent if desired # agent = PPO2.load("cartpole_ppo2_baseline.pkl") # Run the learning process agent.learn(total_timesteps=400000, log_interval=5, reset_num_timesteps=False) # Save the agent if desired # agent.save("cartpole_ppo2_baseline.pkl") ### Enjoy a trained agent # duration of the simulations in seconds
# env = DummyVecEnv([lambda: get_env(game, state, scenario)]) # env = VecNormalize(env, norm_obs=True, norm_reward=False) env = VecCheckNan(env, raise_exception=True) # Create a callback to save every n timesteps prefix = "ppo_" + game + "_" + experiment_id checkpoint_callback = CheckpointCallback( save_freq=100000, save_path="C:\\Projects\\OpenAI Games\\retro-ai-hacking\\models", name_prefix=prefix) savefile_name = prefix + "_final" savefile_name = os.path.join( "C:\\Projects\\OpenAI Games\\retro-ai-hacking\\models", savefile_name) model = PPO( CnnPolicy, env, verbose=1, n_steps=128, n_epochs=3, learning_rate=2.5e-4, batch_size=32, ent_coef=0.01, vf_coef=1.0, tensorboard_log="C:\\Projects\\OpenAI Games\\retro-ai-hacking\\tb_logs" ) model.learn(total_timesteps=1000000, callback=checkpoint_callback) model.save(savefile_name)
agent_cfg['clip_range_vf'] = float('inf') agent_cfg['max_grad_norm'] = float('inf') agent_cfg['seed'] = SEED # ====================== Run the optimization ====================== # Create a multiprocess environment env_creator = lambda: gym.make(GYM_ENV_NAME, **GYM_ENV_KWARGS) train_env = SubprocVecEnv([env_creator for _ in range(int(N_THREADS // 2))], start_method='fork') test_env = DummyVecEnv([env_creator]) # Create the learning agent according to the chosen algorithm train_agent = PPO(MlpPolicy, train_env, **agent_cfg, tensorboard_log=log_path, verbose=True) train_agent.eval_env = test_env # Run the learning process checkpoint_path = train(train_agent, max_timesteps=100000) # ===================== Enjoy the trained agent ====================== # Create testing agent test_agent = train_agent.load(checkpoint_path) test_agent.eval_env = test_env # Run the testing process test(test_agent, max_episodes=1)