def main(): # Create the callback: check every 1000 steps log_dir = 'log' callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) num_cpu = 16 model_stats_path = os.path.join(log_dir, "sac_" + env_name) env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl') tb_log = 'tb_log' videoName = '5M_timesteps_sac' tb_log_name = videoName if(StartFresh): # env = make_vec_env(env_name, n_envs=4) # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() policy_kwargs = { 'net_arch':[128,64,32], } model = PPO('MlpPolicy', env, learning_rate = 0.001, n_steps=500, # batch_size=0, # n_epochs=1, gamma=0.9, policy_kwargs = policy_kwargs, verbose=1, tensorboard_log=tb_log, device="auto") else: env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize.load(env_stats_path, env) env.reset() model = PPO.load(model_stats_path, tensorboard_log=tb_log) model.set_env(env) if(DoTraining): eval_env = make_vec_env(env_name, n_envs=1) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log) model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback() # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): # mean_reward, std_reward = evaluate_policy(model, eval_env) # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}") record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
def load_model(model_path, policy_class, policy_kwargs, env, hp, partners, testing, try_load=True): load_successful = False if try_load: try: model = PPO.load(model_path) #, policy_kwargs=policy_kwargs) load_successful = True print("Model loaded successfully") except Exception as e: print("Could not load model", e) if not load_successful: print("Create new model") n_steps, batch_size, n_epochs, = hp['n_steps'], hp['batch_size'], hp[ 'n_epochs'] model = PPO(policy_class, env, policy_kwargs=policy_kwargs, n_steps=n_steps, batch_size=batch_size, n_epochs=n_epochs, verbose=0, ent_coef=0.00, marginal_reg_coef=hp['mreg']) for name, param in model.policy.named_parameters(): if param.requires_grad: print(name, param.data.size()) vec_env = DummyVecEnv([lambda: env]) model.set_env(vec_env) model.policy.set_partners(partners) if testing: model.policy.num_partners = 1 # only test 1 partner model.marginal_reg_coef = 0 model.n_epochs = hp['n_epochs_testing'] model.n_steps = hp['n_steps_testing'] model._init_rollout_buffer() return model
def main(): # nn = torch.nn.Sequential(torch.nn.Linear(8, 64), torch.nn.Tanh(), # torch.nn.Linear(64, 2)) os.makedirs(_log_dir, exist_ok=True) DoTraining = True StartFresh = True num_cpu = 8 if (DoTraining): # This doesn't work but it might have something to do with how the environment is written # num_cpu = 1 # env = make_vec_env(env_id, n_envs=num_cpu, monitor_dir=_log_dir) # make_vec_env contains Monitor # Create the callback: check every 1000 steps # callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=_log_dir) if (StartFresh): env = SubprocVecEnv([ make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu) ]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() policy_kwargs = { 'net_arch': [128, 128, 128], } model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=2, tensorboard_log=tb_log) else: env = SubprocVecEnv([ make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu) ]) env = VecNormalize.load(_stats_path, env) env.reset() model = PPO.load( 'log\monitor_simpledriving_vecNormalized_128x3_2\PPO_4243456.mdl', tensorboard_log=tb_log) model.set_env(env) eval_env = gym.make(env_id) # print('!!!!Checking Environment!!!!') # print(check_env(eval_env)) mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10) print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}') for _ in range(50): model.learn(total_timesteps=100000, tb_log_name=env_id, reset_num_timesteps=False) #, callback=callback mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10) print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}') model.save(_log_dir + 'PPO_{}'.format(model.num_timesteps) + '.mdl') env.save(_log_dir + 'vec_normalize_{}'.format(model.num_timesteps) + '.pkl') if (not DoTraining): # eval_env = SubprocVecEnv([make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)]) # eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env) # eval_env = VecVideoRecorder(eval_env, video_folder='videos/', # record_video_trigger=lambda step: step == 0, video_length=500, # name_prefix='test') # eval_env.training = False # eval_env.norm_reward = False # eval_env.reset() eval_env = DummyVecEnv( [make_env(env_id, i, log_dir=_log_dir) for i in range(1)]) # eval_env = gym.make(env_id) eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env) model = PPO.load( 'log\monitor_simpledriving_vecNormalized_128x3\PPO_5734400.mdl', tensorboard_log=tb_log) model.set_env(eval_env) # record_video(env_id, model, video_length=500, prefix='ppo_'+env_id) # Start the video at step=0 and record 500 steps # eval_env = VecVideoRecorder(eval_env, video_folder='tmp', # record_video_trigger=lambda step: step == 0, video_length=500, # name_prefix='') obs = eval_env.reset() # for i in range(500): # action, _ = model.predict(obs) # obs, _, _, _ = eval_env.step(action) # eval_env.close() while True: action, _states = model.predict(obs, deterministic=True) obs, _, done, _ = eval_env.step(action) # eval_env.render() if done.any(): # obs = eval_env.reset() # time.sleep(1/30) eval_env.close() break
env, learning_rate=0.0001, gamma=0.7, batch_size=1024, verbose=1, tensorboard_log="./log/ppo_crossy_road_tensorboard/") model.learn(total_timesteps=30000) model.save("../model/ppo") env.close() # Continue to train elif programing_type == 1: myenv = gym.make(environment_name) env = DummyVecEnv([lambda: myenv]) model = PPO.load('../model/ppo', env=env) model.set_env(env) model.learn(total_timesteps=20000, callback=None, reset_num_timesteps=False) model.save("../model/ppo") env.close() # Test the agent else: myenv = gym.make(environment_name) env = DummyVecEnv([lambda: myenv]) model = PPO.load('../model/ppo', env=env) result = {} mean_reward = [] scores = []
model = PPO("MlpPolicy", meta_env, policy_kwargs=policy_kwargs, n_steps=n_steps, batch_size=batch_size, n_epochs=n_epochs, verbose=0) #model.learn(total_timesteps=100000, callback=eval_callback, meta_learn=False) # no meta learning model.learn(total_timesteps=100000, callback=eval_callback, meta_learn=True) # meta learning opponent_policies = [ np.array([0,1,2,0,1]), np.array([1,2,2,1,0]), np.array([2,1,0,0,0]), #np.array([2,2,1,1,0]), #np.array([0,1,2,2,2]), ] eval_callback_test = EvalCallback(test_meta_env, eval_freq=500, deterministic=True, render=False) for opponent_policy in opponent_policies: meta_env.fixed_opponent_policy = opponent_policy test_meta_env.fixed_opponent_policy = opponent_policy model.set_env(meta_env) model.learn(total_timesteps=2000, callback=eval_callback_test, meta_learn=False) # rewards_fixed_rock = meta_env.run_sim(policies[0], 50, model, 0) # rewards_fixed_paper = meta_env.run_sim(policies[1], 50, model, 1) # rewards_fixed_scissors = meta_env.run_sim(policies[2], 50, model, 2) # rewards_copycat = meta_env.run_sim(policies[3], 50, model, 3) # rewards_random = meta_env.run_sim(policies[4], 50, model, 4) # rewards_aggressive = meta_env.run_sim(policies[5], 50, model, 5) # rewards_passive = meta_env.run_sim(policies[6], 50, model, 6) # avg_rewards = [np.mean(rewards_fixed_rock), np.mean(rewards_fixed_paper), np.mean(rewards_fixed_scissors), np.mean(rewards_copycat), np.mean(rewards_random), np.mean(rewards_aggressive), np.mean(rewards_passive)] # x = [policy.name for policy in policies] # x_pos = [i for i, _ in enumerate(x)] # plt.bar(x_pos, avg_rewards, color='red')