コード例 #1
0
def main():
  # Create the callback: check every 1000 steps
  log_dir = 'log'
  callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)
  num_cpu = 16
  model_stats_path = os.path.join(log_dir, "sac_" + env_name)
  env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl')
  tb_log = 'tb_log'
  videoName = '5M_timesteps_sac'
  tb_log_name = videoName

  if(StartFresh):
        # env = make_vec_env(env_name, n_envs=4)
        # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)
        env.reset()
        policy_kwargs = {
            'net_arch':[128,64,32],
        }
        model = PPO('MlpPolicy', 
          env, 
          learning_rate = 0.001,
          n_steps=500,
          # batch_size=0,
          # n_epochs=1,
          gamma=0.9,
          policy_kwargs = policy_kwargs, 
          verbose=1, 
          tensorboard_log=tb_log,
          device="auto")
  else:
      env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
      env = VecNormalize.load(env_stats_path, env)
      env.reset()

      
      model = PPO.load(model_stats_path, tensorboard_log=tb_log)
      model.set_env(env)

  if(DoTraining):
    eval_env = make_vec_env(env_name, n_envs=1)
    eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
    eval_env.reset()
    # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log)
    model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback()

    # Don't forget to save the VecNormalize statistics when saving the agent
    model.save(model_stats_path)
    env.save(env_stats_path)
    
  if(DoVideo):
    # mean_reward, std_reward = evaluate_policy(model, eval_env)
    # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}")
    record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
コード例 #2
0
def load_model(model_path,
               policy_class,
               policy_kwargs,
               env,
               hp,
               partners,
               testing,
               try_load=True):
    load_successful = False

    if try_load:
        try:
            model = PPO.load(model_path)  #, policy_kwargs=policy_kwargs)
            load_successful = True
            print("Model loaded successfully")
        except Exception as e:
            print("Could not load model", e)

    if not load_successful:
        print("Create new model")

        n_steps, batch_size, n_epochs, = hp['n_steps'], hp['batch_size'], hp[
            'n_epochs']
        model = PPO(policy_class,
                    env,
                    policy_kwargs=policy_kwargs,
                    n_steps=n_steps,
                    batch_size=batch_size,
                    n_epochs=n_epochs,
                    verbose=0,
                    ent_coef=0.00,
                    marginal_reg_coef=hp['mreg'])

        for name, param in model.policy.named_parameters():
            if param.requires_grad:
                print(name, param.data.size())

    vec_env = DummyVecEnv([lambda: env])
    model.set_env(vec_env)

    model.policy.set_partners(partners)
    if testing:
        model.policy.num_partners = 1  # only test 1 partner
        model.marginal_reg_coef = 0
        model.n_epochs = hp['n_epochs_testing']
        model.n_steps = hp['n_steps_testing']
        model._init_rollout_buffer()

    return model
コード例 #3
0
def main():
    # nn = torch.nn.Sequential(torch.nn.Linear(8, 64), torch.nn.Tanh(),
    #                          torch.nn.Linear(64, 2))

    os.makedirs(_log_dir, exist_ok=True)

    DoTraining = True
    StartFresh = True
    num_cpu = 8
    if (DoTraining):

        # This doesn't work but it might have something to do with how the environment is written
        # num_cpu = 1
        # env = make_vec_env(env_id, n_envs=num_cpu, monitor_dir=_log_dir) # make_vec_env contains Monitor

        # Create the callback: check every 1000 steps
        # callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=_log_dir)

        if (StartFresh):
            env = SubprocVecEnv([
                make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)
            ])
            env = VecNormalize(env,
                               norm_obs=True,
                               norm_reward=True,
                               clip_obs=10.)
            env.reset()
            policy_kwargs = {
                'net_arch': [128, 128, 128],
            }
            model = PPO('MlpPolicy',
                        env,
                        policy_kwargs=policy_kwargs,
                        verbose=2,
                        tensorboard_log=tb_log)
        else:
            env = SubprocVecEnv([
                make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)
            ])
            env = VecNormalize.load(_stats_path, env)
            env.reset()

            model = PPO.load(
                'log\monitor_simpledriving_vecNormalized_128x3_2\PPO_4243456.mdl',
                tensorboard_log=tb_log)
            model.set_env(env)

        eval_env = gym.make(env_id)
        # print('!!!!Checking Environment!!!!')
        # print(check_env(eval_env))

        mean_reward, std_reward = evaluate_policy(model,
                                                  eval_env,
                                                  n_eval_episodes=10)
        print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}')
        for _ in range(50):
            model.learn(total_timesteps=100000,
                        tb_log_name=env_id,
                        reset_num_timesteps=False)  #, callback=callback
            mean_reward, std_reward = evaluate_policy(model,
                                                      eval_env,
                                                      n_eval_episodes=10)
            print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}')
            model.save(_log_dir + 'PPO_{}'.format(model.num_timesteps) +
                       '.mdl')
            env.save(_log_dir +
                     'vec_normalize_{}'.format(model.num_timesteps) + '.pkl')

    if (not DoTraining):
        # eval_env = SubprocVecEnv([make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)])
        # eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env)
        # eval_env = VecVideoRecorder(eval_env, video_folder='videos/',
        #                       record_video_trigger=lambda step: step == 0, video_length=500,
        #                       name_prefix='test')
        # eval_env.training = False
        # eval_env.norm_reward = False
        # eval_env.reset()

        eval_env = DummyVecEnv(
            [make_env(env_id, i, log_dir=_log_dir) for i in range(1)])
        # eval_env = gym.make(env_id)
        eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl',
                                     eval_env)

        model = PPO.load(
            'log\monitor_simpledriving_vecNormalized_128x3\PPO_5734400.mdl',
            tensorboard_log=tb_log)
        model.set_env(eval_env)
        # record_video(env_id, model, video_length=500, prefix='ppo_'+env_id)
        # Start the video at step=0 and record 500 steps
        # eval_env = VecVideoRecorder(eval_env, video_folder='tmp',
        #                       record_video_trigger=lambda step: step == 0, video_length=500,
        #                       name_prefix='')

        obs = eval_env.reset()
        # for i in range(500):
        #     action, _ = model.predict(obs)
        #     obs, _, _, _ = eval_env.step(action)
        # eval_env.close()
        while True:
            action, _states = model.predict(obs, deterministic=True)
            obs, _, done, _ = eval_env.step(action)
            # eval_env.render()
            if done.any():
                # obs = eval_env.reset()
                # time.sleep(1/30)
                eval_env.close()
                break
コード例 #4
0
                env,
                learning_rate=0.0001,
                gamma=0.7,
                batch_size=1024,
                verbose=1,
                tensorboard_log="./log/ppo_crossy_road_tensorboard/")
    model.learn(total_timesteps=30000)
    model.save("../model/ppo")
    env.close()

# Continue to train
elif programing_type == 1:
    myenv = gym.make(environment_name)
    env = DummyVecEnv([lambda: myenv])
    model = PPO.load('../model/ppo', env=env)
    model.set_env(env)
    model.learn(total_timesteps=20000,
                callback=None,
                reset_num_timesteps=False)
    model.save("../model/ppo")
    env.close()

# Test the agent
else:
    myenv = gym.make(environment_name)
    env = DummyVecEnv([lambda: myenv])
    model = PPO.load('../model/ppo', env=env)
    result = {}

    mean_reward = []
    scores = []
コード例 #5
0
model = PPO("MlpPolicy", meta_env, policy_kwargs=policy_kwargs, n_steps=n_steps, batch_size=batch_size, n_epochs=n_epochs, verbose=0)
#model.learn(total_timesteps=100000, callback=eval_callback, meta_learn=False)        # no meta learning
model.learn(total_timesteps=100000, callback=eval_callback, meta_learn=True)     # meta learning

opponent_policies = [
    np.array([0,1,2,0,1]),
    np.array([1,2,2,1,0]),
    np.array([2,1,0,0,0]),
    #np.array([2,2,1,1,0]),
    #np.array([0,1,2,2,2]),
]
eval_callback_test = EvalCallback(test_meta_env, eval_freq=500, deterministic=True, render=False)
for opponent_policy in opponent_policies:
    meta_env.fixed_opponent_policy = opponent_policy
    test_meta_env.fixed_opponent_policy = opponent_policy
    model.set_env(meta_env)
    model.learn(total_timesteps=2000, callback=eval_callback_test, meta_learn=False)



# rewards_fixed_rock = meta_env.run_sim(policies[0], 50, model, 0)
# rewards_fixed_paper = meta_env.run_sim(policies[1], 50, model, 1)
# rewards_fixed_scissors = meta_env.run_sim(policies[2], 50, model, 2)
# rewards_copycat = meta_env.run_sim(policies[3], 50, model, 3)
# rewards_random = meta_env.run_sim(policies[4], 50, model, 4)
# rewards_aggressive = meta_env.run_sim(policies[5], 50, model, 5)
# rewards_passive = meta_env.run_sim(policies[6], 50, model, 6)
# avg_rewards = [np.mean(rewards_fixed_rock), np.mean(rewards_fixed_paper), np.mean(rewards_fixed_scissors), np.mean(rewards_copycat), np.mean(rewards_random), np.mean(rewards_aggressive), np.mean(rewards_passive)]
# x = [policy.name for policy in policies]
# x_pos = [i for i, _ in enumerate(x)]
# plt.bar(x_pos, avg_rewards, color='red')