def test_vec_env(tmpdir):
    """Test VecNormalize Object"""
    clip_obs = 0.5
    clip_reward = 5.0

    orig_venv = DummyVecEnv([make_env])
    norm_venv = VecNormalize(orig_venv, norm_obs=True, norm_reward=True, clip_obs=clip_obs, clip_reward=clip_reward)
    _, done = norm_venv.reset(), [False]
    while not done[0]:
        actions = [norm_venv.action_space.sample()]
        obs, rew, done, _ = norm_venv.step(actions)
        assert np.max(np.abs(obs)) <= clip_obs
        assert np.max(np.abs(rew)) <= clip_reward

    path = str(tmpdir.join("vec_normalize"))
    norm_venv.save(path)
    deserialized = VecNormalize.load(path, venv=orig_venv)
    check_vec_norm_equal(norm_venv, deserialized)
eval_callback = EvalCallback(eval_env, best_model_save_path='Agent007',
                                     log_path='./logs/', eval_freq=10000,
                                     deterministic=True, render=False,n_eval_episodes=1)

model = PPO2(MlpPolicy, env, n_steps=1000, nminibatches=32, lam=0.98, gamma=0.999, learning_rate=1e-4,
                                  noptepochs=4,ent_coef=0.01,verbose=1, tensorboard_log="./rocket_tensorboard/",
                                  policy_kwargs = dict(layers=[400, 300]))


#model = PPO2.load("TestHover", env=env, tensorboard_log="./rocket_tensorboard/")
#while True:
#model.learning_rate = 3e-5
model.learn(total_timesteps=5000000,callback=eval_callback)
model.save("TestHover")
env.save("TestHover_env")
del model # remove to demonstrate saving and loading

model = PPO2.load("TestHover", env=eval_env)

# Enjoy trained agent
obs = eval_env.reset()
data=[]
time=[]
actions=[]
alt_reward = []
mix_reward = []
temp_reward = []
valveChange = []
speedPunishes = []
Esempio n. 3
0
class RocketTrainer:
    def __init__(self, algorithm="SAC", load=True, agent_name="Agent001"):
        self.agent_name = agent_name

        #self.env = LearningRocket(visualize=False)
        #self.env = NormalizeActionWrapper(self.env)

        #self.eval_env = LearningRocket(visualize=True)
        #self.eval_env = NormalizeActionWrapper(self.eval_env)

        #self.env = SubprocVecEnv([lambda: LearningRocket(visualize=False) for i in range(4)])
        self.env = make_vec_env(
            LearningRocket, n_envs=16
        )  #[lambda: LearningRocket(visualize=False) for i in range(16)]))
        #self.eval_env = VecNormalize(DummyVecEnv([lambda: LearningRocket(visualize=True) for i in range(1)]))
        self.eval_env = make_vec_env(lambda: LearningRocket(visualize=True),
                                     n_envs=1)
        #self.eval_env = VecNormalize(self.eval_env)
        self.eval_callback = EvalCallback(self.eval_env,
                                          best_model_save_path='Agent007',
                                          log_path='./logs/',
                                          eval_freq=10000,
                                          deterministic=True,
                                          render=False,
                                          n_eval_episodes=1)
        kai_policy = dict(act_fun=tf.nn.tanh, net_arch=[400, 300])
        #check_env(self.env, warn=True)
        """
        if algorithm == "SAC":
            if load is True:
                self.model = SAC.load(agent_name, env=self.env, tensorboard_log="./rocket_tensorboard/")
                #self.model.ent_coef=0.2
            else:
                self.model = SAC('MlpPolicy', self.env, verbose=1, tensorboard_log="./rocket_tensorboard/",ent_coef=5)
            print("Trainer Set for SAC")
        """
        if algorithm == "TD3":
            n_actions = self.env.action_space.shape[-1]
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=0.1 * np.ones(n_actions))
            if load is True:
                self.model = TD3.load(agent_name,
                                      env=self.env,
                                      tensorboard_log="./rocket_tensorboard/")
                #file = open('replay_buffer', 'rb')
                #self.model.replay_buffer = pickle.load(file)
                #file.close()
            else:
                self.model = TD3(MlpPolicy,
                                 self.env,
                                 action_noise=action_noise,
                                 batch_size=768,
                                 gamma=0.95,
                                 learning_rate=1e-4,
                                 learning_starts=20000,
                                 verbose=1,
                                 tensorboard_log="./rocket_tensorboard/",
                                 policy_kwargs=dict(layers=[400, 300]))
            print("Trainer Set for TD3")
        elif algorithm == "PPO2":
            if load is True:
                self.model = PPO2.load(agent_name,
                                       env=self.env,
                                       tensorboard_log="./rocket_tensorboard/")
                self.eval_env = VecNormalize.load(self.agent_name + "vEnv",
                                                  self.eval_env)
                #self.eval_env.clip_obs = 500
                #self.env = VecNormalize(self.env)
                self.env = VecNormalize.load(self.agent_name + "vEnv",
                                             self.env)
                #self.env.clip_obs = 500
                #self.env.norm_obs = False
                #self.eval_env.norm_obs = False
            else:
                self.model = PPO2(PPOMlpPolicy,
                                  self.env,
                                  n_steps=1024,
                                  nminibatches=32,
                                  lam=0.98,
                                  gamma=0.999,
                                  noptepochs=4,
                                  ent_coef=0.01,
                                  verbose=1,
                                  tensorboard_log="./rocket_tensorboard/",
                                  policy_kwargs=dict(layers=[400, 300]))
                self.eval_env = VecNormalize(self.eval_env)
                self.env = VecNormalize(self.env)
                #self.eval_env.clip_obs = 500
                #self.env.clip_obs = 500
                #self.env.norm_obs=False
                #self.eval_env.norm_obs=False

                print("Trainer set for PPO2. I am speed.")

    def train(self, visualize=False, lesson_length=100000, lessons=1):
        print("Today I'm teaching rocket science. How hard can it be?")
        #self.env.render(visualize)
        for i in range(lessons):
            print("*sigh* here we go again.")
            self.model.learn(
                total_timesteps=lesson_length,
                callback=self.eval_callback)  #,callback=self.eval_callback)
            self.model.save(self.agent_name)
            self.env.save(self.agent_name + "vEnv")
            #self.eval_env = VecNormalize.load(self.agent_name + "vEnv",self.eval_env)
            #a_file = open('replay_buffer', 'wb')
            #pickle.dump(self.model.replay_buffer, a_file)
            #a_file.close()
            print("{} Batches Done.".format(i + 1))
            # plt.close()
            mean_reward, std_reward = evaluate_policy(self.model,
                                                      self.eval_env,
                                                      n_eval_episodes=1)
            print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
        self.evaluate()

    def lecture(self):
        teacher = DummyExpert()
        #teacher = NormalizeActionWrapper(teacher)
        print("Let me show you how it's done.")
        generate_expert_traj(teacher.teach,
                             'dummy_expert_rocket',
                             self.env,
                             n_episodes=10)

    def evaluate(self):
        self.eval_env.training = False
        self.eval_env.norm_reward = False

        print("Watch this!")
        obs = self.eval_env.reset()
        #self.eval_env.render(True)

        mean_reward, std_reward = evaluate_policy(self.model,
                                                  self.eval_env,
                                                  n_eval_episodes=1)
        print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

        reward_list = []
        reward_sum: List[float] = []
        action_list = []
        for i in range(3):
            action_list.append([])
        Time = []
        steps = 0
        cumulativeReward = 0
        data = []
        for i in range(obs.size):
            data.append([])

        for j in range(1000):
            action, states = self.model.predict(obs, deterministic=True)
            obs, reward, done, info = self.eval_env.step(action)
            #re_obs = self.eval_env.rescale_observation((obs))
            #obs = self.eval_env.get_original_obs()
            #action = self.eval_env.rescale_action(action)
            reward_list.append(reward[0])
            cumulativeReward += reward[0]
            reward_sum.append(cumulativeReward)
            action_list[0].append(action[0])
            #for i in range(3):
            #    action_list[i].append(action[i])
            for i in range(obs.size):
                data[i].append(obs[0][i])
            steps += 1
            Time.append(steps)

        print("Another happy landing.")

        plt.figure(figsize=(11, 8))
        plt.subplot(3, 2, 3)
        plt.xlabel('Time(s)')
        plt.ylabel('Position (m)')
        plt.plot(Time, data[0], label='X Position')
        plt.plot(Time, data[1], label='Speed')
        #plt.plot(Time, data[2], label='Z Position')
        plt.legend(loc='best')
        plt.subplot(3, 2, 1)
        plt.xlabel('Time(s)')
        plt.ylabel('Reward')
        plt.plot(Time, reward_list, label='Reward')
        plt.plot(Time, reward_sum, label='Total Reward')
        plt.legend(loc='best')
        plt.subplot(3, 2, 2)
        plt.xlabel('Time(s)')
        plt.ylabel('Actions')
        plt.plot(Time, action_list[0], label='Thrust')
        #plt.plot(Time, action_list[1], label='GimbalX')
        #plt.plot(Time, action_list[2], label='GimbalY')
        plt.legend(loc='best')

        plt.subplot(3, 2, 4)
        plt.xlabel('Time(s)')
        plt.ylabel('Attitude')
        #plt.plot(Time, data[4], label='Roll')
        #plt.plot(Time, data[4], label='Pitch')
        #plt.plot(Time, data[5], label='Yaw')
        plt.legend(loc='best')

        plt.subplot(3, 2, 5)
        plt.xlabel('Time(s)')
        plt.ylabel('Velocity')
        #plt.plot(Time, data[2], label='vX')
        #plt.plot(Time, data[3], label='vY')
        #plt.plot(Time, data[5], label='vZ')
        plt.legend(loc='best')

        plt.subplot(3, 2, 6)
        plt.xlabel('Time(s)')
        plt.ylabel('RotVel')
        #plt.plot(Time, data[12], label='Fuel')
        #plt.plot(Time, data[6], label='Rot X')
        #plt.plot(Time, data[7], label='Rot Y')
        plt.legend(loc='best')

        plt.tight_layout()
        plt.show()
Esempio n. 4
0
def train(method="SAC"):
    def get_multi_process_env(num_of_envs,
                              subprocess=True,
                              amplitude_scaling=False,
                              frameskip=5,
                              with_goals=False,
                              action_type=ActionType.POSITION,
                              difficulty=1,
                              initializer="random",
                              testing=False):

        if initializer == "random":
            initializer = RandomInitializer(difficulty=difficulty)
        elif initializer == "completely_random":
            initializer = CompletelyRandomInitializer()

        def _make_env(rank):
            def _init():
                obs_type = ObservationType.WITH_GOALS if with_goals else ObservationType.WITHOUT_GOALS
                out_env = CubeEnv(frameskip=frameskip,
                                  visualization=False,
                                  initializer=initializer,
                                  action_type=action_type,
                                  observation_type=obs_type,
                                  testing=testing)
                out_env.seed(seed=54321)
                out_env.action_space.seed(seed=54321)
                if not with_goals:
                    out_env = FlatObservationWrapper(
                        out_env, amplitude_scaling=amplitude_scaling)
                    out_env = TimeFeatureWrapper(out_env,
                                                 max_steps=math.ceil(
                                                     3750 / frameskip))
                else:
                    out_env = GoalObservationWrapper(
                        out_env, amplitude_scaling=amplitude_scaling)
                return out_env

            return _init

        if subprocess:
            return SubprocVecEnv(
                [_make_env(rank=i) for i in range(num_of_envs)])
        else:
            return DummyVecEnv([_make_env(rank=i) for i in range(num_of_envs)])

    date_time_str = datetime.now().strftime("%m_%d_%Y_%H_%M_%S_")
    print(method, date_time_str)
    set_global_seeds(0)

    if method == "HER":
        env = get_multi_process_env(1,
                                    subprocess=False,
                                    amplitude_scaling=True,
                                    frameskip=5,
                                    with_goals=True)
        env.set_attr("reward_range", 1000)
        policy_kwargs = dict(layers=[128, 128], act_fun=tf.tanh)

        n_actions = env.action_space.shape[-1]
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.2) *
                                                    np.ones(n_actions))

        model = HER("MlpPolicy",
                    env,
                    SAC,
                    policy_kwargs=policy_kwargs,
                    n_sampled_goal=4,
                    goal_selection_strategy='future',
                    verbose=1,
                    tensorboard_log="tblogs",
                    batch_size=512,
                    buffer_size=100000,
                    gamma=0.98,
                    learning_starts=10000,
                    random_exploration=0.15)
        model.learn(int(2e6),
                    log_interval=10,
                    callback=CheckpointCallback(
                        save_freq=int(1e5),
                        save_path='models/checkpoint_saves',
                        name_prefix=method + '_' + date_time_str),
                    tb_log_name=method + '_' + date_time_str)
    if method == "SAC":
        env = VecNormalize(VecFrameStack(
            get_multi_process_env(1,
                                  subprocess=False,
                                  amplitude_scaling=False,
                                  frameskip=5,
                                  action_type=ActionType.POSITION,
                                  difficulty=1,
                                  initializer="completely_random"), 4),
                           norm_reward=False,
                           clip_reward=1500,
                           gamma=0.99)
        policy_kwargs = dict(layers=[256, 256])

        n_actions = env.action_space.shape[-1]
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.2) *
                                                    np.ones(n_actions))
        model = SAC("LnMlpPolicy",
                    env,
                    policy_kwargs=policy_kwargs,
                    buffer_size=1000000,
                    batch_size=256,
                    gamma=0.99,
                    learning_rate=LinearSchedule(int(2e6),
                                                 5e-5,
                                                 initial_p=3e-4).value,
                    train_freq=64,
                    gradient_steps=4,
                    tau=0.005,
                    learning_starts=10000,
                    tensorboard_log="tblogs",
                    verbose=1,
                    use_emph_exp=True,
                    action_noise=action_noise)
        model.learn(int(2e6),
                    log_interval=10,
                    callback=CheckpointCallback(
                        save_freq=int(5e5),
                        save_path='models/checkpoint_saves',
                        name_prefix=method + '_' + date_time_str),
                    tb_log_name=method + '_' + date_time_str)
        env.save("normalized_env_" + date_time_str)
    if method == "CONTINUE_SAC":
        difficulty = 4
        env = VecNormalize.load(
            "models/normalized_env_frame_stacked_model",
            VecFrameStack(
                get_multi_process_env(1,
                                      subprocess=False,
                                      amplitude_scaling=True,
                                      frameskip=5,
                                      action_type=ActionType.POSITION,
                                      difficulty=difficulty,
                                      initializer="random",
                                      testing=True), 4))

        model = SAC.load(
            "models/checkpoint_saves/SAC_09_18_2020_19_07_42__1000000_steps.zip",
            env=env,
            tensorboard_log="tblogs",
        )
        model.learn(int(1e6),
                    log_interval=10,
                    callback=CheckpointCallback(
                        save_freq=int(5e5),
                        save_path='models/checkpoint_saves',
                        name_prefix=method + '_' + date_time_str),
                    tb_log_name=method + '_' + date_time_str)
        env.save("normalized_env_difficulty_" + str(difficulty))
        model.save(
            os.path.join('models', "model_difficulty_" + str(difficulty)))
    if method == "save_vec_env":
        env = VecNormalize(
            get_multi_process_env(1,
                                  subprocess=False,
                                  amplitude_scaling=True,
                                  frameskip=5,
                                  action_type=ActionType.POSITION,
                                  difficulty=1,
                                  initializer="completely_random"))

        model = SAC.load(
            "models/checkpoint_saves/SAC_09_18_2020_14_27_30__2000000_steps.zip",
            env=env)
        model.learn(int(1e5), log_interval=1)
        env.save("normalized_env_without_framestack")
        return
    else:
        return

    print("save model: ", os.path.join('models', method + '_' + date_time_str))
Esempio n. 5
0
def train(params, model=None, env=None): 
    print("Training Parameters: ", params)

    data_dir, tb_path = get_paths(params)
    os.makedirs(data_dir, exist_ok=True)
    # Save parameters immediately
    params.save(data_dir)

    rank = mpi_rank_or_zero()
    if rank != 0:
        logger.set_level(logger.DISABLED)

    # Create the environment if not given
    if env is None:  
        def make_env(i):
            env = get_env(params)
            print("ENV IN UTIL" ,env)
            # TODO: make monitor work for multiple agent.
            env = Monitor(env, data_dir + '/' + str(i), allow_early_resets=params['early_reset'])
            return env

        # if 'PPO' in params['alg']:
        #     env = DummyVecEnv([(lambda n: lambda: make_env(n))(i) for i in range(params['num_proc'])])
        # else:
        #     env = make_env(0)
        env = make_env(0)

        if params['normalize']:
            env = VecNormalize(env)
    # Set the seeds
    if params['seed']:
        seed = params['seed'] + 100000 * rank
        set_global_seeds(seed)
        params['alg_args']['seed'] = seed

    if 'noise' in params and params['noise']:
        from stable_baselines.ddpg import OrnsteinUhlenbeckActionNoise
        n_actions = env.action_space.shape[-1]
        params['alg_args']['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(params['noise'])*np.ones(n_actions))
  

    print("ENV", env, env.action_space)
    if model is None:
        alg = get_alg(params)
        policy = get_policy(params)
        model = alg(policy,  env, verbose=1, tensorboard_log=tb_path, policy_kwargs=params['policy_args'], **params['alg_args'])
    else:
        model.set_env(env)

    print("\n===============================\n")
    print("TENSORBOARD PATH:", tb_path)
    print("\n===============================\n")
    model.learn(total_timesteps=params['timesteps'], log_interval=params['log_interval'], 
                callback=create_training_callback(data_dir, params, env, freq=params['eval_freq'], checkpoint_freq=params['checkpoint_freq']))
    
    print("Saving model to", data_dir)
    model.save(data_dir +'/final_model')

    if params['normalize']:
        env.save(data_dir + '/environment.pkl')
        
    env.close()