Esempio n. 1
0
def test_long_episode(model_class):
    """
    Check that the model does not break when the replay buffer is still empty
    after the first rollout (because the episode is not over).
    """
    # n_bits > nb_rollout_steps
    n_bits = 10
    env = BitFlippingEnv(n_bits,
                         continuous=model_class in [DDPG, SAC, TD3],
                         max_steps=n_bits)
    kwargs = {}
    if model_class == DDPG:
        kwargs['nb_rollout_steps'] = 9  # < n_bits
    elif model_class in [DQN, SAC, TD3]:
        kwargs['batch_size'] = 8  # < n_bits
        kwargs['learning_starts'] = 0

    model = HER('MlpPolicy',
                env,
                model_class,
                n_sampled_goal=4,
                goal_selection_strategy='future',
                verbose=0,
                **kwargs)
    model.learn(100)
Esempio n. 2
0
def train_policy(num_of_envs, log_relative_path, maximum_episode_length,
                 skip_frame, seed_num, her_config, total_time_steps,
                 validate_every_timesteps, task_name):
    task = generate_task(task_generator_id=task_name,
                         dense_reward_weights=np.array([100000, 0, 0, 0]),
                         fractional_reward_weight=0)
    env = CausalWorld(task=task,
                      skip_frame=skip_frame,
                      enable_visualization=False,
                      seed=seed_num,
                      max_episode_length=maximum_episode_length)
    env = HERGoalEnvWrapper(env)
    env = CurriculumWrapper(
        env,
        intervention_actors=[GoalInterventionActorPolicy()],
        actives=[(0, 1000000000, 1, 0)])
    set_global_seeds(seed_num)
    checkpoint_callback = CheckpointCallback(save_freq=int(
        validate_every_timesteps / num_of_envs),
                                             save_path=log_relative_path,
                                             name_prefix='model')
    model = HER(MlpPolicy,
                env,
                SAC,
                verbose=1,
                policy_kwargs=dict(layers=[256, 256, 256]),
                **her_config,
                seed=seed_num)
    model.learn(total_timesteps=total_time_steps,
                tb_log_name="her_sac",
                callback=checkpoint_callback)
    return
Esempio n. 3
0
def test_her(model_class, goal_selection_strategy, discrete_obs_space):
    env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3],
                         max_steps=N_BITS, discrete_obs_space=discrete_obs_space)

    # Take random actions 10% of the time
    kwargs = {'random_exploration': 0.1} if model_class in [DDPG, SAC, TD3] else {}
    model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=0, **kwargs)
    model.learn(1000)
def launchAgent(model_name: str):
    """
    :param model_name: 실행시킬 모델의 종류. HER, DDPG, PPO2 혹은 기타값(DQN)이어야 함
                        현재는 의도상 PPO2로 세팅할 것
    :return: 1000회의 사이클을 돌고 난 이후의 모델
    """
    import Reinforcement_AI.env.e_enhanced_image_env as image_env
    from stable_baselines import DQN, HER, DDPG, PPO2
    from stable_baselines.common import make_vec_env

    print("Current Env is " + model_name)

    if model_name == "HER":
        env = image_env.DetailedMiniMapEnv()
        model = HER("CnnPolicy", env=env, model_class=DQN)
    if model_name == "DDPG":
        env = image_env.DDPGImageEnv()
        model = DDPG(policy="CnnPolicy", env=env, normalize_observations=True)
    if model_name == "PPO2":
        env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1)
        model = PPO2(policy="CnnPolicy", env=env, verbose=1)
    else:
        env = image_env.DetailedMiniMapEnv()
        model = DQN(
            "CnnPolicy",  # policy
            env=env,  # environment
            double_q=True,  # Double Q enable
            prioritized_replay=True,  # Replay buffer enabled
            verbose=0  # log print
        )

    for i in range(1000):
        if i != 0:
            if model_name == "HER":
                model = HER.load("detailedmap_HER_" + str(i), env)
            if model_name == "DDPG":
                model = DDPG.load("detailedmap_DDPG_" + str(i), env)
            if model_name == "PPO2":
                model = PPO2.load("detailedmap_PPO2_" + str(i), env)
            else:
                model = DQN.load("detailedmap_DQN_" + str(i), env)

        # print('model learn start')
        model.learn(total_timesteps=12500)  #FPS가 130이상 넘어갈때의 최소수치
        print("this model is : detailedmap_" + model_name + "_" + str(i + 1))
        # print('model learn finished')

        # print('model save start')
        model.save("detailedmap_" + model_name + "_" + str(i + 1))
        del model
        # print('model save end')

    return model
Esempio n. 5
0
def main(env_id):
    env_type = 'robotics'
    env = gym.make(env_id)

    save_file = "/home/shivanik/fetch_trial.zip"
    video_file = "/home/shivanik/fetch_her_videos/"
    env_recorder = NonVecRecorder(env)

    video_len = 100

    model = HER.load(save_file, env=env)
    action_spec = env.action_space
    env.reset()
    i = 0
    record = False
    num_files = 4
    obs = env.reset()
    for i in range(num_files):
        fname = video_file + "%d.mp4" % i
        print(fname)
        env_recorder.init_video_writing(fname=fname)
        for j in range(video_len):
            action, _ = model.predict(obs)
            obs, reward, done, _ = env.step(action)
            env_recorder.viz(True)
            if done:
                obs = env.reset()
        env_recorder.close()
        env.reset()

    env.close()
Esempio n. 6
0
def load_model(eval_env):
  # WARNING: you must pass an env
  # or wrap your environment with HERGoalEnvWrapper to use the predict method
  model = HER.load('./her_robot_env', env=eval_env)
  count = 0
  step_num_arr = []
  for _ in range(20):
    number_steps = 0
    obs = eval_env.reset()
    for _ in range(400):
      action, _ = model.predict(obs)
      obs, reward, done, _ = eval_env.step(action)
      number_steps += 1
      # print(obs['achieved_goal'][0:3], obs['desired_goal'][0:3], reward)
      if done:
        step_num_arr.append(number_steps)
        count += 1
        print("----------------It reached terminal state -------------------")
        break
  print(
    "Robot reached the goal position successfully ",
    count,
    " times and the Average step count was ",
    np.average(np.array(step_num_arr))
  )
Esempio n. 7
0
def main(load_policy=True):
    global log_dir
    model_class = TD3  # works also with SAC and DDPG
    action_space = 6
    gamma = 0.9
    memory_limit = 1000000
    timesteps = 15000000
    discreteAction = 0
    rend = False
    # learning rate


    env = bioEnv()
  
    env = Monitor(env, log_dir, allow_early_resets=True)

    goal_selection_strategy = 'future'
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    # Wrap the model

    model = HER(CustomTD3Policy, env, model_class,n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=1,tensorboard_log="../pybullet_logs/bioEnv_TD3", buffer_size=1000000,batch_size= 256,
                random_exploration=0.3, action_noise=action_noise)
    
    if (load_policy):
        model = HER.load("models/TD3/curriculum/best_model_part_11_10g_TRUE.pkl", env=env, n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        tensorboard_log="../pybullet_logs/bioEnv_TD3",
        buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise)
    
    model.learn(timesteps,log_interval=100, callback = callback)
   
    model.save("policy_TD3_Discr")
Esempio n. 8
0
def test_HER( env, out_dir, seed=None, **kwargs):

  model = HER.load(os.path.join(out_dir,'final_model.pkl'), env=env)

  #model.learn(total_timesteps=10000)
  # Evaluate the trained agent
  mean_reward = evaluate(env, model, num_steps=5000)

  return
def main(load_policy=False):
    global log_dir
    model_class = TD3  # works also with SAC and DDPG
    action_space = 7
    normalize_observations = False
    gamma = 0.9
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 8000000
    rend = False

    obj_pose_rnd_std = 0

    env = pandaPushGymGoalEnv(renders=rend,
                              use_IK=0,
                              numControlledJoints=action_space,
                              obj_pose_rnd_std=obj_pose_rnd_std,
                              includeVelObs=True)

    env = Monitor(env, log_dir, allow_early_resets=True)

    goal_selection_strategy = 'future'
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))
    # Wrap the model

    model = HER(
        CustomTD3Policy,
        env,
        model_class,
        n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        verbose=1,
        tensorboard_log=
        "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed",
        buffer_size=1000000,
        batch_size=256,
        random_exploration=0.3,
        action_noise=action_noise)

    if (load_policy):
        model = HER.load(
            "../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl",
            env=env,
            n_sampled_goal=4,
            goal_selection_strategy=goal_selection_strategy,
            tensorboard_log=
            "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed",
            buffer_size=1000000,
            batch_size=256,
            random_exploration=0.3,
            action_noise=action_noise)

    model.learn(timesteps, log_interval=100, callback=callback)
    print("Saving Policy PHASE_1")
    model.save("../policies/TD3_phase1_target_fixed")
Esempio n. 10
0
    def train_curriculum_fetch(self, env_name="Merging-v0"):
        """
        Trains reward curriculum
        """
        self.curriculum = [env_name]
        bs2model_ours = {'RL': BR_BL0_BL1_BL5, 'LR': BL_BR0}
        bs2model = {'RL': BR_s, 'LR': BL_s}
        for l, lesson in enumerate(self.curriculum):
            for bs in bs2model.keys():
                self.bs = bs
                for seed in [101, 102]:
                    if self.expt_type == "ours":
                        model_info = bs2model_ours[self.bs]
                    else:
                        model_info = bs2model[self.bs]
                    model_dir = os.path.join(model_info[0], model_info[1],
                                             model_info[2])
                    if self.model_type == "PPO":
                        self.model = PPO2.load(
                            model_dir)  # loads pre-trained model
                    elif self.model_type == "HER":
                        self.model = HER.load(
                            model_dir)  # loads pre-trained model
                    print(f"\ntraining on {lesson}, bs {self.bs}, seed{seed}")
                    self.seed = seed
                    self.experiment_name = f"{self.bs}_{self.expt_type}_{seed}"
                    print("EXPT NAME: ", self.experiment_dir1,
                          self.experiment_name)
                    self.experiment_dir = os.path.join(self.experiment_dir1,
                                                       self.experiment_name)
                    self.create_eval_dir()
                    env = gym.make(lesson)
                    eval_env = gym.make(lesson)

                    if self.bs == 'RL':
                        env._set_homotopy_class('left')
                        eval_env._set_homotopy_class('left')
                    elif self.bs == 'LR':
                        env._set_homotopy_class('right')
                        eval_env._set_homotopy_class('right')

                    if self.model_type == "HER":
                        env = HERGoalEnvWrapper(env)
                        eval_env = HERGoalEnvWrapper(eval_env)
                        print("hc: ", env.env.homotopy_class)
                    else:
                        env = DummyVecEnv([lambda: env])
                    self.model.set_env(env)
                    self.model.seed = self.seed
                    self.model = train(self.model, eval_env, self.timesteps,
                                       self.experiment_dir, self.is_save,
                                       self.eval_save_period, self.rets_path,
                                       l)
Esempio n. 11
0
def main(argv):

    numControlledJoints = 6
    fixed = False
    normalize_observations = False
    gamma = 0.9
    batch_size = 16
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 1000000
    policy_name = "reaching_policy"
    discreteAction = 0
    rend = False

    kukaenv = kukaReachGymEnvHer(urdfRoot=robot_data.getDataPath(),
                                 renders=rend,
                                 useIK=0,
                                 isDiscrete=discreteAction,
                                 numControlledJoints=numControlledJoints,
                                 fixedPositionObj=fixed,
                                 includeVelObs=True)
    kukaenv = Monitor(kukaenv, log_dir, allow_early_resets=True)

    n_actions = kukaenv.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))
    model_class = DDPG
    goal_selection_strategy = 'future'
    model = HER(CustomPolicy,
                kukaenv,
                model_class,
                n_sampled_goal=4,
                goal_selection_strategy=goal_selection_strategy,
                verbose=1,
                tensorboard_log=
                "../pybullet_logs/kuka_reach_ddpg/reaching_DDPG_HER_PHASE",
                buffer_size=1000000,
                batch_size=64,
                random_exploration=0.3,
                action_noise=action_noise)

    print(colored("-----Timesteps:", "red"))
    print(colored(timesteps, "red"))
    print(colored("-----Number Joints Controlled:", "red"))
    print(colored(numControlledJoints, "red"))
    print(colored("-----Object Position Fixed:", "red"))
    print(colored(fixed, "red"))
    print(colored("-----Policy Name:", "red"))
    print(colored(policy_name, "red"))
    print(colored("------", "red"))
    print(colored("Launch the script with -h for further info", "red"))

    model.learn(total_timesteps=timesteps, log_interval=100, callback=callback)

    print("Saving model to kuka.pkl")
    model.save("../pybullet_logs/kukareach_ddpg_her/" + policy_name)

    del model  # remove to demonstrate saving and loading
Esempio n. 12
0
def train_policy(num_of_envs, log_relative_path, maximum_episode_length,
                 skip_frame, seed_num, sac_config, total_time_steps,
                 validate_every_timesteps, task_name):
    def _make_env(rank):
        def _init():
            task = generate_task(task_generator_id=task_name)
            env = CausalWorld(task=task,
                              skip_frame=skip_frame,
                              enable_visualization=False,
                              seed=seed_num + rank,
                              max_episode_length=maximum_episode_length)
            env = HERGoalEnvWrapper(env)
            return env

        set_global_seeds(seed_num)
        return _init

    os.makedirs(log_relative_path)
    env = SubprocVecEnv([_make_env(rank=i) for i in range(num_of_envs)])
    model = HER('MlpPolicy',
                env,
                SAC,
                verbose=1,
                policy_kwargs=dict(layers=[256, 256, 256]),
                **sac_config)
    save_config_file(sac_config,
                     _make_env(0)(),
                     os.path.join(log_relative_path, 'config.json'))
    for i in range(int(total_time_steps / validate_every_timesteps)):
        model.learn(total_timesteps=validate_every_timesteps,
                    tb_log_name="sac",
                    reset_num_timesteps=False)
    model.save(os.path.join(log_relative_path, 'saved_model'))
    return
Esempio n. 13
0
class HERSACAgent(Agent):
    name = "her-sac"

    def __init__(self,
                 env: ISettableGoalEnv,
                 verbose=1,
                 rank=0,
                 experiment_name="her-sac"):
        self._env = env
        self._dirs = Dirs(
            experiment_name=f"{type(env).__name__}-{experiment_name}",
            rank=rank)
        options = {
            "env": env,
            "tensorboard_log": self._dirs.tensorboard,
            "model_class": SAC,
            "gamma": 1,
            "learning_rate": 3e-3
        }
        if os.path.isdir(self._dirs.models) and os.path.isfile(
                self._dirs.best_model):
            self._model = HER.load(load_path=self._dirs.best_model, **options)
            print(f"Loaded model {self._dirs.best_model}")
        else:
            self._model = HER(policy="MlpPolicy", verbose=verbose, **options)

    def __call__(self, obs: Observation) -> np.ndarray:
        action, _ = self._model.predict(obs, deterministic=True)
        return action

    def train(self,
              timesteps: int,
              callbacks: Sequence[BaseCallback] = None,
              num_checkpoints=4) -> None:
        callbacks = [] if callbacks is None else callbacks
        cb = CheckpointCallback(save_freq=timesteps // num_checkpoints,
                                save_path=self._dirs.models,
                                name_prefix=self._dirs.prefix)
        self._model.learn(total_timesteps=timesteps,
                          callback=CallbackList([cb, *callbacks]))
Esempio n. 14
0
def main(
  training_env: PSMCartesianHERDDPGEnv,
  eval_env: PSMCartesianHERDDPGEnv = None,
  log_dir='./.logs/results'
):

  os.makedirs(log_dir, exist_ok=True)

  # training_env = Monitor(training_env, log_dir)

  n_actions = training_env.action_space.shape[0]
  noise_std = 0.2
  # Currently using OU noise
  action_noise = OrnsteinUhlenbeckActionNoise(
    mean=np.zeros(n_actions),
    sigma=noise_std * np.ones(n_actions)
  )
  model_class = DDPG  # works also with SAC, DDPG and TD3

  rl_model_kwargs = {
    'actor_lr': 1e-3,
    'critic_lr': 1e-3,
    'action_noise': action_noise,
    'nb_train_steps': 300,
    'nb_rollout_steps': 100,
    'gamma': 0.95,
    'observation_range': (-1.5,
                          1.5),
    'random_exploration': 0.05,
    'normalize_observations': True,
    'critic_l2_reg': 0.01
  }

  # Available strategies (cf paper): future, final, episode, random
  model = HER(
    'MlpPolicy',
    training_env,
    model_class,
    verbose=1,
    n_sampled_goal=4,
    goal_selection_strategy='future',
    buffer_size=int(1e5),
    batch_size=128,
    tensorboard_log="./ddpg_dvrk_tensorboard/",
    **rl_model_kwargs
  )
  # Reset the model
  training_env.reset()
  # Create callbacks
  checkpoint_callback = CheckpointCallback(
    save_freq=100000,
    save_path="./ddpg_dvrk_tensorboard/"
  )  # save_path="./.model/model_checkpoint/") #save_freq=100000
  # eval_callback = EvalCallback(training_env, best_model_save_path='./ddpg_dvrk_tensorboard/best_model',
  #                             log_path=log_dir, eval_freq=500)
  callback = CallbackList([checkpoint_callback])  # , eval_callback])
  # Train the model
  model.learn(4000000, log_interval=100, callback=callback)
  model.save("./her_robot_env")
Esempio n. 15
0
 def __init__(self,
              env: ISettableGoalEnv,
              verbose=1,
              rank=0,
              experiment_name="her-sac"):
     self._env = env
     self._dirs = Dirs(
         experiment_name=f"{type(env).__name__}-{experiment_name}",
         rank=rank)
     options = {
         "env": env,
         "tensorboard_log": self._dirs.tensorboard,
         "model_class": SAC,
         "gamma": 1,
         "learning_rate": 3e-3
     }
     if os.path.isdir(self._dirs.models) and os.path.isfile(
             self._dirs.best_model):
         self._model = HER.load(load_path=self._dirs.best_model, **options)
         print(f"Loaded model {self._dirs.best_model}")
     else:
         self._model = HER(policy="MlpPolicy", verbose=verbose, **options)
    def train_curriculum(self, env_name="Merging-v0"):
        """
        Trains reward curriculum
        """
        self.curriculum = [env_name]
        bs2model_ours = {1: B1R_B0L, 3: B3R_B0L, 5: B5R_B0L2, 7: B7R_B0L_B4L1}
        bs2model = {1: B1R, 3: B3R, 5: B5R, 7: B7R}
        for l, lesson in enumerate(self.curriculum):
            for seed in [201, 202, 203, 204, 205]:
                if self.expt_type == "ours":
                    model_info = bs2model_ours[int(self.bs)]
                elif self.expt_type == "finetune":
                    model_info = bs2model[int(self.bs)]
                model_dir = os.path.join(model_info[0], model_info[1],
                                         model_info[2])
                if self.model_type == "PPO":
                    self.model = PPO2.load(
                        model_dir)  # loads pre-trained model
                elif self.model_type == "HER":
                    self.model = HER.load(model_dir)  # loads pre-trained model
                print(f"\ntraining on {lesson}, bs {self.bs}, seed{seed}")
                self.seed = seed
                self.experiment_name = f"{self.bs}_{self.expt_type}_{seed}"
                print("EXPT NAME: ", self.experiment_dir1,
                      self.experiment_name)
                self.experiment_dir = os.path.join(self.experiment_dir1,
                                                   self.experiment_name)
                self.create_eval_dir()
                env = gym.make(lesson)
                eval_env = gym.make(lesson)

                env._set_barrier_size(self.bs)
                env._set_homotopy_class('left')
                eval_env._set_barrier_size(self.bs)
                eval_env._set_homotopy_class('left')

                if self.model_type == "HER":
                    env = HERGoalEnvWrapper(env)
                    eval_env = HERGoalEnvWrapper(eval_env)
                    print("bs: ", env.env.barrier_size)
                    print("hc: ", env.env.homotopy_class)
                else:
                    env = DummyVecEnv([lambda: env])
                self.model.set_env(env)
                self.model.set_random_seed(self.seed)
                ### ENTROPY###
                #self.model.ent_coef = 0.05
                self.model = train(self.model, eval_env, self.timesteps,
                                   self.experiment_dir, self.is_save,
                                   self.eval_save_period, self.rets_path, l)
Esempio n. 17
0
def main():
    model_class = DDPG  # works also with SAC and DDPG

    # -j
    action_space = 7
    # -p
    fixed = True
    # -o
    normalize_observations = False
    # -g
    gamma = 0.9
    # -b
    #batch_size = 16
    # -m
    memory_limit = 1000000
    # -r
    normalize_returns = True
    # -t
    timesteps = 1000000
    policy_name = "pushing_policy"
    discreteAction = 0
    rend = False
    env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(),
                                 renders=rend,
                                 useIK=0,
                                 isDiscrete=discreteAction,
                                 action_space=action_space,
                                 fixedPositionObj=fixed,
                                 includeVelObs=True)

    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))
    # Wrap the model

    model = HER(
        CustomPolicy,
        env,
        model_class,
        n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        verbose=1,
        tensorboard_log=
        "../pybullet_logs/panda_push_ddpg/stable_baselines/DDPG+HER_FIXED_DYN_RAND",
        buffer_size=1000000,
        batch_size=256,
        random_exploration=0.3,
        action_noise=action_noise)

    # Train the model starting from a previous policy
    model.learn(timesteps)
    print("Saving Policy")
    model.save("../policies/pushing_fixed_HER_Dyn_Rand")
Esempio n. 18
0
    def __init__(self, env_id, exp_id, model_path, trajectory_type,
                 episode_timesteps, noise_parameters):
        self.env_id = env_id
        self.exp_id = exp_id
        self.trajectory_type = trajectory_type
        # Load model and environment
        self.env = HERGoalEnvWrapper(
            gym.make(env_id, **{'noise_parameters': noise_parameters}))
        self.model = HER.load(model_path, env=self.env)
        self.episode_timesteps = episode_timesteps

        # Setup subscriber for trajectory generator
        # self.line_trajectory_timer = rospy.Timer(rospy.Duration(0.1), self.line_trajectory_callback)
        # self.circle_trajectory_timer = rospy.Timer(rospy.Duration(0.01), self.circle_trajectory_callback)

        # Line trajectory settings
        if self.trajectory_type == "line":
            self.start_p = np.array([20, 0, 100]) / 1000
            self.finish_p = np.array([20, 40, 100]) / 1000
            self.del_p = self.finish_p - self.start_p
            self.current_goal = self.start_p

        # Circle trajectory settings
        if self.trajectory_type == "circle":
            self.offset = np.array([20, 20, 100]) / 1000
            self.radius = 20.0 / 1000
            self.thetas = np.arange(0, 2 * np.pi, np.deg2rad(5))
            self.thetas_counter = 0
            self.start_p = self.offset
            self.current_goal = self.start_p

        # Start timer
        self.prev_time = rospy.get_time()

        # Complete trajectory check
        self.shape_df = pd.DataFrame(columns=[
            'episode', 'timestep', 'r1x', 'r1y', 'r1z', 'r2x', 'r2y', 'r2z',
            'r3x', 'r3y', 'r3z'
        ])
        # self.goals_df = pd.DataFrame(columns=['ag_x', 'ag_y', 'ag_z', 'dg_x', 'dg_y', 'dg_z'])
        self.traj_complete = False

        self.achieved_goals = np.array([])
        self.desired_goals = np.array([])
        self.episode_count = 0
def load_model(model_info, model_type="PPO", baseline=None, pkl_file=None):
    model_dir = os.path.join(model_info[0], model_info[1], model_info[2])
    if model_type == "PPO":
        if baseline == 'L2SP':
            from baselines.L2SP.model import PPO2L2SP
            import baselines.L2SP.utils as L2SP_utils
            data, params = L2SP_utils.load_from_file(model_dir)
            model = PPO2L2SP.load(model_dir, original_params=params)
        elif baseline == 'PNN':
            from baselines.PNN.utils import looseload, resave_params_for_PPN
            output_dir = os.path.join(
                "output/updated_gridworld_continuous_PNN", 'resave',
                model_info[2])
            resave_params_for_PPN(model_dir, output_dir)
            model = looseload(PPO2, output_dir)
        elif baseline == 'BSS':
            from baselines.BSS.utils import resave_params_for_BSS
            from baselines.BSS.model import PPO2BSS
            output_dir = os.path.join(
                "output/updated_gridworld_continuous_BSS", 'resave',
                model_info[2])
            resave_params_for_BSS(model_dir, output_dir)
            model = PPO2BSS.load(output_dir, bss_coef=0.001, l2_coef=0.0005)
        else:
            model = PPO2.load(model_dir)

    elif model_type == "HER":
        if baseline == 'L2SP':
            from baselines_fetch.L2SP.model import HER2L2SP
            import baselines_fetch.L2SP.utils as L2SP_utils
            data, params = L2SP_utils.load_from_file(model_dir)
            model = HER2L2SP.load(model_dir, original_params=params)
        elif baseline == "PNN":
            from baselines_fetch.PNN.model import HER2PNN
            from baselines_fetch.PNN.utils import resave_params_for_PNN
            output_dir = os.path.join("output/fetch_PNN", 'resave',
                                      model_info[2])
            resave_params_for_PNN(model_dir, output_dir)
            model = HER2PNN.load(output_dir)
        elif baseline == "BSS":
            pass
        else:
            model = HER.load(model_dir)
    return model
def heralgorithm():

    goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

    # Wrap the model
    model = HER('MlpPolicy',
                env1,
                DDPG,
                n_sampled_goal=4,
                goal_selection_strategy=goal_selection_strategy,
                verbose=1)
    # Train the model
    model.learn(1000)

    model.save("./her_bit_env")
Esempio n. 21
0
def train_HER(env_train, model_name, timesteps=50000):
    start = time.time()
    n_sampled_goal = 4
    goal_selection_strategy = 'future'
    model = HER('MlpPolicy',
                env_train,
                model_class=SAC,
                verbose=0,
                n_sampled_goal=n_sampled_goal,
                goal_selection_strategy=goal_selection_strategy)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (HER): ', (end - start) / 60, ' minutes')
    return model
Esempio n. 22
0
def main():
    panda_env = PandaGraspGymEnv(urdf_root=object_data.getDataPath(),
                                 is_rendering=True,
                                 use_ik=True,
                                 is_discrete=True,
                                 num_controlled_joints=7,
                                 reward_type="sparse")

    env = HERGoalEnvWrapper(panda_env)

    model = HER.load("logs/rl_model_1000000_steps.zip")

    episode_rewards, episode_lengths, episode_success = evaluate_policy(
        model,
        env,
        n_eval_episodes=50,
        render=False,
        deterministic=True,
        return_episode_rewards=True)
    print("Final Reward {}, Episode Length{}, Success Rate {}".format(
        np.mean(episode_rewards), np.mean(episode_lengths),
        np.mean(episode_success)))
def main(load_policy=False):

    global log_dir, log_dir_policy
    if (load_policy):
          log_dir_policy = '../policies/PUSHING_TD3+HER_FIXED_POSITION_DYN_RAND_FROM_FIXED_PHYSICS'
    model_class = TD3  # works also with SAC and DDPG
    action_space = 7
    fixed = True
    normalize_observations = False
    gamma = 0.9
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 1500000
    discreteAction = 0
    rend = False
    env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0,
            isDiscrete=discreteAction, action_space = action_space,
            fixedPositionObj = fixed, includeVelObs = True)


    env = Monitor(env, log_dir, allow_early_resets=True)
    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    # Wrap the model

    model = HER(CustomPolicy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND", buffer_size=1000000,batch_size=256,
                random_exploration=0.3, action_noise=action_noise)

    if (load_policy):
        model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND_FROM_FIXED_PHYSICS",
        buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise)

    # Train the model starting from a previous policy
    model.learn(timesteps, callback = callback )
    model.save("../policies/PUSHING_FIXED_TD3_DYN_RAND")
    print("Finished train1")
def main(load_policy=False):
    global log_dir
    model_class = TD3  # works also with SAC and DDPG
    action_space = 6
    fixed = True
    #0 completely fixed, 1 slightly random radius, 2 big random radius,
    object_position = 1
    normalize_observations = False
    gamma = 0.9
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 5000000
    discreteAction = 0
    rend = False

    env = pandaPushGymEnvHER(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=1,
            isDiscrete=discreteAction, action_space = action_space,
            fixedPositionObj = fixed, includeVelObs = True, object_position=object_position)

    env = Monitor(env, log_dir, allow_early_resets=True)

    goal_selection_strategy = 'future'
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    # Wrap the model

    model = HER(CustomTD3Policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK", buffer_size=1000000,batch_size=256,
                random_exploration=0.3, action_noise=action_noise)

    if (load_policy):
        model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK",
        buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise)

    model.learn(timesteps,log_interval=100, callback = callback)
    print("Saving Policy PHASE_1")
    model.save("../policies/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK")
Esempio n. 25
0
def main(env):

    n_actions = env.action_space.shape[0]
    noise_std = 0.2
    # Currently using OU noise
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=noise_std *
                                                np.ones(n_actions))
    model_class = DDPG  # works also with SAC, DDPG and TD3

    rl_model_kwargs = {
        'actor_lr': 1e-3,
        'critic_lr': 1e-3,
        'action_noise': action_noise,
        'nb_train_steps': 300,
        'nb_rollout_steps': 100,
        'gamma': 0.95,
        'observation_range': (-1.5, 1.5),
        'random_exploration': 0.05,
        'normalize_observations': True,
        'critic_l2_reg': 0.01
    }

    # Available strategies (cf paper): future, final, episode, random
    model = HER('MlpPolicy',
                env,
                model_class,
                verbose=1,
                n_sampled_goal=4,
                goal_selection_strategy='future',
                buffer_size=int(1e5),
                batch_size=128,
                tensorboard_log="./ddpg_dvrk_tensorboard/",
                **rl_model_kwargs)
    # Reset the model
    env.reset()
    # Train the model
    model.learn(4000000,
                log_interval=100,
                callback=CheckpointCallback(
                    save_freq=100000, save_path="./ddpg_dvrk_tensorboard/"))
    model.save("./her_robot_env")
def launchAgent():
    import Reinforcement_AI.env.d_image_env as image_env
    from stable_baselines import DQN, HER, DDPG, PPO2
    from stable_baselines.common import make_vec_env

    model_name = "PPO2"

    if model_name == "HER":
        model = HER(
            "CnnPolicy",
            env=image_env.DetailedMiniMapEnv(),
            model_class=DQN
        )
    if model_name == "DDPG":
        model = DDPG(
            policy="CnnPolicy",
            env=image_env.DDPGImageEnv(),
            normalize_observations=True
        )
    if model_name == "PPO2":
        # env = image_env.DetailedMiniMapEnv()
        env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1)
        model = PPO2(
            policy="CnnPolicy",
            env=env,
            verbose=1
        )
    else:
        model = DQN(
            "CnnPolicy",  # policy
            env=image_env.DetailedMiniMapEnv(),  # environment
            double_q=True,  # Double Q enable
            prioritized_replay=True,  # Replay buffer enabled
            verbose=0  # log print
        )

    for i in range(1000):
        if i != 0:
            if model_name == "HER":
                model = HER.load("detailedmap_HER_" + str(i))
                model.set_env(image_env.DetailedMiniMapEnv())
            if model_name == "DDPG":
                model = DDPG.load("detailedmap_DDPG_" + str(i))
                model.set_env(image_env.DDPGImageEnv())
            if model_name == "PPO2":
                # print('set env')
                # ppo2_env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1)
                # print('get model')
                model = PPO2.load("detailedmap_PPO2_" + str(i), env)
                # print('set model env')
                # model.set_env(ppo2_env)
            else:
                model = DQN.load("detailedmap_DQN_" + str(i))
                model.set_env(image_env.DetailedMiniMapEnv())

        # print('model learn start')
        model.learn(total_timesteps=3900)
        # print('model learn finished')

        # print('model save start')
        model.save("detailedmap_" + model_name + "_" + str(i+1))
        del model
Esempio n. 27
0
def test_model_manipulation(model_class, goal_selection_strategy):
    env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS)
    env = DummyVecEnv([lambda: env])

    model = HER('MlpPolicy', env, model_class, n_sampled_goal=3, goal_selection_strategy=goal_selection_strategy,
                verbose=0)
    model.learn(1000)

    model_predict(model, env, n_steps=100, additional_check=None)

    model.save('./test_her.zip')
    del model

    # NOTE: HER does not support VecEnvWrapper yet
    with pytest.raises(AssertionError):
        model = HER.load('./test_her.zip', env=VecNormalize(env))

    model = HER.load('./test_her.zip')

    # Check that the model raises an error when the env
    # is not wrapped (or no env passed to the model)
    with pytest.raises(ValueError):
        model.predict(env.reset())

    env_ = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS)
    env_ = HERGoalEnvWrapper(env_)

    model_predict(model, env_, n_steps=100, additional_check=None)

    model.set_env(env)
    model.learn(1000)

    model_predict(model, env_, n_steps=100, additional_check=None)

    assert model.n_sampled_goal == 3

    del model

    env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS)
    model = HER.load('./test_her', env=env)
    model.learn(1000)

    model_predict(model, env_, n_steps=100, additional_check=None)

    assert model.n_sampled_goal == 3

    if os.path.isfile('./test_her.zip'):
        os.remove('./test_her.zip')
import time

env = gym.make("FetchPickAndPlace-v1")

goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

# Wrap the model
model = HER('MlpPolicy',
            env,
            SAC,
            n_sampled_goal=4,
            goal_selection_strategy='future',
            verbose=1,
            buffer_size=int(1e6),
            learning_rate=0.001,
            gamma=0.95,
            batch_size=256,
            ent_coef='auto',
            random_exploration=0.3,
            learning_starts=1000,
            train_freq=1,
            policy_kwargs=dict(layers=[256, 256, 256]),
            tensorboard_log="./OpenAI/")
# Train the model
model.learn(int(8e6))

model.save("./model2")

# WARNING: you must pass an env
# or wrap your environment with HERGoalEnvWrapper to use the predict method
model = HER.load('./model2', env=env)
Esempio n. 29
0
env_name = 'blueprint_construction'
core_dir = '/Users/abhijithneilabraham/Documents/GitHub/multi-agent-emergence-environments/'
envs_dir = 'mae_envs/envs'
xmls_dir = 'xmls'
env, _ = load_env(env_name,
                  core_dir=core_dir,
                  envs_dir=envs_dir,
                  xmls_dir=xmls_dir,
                  return_args_remaining=True)
# Available strategies (cf paper): future, final, episode, random
goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

# Wrap the model
model = HER('MlpPolicy',
            env,
            model_class,
            n_sampled_goal=4,
            goal_selection_strategy=goal_selection_strategy,
            verbose=1)
# Train the model
model.learn(1000)

model.save("./her_bit_env")

# WARNING: you must pass an env
# or wrap your environment with HERGoalEnvWrapper to use the predict method
model = HER.load('./her_bit_env', env=env)

obs = env.reset()
for _ in range(100):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
Esempio n. 30
0
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import HER, SAC

print('setting up environment')
#env = gym.make("REALRobot2020-R2J3-v0")
env = REALRobotEnv(objects=1)
# Currently this wrapper doesn't really return goals but just sample_placeholder
# to match the her format.
env = GoalWrapper(env, crop_obs=True)

print('setting up model')
model = HER('MlpPolicy',
            env,
            SAC,
            n_sampled_goal=4,
            goal_selection_strategy='future',
            verbose=1,
            buffer_size=int(1e6),
            learning_rate=1e-3,
            gamma=0.95,
            batch_size=256)
print('start learning')
model.learn(total_timesteps=256)
print('learning done')

#Here we need to restart the environent to make rendering possible
#(doesn't work with the wrappers right now)
env = REALRobotEnv(objects=1)
env = GoalWrapper(env, crop_obs=True)
env.render("human")

print('display model')