def test_model_manipulation(model_class, goal_selection_strategy): env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) env = DummyVecEnv([lambda: env]) model = HER('MlpPolicy', env, model_class, n_sampled_goal=3, goal_selection_strategy=goal_selection_strategy, verbose=0) model.learn(1000) model_predict(model, env, n_steps=100, additional_check=None) model.save('./test_her') del model # NOTE: HER does not support VecEnvWrapper yet with pytest.raises(AssertionError): model = HER.load('./test_her', env=VecNormalize(env)) model = HER.load('./test_her') # Check that the model raises an error when the env # is not wrapped (or no env passed to the model) with pytest.raises(ValueError): model.predict(env.reset()) env_ = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) env_ = HERGoalEnvWrapper(env_) model_predict(model, env_, n_steps=100, additional_check=None) model.set_env(env) model.learn(1000) model_predict(model, env_, n_steps=100, additional_check=None) assert model.n_sampled_goal == 3 del model env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) model = HER.load('./test_her', env=env) model.learn(1000) model_predict(model, env_, n_steps=100, additional_check=None) assert model.n_sampled_goal == 3 if os.path.isfile('./test_her.pkl'): os.remove('./test_her.pkl')
def main(load_policy=True): global log_dir model_class = TD3 # works also with SAC and DDPG action_space = 6 gamma = 0.9 memory_limit = 1000000 timesteps = 15000000 discreteAction = 0 rend = False # learning rate env = bioEnv() env = Monitor(env, log_dir, allow_early_resets=True) goal_selection_strategy = 'future' n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER(CustomTD3Policy, env, model_class,n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1,tensorboard_log="../pybullet_logs/bioEnv_TD3", buffer_size=1000000,batch_size= 256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load("models/TD3/curriculum/best_model_part_11_10g_TRUE.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log="../pybullet_logs/bioEnv_TD3", buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise) model.learn(timesteps,log_interval=100, callback = callback) model.save("policy_TD3_Discr")
def main(env_id): env_type = 'robotics' env = gym.make(env_id) save_file = "/home/shivanik/fetch_trial.zip" video_file = "/home/shivanik/fetch_her_videos/" env_recorder = NonVecRecorder(env) video_len = 100 model = HER.load(save_file, env=env) action_spec = env.action_space env.reset() i = 0 record = False num_files = 4 obs = env.reset() for i in range(num_files): fname = video_file + "%d.mp4" % i print(fname) env_recorder.init_video_writing(fname=fname) for j in range(video_len): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) env_recorder.viz(True) if done: obs = env.reset() env_recorder.close() env.reset() env.close()
def load_model(eval_env): # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method model = HER.load('./her_robot_env', env=eval_env) count = 0 step_num_arr = [] for _ in range(20): number_steps = 0 obs = eval_env.reset() for _ in range(400): action, _ = model.predict(obs) obs, reward, done, _ = eval_env.step(action) number_steps += 1 # print(obs['achieved_goal'][0:3], obs['desired_goal'][0:3], reward) if done: step_num_arr.append(number_steps) count += 1 print("----------------It reached terminal state -------------------") break print( "Robot reached the goal position successfully ", count, " times and the Average step count was ", np.average(np.array(step_num_arr)) )
def main(load_policy=False): global log_dir model_class = TD3 # works also with SAC and DDPG action_space = 7 normalize_observations = False gamma = 0.9 memory_limit = 1000000 normalize_returns = True timesteps = 8000000 rend = False obj_pose_rnd_std = 0 env = pandaPushGymGoalEnv(renders=rend, use_IK=0, numControlledJoints=action_space, obj_pose_rnd_std=obj_pose_rnd_std, includeVelObs=True) env = Monitor(env, log_dir, allow_early_resets=True) goal_selection_strategy = 'future' n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER( CustomTD3Policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1, tensorboard_log= "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed", buffer_size=1000000, batch_size=256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load( "../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log= "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed", buffer_size=1000000, batch_size=256, random_exploration=0.3, action_noise=action_noise) model.learn(timesteps, log_interval=100, callback=callback) print("Saving Policy PHASE_1") model.save("../policies/TD3_phase1_target_fixed")
def test_HER( env, out_dir, seed=None, **kwargs): model = HER.load(os.path.join(out_dir,'final_model.pkl'), env=env) #model.learn(total_timesteps=10000) # Evaluate the trained agent mean_reward = evaluate(env, model, num_steps=5000) return
def launchAgent(model_name: str): """ :param model_name: 실행시킬 모델의 종류. HER, DDPG, PPO2 혹은 기타값(DQN)이어야 함 현재는 의도상 PPO2로 세팅할 것 :return: 1000회의 사이클을 돌고 난 이후의 모델 """ import Reinforcement_AI.env.e_enhanced_image_env as image_env from stable_baselines import DQN, HER, DDPG, PPO2 from stable_baselines.common import make_vec_env print("Current Env is " + model_name) if model_name == "HER": env = image_env.DetailedMiniMapEnv() model = HER("CnnPolicy", env=env, model_class=DQN) if model_name == "DDPG": env = image_env.DDPGImageEnv() model = DDPG(policy="CnnPolicy", env=env, normalize_observations=True) if model_name == "PPO2": env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1) model = PPO2(policy="CnnPolicy", env=env, verbose=1) else: env = image_env.DetailedMiniMapEnv() model = DQN( "CnnPolicy", # policy env=env, # environment double_q=True, # Double Q enable prioritized_replay=True, # Replay buffer enabled verbose=0 # log print ) for i in range(1000): if i != 0: if model_name == "HER": model = HER.load("detailedmap_HER_" + str(i), env) if model_name == "DDPG": model = DDPG.load("detailedmap_DDPG_" + str(i), env) if model_name == "PPO2": model = PPO2.load("detailedmap_PPO2_" + str(i), env) else: model = DQN.load("detailedmap_DQN_" + str(i), env) # print('model learn start') model.learn(total_timesteps=12500) #FPS가 130이상 넘어갈때의 최소수치 print("this model is : detailedmap_" + model_name + "_" + str(i + 1)) # print('model learn finished') # print('model save start') model.save("detailedmap_" + model_name + "_" + str(i + 1)) del model # print('model save end') return model
def train_curriculum_fetch(self, env_name="Merging-v0"): """ Trains reward curriculum """ self.curriculum = [env_name] bs2model_ours = {'RL': BR_BL0_BL1_BL5, 'LR': BL_BR0} bs2model = {'RL': BR_s, 'LR': BL_s} for l, lesson in enumerate(self.curriculum): for bs in bs2model.keys(): self.bs = bs for seed in [101, 102]: if self.expt_type == "ours": model_info = bs2model_ours[self.bs] else: model_info = bs2model[self.bs] model_dir = os.path.join(model_info[0], model_info[1], model_info[2]) if self.model_type == "PPO": self.model = PPO2.load( model_dir) # loads pre-trained model elif self.model_type == "HER": self.model = HER.load( model_dir) # loads pre-trained model print(f"\ntraining on {lesson}, bs {self.bs}, seed{seed}") self.seed = seed self.experiment_name = f"{self.bs}_{self.expt_type}_{seed}" print("EXPT NAME: ", self.experiment_dir1, self.experiment_name) self.experiment_dir = os.path.join(self.experiment_dir1, self.experiment_name) self.create_eval_dir() env = gym.make(lesson) eval_env = gym.make(lesson) if self.bs == 'RL': env._set_homotopy_class('left') eval_env._set_homotopy_class('left') elif self.bs == 'LR': env._set_homotopy_class('right') eval_env._set_homotopy_class('right') if self.model_type == "HER": env = HERGoalEnvWrapper(env) eval_env = HERGoalEnvWrapper(eval_env) print("hc: ", env.env.homotopy_class) else: env = DummyVecEnv([lambda: env]) self.model.set_env(env) self.model.seed = self.seed self.model = train(self.model, eval_env, self.timesteps, self.experiment_dir, self.is_save, self.eval_save_period, self.rets_path, l)
def train_curriculum(self, env_name="Merging-v0"): """ Trains reward curriculum """ self.curriculum = [env_name] bs2model_ours = {1: B1R_B0L, 3: B3R_B0L, 5: B5R_B0L2, 7: B7R_B0L_B4L1} bs2model = {1: B1R, 3: B3R, 5: B5R, 7: B7R} for l, lesson in enumerate(self.curriculum): for seed in [201, 202, 203, 204, 205]: if self.expt_type == "ours": model_info = bs2model_ours[int(self.bs)] elif self.expt_type == "finetune": model_info = bs2model[int(self.bs)] model_dir = os.path.join(model_info[0], model_info[1], model_info[2]) if self.model_type == "PPO": self.model = PPO2.load( model_dir) # loads pre-trained model elif self.model_type == "HER": self.model = HER.load(model_dir) # loads pre-trained model print(f"\ntraining on {lesson}, bs {self.bs}, seed{seed}") self.seed = seed self.experiment_name = f"{self.bs}_{self.expt_type}_{seed}" print("EXPT NAME: ", self.experiment_dir1, self.experiment_name) self.experiment_dir = os.path.join(self.experiment_dir1, self.experiment_name) self.create_eval_dir() env = gym.make(lesson) eval_env = gym.make(lesson) env._set_barrier_size(self.bs) env._set_homotopy_class('left') eval_env._set_barrier_size(self.bs) eval_env._set_homotopy_class('left') if self.model_type == "HER": env = HERGoalEnvWrapper(env) eval_env = HERGoalEnvWrapper(eval_env) print("bs: ", env.env.barrier_size) print("hc: ", env.env.homotopy_class) else: env = DummyVecEnv([lambda: env]) self.model.set_env(env) self.model.set_random_seed(self.seed) ### ENTROPY### #self.model.ent_coef = 0.05 self.model = train(self.model, eval_env, self.timesteps, self.experiment_dir, self.is_save, self.eval_save_period, self.rets_path, l)
def __init__(self, env_id, exp_id, model_path, trajectory_type, episode_timesteps, noise_parameters): self.env_id = env_id self.exp_id = exp_id self.trajectory_type = trajectory_type # Load model and environment self.env = HERGoalEnvWrapper( gym.make(env_id, **{'noise_parameters': noise_parameters})) self.model = HER.load(model_path, env=self.env) self.episode_timesteps = episode_timesteps # Setup subscriber for trajectory generator # self.line_trajectory_timer = rospy.Timer(rospy.Duration(0.1), self.line_trajectory_callback) # self.circle_trajectory_timer = rospy.Timer(rospy.Duration(0.01), self.circle_trajectory_callback) # Line trajectory settings if self.trajectory_type == "line": self.start_p = np.array([20, 0, 100]) / 1000 self.finish_p = np.array([20, 40, 100]) / 1000 self.del_p = self.finish_p - self.start_p self.current_goal = self.start_p # Circle trajectory settings if self.trajectory_type == "circle": self.offset = np.array([20, 20, 100]) / 1000 self.radius = 20.0 / 1000 self.thetas = np.arange(0, 2 * np.pi, np.deg2rad(5)) self.thetas_counter = 0 self.start_p = self.offset self.current_goal = self.start_p # Start timer self.prev_time = rospy.get_time() # Complete trajectory check self.shape_df = pd.DataFrame(columns=[ 'episode', 'timestep', 'r1x', 'r1y', 'r1z', 'r2x', 'r2y', 'r2z', 'r3x', 'r3y', 'r3z' ]) # self.goals_df = pd.DataFrame(columns=['ag_x', 'ag_y', 'ag_z', 'dg_x', 'dg_y', 'dg_z']) self.traj_complete = False self.achieved_goals = np.array([]) self.desired_goals = np.array([]) self.episode_count = 0
def load_model(model_info, model_type="PPO", baseline=None, pkl_file=None): model_dir = os.path.join(model_info[0], model_info[1], model_info[2]) if model_type == "PPO": if baseline == 'L2SP': from baselines.L2SP.model import PPO2L2SP import baselines.L2SP.utils as L2SP_utils data, params = L2SP_utils.load_from_file(model_dir) model = PPO2L2SP.load(model_dir, original_params=params) elif baseline == 'PNN': from baselines.PNN.utils import looseload, resave_params_for_PPN output_dir = os.path.join( "output/updated_gridworld_continuous_PNN", 'resave', model_info[2]) resave_params_for_PPN(model_dir, output_dir) model = looseload(PPO2, output_dir) elif baseline == 'BSS': from baselines.BSS.utils import resave_params_for_BSS from baselines.BSS.model import PPO2BSS output_dir = os.path.join( "output/updated_gridworld_continuous_BSS", 'resave', model_info[2]) resave_params_for_BSS(model_dir, output_dir) model = PPO2BSS.load(output_dir, bss_coef=0.001, l2_coef=0.0005) else: model = PPO2.load(model_dir) elif model_type == "HER": if baseline == 'L2SP': from baselines_fetch.L2SP.model import HER2L2SP import baselines_fetch.L2SP.utils as L2SP_utils data, params = L2SP_utils.load_from_file(model_dir) model = HER2L2SP.load(model_dir, original_params=params) elif baseline == "PNN": from baselines_fetch.PNN.model import HER2PNN from baselines_fetch.PNN.utils import resave_params_for_PNN output_dir = os.path.join("output/fetch_PNN", 'resave', model_info[2]) resave_params_for_PNN(model_dir, output_dir) model = HER2PNN.load(output_dir) elif baseline == "BSS": pass else: model = HER.load(model_dir) return model
def main(load_policy=False): global log_dir, log_dir_policy if (load_policy): log_dir_policy = '../policies/PUSHING_TD3+HER_FIXED_POSITION_DYN_RAND_FROM_FIXED_PHYSICS' model_class = TD3 # works also with SAC and DDPG action_space = 7 fixed = True normalize_observations = False gamma = 0.9 memory_limit = 1000000 normalize_returns = True timesteps = 1500000 discreteAction = 0 rend = False env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, action_space = action_space, fixedPositionObj = fixed, includeVelObs = True) env = Monitor(env, log_dir, allow_early_resets=True) # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER(CustomPolicy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND", buffer_size=1000000,batch_size=256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND_FROM_FIXED_PHYSICS", buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise) # Train the model starting from a previous policy model.learn(timesteps, callback = callback ) model.save("../policies/PUSHING_FIXED_TD3_DYN_RAND") print("Finished train1")
def main(load_policy=False): global log_dir model_class = TD3 # works also with SAC and DDPG action_space = 6 fixed = True #0 completely fixed, 1 slightly random radius, 2 big random radius, object_position = 1 normalize_observations = False gamma = 0.9 memory_limit = 1000000 normalize_returns = True timesteps = 5000000 discreteAction = 0 rend = False env = pandaPushGymEnvHER(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=1, isDiscrete=discreteAction, action_space = action_space, fixedPositionObj = fixed, includeVelObs = True, object_position=object_position) env = Monitor(env, log_dir, allow_early_resets=True) goal_selection_strategy = 'future' n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER(CustomTD3Policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK", buffer_size=1000000,batch_size=256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK", buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise) model.learn(timesteps,log_interval=100, callback = callback) print("Saving Policy PHASE_1") model.save("../policies/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK")
def main(): panda_env = PandaGraspGymEnv(urdf_root=object_data.getDataPath(), is_rendering=True, use_ik=True, is_discrete=True, num_controlled_joints=7, reward_type="sparse") env = HERGoalEnvWrapper(panda_env) model = HER.load("logs/rl_model_1000000_steps.zip") episode_rewards, episode_lengths, episode_success = evaluate_policy( model, env, n_eval_episodes=50, render=False, deterministic=True, return_episode_rewards=True) print("Final Reward {}, Episode Length{}, Success Rate {}".format( np.mean(episode_rewards), np.mean(episode_lengths), np.mean(episode_success)))
def __init__(self, env: ISettableGoalEnv, verbose=1, rank=0, experiment_name="her-sac"): self._env = env self._dirs = Dirs( experiment_name=f"{type(env).__name__}-{experiment_name}", rank=rank) options = { "env": env, "tensorboard_log": self._dirs.tensorboard, "model_class": SAC, "gamma": 1, "learning_rate": 3e-3 } if os.path.isdir(self._dirs.models) and os.path.isfile( self._dirs.best_model): self._model = HER.load(load_path=self._dirs.best_model, **options) print(f"Loaded model {self._dirs.best_model}") else: self._model = HER(policy="MlpPolicy", verbose=verbose, **options)
#env = SubprocVecEnv([make_env(env_id, log_dir, i+worker_id) for i in range(num_env)]) model_class = DQN # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1) model.learn(total_timesteps=1000) model.save(log_dir + "model") # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method model = HER.load(log_dir + "model", env=env) #evaluate agent episodes = 100 ep_r = [] ep_l = [] for e in range(episodes): obs = env.reset() total_r = 0. total_l = 0. while True: action, _states = model.predict(obs) obs, rewards, dones, infos = env.step(action) total_l += 1. total_r += rewards if dones:
def launchAgent(): import Reinforcement_AI.env.d_image_env as image_env from stable_baselines import DQN, HER, DDPG, PPO2 from stable_baselines.common import make_vec_env model_name = "PPO2" if model_name == "HER": model = HER( "CnnPolicy", env=image_env.DetailedMiniMapEnv(), model_class=DQN ) if model_name == "DDPG": model = DDPG( policy="CnnPolicy", env=image_env.DDPGImageEnv(), normalize_observations=True ) if model_name == "PPO2": # env = image_env.DetailedMiniMapEnv() env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1) model = PPO2( policy="CnnPolicy", env=env, verbose=1 ) else: model = DQN( "CnnPolicy", # policy env=image_env.DetailedMiniMapEnv(), # environment double_q=True, # Double Q enable prioritized_replay=True, # Replay buffer enabled verbose=0 # log print ) for i in range(1000): if i != 0: if model_name == "HER": model = HER.load("detailedmap_HER_" + str(i)) model.set_env(image_env.DetailedMiniMapEnv()) if model_name == "DDPG": model = DDPG.load("detailedmap_DDPG_" + str(i)) model.set_env(image_env.DDPGImageEnv()) if model_name == "PPO2": # print('set env') # ppo2_env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1) # print('get model') model = PPO2.load("detailedmap_PPO2_" + str(i), env) # print('set model env') # model.set_env(ppo2_env) else: model = DQN.load("detailedmap_DQN_" + str(i)) model.set_env(image_env.DetailedMiniMapEnv()) # print('model learn start') model.learn(total_timesteps=3900) # print('model learn finished') # print('model save start') model.save("detailedmap_" + model_name + "_" + str(i+1)) del model
env, _ = load_env(env_name, core_dir=core_dir, envs_dir=envs_dir, xmls_dir=xmls_dir, return_args_remaining=True) # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE # Wrap the model model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1) # Train the model model.learn(1000) model.save("./her_bit_env") # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method model = HER.load('./her_bit_env', env=env) obs = env.reset() for _ in range(100): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) if done: obs = env.reset()
gamma=0.95, batch_size=256, ent_coef='auto', random_exploration=0.3, learning_starts=1000, train_freq=1, policy_kwargs=dict(layers=[256, 256, 256]), tensorboard_log="./OpenAI/") # Train the model model.learn(int(8e6)) model.save("./model2") # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method model = HER.load('./model2', env=env) obs = env.reset() episodes = 0 successes = 0 step = 0 while (episodes < 50): step += 1 action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) env.render() if done or step > 1000: obs = env.reset() episodes += 1 if _['is_success']: successes += 1
sigma=float(0.5) * np.ones(n_actions)) model = DDPG(MlpPolicy, env, verbose=1, param_noise=None, action_noise=action_noise) # Train the model model.learn(1000) model.save("./hideandseek") # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method model = HER.load('./hideandseek', env=env) obs = env.reset() for _ in range(100): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) if done: obs = env.reset() # print(main.__doc__) #if __name__ == '__main__': # logging.getLogger('').handlers = [] # logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
def evaluation(env_id, exp_id, model_path, num_episodes, output_path): env = HERGoalEnvWrapper(gym.make(env_id)) model = HER.load(model_path, env=env) seed = np.random.randint(0, 10) set_global_seeds(seed) goal_errors = np.empty((num_episodes), dtype=float) B_errors = np.empty((num_episodes), dtype=float) alpha_errors = np.empty((num_episodes), dtype=float) q_B_achieved = np.empty((num_episodes, 3), dtype=float) q_alpha_achieved = np.empty((num_episodes, 3), dtype=float) q_B_desired = np.empty((num_episodes, 3), dtype=float) q_alpha_desired = np.empty((num_episodes, 3), dtype=float) desired_goals = np.empty((num_episodes, 3), dtype=float) achieved_goals = np.empty((num_episodes, 3), dtype=float) starting_positions = np.empty((num_episodes, 3), dtype=float) q_B_starting = np.empty((num_episodes, 3), dtype=float) q_alpha_starting = np.empty((num_episodes, 3), dtype=float) # TODO: pre-allocate memory for episode in range(num_episodes): print('episode: ', episode) # Run random episodes and save sequence of actions and states to plot in matlab episode_reward = 0 ep_len = 0 obs = env.reset() while True: action, _ = model.predict(obs, deterministic=True) action = np.clip(action, env.action_space.low, env.action_space.high) obs, reward, done, infos = env.step(action) episode_reward += reward ep_len += 1 if done or infos.get('is_success', False): goal_errors[episode] = infos.get('errors_pos') q_B_desired[episode, :] = infos.get('q_desired')[:3] q_alpha_desired[episode, :] = infos.get('q_desired')[3:] q_B_achieved[episode, :] = infos.get('q_achieved')[:3] q_alpha_achieved[episode, :] = infos.get('q_achieved')[3:] desired_goals[episode, :] = infos.get('desired_goal') achieved_goals[episode, :] = infos.get('achieved_goal') starting_positions[episode, :] = infos.get('starting_position') q_B_starting[episode, :] = infos.get('q_starting')[:3] q_alpha_starting[episode, :] = infos.get('q_starting')[3:] break print('mean_errors: ', np.mean(goal_errors)) eval_df = pd.DataFrame(data=np.column_stack( (desired_goals, achieved_goals, starting_positions, q_B_desired, q_B_achieved, q_B_starting, q_alpha_desired, q_alpha_achieved, q_alpha_starting)), columns=[ 'desired_goal_x', 'desired_goal_y', 'desired_goal_z', 'achieved_goal_x', 'achieved_goal_y', 'achieved_goal_z', 'starting_position_x', 'starting_position_y', 'starting_position_z', 'B_desired_1', 'B_desired_2', 'B_desired_3', 'B_achieved_1', 'B_achieved_2', 'B_achieved_3', 'B_starting_1', 'B_starting_2', 'B_starting_3', 'alpha_desired_1', 'alpha_desired_2', 'alpha_desired_3', 'alpha_achieved_1', 'alpha_achieved_2', 'alpha_achieved_3', 'alpha_startin_1', 'alpha_starting_2', 'alpha_starting_3', ]) eval_df.to_csv(output_path)
model_class = DDPG # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE # Wrap the model model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,verbose=1) # # Train the model # model.learn(1000) # model.save("her_fetch_reach_env") # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method model = HER.load('her_fetch_reach_env', env=env) obs = env.reset() for _ in range(500): env.render() action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) if done: obs = env.reset() # while not done: # env.render() # action, _ = model.predict(obs) # obs, reward, done, _ = env.step(action)
fixed = False # -g gamma = 0.9 # -b batch_size = 256 # -m memory_limit = 1000000 # -t timesteps = 2000000 discreteAction = 0 rend = False env = bioEnv() goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE # Wrap the model model = HER.load("models/TD3/curriculum/best_model_part_11_10g_TRUE.pkl", env=env) obs = env.reset() i = 0 for _ in range(10000): i += 1 action, _ = model.predict(obs) print(action) obs, reward, done, _ = env.step(action) if done: print(str(i) + " " + str(done)) obs = env.reset()
# -b batch_size = 16 # -m memory_limit = 1000000 # -r normalize_returns = True # -t timesteps = 10000000 policy_name = "pushing_policy" discreteAction = 0 rend = True env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, numControlledJoints=numControlledJoints, fixedPositionObj=fixed, includeVelObs=True) # Wrap the model model = HER.load("../policies/pushing_fixed_HER_Dyn_Rand0.pkl", env=env) obs = env.reset() for i in range(10000): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) if done or i == 500: obs = env.reset()
fixed = False # -o normalize_observations = False # -g gamma = 0.9 # -b batch_size = 64 # -m memory_limit = 1000000 # -r normalize_returns = False # -t timesteps = 1000000 policy_name = "pushing_policy" discreteAction = 0 rend = False env = bioEnv() goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE # Wrap the model model = HER.load("models/TD3/policy_TD3_new.pkl", env=env) obs = env.reset() for _ in range(10000): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) if done: obs = env.reset()
import gym import time from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2, DQN, HER, DDPG import synergyenvs env = gym.make("GraspBoxPybullet-v0") env.render() o = env.reset() # model = PPO2(MlpPolicy, env, verbose=1) model = HER('MlpPolicy', env, DDPG, n_sampled_goal=4, verbose=0) model.load("./her_graspbox-1") env.camera_adjust() for _ in range(10000): env.render() action, _states = model.predict(o) # action = env.action_space.sample() o, r, done, info = env.step(action) print(o, r, done, info) if done: o = env.reset() time.sleep(0.1) env.close()
memory_limit = 1000000 # -r normalize_returns = True # -t timesteps = 100000 policy_name = "pushing_policy" discreteAction = 0 rend = True env = pandaPushGymEnvHER(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, action_space=action_space, fixedPositionObj=fixed, includeVelObs=True, object_position=0, test_phase=True) goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE # Wrap the model model = HER.load("../policies/pushing_DDPG_HER_PHASE_1best_model.pkl", env=env) obs = env.reset() for _ in range(10000): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) if done: obs = env.reset()
# NOTE: it works even without action noise # n_actions = env.action_space.shape[0] # noise_std = 0.2 # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) # model = HER('MlpPolicy', env, DDPG, n_sampled_goal=n_sampled_goal, # goal_selection_strategy='future', # verbose=1, buffer_size=int(1e6), # actor_lr=1e-3, critic_lr=1e-3, action_noise=action_noise, # gamma=0.95, batch_size=256, # policy_kwargs=dict(layers=[256, 256, 256])) model.learn(int(2e5)) model.save('her_sac_highway') # Load saved model model = HER.load('her_sac_highway', env=env) obs = env.reset() # Evaluate the agent episode_reward = 0 for _ in range(100): action, _ = model.predict(obs) obs, reward, done, info = env.step(action) env.render() episode_reward += reward if done or info.get('is_success', False): print("Reward:", episode_reward, "Success?", info.get('is_success', False)) episode_reward = 0.0 obs = env.reset()
normalize_observations = False # -g gamma = 0.9 # -b batch_size = 16 # -m memory_limit = 1000000 # -r normalize_returns = True # -t timesteps = 1000000 policy_name = "pushing_policy" discreteAction = 0 rend = True env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, action_space = action_space, fixedPositionObj = fixed, includeVelObs = True, object_position=0, test_phase = True, alg = 'td3_normal_policy_to_different_physics', type_physics=2, max_episode_steps=500) goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE # Wrap the model model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env) obs = env.reset() for _ in range(10000): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) if done: obs = env.reset()
# -r normalize_returns = True # -t timesteps = 1000000 policy_name = "pushing_policy" discreteAction = 0 rend = True env = pandaPushGymEnvHER(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, action_space=action_space, fixedPositionObj=fixed, includeVelObs=True, object_position=1) goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE # Wrap the model model = HER.load( "../policies/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1best_model.pkl", env=env) obs = env.reset() for _ in range(10000): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) if done: obs = env.reset()