def test_long_episode(model_class): """ Check that the model does not break when the replay buffer is still empty after the first rollout (because the episode is not over). """ # n_bits > nb_rollout_steps n_bits = 10 env = BitFlippingEnv(n_bits, continuous=model_class in [DDPG, SAC, TD3], max_steps=n_bits) kwargs = {} if model_class == DDPG: kwargs['nb_rollout_steps'] = 9 # < n_bits elif model_class in [DQN, SAC, TD3]: kwargs['batch_size'] = 8 # < n_bits kwargs['learning_starts'] = 0 model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy='future', verbose=0, **kwargs) model.learn(100)
def train_policy(num_of_envs, log_relative_path, maximum_episode_length, skip_frame, seed_num, her_config, total_time_steps, validate_every_timesteps, task_name): task = generate_task(task_generator_id=task_name, dense_reward_weights=np.array([100000, 0, 0, 0]), fractional_reward_weight=0) env = CausalWorld(task=task, skip_frame=skip_frame, enable_visualization=False, seed=seed_num, max_episode_length=maximum_episode_length) env = HERGoalEnvWrapper(env) env = CurriculumWrapper( env, intervention_actors=[GoalInterventionActorPolicy()], actives=[(0, 1000000000, 1, 0)]) set_global_seeds(seed_num) checkpoint_callback = CheckpointCallback(save_freq=int( validate_every_timesteps / num_of_envs), save_path=log_relative_path, name_prefix='model') model = HER(MlpPolicy, env, SAC, verbose=1, policy_kwargs=dict(layers=[256, 256, 256]), **her_config, seed=seed_num) model.learn(total_timesteps=total_time_steps, tb_log_name="her_sac", callback=checkpoint_callback) return
def test_her(model_class, goal_selection_strategy, discrete_obs_space): env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS, discrete_obs_space=discrete_obs_space) # Take random actions 10% of the time kwargs = {'random_exploration': 0.1} if model_class in [DDPG, SAC, TD3] else {} model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=0, **kwargs) model.learn(1000)
def launchAgent(model_name: str): """ :param model_name: 실행시킬 모델의 종류. HER, DDPG, PPO2 혹은 기타값(DQN)이어야 함 현재는 의도상 PPO2로 세팅할 것 :return: 1000회의 사이클을 돌고 난 이후의 모델 """ import Reinforcement_AI.env.e_enhanced_image_env as image_env from stable_baselines import DQN, HER, DDPG, PPO2 from stable_baselines.common import make_vec_env print("Current Env is " + model_name) if model_name == "HER": env = image_env.DetailedMiniMapEnv() model = HER("CnnPolicy", env=env, model_class=DQN) if model_name == "DDPG": env = image_env.DDPGImageEnv() model = DDPG(policy="CnnPolicy", env=env, normalize_observations=True) if model_name == "PPO2": env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1) model = PPO2(policy="CnnPolicy", env=env, verbose=1) else: env = image_env.DetailedMiniMapEnv() model = DQN( "CnnPolicy", # policy env=env, # environment double_q=True, # Double Q enable prioritized_replay=True, # Replay buffer enabled verbose=0 # log print ) for i in range(1000): if i != 0: if model_name == "HER": model = HER.load("detailedmap_HER_" + str(i), env) if model_name == "DDPG": model = DDPG.load("detailedmap_DDPG_" + str(i), env) if model_name == "PPO2": model = PPO2.load("detailedmap_PPO2_" + str(i), env) else: model = DQN.load("detailedmap_DQN_" + str(i), env) # print('model learn start') model.learn(total_timesteps=12500) #FPS가 130이상 넘어갈때의 최소수치 print("this model is : detailedmap_" + model_name + "_" + str(i + 1)) # print('model learn finished') # print('model save start') model.save("detailedmap_" + model_name + "_" + str(i + 1)) del model # print('model save end') return model
def main(env_id): env_type = 'robotics' env = gym.make(env_id) save_file = "/home/shivanik/fetch_trial.zip" video_file = "/home/shivanik/fetch_her_videos/" env_recorder = NonVecRecorder(env) video_len = 100 model = HER.load(save_file, env=env) action_spec = env.action_space env.reset() i = 0 record = False num_files = 4 obs = env.reset() for i in range(num_files): fname = video_file + "%d.mp4" % i print(fname) env_recorder.init_video_writing(fname=fname) for j in range(video_len): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) env_recorder.viz(True) if done: obs = env.reset() env_recorder.close() env.reset() env.close()
def load_model(eval_env): # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method model = HER.load('./her_robot_env', env=eval_env) count = 0 step_num_arr = [] for _ in range(20): number_steps = 0 obs = eval_env.reset() for _ in range(400): action, _ = model.predict(obs) obs, reward, done, _ = eval_env.step(action) number_steps += 1 # print(obs['achieved_goal'][0:3], obs['desired_goal'][0:3], reward) if done: step_num_arr.append(number_steps) count += 1 print("----------------It reached terminal state -------------------") break print( "Robot reached the goal position successfully ", count, " times and the Average step count was ", np.average(np.array(step_num_arr)) )
def main(load_policy=True): global log_dir model_class = TD3 # works also with SAC and DDPG action_space = 6 gamma = 0.9 memory_limit = 1000000 timesteps = 15000000 discreteAction = 0 rend = False # learning rate env = bioEnv() env = Monitor(env, log_dir, allow_early_resets=True) goal_selection_strategy = 'future' n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER(CustomTD3Policy, env, model_class,n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1,tensorboard_log="../pybullet_logs/bioEnv_TD3", buffer_size=1000000,batch_size= 256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load("models/TD3/curriculum/best_model_part_11_10g_TRUE.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log="../pybullet_logs/bioEnv_TD3", buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise) model.learn(timesteps,log_interval=100, callback = callback) model.save("policy_TD3_Discr")
def test_HER( env, out_dir, seed=None, **kwargs): model = HER.load(os.path.join(out_dir,'final_model.pkl'), env=env) #model.learn(total_timesteps=10000) # Evaluate the trained agent mean_reward = evaluate(env, model, num_steps=5000) return
def main(load_policy=False): global log_dir model_class = TD3 # works also with SAC and DDPG action_space = 7 normalize_observations = False gamma = 0.9 memory_limit = 1000000 normalize_returns = True timesteps = 8000000 rend = False obj_pose_rnd_std = 0 env = pandaPushGymGoalEnv(renders=rend, use_IK=0, numControlledJoints=action_space, obj_pose_rnd_std=obj_pose_rnd_std, includeVelObs=True) env = Monitor(env, log_dir, allow_early_resets=True) goal_selection_strategy = 'future' n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER( CustomTD3Policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1, tensorboard_log= "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed", buffer_size=1000000, batch_size=256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load( "../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log= "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed", buffer_size=1000000, batch_size=256, random_exploration=0.3, action_noise=action_noise) model.learn(timesteps, log_interval=100, callback=callback) print("Saving Policy PHASE_1") model.save("../policies/TD3_phase1_target_fixed")
def train_curriculum_fetch(self, env_name="Merging-v0"): """ Trains reward curriculum """ self.curriculum = [env_name] bs2model_ours = {'RL': BR_BL0_BL1_BL5, 'LR': BL_BR0} bs2model = {'RL': BR_s, 'LR': BL_s} for l, lesson in enumerate(self.curriculum): for bs in bs2model.keys(): self.bs = bs for seed in [101, 102]: if self.expt_type == "ours": model_info = bs2model_ours[self.bs] else: model_info = bs2model[self.bs] model_dir = os.path.join(model_info[0], model_info[1], model_info[2]) if self.model_type == "PPO": self.model = PPO2.load( model_dir) # loads pre-trained model elif self.model_type == "HER": self.model = HER.load( model_dir) # loads pre-trained model print(f"\ntraining on {lesson}, bs {self.bs}, seed{seed}") self.seed = seed self.experiment_name = f"{self.bs}_{self.expt_type}_{seed}" print("EXPT NAME: ", self.experiment_dir1, self.experiment_name) self.experiment_dir = os.path.join(self.experiment_dir1, self.experiment_name) self.create_eval_dir() env = gym.make(lesson) eval_env = gym.make(lesson) if self.bs == 'RL': env._set_homotopy_class('left') eval_env._set_homotopy_class('left') elif self.bs == 'LR': env._set_homotopy_class('right') eval_env._set_homotopy_class('right') if self.model_type == "HER": env = HERGoalEnvWrapper(env) eval_env = HERGoalEnvWrapper(eval_env) print("hc: ", env.env.homotopy_class) else: env = DummyVecEnv([lambda: env]) self.model.set_env(env) self.model.seed = self.seed self.model = train(self.model, eval_env, self.timesteps, self.experiment_dir, self.is_save, self.eval_save_period, self.rets_path, l)
def main(argv): numControlledJoints = 6 fixed = False normalize_observations = False gamma = 0.9 batch_size = 16 memory_limit = 1000000 normalize_returns = True timesteps = 1000000 policy_name = "reaching_policy" discreteAction = 0 rend = False kukaenv = kukaReachGymEnvHer(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, numControlledJoints=numControlledJoints, fixedPositionObj=fixed, includeVelObs=True) kukaenv = Monitor(kukaenv, log_dir, allow_early_resets=True) n_actions = kukaenv.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model_class = DDPG goal_selection_strategy = 'future' model = HER(CustomPolicy, kukaenv, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1, tensorboard_log= "../pybullet_logs/kuka_reach_ddpg/reaching_DDPG_HER_PHASE", buffer_size=1000000, batch_size=64, random_exploration=0.3, action_noise=action_noise) print(colored("-----Timesteps:", "red")) print(colored(timesteps, "red")) print(colored("-----Number Joints Controlled:", "red")) print(colored(numControlledJoints, "red")) print(colored("-----Object Position Fixed:", "red")) print(colored(fixed, "red")) print(colored("-----Policy Name:", "red")) print(colored(policy_name, "red")) print(colored("------", "red")) print(colored("Launch the script with -h for further info", "red")) model.learn(total_timesteps=timesteps, log_interval=100, callback=callback) print("Saving model to kuka.pkl") model.save("../pybullet_logs/kukareach_ddpg_her/" + policy_name) del model # remove to demonstrate saving and loading
def train_policy(num_of_envs, log_relative_path, maximum_episode_length, skip_frame, seed_num, sac_config, total_time_steps, validate_every_timesteps, task_name): def _make_env(rank): def _init(): task = generate_task(task_generator_id=task_name) env = CausalWorld(task=task, skip_frame=skip_frame, enable_visualization=False, seed=seed_num + rank, max_episode_length=maximum_episode_length) env = HERGoalEnvWrapper(env) return env set_global_seeds(seed_num) return _init os.makedirs(log_relative_path) env = SubprocVecEnv([_make_env(rank=i) for i in range(num_of_envs)]) model = HER('MlpPolicy', env, SAC, verbose=1, policy_kwargs=dict(layers=[256, 256, 256]), **sac_config) save_config_file(sac_config, _make_env(0)(), os.path.join(log_relative_path, 'config.json')) for i in range(int(total_time_steps / validate_every_timesteps)): model.learn(total_timesteps=validate_every_timesteps, tb_log_name="sac", reset_num_timesteps=False) model.save(os.path.join(log_relative_path, 'saved_model')) return
class HERSACAgent(Agent): name = "her-sac" def __init__(self, env: ISettableGoalEnv, verbose=1, rank=0, experiment_name="her-sac"): self._env = env self._dirs = Dirs( experiment_name=f"{type(env).__name__}-{experiment_name}", rank=rank) options = { "env": env, "tensorboard_log": self._dirs.tensorboard, "model_class": SAC, "gamma": 1, "learning_rate": 3e-3 } if os.path.isdir(self._dirs.models) and os.path.isfile( self._dirs.best_model): self._model = HER.load(load_path=self._dirs.best_model, **options) print(f"Loaded model {self._dirs.best_model}") else: self._model = HER(policy="MlpPolicy", verbose=verbose, **options) def __call__(self, obs: Observation) -> np.ndarray: action, _ = self._model.predict(obs, deterministic=True) return action def train(self, timesteps: int, callbacks: Sequence[BaseCallback] = None, num_checkpoints=4) -> None: callbacks = [] if callbacks is None else callbacks cb = CheckpointCallback(save_freq=timesteps // num_checkpoints, save_path=self._dirs.models, name_prefix=self._dirs.prefix) self._model.learn(total_timesteps=timesteps, callback=CallbackList([cb, *callbacks]))
def main( training_env: PSMCartesianHERDDPGEnv, eval_env: PSMCartesianHERDDPGEnv = None, log_dir='./.logs/results' ): os.makedirs(log_dir, exist_ok=True) # training_env = Monitor(training_env, log_dir) n_actions = training_env.action_space.shape[0] noise_std = 0.2 # Currently using OU noise action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions) ) model_class = DDPG # works also with SAC, DDPG and TD3 rl_model_kwargs = { 'actor_lr': 1e-3, 'critic_lr': 1e-3, 'action_noise': action_noise, 'nb_train_steps': 300, 'nb_rollout_steps': 100, 'gamma': 0.95, 'observation_range': (-1.5, 1.5), 'random_exploration': 0.05, 'normalize_observations': True, 'critic_l2_reg': 0.01 } # Available strategies (cf paper): future, final, episode, random model = HER( 'MlpPolicy', training_env, model_class, verbose=1, n_sampled_goal=4, goal_selection_strategy='future', buffer_size=int(1e5), batch_size=128, tensorboard_log="./ddpg_dvrk_tensorboard/", **rl_model_kwargs ) # Reset the model training_env.reset() # Create callbacks checkpoint_callback = CheckpointCallback( save_freq=100000, save_path="./ddpg_dvrk_tensorboard/" ) # save_path="./.model/model_checkpoint/") #save_freq=100000 # eval_callback = EvalCallback(training_env, best_model_save_path='./ddpg_dvrk_tensorboard/best_model', # log_path=log_dir, eval_freq=500) callback = CallbackList([checkpoint_callback]) # , eval_callback]) # Train the model model.learn(4000000, log_interval=100, callback=callback) model.save("./her_robot_env")
def __init__(self, env: ISettableGoalEnv, verbose=1, rank=0, experiment_name="her-sac"): self._env = env self._dirs = Dirs( experiment_name=f"{type(env).__name__}-{experiment_name}", rank=rank) options = { "env": env, "tensorboard_log": self._dirs.tensorboard, "model_class": SAC, "gamma": 1, "learning_rate": 3e-3 } if os.path.isdir(self._dirs.models) and os.path.isfile( self._dirs.best_model): self._model = HER.load(load_path=self._dirs.best_model, **options) print(f"Loaded model {self._dirs.best_model}") else: self._model = HER(policy="MlpPolicy", verbose=verbose, **options)
def train_curriculum(self, env_name="Merging-v0"): """ Trains reward curriculum """ self.curriculum = [env_name] bs2model_ours = {1: B1R_B0L, 3: B3R_B0L, 5: B5R_B0L2, 7: B7R_B0L_B4L1} bs2model = {1: B1R, 3: B3R, 5: B5R, 7: B7R} for l, lesson in enumerate(self.curriculum): for seed in [201, 202, 203, 204, 205]: if self.expt_type == "ours": model_info = bs2model_ours[int(self.bs)] elif self.expt_type == "finetune": model_info = bs2model[int(self.bs)] model_dir = os.path.join(model_info[0], model_info[1], model_info[2]) if self.model_type == "PPO": self.model = PPO2.load( model_dir) # loads pre-trained model elif self.model_type == "HER": self.model = HER.load(model_dir) # loads pre-trained model print(f"\ntraining on {lesson}, bs {self.bs}, seed{seed}") self.seed = seed self.experiment_name = f"{self.bs}_{self.expt_type}_{seed}" print("EXPT NAME: ", self.experiment_dir1, self.experiment_name) self.experiment_dir = os.path.join(self.experiment_dir1, self.experiment_name) self.create_eval_dir() env = gym.make(lesson) eval_env = gym.make(lesson) env._set_barrier_size(self.bs) env._set_homotopy_class('left') eval_env._set_barrier_size(self.bs) eval_env._set_homotopy_class('left') if self.model_type == "HER": env = HERGoalEnvWrapper(env) eval_env = HERGoalEnvWrapper(eval_env) print("bs: ", env.env.barrier_size) print("hc: ", env.env.homotopy_class) else: env = DummyVecEnv([lambda: env]) self.model.set_env(env) self.model.set_random_seed(self.seed) ### ENTROPY### #self.model.ent_coef = 0.05 self.model = train(self.model, eval_env, self.timesteps, self.experiment_dir, self.is_save, self.eval_save_period, self.rets_path, l)
def main(): model_class = DDPG # works also with SAC and DDPG # -j action_space = 7 # -p fixed = True # -o normalize_observations = False # -g gamma = 0.9 # -b #batch_size = 16 # -m memory_limit = 1000000 # -r normalize_returns = True # -t timesteps = 1000000 policy_name = "pushing_policy" discreteAction = 0 rend = False env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, action_space=action_space, fixedPositionObj=fixed, includeVelObs=True) # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER( CustomPolicy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1, tensorboard_log= "../pybullet_logs/panda_push_ddpg/stable_baselines/DDPG+HER_FIXED_DYN_RAND", buffer_size=1000000, batch_size=256, random_exploration=0.3, action_noise=action_noise) # Train the model starting from a previous policy model.learn(timesteps) print("Saving Policy") model.save("../policies/pushing_fixed_HER_Dyn_Rand")
def __init__(self, env_id, exp_id, model_path, trajectory_type, episode_timesteps, noise_parameters): self.env_id = env_id self.exp_id = exp_id self.trajectory_type = trajectory_type # Load model and environment self.env = HERGoalEnvWrapper( gym.make(env_id, **{'noise_parameters': noise_parameters})) self.model = HER.load(model_path, env=self.env) self.episode_timesteps = episode_timesteps # Setup subscriber for trajectory generator # self.line_trajectory_timer = rospy.Timer(rospy.Duration(0.1), self.line_trajectory_callback) # self.circle_trajectory_timer = rospy.Timer(rospy.Duration(0.01), self.circle_trajectory_callback) # Line trajectory settings if self.trajectory_type == "line": self.start_p = np.array([20, 0, 100]) / 1000 self.finish_p = np.array([20, 40, 100]) / 1000 self.del_p = self.finish_p - self.start_p self.current_goal = self.start_p # Circle trajectory settings if self.trajectory_type == "circle": self.offset = np.array([20, 20, 100]) / 1000 self.radius = 20.0 / 1000 self.thetas = np.arange(0, 2 * np.pi, np.deg2rad(5)) self.thetas_counter = 0 self.start_p = self.offset self.current_goal = self.start_p # Start timer self.prev_time = rospy.get_time() # Complete trajectory check self.shape_df = pd.DataFrame(columns=[ 'episode', 'timestep', 'r1x', 'r1y', 'r1z', 'r2x', 'r2y', 'r2z', 'r3x', 'r3y', 'r3z' ]) # self.goals_df = pd.DataFrame(columns=['ag_x', 'ag_y', 'ag_z', 'dg_x', 'dg_y', 'dg_z']) self.traj_complete = False self.achieved_goals = np.array([]) self.desired_goals = np.array([]) self.episode_count = 0
def load_model(model_info, model_type="PPO", baseline=None, pkl_file=None): model_dir = os.path.join(model_info[0], model_info[1], model_info[2]) if model_type == "PPO": if baseline == 'L2SP': from baselines.L2SP.model import PPO2L2SP import baselines.L2SP.utils as L2SP_utils data, params = L2SP_utils.load_from_file(model_dir) model = PPO2L2SP.load(model_dir, original_params=params) elif baseline == 'PNN': from baselines.PNN.utils import looseload, resave_params_for_PPN output_dir = os.path.join( "output/updated_gridworld_continuous_PNN", 'resave', model_info[2]) resave_params_for_PPN(model_dir, output_dir) model = looseload(PPO2, output_dir) elif baseline == 'BSS': from baselines.BSS.utils import resave_params_for_BSS from baselines.BSS.model import PPO2BSS output_dir = os.path.join( "output/updated_gridworld_continuous_BSS", 'resave', model_info[2]) resave_params_for_BSS(model_dir, output_dir) model = PPO2BSS.load(output_dir, bss_coef=0.001, l2_coef=0.0005) else: model = PPO2.load(model_dir) elif model_type == "HER": if baseline == 'L2SP': from baselines_fetch.L2SP.model import HER2L2SP import baselines_fetch.L2SP.utils as L2SP_utils data, params = L2SP_utils.load_from_file(model_dir) model = HER2L2SP.load(model_dir, original_params=params) elif baseline == "PNN": from baselines_fetch.PNN.model import HER2PNN from baselines_fetch.PNN.utils import resave_params_for_PNN output_dir = os.path.join("output/fetch_PNN", 'resave', model_info[2]) resave_params_for_PNN(model_dir, output_dir) model = HER2PNN.load(output_dir) elif baseline == "BSS": pass else: model = HER.load(model_dir) return model
def heralgorithm(): goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE # Wrap the model model = HER('MlpPolicy', env1, DDPG, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1) # Train the model model.learn(1000) model.save("./her_bit_env")
def train_HER(env_train, model_name, timesteps=50000): start = time.time() n_sampled_goal = 4 goal_selection_strategy = 'future' model = HER('MlpPolicy', env_train, model_class=SAC, verbose=0, n_sampled_goal=n_sampled_goal, goal_selection_strategy=goal_selection_strategy) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (HER): ', (end - start) / 60, ' minutes') return model
def main(): panda_env = PandaGraspGymEnv(urdf_root=object_data.getDataPath(), is_rendering=True, use_ik=True, is_discrete=True, num_controlled_joints=7, reward_type="sparse") env = HERGoalEnvWrapper(panda_env) model = HER.load("logs/rl_model_1000000_steps.zip") episode_rewards, episode_lengths, episode_success = evaluate_policy( model, env, n_eval_episodes=50, render=False, deterministic=True, return_episode_rewards=True) print("Final Reward {}, Episode Length{}, Success Rate {}".format( np.mean(episode_rewards), np.mean(episode_lengths), np.mean(episode_success)))
def main(load_policy=False): global log_dir, log_dir_policy if (load_policy): log_dir_policy = '../policies/PUSHING_TD3+HER_FIXED_POSITION_DYN_RAND_FROM_FIXED_PHYSICS' model_class = TD3 # works also with SAC and DDPG action_space = 7 fixed = True normalize_observations = False gamma = 0.9 memory_limit = 1000000 normalize_returns = True timesteps = 1500000 discreteAction = 0 rend = False env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0, isDiscrete=discreteAction, action_space = action_space, fixedPositionObj = fixed, includeVelObs = True) env = Monitor(env, log_dir, allow_early_resets=True) # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER(CustomPolicy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND", buffer_size=1000000,batch_size=256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND_FROM_FIXED_PHYSICS", buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise) # Train the model starting from a previous policy model.learn(timesteps, callback = callback ) model.save("../policies/PUSHING_FIXED_TD3_DYN_RAND") print("Finished train1")
def main(load_policy=False): global log_dir model_class = TD3 # works also with SAC and DDPG action_space = 6 fixed = True #0 completely fixed, 1 slightly random radius, 2 big random radius, object_position = 1 normalize_observations = False gamma = 0.9 memory_limit = 1000000 normalize_returns = True timesteps = 5000000 discreteAction = 0 rend = False env = pandaPushGymEnvHER(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=1, isDiscrete=discreteAction, action_space = action_space, fixedPositionObj = fixed, includeVelObs = True, object_position=object_position) env = Monitor(env, log_dir, allow_early_resets=True) goal_selection_strategy = 'future' n_actions = env.action_space.shape[-1] action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) # Wrap the model model = HER(CustomTD3Policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK", buffer_size=1000000,batch_size=256, random_exploration=0.3, action_noise=action_noise) if (load_policy): model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK", buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise) model.learn(timesteps,log_interval=100, callback = callback) print("Saving Policy PHASE_1") model.save("../policies/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK")
def main(env): n_actions = env.action_space.shape[0] noise_std = 0.2 # Currently using OU noise action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) model_class = DDPG # works also with SAC, DDPG and TD3 rl_model_kwargs = { 'actor_lr': 1e-3, 'critic_lr': 1e-3, 'action_noise': action_noise, 'nb_train_steps': 300, 'nb_rollout_steps': 100, 'gamma': 0.95, 'observation_range': (-1.5, 1.5), 'random_exploration': 0.05, 'normalize_observations': True, 'critic_l2_reg': 0.01 } # Available strategies (cf paper): future, final, episode, random model = HER('MlpPolicy', env, model_class, verbose=1, n_sampled_goal=4, goal_selection_strategy='future', buffer_size=int(1e5), batch_size=128, tensorboard_log="./ddpg_dvrk_tensorboard/", **rl_model_kwargs) # Reset the model env.reset() # Train the model model.learn(4000000, log_interval=100, callback=CheckpointCallback( save_freq=100000, save_path="./ddpg_dvrk_tensorboard/")) model.save("./her_robot_env")
def launchAgent(): import Reinforcement_AI.env.d_image_env as image_env from stable_baselines import DQN, HER, DDPG, PPO2 from stable_baselines.common import make_vec_env model_name = "PPO2" if model_name == "HER": model = HER( "CnnPolicy", env=image_env.DetailedMiniMapEnv(), model_class=DQN ) if model_name == "DDPG": model = DDPG( policy="CnnPolicy", env=image_env.DDPGImageEnv(), normalize_observations=True ) if model_name == "PPO2": # env = image_env.DetailedMiniMapEnv() env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1) model = PPO2( policy="CnnPolicy", env=env, verbose=1 ) else: model = DQN( "CnnPolicy", # policy env=image_env.DetailedMiniMapEnv(), # environment double_q=True, # Double Q enable prioritized_replay=True, # Replay buffer enabled verbose=0 # log print ) for i in range(1000): if i != 0: if model_name == "HER": model = HER.load("detailedmap_HER_" + str(i)) model.set_env(image_env.DetailedMiniMapEnv()) if model_name == "DDPG": model = DDPG.load("detailedmap_DDPG_" + str(i)) model.set_env(image_env.DDPGImageEnv()) if model_name == "PPO2": # print('set env') # ppo2_env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1) # print('get model') model = PPO2.load("detailedmap_PPO2_" + str(i), env) # print('set model env') # model.set_env(ppo2_env) else: model = DQN.load("detailedmap_DQN_" + str(i)) model.set_env(image_env.DetailedMiniMapEnv()) # print('model learn start') model.learn(total_timesteps=3900) # print('model learn finished') # print('model save start') model.save("detailedmap_" + model_name + "_" + str(i+1)) del model
def test_model_manipulation(model_class, goal_selection_strategy): env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS) env = DummyVecEnv([lambda: env]) model = HER('MlpPolicy', env, model_class, n_sampled_goal=3, goal_selection_strategy=goal_selection_strategy, verbose=0) model.learn(1000) model_predict(model, env, n_steps=100, additional_check=None) model.save('./test_her.zip') del model # NOTE: HER does not support VecEnvWrapper yet with pytest.raises(AssertionError): model = HER.load('./test_her.zip', env=VecNormalize(env)) model = HER.load('./test_her.zip') # Check that the model raises an error when the env # is not wrapped (or no env passed to the model) with pytest.raises(ValueError): model.predict(env.reset()) env_ = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS) env_ = HERGoalEnvWrapper(env_) model_predict(model, env_, n_steps=100, additional_check=None) model.set_env(env) model.learn(1000) model_predict(model, env_, n_steps=100, additional_check=None) assert model.n_sampled_goal == 3 del model env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS) model = HER.load('./test_her', env=env) model.learn(1000) model_predict(model, env_, n_steps=100, additional_check=None) assert model.n_sampled_goal == 3 if os.path.isfile('./test_her.zip'): os.remove('./test_her.zip')
import time env = gym.make("FetchPickAndPlace-v1") goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE # Wrap the model model = HER('MlpPolicy', env, SAC, n_sampled_goal=4, goal_selection_strategy='future', verbose=1, buffer_size=int(1e6), learning_rate=0.001, gamma=0.95, batch_size=256, ent_coef='auto', random_exploration=0.3, learning_starts=1000, train_freq=1, policy_kwargs=dict(layers=[256, 256, 256]), tensorboard_log="./OpenAI/") # Train the model model.learn(int(8e6)) model.save("./model2") # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method model = HER.load('./model2', env=env)
env_name = 'blueprint_construction' core_dir = '/Users/abhijithneilabraham/Documents/GitHub/multi-agent-emergence-environments/' envs_dir = 'mae_envs/envs' xmls_dir = 'xmls' env, _ = load_env(env_name, core_dir=core_dir, envs_dir=envs_dir, xmls_dir=xmls_dir, return_args_remaining=True) # Available strategies (cf paper): future, final, episode, random goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE # Wrap the model model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=1) # Train the model model.learn(1000) model.save("./her_bit_env") # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method model = HER.load('./her_bit_env', env=env) obs = env.reset() for _ in range(100): action, _ = model.predict(obs) obs, reward, done, _ = env.step(action)
from stable_baselines.common.policies import MlpPolicy from stable_baselines import HER, SAC print('setting up environment') #env = gym.make("REALRobot2020-R2J3-v0") env = REALRobotEnv(objects=1) # Currently this wrapper doesn't really return goals but just sample_placeholder # to match the her format. env = GoalWrapper(env, crop_obs=True) print('setting up model') model = HER('MlpPolicy', env, SAC, n_sampled_goal=4, goal_selection_strategy='future', verbose=1, buffer_size=int(1e6), learning_rate=1e-3, gamma=0.95, batch_size=256) print('start learning') model.learn(total_timesteps=256) print('learning done') #Here we need to restart the environent to make rendering possible #(doesn't work with the wrappers right now) env = REALRobotEnv(objects=1) env = GoalWrapper(env, crop_obs=True) env.render("human") print('display model')