def test_model_manipulation(model_class, goal_selection_strategy): env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) env = DummyVecEnv([lambda: env]) model = HER('MlpPolicy', env, model_class, n_sampled_goal=3, goal_selection_strategy=goal_selection_strategy, verbose=0) model.learn(1000) model_predict(model, env, n_steps=100, additional_check=None) model.save('./test_her') del model # NOTE: HER does not support VecEnvWrapper yet with pytest.raises(AssertionError): model = HER.load('./test_her', env=VecNormalize(env)) model = HER.load('./test_her') # Check that the model raises an error when the env # is not wrapped (or no env passed to the model) with pytest.raises(ValueError): model.predict(env.reset()) env_ = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) env_ = HERGoalEnvWrapper(env_) model_predict(model, env_, n_steps=100, additional_check=None) model.set_env(env) model.learn(1000) model_predict(model, env_, n_steps=100, additional_check=None) assert model.n_sampled_goal == 3 del model env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC], max_steps=N_BITS) model = HER.load('./test_her', env=env) model.learn(1000) model_predict(model, env_, n_steps=100, additional_check=None) assert model.n_sampled_goal == 3 if os.path.isfile('./test_her.pkl'): os.remove('./test_her.pkl')
class HERSACAgent(Agent): name = "her-sac" def __init__(self, env: ISettableGoalEnv, verbose=1, rank=0, experiment_name="her-sac"): self._env = env self._dirs = Dirs( experiment_name=f"{type(env).__name__}-{experiment_name}", rank=rank) options = { "env": env, "tensorboard_log": self._dirs.tensorboard, "model_class": SAC, "gamma": 1, "learning_rate": 3e-3 } if os.path.isdir(self._dirs.models) and os.path.isfile( self._dirs.best_model): self._model = HER.load(load_path=self._dirs.best_model, **options) print(f"Loaded model {self._dirs.best_model}") else: self._model = HER(policy="MlpPolicy", verbose=verbose, **options) def __call__(self, obs: Observation) -> np.ndarray: action, _ = self._model.predict(obs, deterministic=True) return action def train(self, timesteps: int, callbacks: Sequence[BaseCallback] = None, num_checkpoints=4) -> None: callbacks = [] if callbacks is None else callbacks cb = CheckpointCallback(save_freq=timesteps // num_checkpoints, save_path=self._dirs.models, name_prefix=self._dirs.prefix) self._model.learn(total_timesteps=timesteps, callback=CallbackList([cb, *callbacks]))
ent_coef='auto', random_exploration=0.3, learning_starts=1000, train_freq=1, policy_kwargs=dict(layers=[256, 256, 256]), tensorboard_log="./OpenAI/") # Train the model model.learn(int(8e6)) model.save("./model2") # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method model = HER.load('./model2', env=env) obs = env.reset() episodes = 0 successes = 0 step = 0 while (episodes < 50): step += 1 action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) env.render() if done or step > 1000: obs = env.reset() episodes += 1 if _['is_success']: successes += 1 print('success_rate = ' + str(successes / episodes))
print('setting up model') model = HER('MlpPolicy', env, SAC, n_sampled_goal=4, goal_selection_strategy='future', verbose=1, buffer_size=int(1e6), learning_rate=1e-3, gamma=0.95, batch_size=256) print('start learning') model.learn(total_timesteps=256) print('learning done') #Here we need to restart the environent to make rendering possible #(doesn't work with the wrappers right now) env = REALRobotEnv(objects=1) env = GoalWrapper(env, crop_obs=True) env.render("human") print('display model') observation = env.reset() action = env.action_space.sample() reward, done = 0, False for t in range(100): model_action, _ = model.predict(observation) observation, reward, done, info = env.step(model_action) #print(model_action)
import gym import time from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2, DQN, HER, DDPG import synergyenvs env = gym.make("GraspBoxPybullet-v0") env.render() o = env.reset() # model = PPO2(MlpPolicy, env, verbose=1) model = HER('MlpPolicy', env, DDPG, n_sampled_goal=4, verbose=0) model.load("./her_graspbox-1") env.camera_adjust() for _ in range(10000): env.render() action, _states = model.predict(o) # action = env.action_space.sample() o, r, done, info = env.step(action) print(o, r, done, info) if done: o = env.reset() time.sleep(0.1) env.close()
from stable_baselines import DDPG, HER, TD3, SAC env = gym.make('FetchReach-v1') #model = DDPG('MlpPolicy', env) model = HER('MlpPolicy', env, DDPG, goal_selection_strategy='final', n_sampled_goal=4) model.learn(50000000) model.save('./her_fetch_reach') #model = HER.load('./her_fetch_reach', env=env) for _ in range(100): obs = env.reset() state = None done = False _reward = 0 while not done: env.render() action, state = model.predict(obs) obs, reward, done, info = env.step(action) _reward += reward print("Reward = {}".format(_reward)) env.close()
### Run the learning process model.learn(total_timesteps=400000, log_interval=1, reset_num_timesteps=False) # Save the model if desired # model.save("acrobot_her_baseline.pkl") ### Enjoy a trained agent # duration of the simulations in seconds t_end = 20 # Desired goal desired_goal = 0.95 * env.env._tipPosZMax # As difficult as possible # Run the simulation in real-time env.reset() env.env.goal[0] = desired_goal obs = env.env._get_obs() episode_reward = 0 for _ in range(int(t_end / env.dt)): action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() time.sleep(env.dt) episode_reward += reward if done or info.get('is_success', False): print("Reward:", episode_reward, "Success:", info.get('is_success', False)) break