def Main(): #define arguments for her env_id = 'ur5e_reacher-v1' model_class = DDPG goal_selection_strategy = 'future' env = gym.make(env_id) #define kwargs to be passed to HER and wrapped algo kwargs = { #"n_timesteps":10000, "policy": 'MlpPolicy', "model_class": DDPG, "n_sampled_goal": 4, "goal_selection_strategy": 'future', "buffer_size": 1000000, #"ent_coef": 'auto', "batch_size": 256, "gamma": 0.95, "learning_rate": 0.001, "learning_starts": 1000, "online_sampling": True, #"normalize": True } #In the future, read hyperparams from her.yml #kwargs = read_hyperparameters(env_id) model = HER(env=env, **kwargs) total_n_steps = 1e6 safe_freq = total_n_steps // 10 max_episode_length = 4000 n_episodes = total_n_steps // max_episode_length model.learn(4000) model.save("./her_ur5e_model/model_3") model = HER.load('./her_ur5e_model/model_3', env=env) all_cumulative_rewards = [] num_episodes = 5 num_timesteps = 4800 env.render() #each timestep lasts 1/240 s. for episode in range(num_episodes): obs = env.reset() epi_rewards = [] for t in range(num_timesteps): action, _ = model.predict(obs) obs, reward, done, info = env.step(action) #time.sleep(1/240) epi_rewards.append(reward) if t == num_timesteps - 1: done = True if done: #pp.pprint(info) obs = env.reset() cumulative_reward = sum(epi_rewards) all_cumulative_rewards.append(cumulative_reward) print("episode {} | cumulative reward : {}".format( episode, cumulative_reward)) print("all_cumulative_rewards: ") pp.pprint(all_cumulative_rewards)
def test_save_load(tmp_path, model_class, use_sde, online_sampling): """ Test if 'save' and 'load' saves and loads model correctly """ if use_sde and model_class != SAC: pytest.skip("Only SAC has gSDE support") n_bits = 4 env = BitFlippingEnv(n_bits=n_bits, continuous=not (model_class == DQN)) kwargs = dict(use_sde=True) if use_sde else {} # create model model = HER( "MlpPolicy", env, model_class, n_sampled_goal=5, goal_selection_strategy="future", online_sampling=online_sampling, verbose=0, tau=0.05, batch_size=128, learning_rate=0.001, policy_kwargs=dict(net_arch=[64]), buffer_size=int(1e6), gamma=0.98, gradient_steps=1, train_freq=4, learning_starts=100, max_episode_length=n_bits, **kwargs ) model.learn(total_timesteps=300) env.reset() observations_list = [] for _ in range(10): obs = env.step(env.action_space.sample())[0] observation = ObsDictWrapper.convert_dict(obs) observations_list.append(observation) observations = np.array(observations_list) # Get dictionary of current parameters params = deepcopy(model.policy.state_dict()) # Modify all parameters to be random values random_params = dict((param_name, th.rand_like(param)) for param_name, param in params.items()) # Update model parameters with the new random values model.policy.load_state_dict(random_params) new_params = model.policy.state_dict() # Check that all params are different now for k in params: assert not th.allclose(params[k], new_params[k]), "Parameters did not change as expected." params = new_params # get selected actions selected_actions, _ = model.predict(observations, deterministic=True) # Check model.save(tmp_path / "test_save.zip") del model model = HER.load(str(tmp_path / "test_save.zip"), env=env) # check if params are still the same after load new_params = model.policy.state_dict() # Check that all params are the same as before save load procedure now for key in params: assert th.allclose(params[key], new_params[key]), "Model parameters not the same after save and load." # check if model still selects the same actions new_selected_actions, _ = model.predict(observations, deterministic=True) assert np.allclose(selected_actions, new_selected_actions, 1e-4) # check if learn still works model.learn(total_timesteps=300) # Test that the change of parameters works model = HER.load(str(tmp_path / "test_save.zip"), env=env, verbose=3, learning_rate=2.0) assert model.model.learning_rate == 2.0 assert model.verbose == 3 # clear file from os os.remove(tmp_path / "test_save.zip")
# we have to manually specify the max number of steps per episode max_episode_length=100, verbose=1, buffer_size=int(1e6), learning_rate=1e-3, gamma=0.95, batch_size=256, online_sampling=True, policy_kwargs=dict(net_arch=[256, 256, 256]), ) model.learn(int(2e5)) model.save("her_sac_highway") # Load saved model model = HER.load("her_sac_highway", env=env) obs = env.reset() # Evaluate the agent episode_reward = 0 for _ in range(100): action, _ = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) env.render() episode_reward += reward if done or info.get("is_success", False): print("Reward:", episode_reward, "Success?", info.get("is_success", False)) episode_reward = 0.0 obs = env.reset()
class DDPG_HER: def __init__(self, env, model_class=DDPG): self.model_class = model_class # works also with SAC, DDPG and TD3 # Available strategies (cf paper): future, final, episode, random self.env = env self.goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE self.model = HER('MlpPolicy', self.env, self.model_class, n_sampled_goal=4, goal_selection_strategy=self.goal_selection_strategy, buffer_size=1000000, batch_size=256, gamma=.95, learning_rate=1e-3, verbose=1, max_episode_length=50) def run(self, train_epochs=5000, train=False): # print("np.array(obs).shape: ", obs.shape) print("observation_space: ", self.env.observation_space) # Train the model if train: # 1000 epochs is approximately 50,000 time steps self.model.learn(total_timesteps=(50 * train_epochs)) self.model.save("./her_bit_env") # WARNING: you must pass an env # or wrap your environment with HERGoalEnvWrapper to use the predict method self.model = HER.load('./her_bit_env_new', env=self.env) obs = self.env.get_observation_simulated() for i in range(1): obs = self.env.reset() score = 0 self.env.success_history.append(False) start = time.time() for j in range(1000): # obs needs simulated coords action, _ = self.model.predict(obs) obs, reward, done, info = self.env.step(action) score += reward if j != 49: self.env.success_history[-1] = done # self.env.success_history[-1] = done print("Distance history: ", self.env.distance_history[-1]) print("Success history: ", self.env.success_history[-1]) if done: end = time.time() self.env.time_history.append(end - start) break time.sleep(1) print("epoch: ", j) if j != 0: print("score:", score, "average score:", score / j) print("self.env.success_history[-1]: ", self.env.success_history[-1]) print( "success rate: ", self.env.success_history.count(True) / len(self.env.success_history)) return self.env.success_history, self.env.distance_history, self.env.time_history