def test_save_load_pytorch_var(tmp_path): model = SAC("MlpPolicy", "Pendulum-v1", seed=3, policy_kwargs=dict(net_arch=[64], n_critics=1)) model.learn(200) save_path = str(tmp_path / "sac_pendulum") model.save(save_path) env = model.get_env() log_ent_coef_before = model.log_ent_coef del model model = SAC.load(save_path, env=env) assert th.allclose(log_ent_coef_before, model.log_ent_coef) model.learn(200) log_ent_coef_after = model.log_ent_coef # Check that the entropy coefficient is still optimized assert not th.allclose(log_ent_coef_before, log_ent_coef_after) # With a fixed entropy coef model = SAC("MlpPolicy", "Pendulum-v1", seed=3, ent_coef=0.01, policy_kwargs=dict(net_arch=[64], n_critics=1)) model.learn(200) save_path = str(tmp_path / "sac_pendulum") model.save(save_path) env = model.get_env() assert model.log_ent_coef is None ent_coef_before = model.ent_coef_tensor del model model = SAC.load(save_path, env=env) assert th.allclose(ent_coef_before, model.ent_coef_tensor) model.learn(200) ent_coef_after = model.ent_coef_tensor assert model.log_ent_coef is None # Check that the entropy coefficient is still the same assert th.allclose(ent_coef_before, ent_coef_after)
def get_perf(i): model = SAC("MlpPolicy", "Pendulum-v0", policy_kwargs=dict(net_arch=[64, 64]), learning_starts=5e3, verbose=1, create_eval_env=True, buffer_size=1000000, ent_coef=0.2, action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)), seed=42) saved_policy = MlpPolicy.load( "Pendulum-v0#test4SAC#custom#None#{}.zip".format(i)) mean_reward, std_reward = evaluate_policy(saved_policy, model.get_env(), n_eval_episodes=900) return mean_reward, std_reward
def test_train_freq(tmp_path, train_freq): model = SAC( "MlpPolicy", "Pendulum-v1", policy_kwargs=dict(net_arch=[64, 64], n_critics=1), learning_starts=100, buffer_size=10000, verbose=1, train_freq=train_freq, ) model.learn(total_timesteps=150) model.save(tmp_path / "test_save.zip") env = model.get_env() model = SAC.load(tmp_path / "test_save.zip", env=env) model.learn(total_timesteps=150) model = SAC.load(tmp_path / "test_save.zip", train_freq=train_freq, env=env) model.learn(total_timesteps=150)
def advanced_saving_and_loading_example(): # Advanced Saving and Loading. from stable_baselines3.sac.policies import MlpPolicy # Create the model, the training environment and the test environment (for evaluation). model = SAC('MlpPolicy', 'Pendulum-v1', verbose=1, learning_rate=1e-3, create_eval_env=True) # Evaluate the model every 1000 steps on 5 test episodes and save the evaluation to the "logs/" folder. model.learn(6000, eval_freq=1000, n_eval_episodes=5, eval_log_path="./logs/") # Save the model. model.save("sac_pendulum") # The saved model does not contain the replay buffer. loaded_model = SAC.load("sac_pendulum") print( f"The loaded_model has {loaded_model.replay_buffer.size()} transitions in its buffer" ) # Now save the replay buffer too. model.save_replay_buffer("sac_replay_buffer") # Load it into the loaded_model. loaded_model.load_replay_buffer("sac_replay_buffer") # Now the loaded replay is not empty anymore. print( f"The loaded_model has {loaded_model.replay_buffer.size()} transitions in its buffer" ) # Save the policy independently from the model. # Note: if you don't save the complete model with 'model.save()' # you cannot continue training afterward. policy = model.policy policy.save("sac_policy_pendulum") # Retrieve the environment. env = model.get_env() # Evaluate the policy. mean_reward, std_reward = evaluate_policy(policy, env, n_eval_episodes=10, deterministic=True) print(f"mean_reward={mean_reward:.2f} +/- {std_reward}") # Load the policy independently from the model. saved_policy = MlpPolicy.load("sac_policy_pendulum") # Evaluate the loaded policy. mean_reward, std_reward = evaluate_policy(saved_policy, env, n_eval_episodes=10, deterministic=True) print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
if __name__ == '__main__': env_id = 'gym_spm:spm-v0' num_cpu = 4 # Number of processes to use env = gym.make('gym_spm:spm-v0') n_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=.75 * np.ones(n_actions)) # model = SAC(MlpPolicy, env, action_noise=action_noise, verbose=1) model = SAC(MlpPolicy, env, verbose=1) model.learn(total_timesteps=25000) # model.load('DDPG_test_2_SOC_point5_two_states') mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) print("Mean Reward = ", mean_reward) epsi_sp_list = [] action_list = [] soc_list = [] Concentration_list = [] Concentration_list1 = [] obs = env.reset() for _ in range(3600): action, _states = model.predict(obs, deterministic=True) obs, rewards, done, info = env.step(action)
# load it into the loaded_model loaded_model.load_replay_buffer("sac_replay_buffer") # now the loaded replay is not empty anymore print( f"The loaded_model has {loaded_model.replay_buffer.size()} transitions in its buffer" ) # Save the policy independently from the model # Note: if you don't save the complete model with `model.save()` # you cannot continue training afterward policy = model.policy policy.save("sac_policy_pendulum") # Retrieve the environment env = model.get_env() # Evaluate the policy mean_reward, std_reward = evaluate_policy(policy, env, n_eval_episodes=10, deterministic=True) print(f"mean_reward={mean_reward:.2f} +/- {std_reward}") # Load the policy independently from the model saved_policy = MlpPolicy.load("sac_policy_pendulum") # Evaluate the loaded policy mean_reward, std_reward = evaluate_policy(saved_policy, env,