def main(): # Create the callback: check every 1000 steps log_dir = 'log' callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) num_cpu = 16 model_stats_path = os.path.join(log_dir, "sac_" + env_name) env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl') tb_log = 'tb_log' videoName = '5M_timesteps_sac' tb_log_name = videoName if(StartFresh): # env = make_vec_env(env_name, n_envs=4) # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() policy_kwargs = { 'net_arch':[128,64,32], } model = PPO('MlpPolicy', env, learning_rate = 0.001, n_steps=500, # batch_size=0, # n_epochs=1, gamma=0.9, policy_kwargs = policy_kwargs, verbose=1, tensorboard_log=tb_log, device="auto") else: env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize.load(env_stats_path, env) env.reset() model = PPO.load(model_stats_path, tensorboard_log=tb_log) model.set_env(env) if(DoTraining): eval_env = make_vec_env(env_name, n_envs=1) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log) model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback() # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): # mean_reward, std_reward = evaluate_policy(model, eval_env) # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}") record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
def make_env_stack(num_envs, game_path, base_port, game_log_path, opp_fp_and_elo, trainee_elo, elo_match=True, survivor=False, stdout_path=None, level_path=None, image_based=False, time_reward=0., env_p=3): if num_envs >= 1: envs = [] for i in range(num_envs): envs.append(lambda game_path=game_path, b=base_port + (i * 2), c=game_log_path.replace( ".txt", "-" + str(i) + ".txt"), d=opp_fp_and_elo, e =elo_match, f=trainee_elo, g=survivor, h=stdout_path. replace(".txt", "-" + str(i) + ".txt"), i=level_path, j =image_based, k=time_reward: TankEnv(game_path, game_port=b, game_log_path=c, opp_fp_and_elo=d, elo_match=e, center_elo=f, survivor=g, stdout_path=h, verbose=True, level_path=i, image_based=j, time_reward=k, p=env_p)) if num_envs == 1: env_stack = SubprocVecEnv(envs, start_method="fork") else: env_stack = SubprocVecEnv(envs, start_method="forkserver") env_stack.reset() return env_stack else: env = TankEnv(game_path, game_port=base_port, game_log_path=game_log_path, opp_fp_and_elo=opp_fp_and_elo, elo_match=elo_match, center_elo=trainee_elo, survivor=survivor, stdout_path=stdout_path, level_path=level_path, image_based=image_based, time_reward=time_reward, p=env_p) env.reset() return env
def get_multiproc_env(self, n=10): def get_self(): return deepcopy(self) e = SubprocVecEnv([get_self for _ in range(n)], start_method="fork") obs = e.reset() return e, obs
def make_ai_matchmaker_stack(all_stats, all_opps, all_elos, game_path, model_dir, base_port=50000, image_based=False, level_path=None, env_p=3, starting_elo=None, K=16, D=5., time_reward=-0.003, num_envs=1, matchmaking_mode=0, win_loss_ratio="0:0"): envs = [] for i in range(num_envs): envs.append( lambda a=all_stats, b=all_opps, c=all_elos, d=game_path, e=model_dir, f=base_port+(i*2), g=base_port+(i*2)+1, \ h=image_based, i=level_path, j=env_p, k=starting_elo, l=time_reward, m=matchmaking_mode, \ n=[int(x) for x in win_loss_ratio.split(':')]: AIMatchmaker(a,b,c,d,e, base_port=f, my_port=g, image_based=h, level_path=i, env_p=j, starting_elo=k, time_reward=l, matchmaking_mode=m, win_loss_ratio=n ) ) env_stack = SubprocVecEnv(envs, start_method="fork") env_stack.reset() return env_stack
def record_video(env_name, train_env, model, videoLength=500, prefix='', videoPath='videos/'): print('record_video function') # Wrap the env in a Vec Video Recorder local_eval_env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(1)]) local_eval_env = VecNormalize(local_eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) sync_envs_normalization(train_env, local_eval_env) local_eval_env = VecVideoRecorder(local_eval_env, video_folder=videoPath, record_video_trigger=lambda step: step == 0, video_length=videoLength, name_prefix=prefix) obs = local_eval_env.reset() for _ in range(videoLength): action, _ = model.predict(obs) obs, _, _, _ = local_eval_env.step(action) # Close the video recorder local_eval_env.close()
def multiprocessing_example(): # Multiprocessing: Unleashing the Power of Vectorized Environments def make_env(env_id, rank, seed=0): """ Utility function for multiprocessed env. :param env_id: (str) the environment ID. :param num_env: (int) the number of environments you wish to have in subprocesses. :param seed: (int) the inital seed for RNG. :param rank: (int) index of the subprocess. """ def _init(): env = gym.make(env_id) env.seed(seed + rank) return env set_random_seed(seed) return _init env_id = "CartPole-v1" num_cpu = 4 # Number of processes to use. # Create the vectorized environment. env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) # Stable Baselines provides you with make_vec_env() helper which does exactly the previous steps for you. # You can choose between 'DummyVecEnv' (usually faster) and 'SubprocVecEnv'. #env = make_vec_env(env_id, n_envs=num_cpu, seed=0, vec_env_cls=SubprocVecEnv) model = PPO("MlpPolicy", env, verbose=1) model.learn(total_timesteps=25_000) obs = env.reset() for _ in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
from stable_baselines3 import PPO from stable_baselines3.common.vec_env import SubprocVecEnv from stable_baselines3.common.env_util import make_vec_env from stable_baselines3.common.utils import set_random_seed def make_env(env_id, rank, seed=0): def _init(): env = gym.make(env_id) env.seed(seed + rank) return env set_random_seed(seed) return _init if __name__ == '__main__': env_id = "CartPole-v1" num_cpu = 4 # Number of processes to use env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) model = PPO('MlpPolicy', env, verbose=1) model.learn(total_timesteps=25000) obs = env.reset() for _ in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def main(): # multiprocess environment n_cpu = 8 env = SubprocVecEnv( [lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)]) env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) # n_cpu = 1 # env = gym.make('DYROSTocabi-v1') # env = DummyVecEnv([lambda: env]) # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True) model = PPO('MlpPolicy', env, verbose=1, n_steps=int(4096 / n_cpu), wandb_use=True) model.learn(total_timesteps=40000000) file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now()) model.save(file_name) env.save(file_name + "_env.pkl") model.policy.to("cpu") for name, param in model.policy.state_dict().items(): weight_file_name = "./result/" + name + ".txt" np.savetxt(weight_file_name, param.data) np.savetxt("./result/obs_mean.txt", env.obs_rms.mean) np.savetxt("./result/obs_variance.txt", env.obs_rms.var) del model # remove to demonstrate saving and loading del env # file_name = "ppo2_DYROSTocabi_2021-02-27 02:20:20.015346" env = gym.make('DYROSTocabi-v1') env = DummyVecEnv([lambda: env]) env = VecNormalize.load(file_name + "_env.pkl", env) env.training = False model = PPO.load(file_name, env=env, wandb_use=False) model.policy.to("cpu") for name, param in model.policy.state_dict().items(): weight_file_name = "./result/" + name + ".txt" np.savetxt(weight_file_name, param.data) np.savetxt("./result/obs_mean.txt", env.obs_rms.mean) np.savetxt("./result/obs_variance.txt", env.obs_rms.var) #Enjoy trained agent obs = np.copy(env.reset()) epi_reward = 0 while True: action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = env.step(action) env.render() epi_reward += rewards if dones: print("Episode Reward: ", epi_reward) epi_reward = 0
import retro from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecFrameStack, VecNormalize from stable_baselines3 import PPO, A2C import numpy as np import gym from stable_baselines3.common.callbacks import CheckpointCallback from utils import * if __name__ == "__main__": num_envs = 16 # Must use the save number of envs as trained on but we create a single dummy env for testing. envs = SubprocVecEnv([make_env] * num_envs) envs = VecFrameStack(envs, n_stack=4) model = PPO.load("./subzero_model") model.set_env(envs) obs = envs.reset() print(obs.shape) # Create one env for testing env = DummyVecEnv([make_env]) env = VecFrameStack(env, n_stack=4) obs = env.reset() # model.predict(test_obs) would through an error # because the number of test env is different from the number of training env # so we need to complete the observation with zeroes zero_completed_obs = np.zeros((num_envs,) + envs.observation_space.shape) zero_completed_obs[0, :] = obs obs = zero_completed_obs while True:
def main(): # nn = torch.nn.Sequential(torch.nn.Linear(8, 64), torch.nn.Tanh(), # torch.nn.Linear(64, 2)) os.makedirs(_log_dir, exist_ok=True) DoTraining = True StartFresh = True num_cpu = 8 if (DoTraining): # This doesn't work but it might have something to do with how the environment is written # num_cpu = 1 # env = make_vec_env(env_id, n_envs=num_cpu, monitor_dir=_log_dir) # make_vec_env contains Monitor # Create the callback: check every 1000 steps # callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=_log_dir) if (StartFresh): env = SubprocVecEnv([ make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu) ]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() policy_kwargs = { 'net_arch': [128, 128, 128], } model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=2, tensorboard_log=tb_log) else: env = SubprocVecEnv([ make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu) ]) env = VecNormalize.load(_stats_path, env) env.reset() model = PPO.load( 'log\monitor_simpledriving_vecNormalized_128x3_2\PPO_4243456.mdl', tensorboard_log=tb_log) model.set_env(env) eval_env = gym.make(env_id) # print('!!!!Checking Environment!!!!') # print(check_env(eval_env)) mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10) print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}') for _ in range(50): model.learn(total_timesteps=100000, tb_log_name=env_id, reset_num_timesteps=False) #, callback=callback mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10) print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}') model.save(_log_dir + 'PPO_{}'.format(model.num_timesteps) + '.mdl') env.save(_log_dir + 'vec_normalize_{}'.format(model.num_timesteps) + '.pkl') if (not DoTraining): # eval_env = SubprocVecEnv([make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)]) # eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env) # eval_env = VecVideoRecorder(eval_env, video_folder='videos/', # record_video_trigger=lambda step: step == 0, video_length=500, # name_prefix='test') # eval_env.training = False # eval_env.norm_reward = False # eval_env.reset() eval_env = DummyVecEnv( [make_env(env_id, i, log_dir=_log_dir) for i in range(1)]) # eval_env = gym.make(env_id) eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env) model = PPO.load( 'log\monitor_simpledriving_vecNormalized_128x3\PPO_5734400.mdl', tensorboard_log=tb_log) model.set_env(eval_env) # record_video(env_id, model, video_length=500, prefix='ppo_'+env_id) # Start the video at step=0 and record 500 steps # eval_env = VecVideoRecorder(eval_env, video_folder='tmp', # record_video_trigger=lambda step: step == 0, video_length=500, # name_prefix='') obs = eval_env.reset() # for i in range(500): # action, _ = model.predict(obs) # obs, _, _, _ = eval_env.step(action) # eval_env.close() while True: action, _states = model.predict(obs, deterministic=True) obs, _, done, _ = eval_env.step(action) # eval_env.render() if done.any(): # obs = eval_env.reset() # time.sleep(1/30) eval_env.close() break
def main(): if(StartFresh): # Create Environment env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.) env.reset() # Separate evaluation env eval_env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(1)]) eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.) eval_env.reset() # Create Model # model = SAC("MlpPolicy", env, verbose=1, tensorboard_log=tb_log, device="auto") policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[dict(pi=[256, 256], vf=[256, 256])]) model = PPO('MlpPolicy', env, learning_rate = 3e-5, n_steps=512, batch_size=128, n_epochs=20, gamma=0.99, gae_lambda = 0.9, clip_range = 0.4, vf_coef = 0.5, use_sde = True, sde_sample_freq = 4, policy_kwargs = policy_kwargs, verbose=1, tensorboard_log=tb_log, device="auto") else: print('duh') # tmp_test_name = 'SAC-Continued' # tb_log_name = tmp_test_name + '_' + env_name # tmp_log_dir = os.path.join('log', tmp_test_name) # tmp_model_stats_path = os.path.join(tmp_log_dir, 'Model_' + tb_log_name) # tmp_env_stats_path = os.path.join(tmp_log_dir, 'Env_' + tb_log_name) # tmp_best_path = os.path.join(tmp_log_dir, 'saved_models') # tmp_load_path = os.path.join(tmp_best_path, 'rl_model_3900000_steps') # # Load Enironment # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) # env = VecNormalize.load(tmp_env_stats_path, env) # env.reset() # # Separate evaluation env # eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)]) # eval_env = VecNormalize.load(tmp_env_stats_path, eval_env) # eval_env.reset() # # Load Model # # model = SAC.load(model_stats_path, tensorboard_log=tb_log) # model = SAC.load(tmp_load_path, tensorboard_log=tb_log, learning_rate=1e-6) # # model.learning_rate = 1e-5 # model.set_env(env) if(DoTraining): checkpoint_callback = CheckpointCallback(save_freq=eval_freq, save_path=checkpoint_path) # Use deterministic actions for evaluation eval_callback = EvalCallback(eval_env, best_model_save_path=best_path, log_path=best_path, eval_freq=eval_freq, deterministic=True, render=False) # Video Update Callback record_callback = RecordVideo(env_name, videoName=videoName, videoPath=video_path, verbose=1) envSave_callback = SaveEnvVariable(env, model, env_stats_path, model_stats_path) nStep_callback_list = CallbackList([record_callback, envSave_callback]) # nStep_callback_list = CallbackList([envSave_callback]) vid_callback = EveryNTimesteps(n_steps=vid_freq, callback=nStep_callback_list) # Create the callback list callbacks = CallbackList([checkpoint_callback, eval_callback, vid_callback]) # callbacks = CallbackList([checkpoint_callback, eval_callback]) print(tb_log_name) model.learn(total_timesteps=total_timesteps, tb_log_name=tb_log_name, reset_num_timesteps=False, callback=callbacks) # Don't forget to save the VecNormalize statistics when saving the agent model.save(model_stats_path) env.save(env_stats_path) if(DoVideo): record_video(env_name, env, model, videoLength=1000, prefix='best' + videoName, videoPath=video_path)
""" def _init(): env = gym.make(env_id) env.seed(seed + rank) return env set_random_seed(seed) return _init if __name__ == '__main__': env_id = "bandit-v0" num_cpu = 4 # Number of processes to use # # Create the vectorized environment env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) # Stable Baselines provides you with make_vec_env() helper # which does exactly the previous steps for you: # env = make_vec_env(env_id, n_envs=num_cpu, seed=0) policy_kwargs = dict(activation_fn=nn.Tanh, net_arch=[10, 5]) # env = gym.make(env_id, total=10, good=3) model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=1) model.learn(total_timesteps=15000) env = gym.make(env_id) #, total=10, good=3) for _ in range(10): obs = env.reset(test=True) action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
import time import utils from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecEnv if __name__ == "__main__": # noqa: C901 bodies = [int(x) for x in utils.args.train_bodies.split(',')] print(bodies) envs = [utils.make_env(robot_body=i, body_info=0) for i in bodies] eval_env = SubprocVecEnv(envs) eval_env.reset() eval_env.env_method("show_body_id") eval_env.env_method("set_view") time.sleep(10000)
'params': vnet.parameters(), 'lr': 7e-4, 'alpha': 0.99, 'eps': 1e-5 }, ]) all_rewards = [] all_losses = [] all_values = [] episode_reward = 0 loss = 0.0 env = EnvWrapper(gym.make('PongDeterministic-v4'), NFRAMES) state = venv.reset() #for nstep in tqdm.tqdm(range(NSTEPS)): for nstep in range(NSTEPS): state_t = torch.tensor(state, dtype=torch.float32).cuda() action = pnet.act(state_t).cpu() next_state, reward, done, _ = venv.step(action) buffer.push(state, action, reward, next_state, done) state = next_state if len(buffer) == BATCH_SIZE: loss = 0.9 * loss + 0.1 * train(buffer, pnet, vnet, optimizer) buffer.reset() # break # loss = train(venv, pnet, vnet, optimizer)
from stable_baselines3.common.vec_env import SubprocVecEnv from gym.envs.classic_control.mountain_car import MountainCarEnv from multiprocessing import Queue from env import Env def create_thunk(): # return lambda: MyEnv(queue) return lambda: Env( break_on_fail=False, attack_prob=0, max_lines=10, min_lines=1, num_initial_buildings=2, time_per_line=4, tgt_success_rate=0.75, world_size=3, eval_steps=500, failure_buffer=queue, random_seed=0, rank=0, ) if __name__ == "__main__": queue = Queue() envs = SubprocVecEnv( env_fns=[create_thunk() for _ in range(2)], start_method="fork" ) print(envs.reset())
def sac(env_fn, env_name, test_env_fns=[], actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, load_dir=None, num_procs=1, clean_every=200): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ from spinup.examples.pytorch.eval_sac import load_pytorch_policy print(f"SAC proc_id {proc_id()}") logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) if proc_id() == 0: writer = SummaryWriter(log_dir=os.path.join( logger.output_dir, str(datetime.datetime.now())), comment=logger_kwargs["exp_name"]) torch.manual_seed(seed) np.random.seed(seed) env = SubprocVecEnv([partial(env_fn, rank=i) for i in range(num_procs)], "spawn") test_env = SubprocVecEnv(test_env_fns, "spawn") obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks if load_dir is not None: _, ac = load_pytorch_policy(load_dir, itr="", deterministic=False) else: ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters()) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2]) logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) # Set up function for computing SAC Q-losses def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q1 = ac.q1(o, a) q2 = ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = ac.pi(o2) # Target Q-values q1_pi_targ = ac_targ.q1(o2, a2) q2_pi_targ = ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info # Set up function for computing TD feats-losses def compute_loss_feats(data): o, a, r, o2, d, feats = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'], data["feats"] feats = torch.stack(list(feats.values())).T # (nbatch, nfeats) feats1 = ac.q1.predict_feats(o, a) feats2 = ac.q2.predict_feats(o, a) feats_keys = replay_buffer.feats_keys # Bellman backup for feature functions with torch.no_grad(): a2, _ = ac.pi(o2) # Target feature values feats1_targ = ac_targ.q1.predict_feats(o2, a2) feats2_targ = ac_targ.q2.predict_feats(o2, a2) feats_targ = torch.min(feats1_targ, feats2_targ) backup = feats + gamma * (1 - d[:, None]) * feats_targ # MSE loss against Bellman backup loss_feats1 = ((feats1 - backup)**2).mean(axis=0) loss_feats2 = ((feats2 - backup)**2).mean(axis=0) loss_feats = loss_feats1 + loss_feats2 # Useful info for logging feats_info = dict(Feats1Vals=feats1.detach().numpy(), Feats2Vals=feats2.detach().numpy()) return loss_feats, feats_info # Set up function for computing SAC pi loss def compute_loss_pi(data): o = data['obs'] pi, logp_pi = ac.pi(o) q1_pi = ac.q1(o, pi) q2_pi = ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) # Entropy-regularized policy loss loss_pi = (alpha * logp_pi - q_pi).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().numpy()) return loss_pi, pi_info # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=lr) q_optimizer = Adam(q_params, lr=lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data, feats_keys): # First run one gradient descent step for Q1 and Q2 q_optimizer.zero_grad() loss_q, q_info = compute_loss_q(data) loss_q.backward() loss_feats, feats_info = compute_loss_feats(data) q_optimizer.step() # Record things logger.store(LossQ=loss_q.item(), **q_info) # Feature loss keys = [f"LossFeats_{key}" for key in feats_keys] for key, val in zip(keys, loss_feats): logger.store(**dict(key, val.item())) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in q_params: p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in q_params: p.requires_grad = True # Record things logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, deterministic=False): return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic) def test_agent(feats_keys): num_envs = len(test_env_fns) env_ep_rets = np.zeros(num_envs) for j in range(num_test_episodes): o, d = test_env.reset(), np.zeros(num_envs, dtype=bool) ep_len = np.zeros(num_envs) while not (np.all(d) or np.all(ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, info = test_env.step(get_action(o, True)) env_ep_rets += r ep_len += 1 # logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) for ti in range(num_envs): logger.store( **{f"TestEpRet_{ti}": env_ep_rets[ti] / num_test_episodes}) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), np.zeros(num_procs), np.zeros(num_procs) # Main loop: collect experience in env and update/log each epoch epoch = 0 update_times, clean_times = 0, 0 t = 0 while t <= total_steps: # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy. if t > start_steps: a = get_action(o) else: a = np.stack([env.action_space.sample() for _ in range(num_procs)]) # Step the env o2, r, d, info = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) if np.all(ep_len == max_ep_len): d.fill(False) # Store experience to replay buffer replay_buffer.store_vec(o, a, r, o2, d, [inf["features"] for inf in info]) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling, assumes all subenvs end at the same time if np.all(d) or np.all(ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) if clean_every > 0 and epoch // clean_every >= clean_times: env.close() test_env.close() env = SubprocVecEnv( [partial(env_fn, rank=i) for i in range(num_procs)], "spawn") test_env = SubprocVecEnv(test_env_fns, "spawn") clean_times += 1 o, ep_ret, ep_len = env.reset(), np.zeros(num_procs), np.zeros( num_procs) # Update handling if t >= update_after and t / update_every > update_times: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch, feats_keys=replay_buffer.feats_keys) update_times += 1 # End of epoch handling if t // steps_per_epoch > epoch: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): # try: logger.save_state({'env_name': env_name}, None) # logger.save_state({'env': env}, None) #except: #logger.save_state({'env_name': env_name}, None) # Test the performance of the deterministic version of the agent. test_agent(replay_buffer.feats_keys) # Update tensorboard if proc_id() == 0: log_perf_board = ['EpRet', 'EpLen', 'Q1Vals', 'Q2Vals'] + [ f"TestEpRet_{ti}" for ti in range(len(test_env_fns)) ] log_loss_board = ['LogPi', 'LossPi', 'LossQ'] + [ key for key in logger.epoch_dict.keys() if "LossFeats" in key ] log_board = { 'Performance': log_perf_board, 'Loss': log_loss_board } for key, value in log_board.items(): for val in value: mean, std = logger.get_stats(val) if key == 'Performance': writer.add_scalar(key + '/Average' + val, mean, epoch) writer.add_scalar(key + '/Std' + val, std, epoch) else: writer.add_scalar(key + '/' + val, mean, epoch) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() if proc_id() == 0: writer.flush() import psutil # gives a single float value cpu_percent = psutil.cpu_percent() # gives an object with many fields mem_percent = psutil.virtual_memory().percent print(f"Used cpu avg {cpu_percent}% memory {mem_percent}%") cpu_separate = psutil.cpu_percent(percpu=True) for ci, cval in enumerate(cpu_separate): print(f"\t cpu {ci}: {cval}%") # buf_size = replay_buffer.get_size() # print(f"Replay buffer size: {buf_size//1e6}MB {buf_size // 1e3} KB {buf_size % 1e3} B") t += num_procs if proc_id() == 0: writer.close()