Esempio n. 1
0
def main():
  # Create the callback: check every 1000 steps
  log_dir = 'log'
  callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)
  num_cpu = 16
  model_stats_path = os.path.join(log_dir, "sac_" + env_name)
  env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl')
  tb_log = 'tb_log'
  videoName = '5M_timesteps_sac'
  tb_log_name = videoName

  if(StartFresh):
        # env = make_vec_env(env_name, n_envs=4)
        # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)
        env.reset()
        policy_kwargs = {
            'net_arch':[128,64,32],
        }
        model = PPO('MlpPolicy', 
          env, 
          learning_rate = 0.001,
          n_steps=500,
          # batch_size=0,
          # n_epochs=1,
          gamma=0.9,
          policy_kwargs = policy_kwargs, 
          verbose=1, 
          tensorboard_log=tb_log,
          device="auto")
  else:
      env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
      env = VecNormalize.load(env_stats_path, env)
      env.reset()

      
      model = PPO.load(model_stats_path, tensorboard_log=tb_log)
      model.set_env(env)

  if(DoTraining):
    eval_env = make_vec_env(env_name, n_envs=1)
    eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
    eval_env.reset()
    # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log)
    model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback()

    # Don't forget to save the VecNormalize statistics when saving the agent
    model.save(model_stats_path)
    env.save(env_stats_path)
    
  if(DoVideo):
    # mean_reward, std_reward = evaluate_policy(model, eval_env)
    # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}")
    record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
Esempio n. 2
0
def make_env_stack(num_envs,
                   game_path,
                   base_port,
                   game_log_path,
                   opp_fp_and_elo,
                   trainee_elo,
                   elo_match=True,
                   survivor=False,
                   stdout_path=None,
                   level_path=None,
                   image_based=False,
                   time_reward=0.,
                   env_p=3):
    if num_envs >= 1:
        envs = []
        for i in range(num_envs):
            envs.append(lambda game_path=game_path, b=base_port +
                        (i * 2), c=game_log_path.replace(
                            ".txt", "-" + str(i) + ".txt"), d=opp_fp_and_elo, e
                        =elo_match, f=trainee_elo, g=survivor, h=stdout_path.
                        replace(".txt", "-" + str(i) + ".txt"), i=level_path, j
                        =image_based, k=time_reward: TankEnv(game_path,
                                                             game_port=b,
                                                             game_log_path=c,
                                                             opp_fp_and_elo=d,
                                                             elo_match=e,
                                                             center_elo=f,
                                                             survivor=g,
                                                             stdout_path=h,
                                                             verbose=True,
                                                             level_path=i,
                                                             image_based=j,
                                                             time_reward=k,
                                                             p=env_p))
        if num_envs == 1:
            env_stack = SubprocVecEnv(envs, start_method="fork")
        else:
            env_stack = SubprocVecEnv(envs, start_method="forkserver")
        env_stack.reset()
        return env_stack
    else:
        env = TankEnv(game_path,
                      game_port=base_port,
                      game_log_path=game_log_path,
                      opp_fp_and_elo=opp_fp_and_elo,
                      elo_match=elo_match,
                      center_elo=trainee_elo,
                      survivor=survivor,
                      stdout_path=stdout_path,
                      level_path=level_path,
                      image_based=image_based,
                      time_reward=time_reward,
                      p=env_p)
        env.reset()
        return env
Esempio n. 3
0
    def get_multiproc_env(self, n=10):
        def get_self():
            return deepcopy(self)

        e = SubprocVecEnv([get_self for _ in range(n)], start_method="fork")
        obs = e.reset()
        return e, obs
def make_ai_matchmaker_stack(all_stats,
                             all_opps,
                             all_elos,
                             game_path,
                             model_dir,
                             base_port=50000,
                             image_based=False,
                             level_path=None,
                             env_p=3,
                             starting_elo=None,
                             K=16,
                             D=5.,
                             time_reward=-0.003,
                             num_envs=1,
                             matchmaking_mode=0,
                             win_loss_ratio="0:0"):

    envs = []
    for i in range(num_envs):
        envs.append(
            lambda a=all_stats, b=all_opps, c=all_elos, d=game_path, e=model_dir, f=base_port+(i*2), g=base_port+(i*2)+1, \
            h=image_based, i=level_path, j=env_p, k=starting_elo, l=time_reward, m=matchmaking_mode, \
            n=[int(x) for x in win_loss_ratio.split(':')]:
                    AIMatchmaker(a,b,c,d,e,
                            base_port=f,
                            my_port=g,
                            image_based=h,
                            level_path=i,
                            env_p=j,
                            starting_elo=k,
                            time_reward=l,
                            matchmaking_mode=m,
                            win_loss_ratio=n
                    )
        )
    env_stack = SubprocVecEnv(envs, start_method="fork")
    env_stack.reset()
    return env_stack
def record_video(env_name, train_env, model, videoLength=500, prefix='', videoPath='videos/'):
    print('record_video function')
    # Wrap the env in a Vec Video Recorder 
    local_eval_env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(1)])
    local_eval_env = VecNormalize(local_eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
    sync_envs_normalization(train_env, local_eval_env)
    local_eval_env = VecVideoRecorder(local_eval_env, video_folder=videoPath,
                              record_video_trigger=lambda step: step == 0, video_length=videoLength,
                              name_prefix=prefix)
    obs = local_eval_env.reset()
    for _ in range(videoLength):
        action, _ = model.predict(obs)
        obs, _, _, _ = local_eval_env.step(action)

    # Close the video recorder
    local_eval_env.close()
Esempio n. 6
0
def multiprocessing_example():
    # Multiprocessing: Unleashing the Power of Vectorized Environments

    def make_env(env_id, rank, seed=0):
        """
		Utility function for multiprocessed env.

		:param env_id: (str) the environment ID.
		:param num_env: (int) the number of environments you wish to have in subprocesses.
		:param seed: (int) the inital seed for RNG.
		:param rank: (int) index of the subprocess.
		"""
        def _init():
            env = gym.make(env_id)
            env.seed(seed + rank)
            return env

        set_random_seed(seed)
        return _init

    env_id = "CartPole-v1"
    num_cpu = 4  # Number of processes to use.
    # Create the vectorized environment.
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    # Stable Baselines provides you with make_vec_env() helper which does exactly the previous steps for you.
    # You can choose between 'DummyVecEnv' (usually faster) and 'SubprocVecEnv'.
    #env = make_vec_env(env_id, n_envs=num_cpu, seed=0, vec_env_cls=SubprocVecEnv)

    model = PPO("MlpPolicy", env, verbose=1)
    model.learn(total_timesteps=25_000)

    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
Esempio n. 7
0
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.utils import set_random_seed


def make_env(env_id, rank, seed=0):
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env

    set_random_seed(seed)
    return _init


if __name__ == '__main__':
    env_id = "CartPole-v1"
    num_cpu = 4  # Number of processes to use
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    model = PPO('MlpPolicy', env, verbose=1)
    model.learn(total_timesteps=25000)

    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
Esempio n. 8
0
def main():
    # multiprocess environment
    n_cpu = 8
    env = SubprocVecEnv(
        [lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)])
    env = VecNormalize(env,
                       norm_obs=True,
                       clip_obs=2.0,
                       norm_reward=False,
                       training=True)

    # n_cpu = 1
    # env = gym.make('DYROSTocabi-v1')
    # env = DummyVecEnv([lambda: env])
    # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True)

    model = PPO('MlpPolicy',
                env,
                verbose=1,
                n_steps=int(4096 / n_cpu),
                wandb_use=True)
    model.learn(total_timesteps=40000000)

    file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now())
    model.save(file_name)
    env.save(file_name + "_env.pkl")

    model.policy.to("cpu")
    for name, param in model.policy.state_dict().items():
        weight_file_name = "./result/" + name + ".txt"
        np.savetxt(weight_file_name, param.data)

    np.savetxt("./result/obs_mean.txt", env.obs_rms.mean)
    np.savetxt("./result/obs_variance.txt", env.obs_rms.var)

    del model  # remove to demonstrate saving and loading
    del env

    # file_name = "ppo2_DYROSTocabi_2021-02-27 02:20:20.015346"

    env = gym.make('DYROSTocabi-v1')
    env = DummyVecEnv([lambda: env])
    env = VecNormalize.load(file_name + "_env.pkl", env)
    env.training = False

    model = PPO.load(file_name, env=env, wandb_use=False)

    model.policy.to("cpu")
    for name, param in model.policy.state_dict().items():
        weight_file_name = "./result/" + name + ".txt"
        np.savetxt(weight_file_name, param.data)

    np.savetxt("./result/obs_mean.txt", env.obs_rms.mean)
    np.savetxt("./result/obs_variance.txt", env.obs_rms.var)
    #Enjoy trained agent
    obs = np.copy(env.reset())
    epi_reward = 0

    while True:
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, dones, info = env.step(action)
        env.render()
        epi_reward += rewards

        if dones:
            print("Episode Reward: ", epi_reward)
            epi_reward = 0
import retro
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv, VecFrameStack, VecNormalize
from stable_baselines3 import PPO, A2C
import numpy as np
import gym
from stable_baselines3.common.callbacks import CheckpointCallback
from utils import *

if __name__ == "__main__":
    num_envs = 16  # Must use the save number of envs as trained on but we create a single dummy env for testing.
    envs = SubprocVecEnv([make_env] * num_envs)
    envs = VecFrameStack(envs, n_stack=4)

    model = PPO.load("./subzero_model")
    model.set_env(envs)
    obs = envs.reset()
    print(obs.shape)

    # Create one env for testing 
    env = DummyVecEnv([make_env])
    env = VecFrameStack(env, n_stack=4)
    obs = env.reset()

    # model.predict(test_obs) would through an error
    # because the number of test env is different from the number of training env
    # so we need to complete the observation with zeroes
    zero_completed_obs = np.zeros((num_envs,) + envs.observation_space.shape)
    zero_completed_obs[0, :] = obs
    obs = zero_completed_obs

    while True:
Esempio n. 10
0
def main():
    # nn = torch.nn.Sequential(torch.nn.Linear(8, 64), torch.nn.Tanh(),
    #                          torch.nn.Linear(64, 2))

    os.makedirs(_log_dir, exist_ok=True)

    DoTraining = True
    StartFresh = True
    num_cpu = 8
    if (DoTraining):

        # This doesn't work but it might have something to do with how the environment is written
        # num_cpu = 1
        # env = make_vec_env(env_id, n_envs=num_cpu, monitor_dir=_log_dir) # make_vec_env contains Monitor

        # Create the callback: check every 1000 steps
        # callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=_log_dir)

        if (StartFresh):
            env = SubprocVecEnv([
                make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)
            ])
            env = VecNormalize(env,
                               norm_obs=True,
                               norm_reward=True,
                               clip_obs=10.)
            env.reset()
            policy_kwargs = {
                'net_arch': [128, 128, 128],
            }
            model = PPO('MlpPolicy',
                        env,
                        policy_kwargs=policy_kwargs,
                        verbose=2,
                        tensorboard_log=tb_log)
        else:
            env = SubprocVecEnv([
                make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)
            ])
            env = VecNormalize.load(_stats_path, env)
            env.reset()

            model = PPO.load(
                'log\monitor_simpledriving_vecNormalized_128x3_2\PPO_4243456.mdl',
                tensorboard_log=tb_log)
            model.set_env(env)

        eval_env = gym.make(env_id)
        # print('!!!!Checking Environment!!!!')
        # print(check_env(eval_env))

        mean_reward, std_reward = evaluate_policy(model,
                                                  eval_env,
                                                  n_eval_episodes=10)
        print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}')
        for _ in range(50):
            model.learn(total_timesteps=100000,
                        tb_log_name=env_id,
                        reset_num_timesteps=False)  #, callback=callback
            mean_reward, std_reward = evaluate_policy(model,
                                                      eval_env,
                                                      n_eval_episodes=10)
            print(f'mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}')
            model.save(_log_dir + 'PPO_{}'.format(model.num_timesteps) +
                       '.mdl')
            env.save(_log_dir +
                     'vec_normalize_{}'.format(model.num_timesteps) + '.pkl')

    if (not DoTraining):
        # eval_env = SubprocVecEnv([make_env(env_id, i, log_dir=_log_dir) for i in range(num_cpu)])
        # eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl', eval_env)
        # eval_env = VecVideoRecorder(eval_env, video_folder='videos/',
        #                       record_video_trigger=lambda step: step == 0, video_length=500,
        #                       name_prefix='test')
        # eval_env.training = False
        # eval_env.norm_reward = False
        # eval_env.reset()

        eval_env = DummyVecEnv(
            [make_env(env_id, i, log_dir=_log_dir) for i in range(1)])
        # eval_env = gym.make(env_id)
        eval_env = VecNormalize.load(_log_dir + 'vec_normalize_5734400.pkl',
                                     eval_env)

        model = PPO.load(
            'log\monitor_simpledriving_vecNormalized_128x3\PPO_5734400.mdl',
            tensorboard_log=tb_log)
        model.set_env(eval_env)
        # record_video(env_id, model, video_length=500, prefix='ppo_'+env_id)
        # Start the video at step=0 and record 500 steps
        # eval_env = VecVideoRecorder(eval_env, video_folder='tmp',
        #                       record_video_trigger=lambda step: step == 0, video_length=500,
        #                       name_prefix='')

        obs = eval_env.reset()
        # for i in range(500):
        #     action, _ = model.predict(obs)
        #     obs, _, _, _ = eval_env.step(action)
        # eval_env.close()
        while True:
            action, _states = model.predict(obs, deterministic=True)
            obs, _, done, _ = eval_env.step(action)
            # eval_env.render()
            if done.any():
                # obs = eval_env.reset()
                # time.sleep(1/30)
                eval_env.close()
                break
def main():
    if(StartFresh):
        # Create Environment
        env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)
        env.reset()
        # Separate evaluation env
        eval_env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(1)])
        eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
        eval_env.reset()
        # Create Model
        # model = SAC("MlpPolicy", env, verbose=1, tensorboard_log=tb_log, device="auto")
        policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[dict(pi=[256, 256], vf=[256, 256])])

        model = PPO('MlpPolicy', 
            env, 
            learning_rate = 3e-5,
            n_steps=512,
            batch_size=128,
            n_epochs=20,
            gamma=0.99,
            gae_lambda = 0.9,
            clip_range = 0.4,
            vf_coef = 0.5,
            use_sde = True,
            sde_sample_freq = 4,
            policy_kwargs = policy_kwargs, 
            verbose=1, 
            tensorboard_log=tb_log,
            device="auto")


    else:
        print('duh')
        # tmp_test_name = 'SAC-Continued'
        # tb_log_name = tmp_test_name + '_' + env_name
        # tmp_log_dir = os.path.join('log', tmp_test_name)
        # tmp_model_stats_path = os.path.join(tmp_log_dir, 'Model_' + tb_log_name)
        # tmp_env_stats_path = os.path.join(tmp_log_dir, 'Env_' + tb_log_name)
        # tmp_best_path = os.path.join(tmp_log_dir, 'saved_models')
        # tmp_load_path = os.path.join(tmp_best_path, 'rl_model_3900000_steps')
        # # Load Enironment
        # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        # env = VecNormalize.load(tmp_env_stats_path, env)
        # env.reset()
        # # Separate evaluation env
        # eval_env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        # eval_env = VecNormalize.load(tmp_env_stats_path, eval_env)
        # eval_env.reset()
        # # Load Model
        # # model = SAC.load(model_stats_path, tensorboard_log=tb_log)
        # model = SAC.load(tmp_load_path, tensorboard_log=tb_log, learning_rate=1e-6)
        # # model.learning_rate = 1e-5
        # model.set_env(env)

    if(DoTraining):
        checkpoint_callback = CheckpointCallback(save_freq=eval_freq, save_path=checkpoint_path)
        # Use deterministic actions for evaluation
        eval_callback = EvalCallback(eval_env, best_model_save_path=best_path,
                                    log_path=best_path, eval_freq=eval_freq,
                                    deterministic=True, render=False)
        # Video Update Callback 
        record_callback = RecordVideo(env_name, videoName=videoName, videoPath=video_path, verbose=1)
        envSave_callback = SaveEnvVariable(env, model, env_stats_path, model_stats_path)
        nStep_callback_list = CallbackList([record_callback, envSave_callback])
        # nStep_callback_list = CallbackList([envSave_callback])
        vid_callback = EveryNTimesteps(n_steps=vid_freq, callback=nStep_callback_list)
        
        # Create the callback list
        callbacks = CallbackList([checkpoint_callback, eval_callback, vid_callback])
        # callbacks = CallbackList([checkpoint_callback, eval_callback])

        print(tb_log_name)
        model.learn(total_timesteps=total_timesteps,
            tb_log_name=tb_log_name, 
            reset_num_timesteps=False,
            callback=callbacks)

        # Don't forget to save the VecNormalize statistics when saving the agent
        model.save(model_stats_path)
        env.save(env_stats_path)

    if(DoVideo):
        record_video(env_name, env, model, videoLength=1000, prefix='best' + videoName, videoPath=video_path)
Esempio n. 12
0
    """
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env

    set_random_seed(seed)
    return _init


if __name__ == '__main__':
    env_id = "bandit-v0"
    num_cpu = 4  # Number of processes to use
    # # Create the vectorized environment
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    policy_kwargs = dict(activation_fn=nn.Tanh, net_arch=[10, 5])
    # env = gym.make(env_id, total=10, good=3)
    model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=1)
    model.learn(total_timesteps=15000)

    env = gym.make(env_id)  #, total=10, good=3)
    for _ in range(10):
        obs = env.reset(test=True)
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
Esempio n. 13
0
import time
import utils
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecEnv

if __name__ == "__main__":  # noqa: C901
    bodies = [int(x) for x in utils.args.train_bodies.split(',')]
    print(bodies)
    envs = [utils.make_env(robot_body=i, body_info=0) for i in bodies]
    eval_env = SubprocVecEnv(envs)
    eval_env.reset()
    eval_env.env_method("show_body_id")
    eval_env.env_method("set_view")
    time.sleep(10000)
Esempio n. 14
0
            'params': vnet.parameters(),
            'lr': 7e-4,
            'alpha': 0.99,
            'eps': 1e-5
        },
    ])

    all_rewards = []
    all_losses = []
    all_values = []
    episode_reward = 0
    loss = 0.0

    env = EnvWrapper(gym.make('PongDeterministic-v4'), NFRAMES)

    state = venv.reset()

    #for nstep in tqdm.tqdm(range(NSTEPS)):
    for nstep in range(NSTEPS):

        state_t = torch.tensor(state, dtype=torch.float32).cuda()
        action = pnet.act(state_t).cpu()
        next_state, reward, done, _ = venv.step(action)
        buffer.push(state, action, reward, next_state, done)
        state = next_state

        if len(buffer) == BATCH_SIZE:
            loss = 0.9 * loss + 0.1 * train(buffer, pnet, vnet, optimizer)
            buffer.reset()
            # break
            # loss = train(venv, pnet, vnet, optimizer)
Esempio n. 15
0
from stable_baselines3.common.vec_env import SubprocVecEnv
from gym.envs.classic_control.mountain_car import MountainCarEnv
from multiprocessing import Queue
from env import Env


def create_thunk():
    # return lambda: MyEnv(queue)
    return lambda: Env(
        break_on_fail=False,
        attack_prob=0,
        max_lines=10,
        min_lines=1,
        num_initial_buildings=2,
        time_per_line=4,
        tgt_success_rate=0.75,
        world_size=3,
        eval_steps=500,
        failure_buffer=queue,
        random_seed=0,
        rank=0,
    )


if __name__ == "__main__":
    queue = Queue()
    envs = SubprocVecEnv(
        env_fns=[create_thunk() for _ in range(2)], start_method="fork"
    )
    print(envs.reset())
Esempio n. 16
0
def sac(env_fn,
        env_name,
        test_env_fns=[],
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=1e-3,
        alpha=0.2,
        batch_size=100,
        start_steps=10000,
        update_after=1000,
        update_every=50,
        num_test_episodes=10,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1,
        load_dir=None,
        num_procs=1,
        clean_every=200):
    """
    Soft Actor-Critic (SAC)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act``
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of
            observations as inputs, and ``q1`` and ``q2`` should accept a batch
            of observations and a batch of actions as inputs. When called,
            ``act``, ``q1``, and ``q2`` should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each
                                           | observation.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

            Calling ``pi`` should return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``logp_pi``  (batch,)          | Tensor containing log probabilities of
                                           | actions in ``a``. Importantly: gradients
                                           | should be able to flow back into ``a``.
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object
            you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long
            you wait between updates, the ratio of env steps to gradient steps
            is locked to 1.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """
    from spinup.examples.pytorch.eval_sac import load_pytorch_policy

    print(f"SAC proc_id {proc_id()}")
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())
    if proc_id() == 0:
        writer = SummaryWriter(log_dir=os.path.join(
            logger.output_dir, str(datetime.datetime.now())),
                               comment=logger_kwargs["exp_name"])

    torch.manual_seed(seed)
    np.random.seed(seed)

    env = SubprocVecEnv([partial(env_fn, rank=i) for i in range(num_procs)],
                        "spawn")
    test_env = SubprocVecEnv(test_env_fns, "spawn")
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    if load_dir is not None:
        _, ac = load_pytorch_policy(load_dir, itr="", deterministic=False)
    else:
        ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(
        core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
               var_counts)

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = ac.pi(o2)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy())

        return loss_q, q_info

    # Set up function for computing TD feats-losses
    def compute_loss_feats(data):
        o, a, r, o2, d, feats = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done'], data["feats"]

        feats = torch.stack(list(feats.values())).T  # (nbatch, nfeats)
        feats1 = ac.q1.predict_feats(o, a)
        feats2 = ac.q2.predict_feats(o, a)

        feats_keys = replay_buffer.feats_keys

        # Bellman backup for feature functions
        with torch.no_grad():
            a2, _ = ac.pi(o2)

            # Target feature values
            feats1_targ = ac_targ.q1.predict_feats(o2, a2)
            feats2_targ = ac_targ.q2.predict_feats(o2, a2)
            feats_targ = torch.min(feats1_targ, feats2_targ)
            backup = feats + gamma * (1 - d[:, None]) * feats_targ

        # MSE loss against Bellman backup
        loss_feats1 = ((feats1 - backup)**2).mean(axis=0)
        loss_feats2 = ((feats2 - backup)**2).mean(axis=0)
        loss_feats = loss_feats1 + loss_feats2

        # Useful info for logging
        feats_info = dict(Feats1Vals=feats1.detach().numpy(),
                          Feats2Vals=feats2.detach().numpy())

        return loss_feats, feats_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(data):
        o = data['obs']
        pi, logp_pi = ac.pi(o)
        q1_pi = ac.q1(o, pi)
        q2_pi = ac.q2(o, pi)
        q_pi = torch.min(q1_pi, q2_pi)

        # Entropy-regularized policy loss
        loss_pi = (alpha * logp_pi - q_pi).mean()

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.detach().numpy())

        return loss_pi, pi_info

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=lr)
    q_optimizer = Adam(q_params, lr=lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data, feats_keys):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()
        loss_feats, feats_info = compute_loss_feats(data)
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)

        # Feature loss
        keys = [f"LossFeats_{key}" for key in feats_keys]
        for key, val in zip(keys, loss_feats):
            logger.store(**dict(key, val.item()))

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in q_params:
            p.requires_grad = True

        # Record things
        logger.store(LossPi=loss_pi.item(), **pi_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, deterministic=False):
        return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic)

    def test_agent(feats_keys):
        num_envs = len(test_env_fns)
        env_ep_rets = np.zeros(num_envs)
        for j in range(num_test_episodes):
            o, d = test_env.reset(), np.zeros(num_envs, dtype=bool)
            ep_len = np.zeros(num_envs)
            while not (np.all(d) or np.all(ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, info = test_env.step(get_action(o, True))
                env_ep_rets += r
                ep_len += 1
            # logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
        for ti in range(num_envs):
            logger.store(
                **{f"TestEpRet_{ti}": env_ep_rets[ti] / num_test_episodes})

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), np.zeros(num_procs), np.zeros(num_procs)

    # Main loop: collect experience in env and update/log each epoch
    epoch = 0
    update_times, clean_times = 0, 0
    t = 0
    while t <= total_steps:
        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t > start_steps:
            a = get_action(o)
        else:
            a = np.stack([env.action_space.sample() for _ in range(num_procs)])

        # Step the env
        o2, r, d, info = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        if np.all(ep_len == max_ep_len):
            d.fill(False)

        # Store experience to replay buffer
        replay_buffer.store_vec(o, a, r, o2, d,
                                [inf["features"] for inf in info])

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling, assumes all subenvs end at the same time
        if np.all(d) or np.all(ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)

            if clean_every > 0 and epoch // clean_every >= clean_times:
                env.close()
                test_env.close()
                env = SubprocVecEnv(
                    [partial(env_fn, rank=i) for i in range(num_procs)],
                    "spawn")
                test_env = SubprocVecEnv(test_env_fns, "spawn")
                clean_times += 1

            o, ep_ret, ep_len = env.reset(), np.zeros(num_procs), np.zeros(
                num_procs)

        # Update handling
        if t >= update_after and t / update_every > update_times:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch, feats_keys=replay_buffer.feats_keys)
            update_times += 1

        # End of epoch handling
        if t // steps_per_epoch > epoch:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                # try:
                logger.save_state({'env_name': env_name}, None)
                # logger.save_state({'env': env}, None)
                #except:
                #logger.save_state({'env_name': env_name}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent(replay_buffer.feats_keys)

            # Update tensorboard
            if proc_id() == 0:
                log_perf_board = ['EpRet', 'EpLen', 'Q1Vals', 'Q2Vals'] + [
                    f"TestEpRet_{ti}" for ti in range(len(test_env_fns))
                ]
                log_loss_board = ['LogPi', 'LossPi', 'LossQ'] + [
                    key
                    for key in logger.epoch_dict.keys() if "LossFeats" in key
                ]
                log_board = {
                    'Performance': log_perf_board,
                    'Loss': log_loss_board
                }
                for key, value in log_board.items():
                    for val in value:
                        mean, std = logger.get_stats(val)
                        if key == 'Performance':
                            writer.add_scalar(key + '/Average' + val, mean,
                                              epoch)
                            writer.add_scalar(key + '/Std' + val, std, epoch)
                        else:
                            writer.add_scalar(key + '/' + val, mean, epoch)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

            if proc_id() == 0:
                writer.flush()

                import psutil
                # gives a single float value
                cpu_percent = psutil.cpu_percent()
                # gives an object with many fields
                mem_percent = psutil.virtual_memory().percent
                print(f"Used cpu avg {cpu_percent}% memory {mem_percent}%")
                cpu_separate = psutil.cpu_percent(percpu=True)
                for ci, cval in enumerate(cpu_separate):
                    print(f"\t cpu {ci}: {cval}%")
                # buf_size = replay_buffer.get_size()
                # print(f"Replay buffer size: {buf_size//1e6}MB {buf_size // 1e3} KB {buf_size % 1e3} B")
        t += num_procs

    if proc_id() == 0:
        writer.close()