def load_policy_and_env(fpath, itr='last', deterministic=False, device=None): """ Load a policy from save, whether it's TF or PyTorch, along with RL env. Not exceptionally future-proof, but it will suffice for basic uses of the Spinning Up implementations. Checks to see if there's a tf1_save folder. If yes, assumes the model is tensorflow and loads it that way. Otherwise, loads as if there's a PyTorch save. """ # determine if tf save or pytorch save if any(['tf1_save' in x for x in os.listdir(fpath)]): backend = 'tf1' else: backend = 'pytorch' # handle which epoch to load from if itr == 'last': # check filenames for epoch (AKA iteration) numbers, find maximum value if backend == 'tf1': saves = [int(x[8:]) for x in os.listdir(fpath) if 'tf1_save' in x and len(x) > 8] elif backend == 'pytorch': pytsave_path = osp.join(fpath, 'pyt_save') # Each file in this folder has naming convention 'modelXX.pt', where # 'XX' is either an integer or empty string. Empty string case # corresponds to len(x)==8, hence that case is excluded. saves = [int(x.split('.')[0][6:]) for x in os.listdir(pytsave_path) if len(x) > 8 and 'model_' in x] itr = f'{max(saves):06d}' if len(saves) > 0 else '' else: assert isinstance(itr, int), \ "Bad value provided for itr (needs to be int or 'last')." itr = f'{itr:06d}' # load the get_action function if backend == 'tf1': get_action, model = load_tf_policy(fpath, itr, deterministic, device=device) else: get_action, model = load_pytorch_policy(fpath, itr, deterministic) # try to load environment from save # (sometimes this will fail because the environment could not be pickled) set_mujoco() state = joblib.load(osp.join(fpath, 'state', f'vars_{itr}.pkl')) env = state['env'] # try: # set_mujoco() # state = joblib.load(osp.join(fpath, 'vars'+itr+'.pkl')) # env = state['env'] # except: # env = None return env, get_action, model
def thunk_plus(): # Make 'env_fn' from 'env_name' if 'env_name' in kwargs: from spinup.utils.run_utils import set_mujoco; set_mujoco(); import gym env_name = kwargs['env_name'] kwargs['env_fn'] = lambda : gym.make(env_name) del kwargs['env_name'] # Fork into multiple processes mpi_fork(num_cpu) # Run thunk thunk(**kwargs)
from copy import deepcopy import itertools import numpy as np import torch from torch.optim import Adam from spinup.utils.run_utils import set_mujoco set_mujoco() import gym import time import spinup.algos.pytorch.egl.core as core from spinup.utils.logx import EpochLogger from .spline_model import SparseDenseAdamOptimizer import math import torch.nn.functional as F from tqdm import tqdm import torch.autograd as autograd class ReplayBuffer: """ A simple FIFO experience replay buffer for SAC agents. """ def __init__(self, obs_dim, act_dim, size, device): self.device = device self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) self.obs2_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
import numpy as np import tensorflow as tf from spinup.utils.run_utils import set_mujoco; set_mujoco(); import gym import time import spinup.algos.tf1.vpg.core as core from spinup.utils.logx import EpochLogger from spinup.utils.mpi_tf import MpiAdamOptimizer, sync_all_params from spinup.utils.mpi_tools import mpi_fork, mpi_avg, proc_id, mpi_statistics_scalar, num_procs class VPGBuffer: """ A buffer for storing trajectories experienced by a VPG agent interacting with the environment, and using Generalized Advantage Estimation (GAE-Lambda) for calculating the advantages of state-action pairs. """ def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95): self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32) self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32) self.adv_buf = np.zeros(size, dtype=np.float32) self.rew_buf = np.zeros(size, dtype=np.float32) self.ret_buf = np.zeros(size, dtype=np.float32) self.val_buf = np.zeros(size, dtype=np.float32) self.logp_buf = np.zeros(size, dtype=np.float32) self.gamma, self.lam = gamma, lam self.ptr, self.path_start_idx, self.max_size = 0, 0, size def store(self, obs, act, rew, val, logp): """ Append one timestep of agent-environment interaction to the buffer.
from copy import deepcopy import itertools import numpy as np import torch from torch.optim import Adam from spinup.utils.run_utils import set_mujoco; set_mujoco(); import gym import time import spinup.algos.pytorch.cegl.core as core from spinup.utils.logx import EpochLogger from .spline_model import SparseDenseAdamOptimizer import torch.autograd as autograd import math import torch.nn.functional as F from tqdm import tqdm debug_a1 = None debug_a2 = None def sample_ellipsoid(S, z_hat, m_FA, Gamma_Threshold=1.0, min_dist=1e-2): bz, nz, _ = S.shape z_hat = z_hat.view(bz, nz, 1) X_Cnz = torch.randn(bz, nz, m_FA, device=z_hat.device) rss_array = torch.sqrt(torch.sum(torch.square(X_Cnz), dim=1, keepdim=True)) kron_prod = rss_array.repeat_interleave(nz, dim=1)