Exemple #1
0
def load_policy_and_env(fpath, itr='last', deterministic=False, device=None):
    """
    Load a policy from save, whether it's TF or PyTorch, along with RL env.

    Not exceptionally future-proof, but it will suffice for basic uses of the
    Spinning Up implementations.

    Checks to see if there's a tf1_save folder. If yes, assumes the model
    is tensorflow and loads it that way. Otherwise, loads as if there's a
    PyTorch save.
    """

    # determine if tf save or pytorch save
    if any(['tf1_save' in x for x in os.listdir(fpath)]):
        backend = 'tf1'
    else:
        backend = 'pytorch'

    # handle which epoch to load from
    if itr == 'last':
        # check filenames for epoch (AKA iteration) numbers, find maximum value

        if backend == 'tf1':
            saves = [int(x[8:]) for x in os.listdir(fpath) if 'tf1_save' in x and len(x) > 8]

        elif backend == 'pytorch':
            pytsave_path = osp.join(fpath, 'pyt_save')
            # Each file in this folder has naming convention 'modelXX.pt', where
            # 'XX' is either an integer or empty string. Empty string case
            # corresponds to len(x)==8, hence that case is excluded.
            saves = [int(x.split('.')[0][6:]) for x in os.listdir(pytsave_path) if len(x) > 8 and 'model_' in x]

        itr = f'{max(saves):06d}' if len(saves) > 0 else ''

    else:
        assert isinstance(itr, int), \
            "Bad value provided for itr (needs to be int or 'last')."
        itr = f'{itr:06d}'

    # load the get_action function
    if backend == 'tf1':
        get_action, model = load_tf_policy(fpath, itr, deterministic, device=device)
    else:
        get_action, model = load_pytorch_policy(fpath, itr, deterministic)

    # try to load environment from save
    # (sometimes this will fail because the environment could not be pickled)

    set_mujoco()
    state = joblib.load(osp.join(fpath, 'state', f'vars_{itr}.pkl'))
    env = state['env']

    # try:
    #     set_mujoco()
    #     state = joblib.load(osp.join(fpath, 'vars'+itr+'.pkl'))
    #     env = state['env']
    # except:
    #     env = None

    return env, get_action, model
Exemple #2
0
    def thunk_plus():
        # Make 'env_fn' from 'env_name'
        if 'env_name' in kwargs:
            from spinup.utils.run_utils import set_mujoco; set_mujoco(); import gym
            env_name = kwargs['env_name']
            kwargs['env_fn'] = lambda : gym.make(env_name)
            del kwargs['env_name']

        # Fork into multiple processes
        mpi_fork(num_cpu)

        # Run thunk
        thunk(**kwargs)
Exemple #3
0
from copy import deepcopy
import itertools
import numpy as np
import torch
from torch.optim import Adam
from spinup.utils.run_utils import set_mujoco
set_mujoco()
import gym
import time
import spinup.algos.pytorch.egl.core as core
from spinup.utils.logx import EpochLogger
from .spline_model import SparseDenseAdamOptimizer
import math
import torch.nn.functional as F
from tqdm import tqdm
import torch.autograd as autograd


class ReplayBuffer:
    """
    A simple FIFO experience replay buffer for SAC agents.
    """
    def __init__(self, obs_dim, act_dim, size, device):

        self.device = device
        self.obs_buf = np.zeros(core.combined_shape(size, obs_dim),
                                dtype=np.float32)
        self.obs2_buf = np.zeros(core.combined_shape(size, obs_dim),
                                 dtype=np.float32)
        self.act_buf = np.zeros(core.combined_shape(size, act_dim),
                                dtype=np.float32)
Exemple #4
0
import numpy as np
import tensorflow as tf
from spinup.utils.run_utils import set_mujoco; set_mujoco(); import gym
import time
import spinup.algos.tf1.vpg.core as core
from spinup.utils.logx import EpochLogger
from spinup.utils.mpi_tf import MpiAdamOptimizer, sync_all_params
from spinup.utils.mpi_tools import mpi_fork, mpi_avg, proc_id, mpi_statistics_scalar, num_procs


class VPGBuffer:
    """
    A buffer for storing trajectories experienced by a VPG agent interacting
    with the environment, and using Generalized Advantage Estimation (GAE-Lambda)
    for calculating the advantages of state-action pairs.
    """

    def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95):
        self.obs_buf = np.zeros(core.combined_shape(size, obs_dim), dtype=np.float32)
        self.act_buf = np.zeros(core.combined_shape(size, act_dim), dtype=np.float32)
        self.adv_buf = np.zeros(size, dtype=np.float32)
        self.rew_buf = np.zeros(size, dtype=np.float32)
        self.ret_buf = np.zeros(size, dtype=np.float32)
        self.val_buf = np.zeros(size, dtype=np.float32)
        self.logp_buf = np.zeros(size, dtype=np.float32)
        self.gamma, self.lam = gamma, lam
        self.ptr, self.path_start_idx, self.max_size = 0, 0, size

    def store(self, obs, act, rew, val, logp):
        """
        Append one timestep of agent-environment interaction to the buffer.
Exemple #5
0
from copy import deepcopy
import itertools
import numpy as np
import torch
from torch.optim import Adam
from spinup.utils.run_utils import set_mujoco;

set_mujoco();
import gym
import time
import spinup.algos.pytorch.cegl.core as core
from spinup.utils.logx import EpochLogger
from .spline_model import SparseDenseAdamOptimizer
import torch.autograd as autograd
import math
import torch.nn.functional as F
from tqdm import tqdm

debug_a1 = None
debug_a2 = None


def sample_ellipsoid(S, z_hat, m_FA, Gamma_Threshold=1.0, min_dist=1e-2):
    bz, nz, _ = S.shape
    z_hat = z_hat.view(bz, nz, 1)

    X_Cnz = torch.randn(bz, nz, m_FA, device=z_hat.device)

    rss_array = torch.sqrt(torch.sum(torch.square(X_Cnz), dim=1, keepdim=True))

    kron_prod = rss_array.repeat_interleave(nz, dim=1)