Ejemplo n.º 1
0

class AtariDreamerModel(AgentModel):
    def forward(self,
                observation: torch.Tensor,
                prev_action: torch.Tensor = None,
                prev_state: RSSMState = None):
        lead_dim, T, B, img_shape = infer_leading_dims(observation, 3)
        observation = observation.reshape(T * B, *img_shape).type(
            self.dtype) / 255.0 - 0.5
        prev_action = to_onehot(prev_action.reshape(T * B, ),
                                self.action_size,
                                dtype=self.dtype)
        if prev_state is None:
            prev_state = self.representation.initial_state(
                prev_action.size(0),
                device=prev_action.device,
                dtype=self.dtype)
        state = self.get_state_representation(observation, prev_action,
                                              prev_state)

        action, action_dist = self.policy(state)
        action = from_onehot(action)
        return_spec = ModelReturnSpec(action, state)
        return_spec = buffer_func(return_spec, restore_leading_dims, lead_dim,
                                  T, B)
        return return_spec


ModelReturnSpec = namedarraytuple('ModelReturnSpec', ['action', 'state'])
Ejemplo n.º 2
0
import torch
import math

from rlpyt.distributions.base import Distribution
from rlpyt.utils.collections import namedarraytuple
from rlpyt.utils.tensor import valid_mean

EPS = 1e-8

DistInfo = namedarraytuple("DistInfo", ["mean"])
DistInfoStd = namedarraytuple("DistInfoStd", ["mean", "log_std"])


class Gaussian(Distribution):
    """Multivariate Gaussian with independent variables (diagonal covariance).
    Standard deviation can be provided, as scalar or value per dimension, or it
    will be drawn from the dist_info (possibly learnable), where it is expected
    to have a value per each dimension.
    Noise clipping or sample clipping optional during sampling, but not
    accounted for in formulas (e.g. entropy).
    Clipping of standard deviation optional and accounted in formulas.
    Squashing of samples to squash * tanh(sample) is optional and accounted for
    in log_likelihood formula but not entropy.
    """
    def __init__(
            self,
            dim,
            std=None,
            clip=None,
            noise_clip=None,
            min_std=None,
Ejemplo n.º 3
0
import torch
from collections import namedtuple

from rlpyt.algos.base import RlAlgorithm
from rlpyt.utils.quick_args import save__init__args
from rlpyt.utils.logging import logger
from rlpyt.replays.non_sequence.frame import (UniformReplayFrameBuffer,
    PrioritizedReplayFrameBuffer, AsyncUniformReplayFrameBuffer,
    AsyncPrioritizedReplayFrameBuffer)
from rlpyt.utils.collections import namedarraytuple
from rlpyt.utils.tensor import select_at_indexes, valid_mean
from rlpyt.algos.utils import valid_from_done

OptInfo = namedtuple("OptInfo", ["loss", "gradNorm", "tdAbsErr"])
SamplesToBuffer = namedarraytuple("SamplesToBuffer",
    ["observation", "action", "reward", "done"])


class DQN(RlAlgorithm):
    """
    DQN algorithm trainig from a replay buffer, with options for double-dqn, n-step
    returns, and prioritized replay.
    """

    opt_info_fields = tuple(f for f in OptInfo._fields)  # copy

    def __init__(
            self,
            discount=0.99,
            batch_size=32,
            min_steps_learn=int(5e4),
Ejemplo n.º 4
0
from torch.nn.parallel import DistributedDataParallel as DDP
# from torch.nn.parallel import DistributedDataParallelCPU as DDPC  # Deprecated

from rlpyt.agents.base import BaseAgent, AgentStep
from rlpyt.models.qpg.mlp import QofMuMlpModel, PiMlpModel
from rlpyt.utils.quick_args import save__init__args
from rlpyt.distributions.gaussian import Gaussian, DistInfoStd
from rlpyt.utils.buffer import buffer_to
from rlpyt.utils.logging import logger
from rlpyt.models.utils import update_state_dict
from rlpyt.utils.collections import namedarraytuple

MIN_LOG_STD = -20
MAX_LOG_STD = 2

AgentInfo = namedarraytuple("AgentInfo", ["dist_info"])
Models = namedtuple("Models", ["pi", "q1", "q2", "v"])


class SacAgent(BaseAgent):
    """Agent for SAC algorithm, including action-squashing, using twin Q-values."""
    def __init__(
            self,
            ModelCls=PiMlpModel,  # Pi model.
            QModelCls=QofMuMlpModel,
            model_kwargs=None,  # Pi model.
            q_model_kwargs=None,
            v_model_kwargs=None,
            initial_model_state_dict=None,  # All models.
            pretrain_std=0.75,  # With squash 0.75 is near uniform.
    ):
Ejemplo n.º 5
0
import torch
import torch.nn.functional as F

from rlpyt.models.mlp import MlpModel
from rlpyt.ul.models.dmlab_conv2d import DmlabConv2dModel
from rlpyt.utils.collections import namedarraytuple
from rlpyt.utils.logging import logger
from rlpyt.utils.tensor import infer_leading_dims, restore_leading_dims

RnnState = namedarraytuple(
    "RnnState", ["h", "c"])  # For downstream namedarraytuples to work


def weight_init(m):
    if isinstance(m, (torch.nn.Linear, torch.nn.Conv2d)):
        torch.nn.init.kaiming_normal_(m.weight,
                                      mode="fan_in",
                                      nonlinearity="relu")
        torch.nn.init.zeros_(m.bias)


class DmlabPgLstmModel(torch.nn.Module):
    def __init__(
        self,
        image_shape,
        output_size,
        lstm_size,
        skip_connections=True,
        hidden_sizes=None,
        kiaming_init=True,
        stop_conv_grad=False,
Ejemplo n.º 6
0
import torch

from rlpyt.agents.base import (AgentStep, RecurrentAgentMixin, 
    AlternatingRecurrentAgentMixin)
from rlpyt.agents.dqn.dqn_agent import DqnAgent
from rlpyt.utils.buffer import buffer_to, buffer_func, buffer_method
from rlpyt.utils.collections import namedarraytuple


AgentInfo = namedarraytuple("AgentInfo", ["q", "prev_rnn_state"])


class R2d1AgentBase(DqnAgent):
    """Base agent for recurrent DQN (to add recurrent mixin)."""

    def __call__(self, observation, prev_action, prev_reward, init_rnn_state):
        # Assume init_rnn_state already shaped: [N,B,H]
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward,
            init_rnn_state), device=self.device)
        output = self.model(*model_inputs) # q, rnn_state
        return output  # Leave rnn state on device.

    def to_agent_step(self, output):
        """Convert the output of the NN model into step info for the agent.
        """
        q, rnn_state = output
        # q = q.cpu()
        action = self.distribution.sample(q)
        prev_rnn_state = self.prev_rnn_state or buffer_func(rnn_state, torch.zeros_like)
Ejemplo n.º 7
0
"""
Methods to overwrite for the saved replay buffer, to return
different samples than was used by the replay buffer object
used to collect the samples.
"""
import numpy as np

from rlpyt.utils.buffer import torchify_buffer, buffer_func
from rlpyt.utils.misc import extract_sequences
from rlpyt.utils.collections import namedarraytuple
from rlpyt.utils.logging import logger

SamplesFromReplay = namedarraytuple("SamplesFromReplay",
    ["observation", "action", "reward", "done", "prev_action", "prev_reward"])
SamplesFromReplayPC = namedarraytuple("SamplesFromReplayPC",
    SamplesFromReplay._fields + ("pixctl_return",))


class UlForRlReplayBuffer:

    def __init__(
            self,
            replay_buffer,
            replay_T=1,
            validation_split=0.0,
            pixel_control_buffer=None,
        ):
        self.load_replay(replay_buffer, pixel_control_buffer)
        self.replay_T = replay_T
        self.validation_t = int((self.T - replay_T) * (1 - validation_split))
        if pixel_control_buffer is not None:
Ejemplo n.º 8
0
from abc import ABC

from rlpyt.algos.pg.ppo import PPO
from rlpyt.agents.base import AgentInputs
from rlpyt.utils.tensor import valid_mean
from rlpyt.utils.quick_args import save__init__args
from rlpyt.utils.buffer import buffer_to, buffer_method
from rlpyt.utils.misc import iterate_mb_idxs
from rlpyt.utils.collections import namedarraytuple

from intrinsic_rl.algos.pg.base import IntrinsicPolicyGradientAlgo

import cv2  ###

LossInputs = namedarraytuple("LossInputs", [
    "agent_inputs", "action", "next_obs", "ext_return", "ext_adv",
    "int_return", "int_adv", "valid", "old_dist_info"
])
OptInfo = namedarraytuple("OptInfo", [
    "loss", "policyLoss", "valueLoss", "entropyLoss", "bonusLoss",
    "extrinsicValue", "intrinsicValue", "intrinsicReward",
    "discountedIntrinsicReturn", "gradNorm", "entropy", "perplexity",
    "meanObsRmsModel", "varObsRmsModel", "meanIntRetRmsModel",
    "varIntRetRmsModel"
])


class IntrinsicPPO(PPO, IntrinsicPolicyGradientAlgo, ABC):
    """
    Abstract base class for PPO using an intrinsic bonus model.
    Must override abstract method ``extract_bonus_inputs`` based on
    specific intrinsic bonus model / algorithm to be used.
Ejemplo n.º 9
0
from gpytorch.mlls import ExactMarginalLogLikelihood

from rlpyt.algos.base import RlAlgorithm
from rlpyt.utils.quick_args import save__init__args
from rlpyt.utils.logging import logger
from rlpyt.replays.model_based import ModelBasedBuffer
from rlpyt.utils.collections import namedarraytuple
from rlpyt.agents.base import AgentInputs
from rlpyt.utils.tensor import valid_mean
from rlpyt.utils.visom import VisdomLinePlotter
from rlpyt.algos.utils import valid_from_done

OptInfo = namedtuple("OptInfo",
    ["muLoss", "dLoss", "muGradNorm", "dGradNorm"])

SamplesToBuffer = namedarraytuple("SamplesToBuffer",
    ["observation", "prev_observation", "action", "reward", "done", "timeout"])

class GP_Mlp(RlAlgorithm):
    """Model-based algorithm that uses Gaussian Process to predict model and a deep
    neural network to control."""

    opt_info_fields = tuple(f for f in OptInfo._fields)  # copy

    def __init__(
            self,
            discount=0.99,
            batch_size=500,
            buffer_size=int(1e6),
            min_steps_learn=int(1e1), # very efficient
            target_update_tau=0.9,
            target_update_interval=5,
Ejemplo n.º 10
0
Archivo: base.py Proyecto: keirp/glamor
from rlpyt.utils.collections import namedarraytuple

AgentInfo = namedarraytuple("AgentInfo", ["dist_info", "value"])
AgentInfoRnn = namedarraytuple("AgentInfoRnn",
                               ["dist_info", "value", "prev_rnn_state"])
Ejemplo n.º 11
0
"""
这个class已经抽象到和具体的environment(例如Atari)无关,而它的子类还是有可能和具体的environment相关的。
"""
import torch

from rlpyt.agents.base import BaseAgent, AgentStep
from rlpyt.agents.dqn.epsilon_greedy import EpsilonGreedyAgentMixin
from rlpyt.distributions.epsilon_greedy import EpsilonGreedy
from rlpyt.models.utils import strip_ddp_state_dict
from rlpyt.utils.buffer import buffer_to
from rlpyt.utils.collections import namedarraytuple
from rlpyt.models.utils import update_state_dict


AgentInfo = namedarraytuple("AgentInfo", "q")


class DqnAgent(EpsilonGreedyAgentMixin, BaseAgent):

    def __call__(self, observation, prev_action, prev_reward):
        """
        __call__使得一个class可以像一个method一样调用,即:假设agent为DqnAgent的一个对象,那么agent(observation, prev_action,
        prev_reward)就等同于调用agent.__call__(observation, prev_action, prev_reward)
        """
        prev_action = self.distribution.to_onehot(prev_action)
        model_inputs = buffer_to((observation, prev_action, prev_reward), device=self.device)
        q = self.model(*model_inputs)  # torch.nn.Module子类的实例,使用torch.nn.Module里定义的__call__调用,相当于计算模型输出(一个Tensor)
        return q.cpu()  # 将tensor移动到CPU(内存)

    def initialize(self, env_spaces, share_memory=False, global_B=1, env_ranks=None):
        """
Ejemplo n.º 12
0
import math

import numpy as np

from rlpyt.replays.n_step import BaseNStepReturnBuffer
from rlpyt.utils.buffer import buffer_from_example, buffer_func, torchify_buffer
from rlpyt.utils.collections import namedarraytuple
from rlpyt.utils.misc import extract_sequences

SamplesFromReplay = namedarraytuple(
    "SamplesFromReplay",
    [
        "all_observation",
        "all_action",
        "all_reward",
        "return_",
        "done",
        "done_n",
        "init_rnn_state",
    ],
)

SamplesToBuffer = None


class SequenceNStepReturnBuffer(BaseNStepReturnBuffer):
    """Base n-step return buffer for sequences replays.  Includes storage of
    agent's recurrent (RNN) state.

    Use of ``rnn_state_interval>1`` only periodically
    stores RNN state, to save memory.  The replay mechanism must account for the
Ejemplo n.º 13
0
from rlpyt.utils.collections import namedarraytuple

AgentInfo = namedarraytuple("AgentInfo", ["dist_info", "value"])
AgentInfoTwin = namedarraytuple("AgentInfoTwin", ["dist_info", "dist_int_info", "value", "int_value"])
AgentInfoRnn = namedarraytuple("AgentInfoRnn", ["dist_info", "value", "prev_rnn_state"])
AgentInfoRnnTwin = namedarraytuple("AgentInfoRnnTwin", [
                                    "dist_info", "dist_int_info", 
                                    "value", "int_value", 
                                    "prev_rnn_state", "prev_int_rnn_state"])
IcmInfo = namedarraytuple("IcmInfo", [])
NdigoInfo = namedarraytuple("NdigoInfo", ["prev_gru_state"])
RndInfo = namedarraytuple("RndInfo", [])
Ejemplo n.º 14
0
import multiprocessing as mp
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.nn.parallel import DistributedDataParallelCPU as DDPC

from rlpyt.utils.quick_args import save__init__args
from rlpyt.utils.collections import namedarraytuple
from rlpyt.utils.synchronize import RWLock
from rlpyt.utils.logging import logger
from rlpyt.models.utils import strip_ddp_state_dict

AgentInputs = namedarraytuple("AgentInputs",
    ["observation", "prev_action", "prev_reward"])
AgentStep = namedarraytuple("AgentStep", ["action", "agent_info"])


class BaseAgent:

    recurrent = False
    alternating = False

    def __init__(self, ModelCls=None, model_kwargs=None, initial_model_state_dict=None):
        save__init__args(locals())
        self.model = None  # type: torch.nn.Module
        self.shared_model = None
        self.distribution = None
        self.device = torch.device("cpu")
        self._mode = None
        if self.model_kwargs is None:
            self.model_kwargs = dict()
        # The rest only for async operations:
Ejemplo n.º 15
0
import numpy as np
from collections import namedtuple

from rlpyt.utils.collections import namedarraytuple, AttrDict

Samples = namedarraytuple("Samples", ["agent", "env"])

AgentSamples = namedarraytuple("AgentSamples",
                               ["action", "prev_action", "agent_info"])
AgentSamplesBsv = namedarraytuple(
    "AgentSamplesBsv",
    ["action", "prev_action", "agent_info", "bootstrap_value"])
EnvSamples = namedarraytuple("EnvSamples", [
    "reward", "prev_reward", "observation", "next_observation", "done",
    "env_info"
])


class BatchSpec(namedtuple("BatchSpec", "T B")):
    """
    T: int  Number of time steps, >=1.
    B: int  Number of separate trajectory segments (i.e. # env instances), >=1.
    """
    __slots__ = ()

    @property
    def size(self):
        return self.T * self.B


class TrajInfo(AttrDict):
Ejemplo n.º 16
0
    AsyncUniformReplayBuffer)
from rlpyt.replays.non_sequence.time_limit import (TlUniformReplayBuffer,
    AsyncTlUniformReplayBuffer)
from rlpyt.utils.collections import namedarraytuple
from rlpyt.utils.buffer import buffer_to
from rlpyt.distributions.gaussian import Gaussian
from rlpyt.distributions.gaussian import DistInfo as GaussianDistInfo
from rlpyt.utils.tensor import valid_mean
from rlpyt.algos.utils import valid_from_done


OptInfo = namedtuple("OptInfo",
    ["q1Loss", "q2Loss", "piLoss",
    "q1GradNorm", "q2GradNorm", "piGradNorm",
    "q1", "q2", "piMu", "piLogStd", "qMeanDiff", "alpha"])
SamplesToBuffer = namedarraytuple("SamplesToBuffer",
    ["observation", "action", "reward", "done"])
SamplesToBufferTl = namedarraytuple("SamplesToBufferTl",
    SamplesToBuffer._fields + ("timeout",))


class SAC(RlAlgorithm):
    """Soft actor critic algorithm, training from a replay buffer."""

    opt_info_fields = tuple(f for f in OptInfo._fields)  # copy

    def __init__(
            self,
            discount=0.99,
            batch_size=256,
            min_steps_learn=int(1e4),
            replay_size=int(1e6),
Ejemplo n.º 17
0
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
# from torch.nn.parallel import DistributedDataParallelCPU as DDPC  # Deprecated

from rlpyt.agents.base import BaseAgent, AgentStep
from rlpyt.utils.quick_args import save__init__args
from rlpyt.distributions.gaussian import Gaussian, DistInfo
from rlpyt.utils.buffer import buffer_to
from rlpyt.utils.logging import logger
from rlpyt.models.qpg.mlp import MuMlpModel, QofMuMlpModel
from rlpyt.models.utils import update_state_dict
from rlpyt.utils.collections import namedarraytuple


AgentInfo = namedarraytuple("AgentInfo", ["mu"])


class DdpgAgent(BaseAgent):
    """Agent for deep deterministic policy gradient algorithm."""

    shared_mu_model = None

    def __init__(
            self,
            ModelCls=MuMlpModel,  # Mu model.
            QModelCls=QofMuMlpModel,
            model_kwargs=None,  # Mu model.
            q_model_kwargs=None,
            initial_model_state_dict=None,  # Mu model.
            initial_q_model_state_dict=None,
Ejemplo n.º 18
0
from rlpyt.utils.collections import namedarraytuple
from rlpyt.utils.tensor import infer_leading_dims, restore_leading_dims

from ul_gen.algos.discrete_sac_ae import DiscreteSACAE
from ul_gen.configs.discrete_sac_ae_config import configs
from ul_gen.agents.discrete_sac_ae_agent import DiscreteSacAEAgent

import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--savepath", type=str, default="./ae_data/")

args = parser.parse_args()

os.makedirs(args.savepath, exist_ok=True)

EmptyAgentInfo = namedarraytuple("EmptyAgentInfo", [])
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

affinity_code = encode_affinity(
    n_cpu_core=4,
    n_gpu=1,
    n_socket=1,
)

affinity = affinity_from_code(prepend_run_slot(0, affinity_code))
# Get Params
config = configs["discrete_sac_ae"]

# Setup the data collection pipeline
# Edit the sampler kwargs to get a larger batch size
config["sampler"]["batch_T"] = 24
Ejemplo n.º 19
0
import numpy as np

from rlpyt.replays.n_step import BaseNStepReturnBuffer
from rlpyt.agents.base import AgentInputs
from rlpyt.utils.collections import namedarraytuple
from rlpyt.utils.buffer import torchify_buffer

SamplesFromReplay = namedarraytuple(
    "SamplesFromReplay",
    ["agent_inputs", "action", "return_", "done", "done_n", "target_inputs"])


class NStepReturnBuffer(BaseNStepReturnBuffer):
    """Definition of what fields are replayed from basic n-step return buffer."""
    def extract_batch(self, T_idxs, B_idxs):
        """From buffer locations `[T_idxs,B_idxs]`, extract data needed for
        training, including target values at `T_idxs + n_step_return`.  Returns
        namedarraytuple of torch tensors (see file for all fields).  Each tensor
        has leading batch dimension ``len(T_idxs)==len(B_idxs)``, but individual
        samples are drawn, so no leading time dimension."""
        s = self.samples
        target_T_idxs = (T_idxs + self.n_step_return) % self.T
        batch = SamplesFromReplay(
            agent_inputs=AgentInputs(
                observation=self.extract_observation(T_idxs, B_idxs),
                prev_action=s.action[T_idxs - 1, B_idxs],
                prev_reward=s.reward[T_idxs - 1, B_idxs],
            ),
            action=s.action[T_idxs, B_idxs],
            return_=self.samples_return_[T_idxs, B_idxs],
            done=self.samples.done[T_idxs, B_idxs],
Ejemplo n.º 20
0
from dreamer.utils.module import get_parameters, FreezeParameters

torch.autograd.set_detect_anomaly(True)  # used for debugging gradients


loss_info_fields = ['model_loss',
                    'actor_loss',
                    'value_loss',
                    'prior_entropy',
                    'post_entropy',
                    'divergence',
                    'reward_loss',
                    'image_loss',
                    'bisim_loss',
                    'pcont_loss']
LossInfo = namedarraytuple('LossInfo', loss_info_fields)
OptInfo = namedarraytuple("OptInfo",
                          ['loss',
                           'grad_norm_model',
                           'grad_norm_actor',
                           'grad_norm_value'] + loss_info_fields)


class Dreamer(RlAlgorithm):

    def __init__(
            self,  # Hyper-parameters
            batch_size=50,
            batch_length=50,
            train_every=1000,
            train_steps=100,
Ejemplo n.º 21
0
import torch
import torch.distributions as td
import torch.nn as nn
import torch.nn.functional as tf
from rlpyt.utils.collections import namedarraytuple
from rlpyt.utils.buffer import buffer_method

from dreamer.utils.module import FreezeParameters

RSSMState = namedarraytuple('RSSMState', ['mean', 'std', 'stoch', 'deter'])


def stack_states(rssm_states: list, dim):
    return RSSMState(
        torch.stack([state.mean for state in rssm_states], dim=dim),
        torch.stack([state.std for state in rssm_states], dim=dim),
        torch.stack([state.stoch for state in rssm_states], dim=dim),
        torch.stack([state.deter for state in rssm_states], dim=dim),
    )


def get_feat(rssm_state: RSSMState):
    return torch.cat((rssm_state.stoch, rssm_state.deter), dim=-1)


def get_dist(rssm_state: RSSMState):
    return td.independent.Independent(
        td.Normal(rssm_state.mean, rssm_state.std), 1)


class TransitionBase(nn.Module):
Ejemplo n.º 22
0
import torch

from rlpyt.distributions.base import Distribution
from rlpyt.distributions.discrete import DiscreteMixin
from rlpyt.utils.collections import namedarraytuple
from rlpyt.utils.tensor import valid_mean, select_at_indexes

EPS = 1e-8

DistInfo = namedarraytuple("DistInfo", ["prob"])


class Categorical(DiscreteMixin, Distribution):
    def kl(self, old_dist_info, new_dist_info):
        p = old_dist_info.prob
        q = new_dist_info.prob
        return torch.sum(p * (torch.log(p + EPS) - torch.log(q + EPS)), dim=-1)

    def mean_kl(self, old_dist_info, new_dist_info, valid=None):
        return valid_mean(self.kl(old_dist_info, new_dist_info), valid)

    def sample(self, dist_info):
        p = dist_info.prob
        sample = torch.multinomial(p.view(-1, self.dim), num_samples=1)
        return sample.view(p.shape[:-1]).type(self.dtype)  # Returns indexes.

    def entropy(self, dist_info):
        p = dist_info.prob
        return -torch.sum(p * torch.log(p + EPS), dim=-1)

    def log_likelihood(self, indexes, dist_info):
Ejemplo n.º 23
0
import math

from rlpyt.replays.sequence.n_step import (SequenceNStepReturnBuffer,
                                           SamplesFromReplay)
from rlpyt.replays.async_ import AsyncReplayBufferMixin
from rlpyt.replays.sum_tree import SumTree, AsyncSumTree
from rlpyt.utils.collections import namedarraytuple
from rlpyt.utils.quick_args import save__init__args
from rlpyt.utils.buffer import torchify_buffer, numpify_buffer

SamplesFromReplayPri = namedarraytuple(
    "SamplesFromReplayPri", SamplesFromReplay._fields + ("is_weights", ))


class PrioritizedSequenceReplay:
    """Prioritized experience replay of sequences using sum-tree prioritization.
    The size of the sum-tree is based on the number of RNN states stored,
    since valid sequences must start with an RNN state.  Hence using periodic
    storage with ``rnn_state_inveral>1`` results in a faster tree using less
    memory.  Replay buffer priorities are indexed to the start of the whole sequence
    to be returned, regardless of whether the initial part is used only as RNN warmup.

    Requires ``batch_T`` to be set and fixed at instantiation, so that the
    priority tree has a fixed scheme for which samples are temporarilty
    invalid due to the looping cursor (the tree must set and propagate 0-priorities
    for those samples, so dynamic ``batch_T`` could require additional tree
    operations for every sampling event).

    Parameter ``input_priority_shift`` is used to assign input priorities to a
    starting time-step which is shifted from the samples input to
    ``append_samples()``.  For example, in R2D1, using replay sequences of 120
Ejemplo n.º 24
0
    AsyncUniformSequenceReplayFrameBuffer, PrioritizedSequenceReplayFrameBuffer
from rlpyt.utils.buffer import torchify_buffer, numpify_buffer
from rlpyt.utils.collections import namedarraytuple
from rlpyt.utils.misc import extract_sequences
import traceback

Transition = recordclass('Transition',
                         ('timestep', 'state', 'action', 'reward', 'value',
                          'policy', 'nonterminal'))
blank_trans = Transition(0, torch.zeros(84, 84,
                                        dtype=torch.uint8), 0, 0., 0., 0,
                         False)  # TODO: Set appropriate default policy value
blank_batch_trans = Transition(0, torch.zeros(1, 84, 84, dtype=torch.uint8), 0,
                               0., 0., 0, False)

PrioritizedSamples = namedarraytuple("PrioritizedSamples",
                                     ["samples", "priorities"])
SamplesToBuffer = namedarraytuple(
    "SamplesToBuffer",
    ["observation", "action", "reward", "done", "policy_probs", "value"])
EPS = 1e-6


def samples_to_buffer(observation,
                      action,
                      reward,
                      done,
                      policy_probs,
                      value,
                      priorities=None):
    samples = SamplesToBuffer(observation=observation,
                              action=action,
Ejemplo n.º 25
0
Archivo: ppo.py Proyecto: keirp/glamor
import torch

from rlpyt.algos.pg.base import PolicyGradientAlgo, OptInfo
from rlpyt.agents.base import AgentInputs, AgentInputsRnn
from rlpyt.utils.tensor import valid_mean
from rlpyt.utils.quick_args import save__init__args
from rlpyt.utils.buffer import buffer_to, buffer_method
from rlpyt.utils.collections import namedarraytuple
from rlpyt.utils.misc import iterate_mb_idxs

LossInputs = namedarraytuple("LossInputs", [
    "agent_inputs", "action", "return_", "advantage", "valid", "old_dist_info"
])


class PPO(PolicyGradientAlgo):
    """
    Proximal Policy Optimization algorithm.  Trains the agent by taking
    multiple epochs of gradient steps on minibatches of the training data at
    each iteration, with advantages computed by generalized advantage
    estimation.  Uses clipped likelihood ratios in the policy loss.
    """
    def __init__(
        self,
        discount=0.99,
        learning_rate=0.001,
        value_loss_coeff=1.,
        entropy_loss_coeff=0.01,
        OptimCls=torch.optim.Adam,
        optim_kwargs=None,
        clip_grad_norm=1.,
Ejemplo n.º 26
0
import multiprocessing as mp

from rlpyt.agents.base import AgentInputs
from rlpyt.samplers.parallel.base import ParallelSamplerBase
from rlpyt.samplers.parallel.gpu.action_server import ActionServer
from rlpyt.samplers.parallel.gpu.collectors import (GpuResetCollector,
                                                    GpuEvalCollector)
from rlpyt.utils.collections import namedarraytuple, AttrDict
from rlpyt.utils.synchronize import drain_queue
from rlpyt.utils.buffer import buffer_from_example, torchify_buffer

StepBuffer = namedarraytuple(
    "StepBuffer", ["observation", "action", "reward", "done", "agent_info"])


class GpuSamplerBase(ParallelSamplerBase):
    """Base class for parallel samplers which use worker processes to execute
    environment steps on CPU resources but the master process to execute agent
    forward passes for action selection, presumably on GPU.  Use GPU-based
    collecter classes.

    In addition to the usual batch buffer for data samples, allocates a step
    buffer over shared memory, which is used for communication with workers.
    The step buffer includes `observations`, which the workers write and the
    master reads, and `actions`, which the master write and the workers read.
    (The step buffer has leading dimension [`batch_B`], for the number of 
    parallel environments, and each worker gets its own slice along that
    dimension.)  The step buffer object is held in both numpy array and torch
    tensor forms over the same memory; e.g. workers write to the numpy array
    form, and the agent is able to read the torch tensor form.
Ejemplo n.º 27
0
import torch
import torch.nn as nn

from rlpyt.utils.tensor import infer_leading_dims, restore_leading_dims
from rlpyt.utils.collections import namedarraytuple
from rlpyt.models.conv2d import Conv2dHeadModel
from rlpyt.models.mlp import MlpModel
from rlpyt.models.dqn.dueling import DuelingHeadModel

RnnState = namedarraytuple("RnnState", ["h"])


class GRUModel(torch.nn.Module):
    """2D convolutional neural network (for multiple video frames per
    observation) feeding into an GRU and MLP output for Q-value outputs for
    the action set. Ability to track intermediate variables"""
    def __init__(
        self,
        image_shape,
        output_size,
        fc_size=512,  # Between conv and lstm.
        lstm_size=512,
        head_size=512,
        use_recurrence=True,
        dueling=False,
        use_maxpool=False,
        channels=None,  # None uses default.
        kernel_sizes=None,
        strides=None,
        paddings=None,
    ):
import torch
from qec.vmpo.v_mpo import VMPO, OptInfo
from rlpyt.agents.base import AgentInputs, AgentInputsRnn
from rlpyt.utils.quick_args import save__init__args
from rlpyt.utils.buffer import buffer_to, buffer_method
from rlpyt.utils.collections import namedarraytuple, namedtuple
from qec.vmpo.on_policy_replay import AsyncUniformSequenceReplayBuffer

LossInputs = namedarraytuple("LossInputs",
                             ["dist_info", "value", "action", "return_", "advantage", "valid", "old_dist_info"])

SamplesToBuffer = namedarraytuple("SamplesToBuffer",
                                  ['agent_inputs', "action", "reward", "done", "dist_info"])
SamplesToBufferTl = namedarraytuple("SamplesToBufferTl",
                                    SamplesToBuffer._fields + ("timeout",))

SamplesToBufferRnn = namedarraytuple("SamplesToBufferRnn", SamplesToBuffer._fields + ("prev_rnn_state",))

OptInfo = namedarraytuple("OptInfo", OptInfo._fields + ("optim_buffer_wait_time",))


class AsyncVMPO(VMPO):
    opt_info_fields = tuple(f for f in OptInfo._fields)  # copy

    def __init__(
            self,
            batch_B=64,
            batch_T=40,
            **kwargs
    ):
        super().__init__(**kwargs)
Ejemplo n.º 29
0
Archivo: r2d1.py Proyecto: zikkat/rlpyt
from rlpyt.algos.dqn.dqn import DQN, SamplesToBuffer
from rlpyt.agents.base import AgentInputs
from rlpyt.utils.quick_args import save__init__args
from rlpyt.utils.logging import logger
from rlpyt.utils.collections import namedarraytuple
from rlpyt.replays.sequence.frame import (
    UniformSequenceReplayFrameBuffer, PrioritizedSequenceReplayFrameBuffer,
    AsyncUniformSequenceReplayFrameBuffer,
    AsyncPrioritizedSequenceReplayFrameBuffer)
from rlpyt.utils.tensor import select_at_indexes, valid_mean
from rlpyt.algos.utils import valid_from_done, discount_return_n_step
from rlpyt.utils.buffer import buffer_to, buffer_method, torchify_buffer

OptInfo = namedtuple("OptInfo", ["loss", "gradNorm", "tdAbsErr", "priority"])
SamplesToBufferRnn = namedarraytuple(
    "SamplesToBufferRnn", SamplesToBuffer._fields + ("prev_rnn_state", ))
PrioritiesSamplesToBuffer = namedarraytuple("PrioritiesSamplesToBuffer",
                                            ["priorities", "samples"])


class R2D1(DQN):
    """Recurrent-replay DQN with options for: Double-DQN, Dueling Architecture,
    n-step returns, prioritized_replay."""

    opt_info_fields = tuple(f for f in OptInfo._fields)  # copy

    def __init__(
            self,
            discount=0.997,
            batch_T=80,
            batch_B=64,
Ejemplo n.º 30
0
from rlpyt.replays.non_sequence.frame import (
    PrioritizedReplayFrameBuffer,
    UniformReplayFrameBuffer,
)
from rlpyt.replays.sum_tree import SumTree
from rlpyt.utils.buffer import (
    buffer_from_example,
    buffer_func,
    get_leading_dims,
    torchify_buffer,
)
from rlpyt.utils.collections import namedarraytuple
from rlpyt.utils.misc import extract_sequences

SamplesFromReplay = namedarraytuple(
    "SamplesFromReplay", ["observation", "action", "reward", "done"]
)


class RlWithUlUniformReplayBuffer(BaseReplayBuffer):
    def __init__(self, example, size, B, replay_T):
        self.T = T = math.ceil(size / B)
        self.B = B
        self.size = T * B
        self.t = 0  # cursor
        self.replay_T = replay_T
        self.samples = buffer_from_example(example, (T, B), share_memory=self.async_)
        self._buffer_full = False

    def append_samples(self, samples):
        T, B = get_leading_dims(samples, n_dim=2)