Ejemplo n.º 1
0
    def test_map_structure_to(self, map_structure_up_to):
        shallow_nest = [[None], None]
        inp_val = [[1], 2]
        out = map_structure_up_to(shallow_nest, lambda x: 2 * x, inp_val)
        self.assertEqual(out, [[2], 4])

        shallow_nest = [None, None]
        inp_val = [[1], 2]
        out = map_structure_up_to(shallow_nest, lambda x: 2 * x, inp_val)
        self.assertEqual(out, [[1, 1], 4])

        data_list = [[2, 4, 6, 8], [[1, 3, 5, 7, 9], [3, 5, 7]]]
        name_list = ['evens', ['odds', 'primes']]
        out = map_structure_up_to(
            name_list, lambda name, sec: "first_{}_{}".format(len(sec), name),
            name_list, data_list)
        self.assertEqual(out,
                         ['first_4_evens', ['first_5_odds', 'first_3_primes']])

        ab_tuple = namedtuple("ab_tuple", "a, b")
        op_tuple = namedtuple("op_tuple", "add, mul")
        inp_val = ab_tuple(a=2, b=3)
        inp_ops = ab_tuple(a=op_tuple(add=1, mul=2), b=op_tuple(add=2, mul=3))
        out = map_structure_up_to(
            inp_val, lambda val, ops: (val + ops.add) * ops.mul, inp_val,
            inp_ops)
        self.assertEqual(out, ab_tuple(a=6, b=15))
Ejemplo n.º 2
0
    def test_transform_image(self):
        shape = [10]
        observation = tf.zeros(shape, dtype=tf.uint8)
        common.image_scale_transformer(observation)

        T1 = namedtuple('T1', ['x', 'y'])
        T2 = namedtuple('T2', ['a', 'b', 'c'])
        T3 = namedtuple('T3', ['l', 'm'])
        observation = T1(x=T2(a=tf.ones(shape, dtype=tf.uint8) * 255,
                              b=T3(l=tf.zeros(shape, dtype=tf.uint8))))
        transformed_observation = common.image_scale_transformer(
            observation, fields=["x.a", "x.b.l"])

        tf.debugging.assert_equal(transformed_observation.x.a,
                                  tf.ones(shape, dtype=tf.float32))
        tf.debugging.assert_equal(transformed_observation.x.b.l,
                                  tf.ones(shape, dtype=tf.float32) * -1)

        with self.assertRaises(Exception) as _:
            common.image_scale_transformer(observation,
                                           fields=["x.b.m"])  # empty ()

        observation = dict(x=dict(a=observation.x.a))
        common.image_scale_transformer(observation, fields=["x.a"])
Ejemplo n.º 3
0
# limitations under the License.

import gin

import torch

import alf
from alf.algorithms.algorithm import Algorithm
from alf.data_structures import TimeStep, namedtuple, AlgStep, LossInfo
from alf.networks import EncodingNetwork
from alf.nest.utils import NestConcat
from alf.tensor_specs import TensorSpec
from alf.utils import math_ops
from alf.utils.normalizers import ScalarAdaptiveNormalizer, AdaptiveNormalizer

ICMInfo = namedtuple("ICMInfo", ["reward", "loss"])


@gin.configurable
class ICMAlgorithm(Algorithm):
    """Intrinsic Curiosity Module

    This module generate the intrinsic reward based on predition error of
    observation.

    See Pathak et al "Curiosity-driven Exploration by Self-supervised Prediction"
    """

    def __init__(self,
                 action_spec,
                 observation_spec=None,
Ejemplo n.º 4
0
            current_frame (int): not used.
        Returns:
            np.ndarray: The shape is [num_channels, image_size_y, image_size_x],
                where num_channels is 3 for rgb sensor, and 1 for other sensors.
        """
        return self._image


NumpyWaypoint = namedtuple(
    "NumpyWaypoint",
    [
        'id',  # int
        'location',  # [3] (x, y, z)
        'rotation',  # [3] (pitch, yaw, rolll)
        'road_id',  # int
        'section_id',  # int
        'lane_id',  # int
        'is_junction',  # bool
        'lane_width',  # float
        'lane_change',  # int (carla.LaneChange) whether lane change is allowed. 0: None, 1: Right, 2: Left, 3: Both
        'lane_type',  # int (carla.LaneType)
        'right_lane_marking',  # int (carla.LaneMarking)
        'left_lane_marking',  # int (carla.LaneMarking)
    ])


def _to_numpy_loc(loc: carla.Location):
    return np.array([loc.x, loc.y, loc.z], dtype=np.float)


def _to_carla_loc(loc):
    return carla.Location(float(loc[0]), float(loc[1]), float(loc[2]))
Ejemplo n.º 5
0
# limitations under the License.

import gin

import torch

import alf
from alf.algorithms.algorithm import Algorithm
from alf.data_structures import AlgStep, LossInfo, namedtuple, TimeStep, StepType
from alf.networks import EncodingNetwork
from alf.tensor_specs import BoundedTensorSpec, TensorSpec
from alf.utils.tensor_utils import to_tensor
from alf.utils import math_ops
from alf.utils.normalizers import AdaptiveNormalizer, ScalarAdaptiveNormalizer

DIAYNInfo = namedtuple("DIAYNInfo", ["reward", "loss"])


@gin.configurable
def create_discrete_skill_spec(num_of_skills):
    return BoundedTensorSpec((), dtype="int64", maximum=num_of_skills - 1)


@gin.configurable
class DIAYNAlgorithm(Algorithm):
    """Diversity is All You Need Module

    This module learns a set of skill-conditional policies in an unsupervised
    way. See Eysenbach et al "Diversity is All You Need: Learning Diverse Skills
    without a Reward Function" for more details.
    """
Ejemplo n.º 6
0
from alf.data_structures import LossInfo, namedtuple
from alf.networks import EncodingNetwork, StableNormalProjectionNetwork, CategoricalProjectionNetwork
from alf.utils import dist_utils, tensor_utils, summary_utils
from alf.utils.losses import element_wise_squared_loss

ModelOutput = namedtuple(
    'ModelOutput',
    [
        'value',  # [B], value for the player 0
        'reward',  # [B], reward for the player 0
        'game_over',  # [B], whether the game is over

        # [B, K, ...], candidate actions, () all available discrete actions
        'actions',

        # [B, K], probabilities of the candidate actions. prob of 0 indicates invalid action
        'action_probs',

        # [B, ...], latent state
        'state',

        # used by calc_loss
        'action_distribution',
        # used by calc_loss
        'game_over_logit'
    ])

ModelTarget = namedtuple(
    'ModelTarget',
    [
        # reward the for taken previous action and the next unoll_steps actions
Ejemplo n.º 7
0
from alf.algorithms.config import TrainerConfig
from alf.algorithms.off_policy_algorithm import OffPolicyAlgorithm
from alf.algorithms.one_step_loss import OneStepTDLoss
from alf.algorithms.rl_algorithm import RLAlgorithm
from alf.data_structures import TimeStep, Experience, LossInfo, namedtuple
from alf.data_structures import AlgStep, StepType
from alf.nest import nest
import alf.nest.utils as nest_utils
from alf.networks import ActorDistributionNetwork, CriticNetwork
from alf.networks import QNetwork, QRNNNetwork
from alf.tensor_specs import TensorSpec, BoundedTensorSpec
from alf.utils import losses, common, dist_utils, math_ops

ActionType = Enum('ActionType', ('Discrete', 'Continuous', 'Mixed'))

SacActionState = namedtuple("SacActionState", ["actor_network", "critic"],
                            default_value=())

SacCriticState = namedtuple("SacCriticState", ["critics", "target_critics"])

SacState = namedtuple("SacState", ["action", "actor", "critic"],
                      default_value=())

SacCriticInfo = namedtuple("SacCriticInfo", ["critics", "target_critic"])

SacActorInfo = namedtuple("SacActorInfo", ["actor_loss", "neg_entropy"],
                          default_value=())

SacInfo = namedtuple("SacInfo",
                     ["action_distribution", "actor", "critic", "alpha"],
                     default_value=())
Ejemplo n.º 8
0
from tf_agents.agents.ddpg.critic_rnn_network import CriticRnnNetwork
from tf_agents.networks.actor_distribution_network import ActorDistributionNetwork
from tf_agents.networks.actor_distribution_rnn_network import ActorDistributionRnnNetwork
from tf_agents.trajectories.time_step import StepType
from tf_agents.utils import common as tfa_common

from alf.algorithms.ddpg_algorithm import create_ou_process
from alf.algorithms.on_policy_algorithm import OnPolicyAlgorithm
from alf.data_structures import ActionTimeStep, LossInfo, PolicyStep, TrainingInfo
from alf.data_structures import namedtuple
from alf.utils import common, dist_utils, losses
from alf.utils.summary_utils import safe_mean_hist_summary

SarsaState = namedtuple('SarsaState', [
    'prev_observation', 'prev_step_type', 'actor', 'target_actor', 'critic',
    'target_critic'
],
                        default_value=())
SarsaInfo = namedtuple(
    'SarsaInfo', ['action_distribution', 'actor_loss', 'critic', 'returns'])
SarsaLossInfo = namedtuple('SarsaLossInfo', ['actor', 'critic'],
                           default_value=())


@gin.configurable
class SarsaAlgorithm(OnPolicyAlgorithm):
    """SARSA Algorithm.

    SARSA update Q function in an online manner using the following loss:
        ||Q(s_t,a_t) - stop_gradient(r_t, \gamma * Q(s_{t+1}, a_{t+1})||^2
    See https://en.wikipedia.org/wiki/State-action-reward-state-action
Ejemplo n.º 9
0
import torch

from alf.algorithms.actor_critic_algorithm import ActorCriticAlgorithm
from alf.algorithms.algorithm import Algorithm
from alf.algorithms.agent_helpers import AgentHelper
from alf.algorithms.config import TrainerConfig
from alf.algorithms.entropy_target_algorithm import EntropyTargetAlgorithm
from alf.algorithms.icm_algorithm import ICMAlgorithm
from alf.algorithms.on_policy_algorithm import OnPolicyAlgorithm
from alf.algorithms.rl_algorithm import RLAlgorithm
from alf.data_structures import AlgStep, Experience
from alf.data_structures import TimeStep, namedtuple
from alf.utils import math_ops

AgentState = namedtuple("AgentState",
                        ["obs_trans", "rl", "irm", "goal_generator", "repr"],
                        default_value=())

AgentInfo = namedtuple(
    "AgentInfo", ["rl", "irm", "goal_generator", "entropy_target", "repr"],
    default_value=())


@gin.configurable
class Agent(OnPolicyAlgorithm):
    """Agent is a master algorithm that integrates different algorithms together.
    """
    def __init__(self,
                 observation_spec,
                 action_spec,
                 env=None,
Ejemplo n.º 10
0
from tf_agents.agents.ddpg.critic_rnn_network import CriticRnnNetwork
from tf_agents.networks.q_network import QNetwork
from tf_agents.networks.q_rnn_network import QRnnNetwork
from tf_agents.networks.actor_distribution_network import ActorDistributionNetwork
from tf_agents.networks.actor_distribution_rnn_network import ActorDistributionRnnNetwork
from tf_agents.networks.network import Network, DistributionNetwork
from tf_agents.utils import common as tfa_common

from alf.algorithms.off_policy_algorithm import OffPolicyAlgorithm
from alf.algorithms.one_step_loss import OneStepTDLoss
from alf.algorithms.rl_algorithm import RLAlgorithm
from alf.data_structures import ActionTimeStep, Experience, LossInfo, namedtuple
from alf.data_structures import PolicyStep, TrainingInfo
from alf.utils import losses, common, dist_utils

SacShareState = namedtuple("SacShareState", ["actor"])

SacActorState = namedtuple("SacActorState", ["critic1", "critic2"])

SacCriticState = namedtuple(
    "SacCriticState",
    ["critic1", "critic2", "target_critic1", "target_critic2"])

SacState = namedtuple("SacState", ["share", "actor", "critic"])

SacActorInfo = namedtuple("SacActorInfo", ["loss"])

SacCriticInfo = namedtuple("SacCriticInfo",
                           ["critic1", "critic2", "target_critic"])

SacAlphaInfo = namedtuple("SacAlphaInfo", ["loss"])
Ejemplo n.º 11
0
import numpy as np
import torch
import torch.nn as nn

import alf
from alf import data_structures as ds
from alf.data_structures import namedtuple
from alf.nest.utils import convert_device
from alf.utils.common import warning_once
from alf.utils.data_buffer import atomic, RingBuffer
from alf.utils import checkpoint_utils

from .segment_tree import SumSegmentTree, MaxSegmentTree, MinSegmentTree

BatchInfo = namedtuple("BatchInfo",
                       ["env_ids", "positions", "importance_weights"],
                       default_value=())


@gin.configurable
class ReplayBuffer(RingBuffer):
    """Replay buffer with RingBuffer as implementation.

    Terminology: consistent with RingBuffer, we use ``pos`` to refer to the
    always increasing position of an element in the infinitly long buffer,
    and ``idx`` as the actual index of the element in the underlying store
    (``_buffer``).  That means ``idx == pos % _max_length`` is always true,
    and one should use ``_buffer[idx]`` to retrieve the stored data.
    """

    ONE_MINUS = np.float32(1) - np.finfo(np.float32).eps
Ejemplo n.º 12
0
# limitations under the License.

import gin
import numpy as np
import functools

import torch

import alf
from alf.algorithms.rl_algorithm import RLAlgorithm
from alf.data_structures import (TimeStep, Experience, LossInfo, namedtuple,
                                 AlgStep, StepType)
from alf.tensor_specs import TensorSpec, BoundedTensorSpec
import alf.utils.common as common

GoalState = namedtuple("GoalState", ["goal"], default_value=())
GoalInfo = namedtuple("GoalInfo", ["goal", "loss"], default_value=())


@gin.configurable
class RandomCategoricalGoalGenerator(RLAlgorithm):
    """Random Goal Generation Module.

    This module generates a random categorical goal for the agent
    in the beginning of every episode.
    """

    def __init__(self,
                 observation_spec,
                 num_of_goals,
                 name="RandomCategoricalGoalGenerator"):
Ejemplo n.º 13
0
import numpy as np

import alf
from alf.algorithms.algorithm import Algorithm
from alf.algorithms.off_policy_algorithm import OffPolicyAlgorithm
from alf.algorithms.sac_algorithm import SacAlgorithm
from alf.algorithms.config import TrainerConfig
from alf.data_structures import TimeStep, Experience, namedtuple, AlgStep
from alf.data_structures import make_experience, LossInfo
from alf.tensor_specs import BoundedTensorSpec, TensorSpec
from alf.utils.conditional_ops import conditional_update
from alf.utils import common, summary_utils, tensor_utils

ActionRepeatState = namedtuple(
    "ActionRepeatState", [
        "rl", "action", "steps", "k", "rl_discount", "rl_reward",
        "sample_rewards", "repr"
    ],
    default_value=())


@gin.configurable
class DynamicActionRepeatAgent(OffPolicyAlgorithm):
    """Create an agent which learns a variable action repetition duration.
    At each decision step, the agent outputs both the action to repeat and
    the number of steps to repeat. These two quantities together constitute the
    action of the agent. We use SAC with mixed action type for training.

    The core idea is similar to `Learning to Repeat: Fine Grained Action Repetition for Deep Reinforcement Learning <http://arxiv.org/abs/1702.06054>`_.
    """

    def __init__(self,
Ejemplo n.º 14
0
"""A generic generator."""

import gin
import numpy as np
import torch

from alf.algorithms.algorithm import Algorithm
from alf.algorithms.mi_estimator import MIEstimator
from alf.data_structures import AlgStep, LossInfo, namedtuple
import alf.nest as nest
from alf.networks import Network, EncodingNetwork
from alf.tensor_specs import TensorSpec
from alf.utils import common, math_ops
from alf.utils.averager import AdaptiveAverager

GeneratorLossInfo = namedtuple("GeneratorLossInfo",
                               ["generator", "mi_estimator"])


@gin.configurable
class Generator(Algorithm):
    """Generator

    Generator generates outputs given `inputs` (can be None) by transforming
    a random noise and input using `net`:

        outputs = net([noise, input]) if input is not None
                  else net(noise)

    The generator is trained to minimize the following objective:

        :math:`E(loss\_func(net([noise, input]))) - entropy\_regulariztion \cdot H(P)`
Ejemplo n.º 15
0
"""Unittests for nest.py"""

import torch

from absl.testing import parameterized
import collections

import alf
import alf.nest as nest
import cnest
from alf.data_structures import namedtuple
from alf.tensor_specs import TensorSpec
from alf.nest.utils import NestConcat, NestSum, NestMultiply
from alf.nest import transform_nest

NTuple = namedtuple('NTuple', ['a', 'b'])  # default value will be None


class TestIsNested(parameterized.TestCase, alf.test.TestCase):
    @parameterized.parameters(nest.is_nested, cnest._is_nested)
    def test_is_nested(self, is_nested):
        self.assertFalse(is_nested(1))
        self.assertFalse(is_nested(None))
        self.assertTrue(is_nested(dict(x=1)))
        self.assertTrue(is_nested([1]))
        ntuple = NTuple(a=1, b=NTuple(a=NTuple(a=(2, ), b=[3]), b=dict(x=2)))
        self.assertTrue(is_nested(ntuple))


class TestFlatten(parameterized.TestCase, alf.test.TestCase):
    @parameterized.parameters(nest.py_flatten, cnest.flatten)
Ejemplo n.º 16
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Actor critic algorithm."""

import gin

from alf.algorithms.on_policy_algorithm import OnPolicyAlgorithm
from alf.networks import ActorDistributionNetwork, ValueNetwork
from alf.algorithms.actor_critic_loss import ActorCriticLoss
from alf.data_structures import TimeStep, AlgStep, namedtuple
from alf.utils import common, dist_utils
from .config import TrainerConfig

ActorCriticState = namedtuple("ActorCriticState", ["actor", "value"],
                              default_value=())

ActorCriticInfo = namedtuple("ActorCriticInfo",
                             ["action_distribution", "value"])


@gin.configurable
class ActorCriticAlgorithm(OnPolicyAlgorithm):
    """Actor critic algorithm."""
    def __init__(self,
                 observation_spec,
                 action_spec,
                 actor_network_ctor=ActorDistributionNetwork,
                 value_network_ctor=ValueNetwork,
                 env=None,
                 config: TrainerConfig = None,
Ejemplo n.º 17
0
import alf
from alf.algorithms.sac_algorithm import SacAlgorithm
from alf.algorithms.agent_helpers import AgentHelper
from alf.algorithms.config import TrainerConfig
from .skill_generator import SkillGenerator, SubTrajectory
from alf.algorithms.on_policy_algorithm import OnPolicyAlgorithm
from alf.data_structures import AlgStep, Experience
from alf.data_structures import TimeStep, namedtuple
from alf.nest.utils import transform_nest
from alf.utils import math_ops
from alf.data_structures import StepType
from alf.tensor_specs import BoundedTensorSpec, TensorSpec
from alf.networks.preprocessors import EmbeddingPreprocessor
from alf.utils.conditional_ops import conditional_update

AgentState = namedtuple("AgentState", ["rl", "skill_generator"],
                        default_value=())

AgentInfo = namedtuple("AgentInfo",
                       ["rl", "skill_generator", "skill_discount"],
                       default_value=())


@gin.configurable
def get_low_rl_input_spec(observation_spec, action_spec, num_steps_per_skill,
                          skill_spec):
    assert observation_spec.ndim == 1 and action_spec.ndim == 1
    concat_observation_spec = TensorSpec(
        (num_steps_per_skill * observation_spec.shape[0], ))
    concat_action_spec = TensorSpec(
        (num_steps_per_skill * action_spec.shape[0], ))
    traj_spec = SubTrajectory(observation=concat_observation_spec,
Ejemplo n.º 18
0
import alf
from alf.algorithms.config import TrainerConfig
from alf.algorithms.off_policy_algorithm import OffPolicyAlgorithm
from alf.algorithms.one_step_loss import OneStepTDLoss
from alf.algorithms.rl_algorithm import RLAlgorithm
from alf.algorithms.sac_algorithm import _set_target_entropy
from alf.data_structures import TimeStep, Experience, LossInfo, namedtuple
from alf.data_structures import AlgStep
from alf.nest import nest
from alf.networks import MdqCriticNetwork
from alf.tensor_specs import TensorSpec, BoundedTensorSpec
from alf.utils import (losses, common, dist_utils, math_ops, spec_utils,
                       tensor_utils)

MdqCriticState = namedtuple("MdqCriticState", ['critic', 'target_critic'])
MdqCriticInfo = namedtuple("MdqCriticInfo", [
    "critic_free_form", "target_critic_free_form", "critic_adv_form",
    "distill_target", "kl_wrt_prior"
])

MdqState = namedtuple("MdqState", ['critic'])
MdqAlphaInfo = namedtuple("MdqAlphaInfo", ["alpha_loss", "neg_entropy"])
MdqInfo = namedtuple("MdqInfo", ["critic", "alpha"], default_value=())

MdqLossInfo = namedtuple('MdqLossInfo', ['critic', 'distill', 'alpha'])


@gin.configurable
class MdqAlgorithm(OffPolicyAlgorithm):
    """Multi-Dimentional Q-Learning Algorithm.
Ejemplo n.º 19
0
import alf
from alf.algorithms.algorithm import Algorithm
from alf.algorithms.sac_algorithm import SacAlgorithm, SacLossInfo
from alf.algorithms.config import TrainerConfig
from alf.data_structures import TimeStep, Experience, namedtuple, AlgStep
from alf.data_structures import make_experience, LossInfo, StepType
from alf.networks import EncodingNetwork
import alf.nest.utils as nest_utils
from alf.tensor_specs import BoundedTensorSpec, TensorSpec
from alf.utils.conditional_ops import conditional_update
from alf.utils import dist_utils, math_ops, common, losses, tensor_utils
from alf.nest.utils import NestConcat
from alf.networks.preprocessors import EmbeddingPreprocessor

SubTrajectory = namedtuple('SubTrajectory', ["observation", "prev_action"],
                           default_value=())

DiscriminatorTimeStep = namedtuple('DiscTimeStep', [
    "step_type",
    "observation",
    "state",
    "env_id",
    "batch_info",
    "prev_action",
    "reward",
],
                                   default_value=())

DiscriminatorState = namedtuple(
    "DiscriminatorState",
    ["untrans_observation", "subtrajectory", "first_observation"],
Ejemplo n.º 20
0
from alf.algorithms.off_policy_algorithm import OffPolicyAlgorithm
from alf.algorithms.one_step_loss import OneStepTDLoss
from alf.algorithms.rl_algorithm import RLAlgorithm
from alf.data_structures import (AlgStep, Experience, LossInfo, namedtuple,
                                 TimeStep)
from alf.nest import nest
from alf.networks import ActorDistributionNetwork, CriticNetwork
from alf.tensor_specs import TensorSpec, BoundedTensorSpec
from alf.utils import losses, common, dist_utils, tensor_utils
from alf.utils.math_ops import add_ignore_empty

from alf.algorithms.dynamics_learning_algorithm import DynamicsLearningAlgorithm
from alf.algorithms.reward_learning_algorithm import RewardEstimationAlgorithm
from alf.algorithms.planning_algorithm import PlanAlgorithm

MbrlState = namedtuple("MbrlState", ["dynamics", "reward", "planner"])
MbrlInfo = namedtuple("MbrlInfo", ["dynamics", "reward", "planner"],
                      default_value=())


@gin.configurable
class MbrlAlgorithm(OffPolicyAlgorithm):
    """Model-based RL algorithm
    """
    def __init__(self,
                 observation_spec,
                 feature_spec,
                 action_spec,
                 dynamics_module: DynamicsLearningAlgorithm,
                 reward_module: RewardEstimationAlgorithm,
                 planner_module: PlanAlgorithm,
Ejemplo n.º 21
0
from typing import Callable

import alf
from alf.algorithms.config import TrainerConfig
from alf.algorithms.off_policy_algorithm import OffPolicyAlgorithm
from alf.algorithms.one_step_loss import OneStepTDLoss
from alf.algorithms.rl_algorithm import RLAlgorithm
from alf.data_structures import TimeStep, Experience, LossInfo, namedtuple
from alf.data_structures import AlgStep, StepType
from alf.nest import nest
import alf.nest.utils as nest_utils
from alf.networks import ActorNetwork, CriticNetwork
from alf.tensor_specs import TensorSpec, BoundedTensorSpec
from alf.utils import losses, common, dist_utils, math_ops, spec_utils

DdpgCriticState = namedtuple("DdpgCriticState",
                             ['critics', 'target_actor', 'target_critics'])
DdpgCriticInfo = namedtuple("DdpgCriticInfo", ["q_values", "target_q_values"])
DdpgActorState = namedtuple("DdpgActorState", ['actor', 'critics'])
DdpgState = namedtuple("DdpgState", ['actor', 'critics'])
DdpgInfo = namedtuple("DdpgInfo",
                      ["action_distribution", "actor_loss", "critic"],
                      default_value=())
DdpgLossInfo = namedtuple('DdpgLossInfo', ('actor', 'critic'))


@gin.configurable
class DdpgAlgorithm(OffPolicyAlgorithm):
    """Deep Deterministic Policy Gradient (DDPG).

    Reference:
    Lillicrap et al "Continuous control with deep reinforcement learning"
Ejemplo n.º 22
0
import alf
from alf.algorithms.algorithm import Algorithm
from alf.algorithms.sac_algorithm import _set_target_entropy
from alf.algorithms.one_step_loss import OneStepTDLoss
from alf.algorithms.rl_algorithm import RLAlgorithm
from alf.algorithms.on_policy_algorithm import OnPolicyAlgorithm
from alf.data_structures import (AlgStep, Experience, experience_to_time_step,
                                 LossInfo, namedtuple, StepType, TimeStep)
from alf.networks import Network
from alf.utils import common, dist_utils, losses, math_ops, spec_utils, tensor_utils
from alf.utils.summary_utils import safe_mean_hist_summary
import alf.nest.utils as nest_utils

SarsaState = namedtuple('SarsaState', [
    'prev_observation', 'prev_step_type', 'actor', 'critics', 'target_critics',
    'noise'
],
                        default_value=())
SarsaInfo = namedtuple('SarsaInfo', [
    'action_distribution', 'actor_loss', 'critics', 'target_critics',
    'neg_entropy'
],
                       default_value=())
SarsaLossInfo = namedtuple('SarsaLossInfo',
                           ['actor', 'critic', 'alpha', 'neg_entropy'])

nest_map = alf.nest.map_structure


@gin.configurable
class SarsaAlgorithm(OnPolicyAlgorithm):
Ejemplo n.º 23
0
import os
import glob
from scipy.interpolate import interp1d
from scipy.signal import savgol_filter

import matplotlib
import matplotlib.pyplot as plt
# Style gallery: https://tonysyu.github.io/raw_content/matplotlib-style-gallery/gallery.html
plt.style.use('seaborn-colorblind')

import alf.nest as nest
from alf.data_structures import namedtuple

HOME = os.getenv("HOME")

MeanCurve = namedtuple("MeanCurve", ['x', 'y', 'min_y', 'max_y', 'name'],
                       default_value=())


class MeanCurveReader(object):
    """Read and compute a MeanCurve from one or multiple TB event files.
    """
    _SIZE_GUIDANCE = {
        'compressedHistograms': 10,
        'images': 0,
        'scalars':
        100,  # sampled points will evenly distribute over the training time
        'histograms': 1
    }

    def _get_metric_name(self):
        raise NotImplementedError()
Ejemplo n.º 24
0
from absl import logging
import gin
import numpy as np
import tensorflow as tf

from tf_agents.trajectories.time_step import StepType

from alf.algorithms.algorithm import Algorithm, AlgorithmStep
from alf.data_structures import namedtuple, LossInfo
from alf.utils import dist_utils
from alf.utils.averager import ScalarWindowAverager
from alf.utils.common import run_if, should_record_summaries
from alf.utils.dist_utils import calc_default_target_entropy
from alf.utils.dist_utils import calc_default_max_entropy

EntropyTargetLossInfo = namedtuple("EntropyTargetLossInfo", ["neg_entropy"])
EntropyTargetInfo = namedtuple("EntropyTargetInfo", ["step_type", "loss"])


@gin.configurable
class EntropyTargetAlgorithm(Algorithm):
    """Algorithm for adjust entropy regularization.

    It tries to adjust the entropy regularization (i.e. alpha) so that the
    the entropy is not smaller than `target_entropy`.

    The algorithm has three stages:
    0. init stage. This is an optional stage. If the initial entropy is already
       below `max_entropy`, then this stage is skipped. Otherwise, the alpha will
       be slowly decreased so that the entropy will land at `max_entropy` to
       trigger the next `free stage`. Basically, this stage let the user to choose
from alf.experience_replayers.replay_buffer import BatchInfo, ReplayBuffer
from alf.nest import nest
from alf.nest.utils import convert_device
from alf.networks import Network, LSTMEncodingNetwork
from alf.utils import common, dist_utils, spec_utils, tensor_utils
from alf.utils.normalizers import AdaptiveNormalizer
from alf.utils.summary_utils import safe_mean_hist_summary, safe_mean_summary

PredictiveRepresentationLearnerInfo = namedtuple(
    'PredictiveRepresentationLearnerInfo',
    [
        # actual actions taken in the next unroll_steps + 1 steps
        # [B, unroll_steps + 1, ...]
        'action',

        # The flag to indicate whether to include this target into loss
        # [B, unroll_steps + 1]
        'mask',

        # nest for targets
        # [B, unroll_steps + 1, ...]
        'target'
    ])


@gin.configurable
class SimpleDecoder(Algorithm):
    """A simple decoder with elementwise loss between the target and the predicted value.

    It is used to predict the target value from the given representation. Its
    loss can be used to train the representation.
Ejemplo n.º 26
0
import gin
import numpy as np
import torch
import torch.distributions as td

import alf
from alf.algorithms.actor_critic_algorithm import ActorCriticAlgorithm
from alf.algorithms.on_policy_algorithm import OnPolicyAlgorithm
from alf.data_structures import Experience, namedtuple, StepType, TimeStep
from alf.optimizers.trusted_updater import TrustedUpdater
from alf.utils import common, dist_utils, math_ops

nest_map = alf.nest.map_structure

TracExperience = namedtuple(
    "TracExperience",
    ["observation", "step_type", "state", "action_param", "prev_action"])

TracInfo = namedtuple(
    "TracInfo",
    ["action_distribution", "observation", "state", "ac", "prev_action"])


@gin.configurable
class TracAlgorithm(OnPolicyAlgorithm):
    """Trust-region actor-critic.

    It compares the action distributions after the SGD with the action
    distributions from the previous model. If the average distance is too big,
    the new parameters are shrinked as:
Ejemplo n.º 27
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from absl.testing import parameterized
import torch

import alf
from alf.data_structures import namedtuple, StepType
from alf.experience_replayers.replay_buffer import ReplayBuffer, BatchInfo
from alf.algorithms.data_transformer import FrameStacker, ImageScaleTransformer
from alf.utils import common

DataItem = namedtuple(
    'DataItem', ['step_type', 'observation', 'batch_info', 'replay_buffer'],
    default_value=())


class FrameStackerTest(parameterized.TestCase, alf.test.TestCase):
    @parameterized.parameters(-1, 0)
    def test_frame_stacker(self, stack_axis=0):
        data_spec = DataItem(step_type=alf.TensorSpec((), dtype=torch.int32),
                             observation=dict(scalar=alf.TensorSpec(()),
                                              vector=alf.TensorSpec((7, )),
                                              matrix=alf.TensorSpec((5, 6)),
                                              tensor=alf.TensorSpec(
                                                  (2, 3, 4))))
        replay_buffer = ReplayBuffer(data_spec=data_spec,
                                     num_environments=2,
                                     max_length=1024,
Ejemplo n.º 28
0
import numpy as np
import torch
import torch.nn.functional as F
from typing import Callable

import alf
from alf.algorithms.algorithm import Algorithm
from alf.algorithms.config import TrainerConfig
from alf.data_structures import AlgStep, LossInfo, namedtuple
from alf.algorithms.generator import Generator
from alf.networks import EncodingNetwork, ParamNetwork
from alf.tensor_specs import TensorSpec
from alf.utils import common, math_ops, summary_utils
from alf.utils.summary_utils import record_time

HyperNetworkLossInfo = namedtuple("HyperNetworkLossInfo", ["loss", "extra"])


def classification_loss(output, target):
    pred = output.max(-1)[1]
    acc = pred.eq(target).float().mean(0)
    avg_acc = acc.mean()
    loss = F.cross_entropy(output.transpose(1, 2), target)
    return HyperNetworkLossInfo(loss=loss, extra=avg_acc)


def regression_loss(output, target):
    out_shape = output.shape[-1]
    assert (target.shape[-1] == out_shape), (
        "feature dimension of output and target does not match.")
    loss = 0.5 * F.mse_loss(output.reshape(-1, out_shape),
Ejemplo n.º 29
0
from abc import abstractmethod
from absl import logging
import copy
import json
import os

import tensorflow as tf
from tensorflow.python.util.serialization import get_json_type

from tf_agents.utils import eager_utils

from alf.data_structures import namedtuple, LossInfo
import alf.utils

AlgorithmStep = namedtuple("AlgorithmStep", ["outputs", "state", "info"])


def _is_alg(obj):
    """Only return True if the obj in an instance of Algorithm."""
    return isinstance(obj, Algorithm)


def _is_trainable_module(obj):
    """Only return True if the module or var is trainable, to avoid
    possible confusions shown in the optimizer info"""
    return (isinstance(obj, tf.Module) and not isinstance(obj, Algorithm)
            and obj.trainable_variables)


def _is_trainable_var(obj):
Ejemplo n.º 30
0
        self.normalize_base = torch.where(normalize, self.minimum,
                                          self.normalize_base)

    def normalize_value(self, value, batch_index):
        return self.normalize_scale[batch_index] * (
            value - self.normalize_base[batch_index])

    def calc_value(self, nodes):
        return self.value_sum[nodes] / self.visit_count[nodes]


def _nest_slice(nested, i):
    return nest.map_structure(lambda x: x[i], nested)


MCTSState = namedtuple("MCTSState", ["steps"])
MCTSInfo = namedtuple(
    "MCTSInfo", ["candidate_actions", "value", "candidate_action_policy"])


@gin.configurable
class MCTSAlgorithm(OffPolicyAlgorithm):
    r"""Monte-Carlo Tree Search algorithm.

    The code largely follows the pseudocode of
    `Schrittwieser et. al. Mastering Atari, Go, Chess and Shogi by Planning with a Learned Model <https://arxiv.org/abs/1911.08265>`_.
    The pseudocode can be downloaded from `<https://arxiv.org/src/1911.08265v2/anc/pseudocode.py>`_

    There are several differences:

    1. In this implementation, all values and rewards are for player 0. It seems