コード例 #1
0
    def __init__(self,
                 train_state_spec,
                 action_spec,
                 feature_spec,
                 hidden_size=256,
                 num_replicas=1,
                 dynamics_network: DynamicsNetwork = None,
                 name="DynamicsLearningAlgorithm"):
        """Create a DynamicsLearningAlgorithm.

        Args:
            hidden_size (int|tuple): size of hidden layer(s)
            dynamics_network (Network): network for predicting the change of
                the next feature based on the previous feature and action.
                It should accept input with spec of the format
                [feature_spec, encoded_action_spec] and output a tensor of the
                shape feature_spec. For discrete action case, encoded_action
                is a one-hot representation of the action. For continuous
                action, encoded action is the original action.
        """
        super().__init__(train_state_spec=train_state_spec, name=name)

        flat_action_spec = nest.flatten(action_spec)
        assert len(flat_action_spec) == 1, "doesn't support nested action_spec"

        flat_feature_spec = nest.flatten(feature_spec)
        assert len(
            flat_feature_spec) == 1, "doesn't support nested feature_spec"

        action_spec = flat_action_spec[0]

        if action_spec.is_discrete:
            self._num_actions = action_spec.maximum - action_spec.minimum + 1
        else:
            self._num_actions = action_spec.shape[-1]

        self._action_spec = action_spec
        self._feature_spec = feature_spec
        self._num_replicas = num_replicas

        if isinstance(hidden_size, int):
            hidden_size = (hidden_size, )

        if dynamics_network is None:
            encoded_action_spec = TensorSpec((self._num_actions, ),
                                             dtype=torch.float32)
            dynamics_network = DynamicsNetwork(
                name="dynamics_net",
                input_tensor_spec=(feature_spec, encoded_action_spec),
                preprocessing_combiner=NestConcat(),
                fc_layer_params=hidden_size,
                output_tensor_spec=flat_feature_spec[0])

        if num_replicas > 1:
            self._dynamics_network = dynamics_network.make_parallel(
                num_replicas)
        else:
            self._dynamics_network = dynamics_network
コード例 #2
0
    def _actor_train_step(self, exp: Experience, state, action, critics,
                          log_pi, action_distribution):
        neg_entropy = sum(nest.flatten(log_pi))

        if self._act_type == ActionType.Discrete:
            # Pure discrete case doesn't need to learn an actor network
            return (), LossInfo(extra=SacActorInfo(neg_entropy=neg_entropy))

        if self._act_type == ActionType.Continuous:
            critics, critics_state = self._compute_critics(
                self._critic_networks, exp.observation, action, state)
            if critics.ndim == 3:
                # Multidimensional reward: [B, num_criric_replicas, reward_dim]
                if self._reward_weights is None:
                    critics = critics.sum(dim=2)
                else:
                    critics = torch.tensordot(critics,
                                              self._reward_weights,
                                              dims=1)

            target_q_value = critics.min(dim=1)[0]
            continuous_log_pi = log_pi
            cont_alpha = torch.exp(self._log_alpha).detach()
        else:
            # use the critics computed during action prediction for Mixed type
            critics_state = ()
            discrete_act_dist = action_distribution[0]
            discrete_entropy = discrete_act_dist.entropy()
            # critics is already after min over replicas
            weighted_q_value = torch.sum(discrete_act_dist.probs * critics,
                                         dim=-1)
            discrete_alpha = torch.exp(self._log_alpha[0]).detach()
            target_q_value = weighted_q_value + discrete_alpha * discrete_entropy
            action, continuous_log_pi = action[1], log_pi[1]
            cont_alpha = torch.exp(self._log_alpha[1]).detach()

        dqda = nest_utils.grad(action, target_q_value.sum())

        def actor_loss_fn(dqda, action):
            if self._dqda_clipping:
                dqda = torch.clamp(dqda, -self._dqda_clipping,
                                   self._dqda_clipping)
            loss = 0.5 * losses.element_wise_squared_loss(
                (dqda + action).detach(), action)
            return loss.sum(list(range(1, loss.ndim)))

        actor_loss = nest.map_structure(actor_loss_fn, dqda, action)
        actor_loss = math_ops.add_n(nest.flatten(actor_loss))
        actor_info = LossInfo(loss=actor_loss + cont_alpha * continuous_log_pi,
                              extra=SacActorInfo(actor_loss=actor_loss,
                                                 neg_entropy=neg_entropy))
        return critics_state, actor_info
コード例 #3
0
    def __init__(self,
                 feature_spec,
                 action_spec,
                 train_state_spec,
                 planning_horizon=25,
                 upper_bound=None,
                 lower_bound=None,
                 name="PlanningAlgorithm"):
        """Create a PlanningAlgorithm.

        Args:
            planning_horizon (int): planning horizon in terms of time steps
            upper_bound (int): upper bound for elements in solution;
                action_spec.maximum will be used if not specified
            lower_bound (int): lower bound for elements in solution;
                action_spec.minimum will be used if not specified
        """
        super().__init__(
            feature_spec,
            action_spec,
            train_state_spec=train_state_spec,
            name=name)

        flat_action_spec = nest.flatten(action_spec)
        assert len(flat_action_spec) == 1, "doesn't support nested action_spec"

        flat_feature_spec = nest.flatten(feature_spec)
        assert len(
            flat_feature_spec) == 1, "doesn't support nested feature_spec"

        action_spec = flat_action_spec[0]

        assert action_spec.is_continuous, "only support \
                                                    continious control"

        self._num_actions = action_spec.shape[-1]

        self._action_spec = action_spec
        self._feature_spec = feature_spec
        self._planning_horizon = planning_horizon
        self._upper_bound = action_spec.maximum if upper_bound is None \
                                                else upper_bound
        self._lower_bound = action_spec.minimum if lower_bound is None \
                                                else lower_bound

        self._reward_func = None
        self._dynamics_func = None
        self._step_eval_func = None  # per step evaluation function
コード例 #4
0
    def __init__(self,
                 feature_spec,
                 action_spec,
                 population_size,
                 planning_horizon,
                 upper_bound=None,
                 lower_bound=None,
                 hidden_size=256,
                 name="RandomShootingAlgorithm"):
        """Create a RandomShootingAlgorithm.

        Args:
            population_size (int): the size of polulation for random shooting
            planning_horizon (int): planning horizon in terms of time steps
            upper_bound (int): upper bound for elements in solution;
                action_spec.maximum will be used if not specified
            lower_bound (int): lower bound for elements in solution;
                action_spec.minimum will be used if not specified
            hidden_size (int|tuple): size of hidden layer(s)
        """
        super().__init__(
            feature_spec=feature_spec,
            action_spec=action_spec,
            train_state_spec=(),
            planning_horizon=planning_horizon,
            upper_bound=upper_bound,
            lower_bound=lower_bound,
            name=name)

        flat_action_spec = nest.flatten(action_spec)
        assert len(flat_action_spec) == 1, ("RandomShootingAlgorithm doesn't "
                                            "support nested action_spec")

        flat_feature_spec = nest.flatten(feature_spec)
        assert len(flat_feature_spec) == 1, ("RandomShootingAlgorithm doesn't "
                                             "support nested feature_spec")

        self._population_size = population_size
        solution_size = self._planning_horizon * self._num_actions
        self._plan_optimizer = RandomOptimizer(
            solution_size,
            self._population_size,
            upper_bound=action_spec.maximum,
            lower_bound=action_spec.minimum)
コード例 #5
0
ファイル: sac_algorithm.py プロジェクト: soychanq/alf
    def train_step(self, exp: Experience, state: SacState):
        # We detach exp.observation here so that in the case that exp.observation
        # is calculated by some other trainable module, the training of that
        # module will not be affected by the gradient back-propagated from the
        # actor. However, the gradient from critic will still affect the training
        # of that module.
        (action_distribution, action, critics,
         action_state) = self._predict_action(common.detach(exp.observation),
                                              state=state.action)

        log_pi = nest.map_structure(lambda dist, a: dist.log_prob(a),
                                    action_distribution, action)

        if self._act_type == ActionType.Mixed:
            # For mixed type, add log_pi separately
            log_pi = type(self._action_spec)(
                (sum(nest.flatten(log_pi[0])), sum(nest.flatten(log_pi[1]))))
        else:
            log_pi = sum(nest.flatten(log_pi))

        if self._prior_actor is not None:
            prior_step = self._prior_actor.train_step(exp, ())
            log_prior = dist_utils.compute_log_probability(
                prior_step.output, action)
            log_pi = log_pi - log_prior

        actor_state, actor_loss = self._actor_train_step(
            exp, state.actor, action, critics, log_pi, action_distribution)
        critic_state, critic_info = self._critic_train_step(
            exp, state.critic, action, log_pi, action_distribution)
        alpha_loss = self._alpha_train_step(log_pi)

        state = SacState(action=action_state,
                         actor=actor_state,
                         critic=critic_state)
        info = SacInfo(action_distribution=action_distribution,
                       actor=actor_loss,
                       critic=critic_info,
                       alpha=alpha_loss)
        return AlgStep(action, state, info)
コード例 #6
0
ファイル: planning_algorithm.py プロジェクト: soychanq/alf
    def __init__(self,
                 feature_spec,
                 action_spec,
                 population_size,
                 planning_horizon=25,
                 upper_bound=None,
                 lower_bound=None,
                 name="RandomShootingAlgorithm"):
        """Create a RandomShootingAlgorithm.

        Args:
            population_size (int): the size of polulation for random shooting
            planning_horizon (int): planning horizon in terms of time steps
            upper_bound (int): upper bound for elements in solution;
                action_spec.maximum will be used if not specified
            lower_bound (int): lower bound for elements in solution;
                action_spec.minimum will be used if not specified
        """
        super().__init__(
            feature_spec=feature_spec,
            action_spec=action_spec,
            planning_horizon=planning_horizon,
            upper_bound=upper_bound,
            lower_bound=lower_bound,
            name=name)

        flat_action_spec = nest.flatten(action_spec)
        assert len(flat_action_spec) == 1, ("RandomShootingAlgorithm doesn't "
                                            "support nested action_spec")

        self._population_size = population_size

        solution_size = self._planning_horizon * self._num_actions
        self._solution_size = solution_size

        # expand action bound to solution bound
        solution_upper_bound = self._upper_bound.unsqueeze(0).expand(
            planning_horizon, *self._upper_bound.shape).reshape(-1)
        solution_lower_bound = self._lower_bound.unsqueeze(0).expand(
            planning_horizon, *self._lower_bound.shape).reshape(-1)

        self._plan_optimizer = RandomOptimizer(
            solution_size,
            self._population_size,
            upper_bound=solution_upper_bound,
            lower_bound=solution_lower_bound,
            cost_func=self._calc_cost_for_action_sequence)
コード例 #7
0
ファイル: planning_algorithm.py プロジェクト: soychanq/alf
    def __init__(self,
                 feature_spec,
                 action_spec,
                 planning_horizon=25,
                 upper_bound=None,
                 lower_bound=None,
                 name="PlanningAlgorithm"):
        """Create a PlanningAlgorithm.

        Args:
            planning_horizon (int): planning horizon in terms of time steps
            upper_bound (int): upper bound for elements in solution;
                action_spec.maximum will be used if not specified
            lower_bound (int): lower bound for elements in solution;
                action_spec.minimum will be used if not specified
            particles_per_replica (int): number of particles used for each replica
        """
        super().__init__(
            feature_spec,
            action_spec,
            train_state_spec=PlannerState(
                prev_plan=TensorSpec((planning_horizon,
                                      action_spec.shape[-1]))),
            name=name)

        flat_action_spec = nest.flatten(action_spec)
        assert len(flat_action_spec) == 1, "doesn't support nested action_spec"

        action_spec = flat_action_spec[0]

        assert action_spec.is_continuous, "only support \
                                                    continious control"

        self._num_actions = action_spec.shape[-1]

        self._action_spec = action_spec
        self._feature_spec = feature_spec
        self._planning_horizon = planning_horizon

        self._upper_bound = torch.Tensor(action_spec.maximum) \
                        if upper_bound is None else upper_bound
        self._lower_bound = torch.Tensor(action_spec.minimum) \
                        if lower_bound is None else lower_bound

        self._action_seq_cost_func = None
コード例 #8
0
ファイル: sac_algorithm.py プロジェクト: soychanq/alf
    def _critic_train_step(self, exp: Experience, state: SacCriticState,
                           action, log_pi, action_distribution):
        critics, critics_state = self._compute_critics(self._critic_networks,
                                                       exp.observation,
                                                       exp.action,
                                                       state.critics)

        target_critics, target_critics_state = self._compute_critics(
            self._target_critic_networks, exp.observation, action,
            state.target_critics)
        target_critics = target_critics.min(dim=1)[0]

        if self._act_type == ActionType.Discrete:
            critics = self._select_q_value(exp.action, critics)
            target_critics = self._select_q_value(
                action, target_critics.unsqueeze(dim=1))

        elif self._act_type == ActionType.Mixed:
            critics = self._select_q_value(exp.action[0], critics)
            discrete_act_dist = action_distribution[0]
            target_critics = torch.sum(discrete_act_dist.probs *
                                       target_critics,
                                       dim=-1)

        target_critic = target_critics.reshape(exp.reward.shape)
        if self._use_entropy_reward:
            entropy_reward = nest.map_structure(
                lambda la, lp: -torch.exp(la) * lp, self._log_alpha, log_pi)
            entropy_reward = sum(nest.flatten(entropy_reward))
            target_critic = target_critic + entropy_reward

        target_critic = target_critic.detach()

        state = SacCriticState(critics=critics_state,
                               target_critics=target_critics_state)
        info = SacCriticInfo(critics=critics, target_critic=target_critic)

        return state, info
コード例 #9
0
ファイル: ddpg_algorithm.py プロジェクト: soychanq/alf
    def _actor_train_step(self, exp: Experience, state: DdpgActorState):
        action, actor_state = self._actor_network(exp.observation,
                                                  state=state.actor)

        q_values, critic_states = self._critic_networks(
            (exp.observation, action), state=state.critics)
        if q_values.ndim == 3:
            # Multidimensional reward: [B, num_criric_replicas, reward_dim]
            if self._reward_weights is None:
                q_values = q_values.sum(dim=2)
            else:
                q_values = torch.tensordot(q_values,
                                           self._reward_weights,
                                           dims=1)

        if self._num_critic_replicas > 1:
            q_value = q_values.min(dim=1)[0]
        else:
            q_value = q_values.squeeze(dim=1)

        dqda = nest_utils.grad(action, q_value.sum())

        def actor_loss_fn(dqda, action):
            if self._dqda_clipping:
                dqda = torch.clamp(dqda, -self._dqda_clipping,
                                   self._dqda_clipping)
            loss = 0.5 * losses.element_wise_squared_loss(
                (dqda + action).detach(), action)
            if self._action_l2 > 0:
                assert action.requires_grad
                loss += self._action_l2 * (action**2)
            loss = loss.sum(list(range(1, loss.ndim)))
            return loss

        actor_loss = nest.map_structure(actor_loss_fn, dqda, action)
        state = DdpgActorState(actor=actor_state, critics=critic_states)
        info = LossInfo(loss=sum(nest.flatten(actor_loss)), extra=actor_loss)
        return AlgStep(output=action, state=state, info=info)
コード例 #10
0
ファイル: sac_algorithm.py プロジェクト: soychanq/alf
 def _alpha_train_step(self, log_pi):
     alpha_loss = nest.map_structure(
         lambda la, lp, t: la * (-lp - t).detach(), self._log_alpha, log_pi,
         self._target_entropy)
     return sum(nest.flatten(alpha_loss))
コード例 #11
0
ファイル: sac_algorithm.py プロジェクト: soychanq/alf
 def _check_spec_equal(spec1, spec2):
     assert nest.flatten(spec1) == nest.flatten(spec2), (
         "Unmatched action specs: {} vs. {}".format(spec1, spec2))
コード例 #12
0
ファイル: sac_algorithm.py プロジェクト: soychanq/alf
    def _make_networks(self, observation_spec, action_spec,
                       continuous_actor_network_cls, critic_network_cls,
                       q_network_cls):
        def _make_parallel(net):
            if self._use_parallel_network:
                nets = net.make_parallel(self._num_critic_replicas)
            else:
                nets = alf.networks.NaiveParallelNetwork(
                    net, self._num_critic_replicas)
            return nets

        def _check_spec_equal(spec1, spec2):
            assert nest.flatten(spec1) == nest.flatten(spec2), (
                "Unmatched action specs: {} vs. {}".format(spec1, spec2))

        discrete_action_spec = [
            spec for spec in nest.flatten(action_spec) if spec.is_discrete
        ]
        continuous_action_spec = [
            spec for spec in nest.flatten(action_spec) if spec.is_continuous
        ]

        if discrete_action_spec and continuous_action_spec:
            # When there are both continuous and discrete actions, we require
            # that acition_spec is a tuple/list ``(discrete, continuous)``.
            assert (isinstance(
                action_spec, (tuple, list)) and len(action_spec) == 2), (
                    "In the mixed case, the action spec must be a tuple/list"
                    " (discrete_action_spec, continuous_action_spec)!")
            _check_spec_equal(action_spec[0], discrete_action_spec)
            _check_spec_equal(action_spec[1], continuous_action_spec)
            discrete_action_spec = action_spec[0]
            continuous_action_spec = action_spec[1]
        elif discrete_action_spec:
            discrete_action_spec = action_spec
        elif continuous_action_spec:
            continuous_action_spec = action_spec

        actor_network = None
        reward_dim = 1
        if continuous_action_spec:
            assert continuous_actor_network_cls is not None, (
                "If there are continuous actions, then a ActorDistributionNetwork "
                "must be provided for sampling continuous actions!")
            actor_network = continuous_actor_network_cls(
                input_tensor_spec=observation_spec,
                action_spec=continuous_action_spec)
            if not discrete_action_spec:
                act_type = ActionType.Continuous
                assert critic_network_cls is not None, (
                    "If only continuous actions exist, then a CriticNetwork must"
                    " be provided!")
                critic_network = critic_network_cls(
                    input_tensor_spec=(observation_spec,
                                       continuous_action_spec))
                reward_dim = critic_network.output_spec.numel
                critic_networks = _make_parallel(critic_network)

        if discrete_action_spec:
            assert reward_dim == 1, (
                "Discrete action is not supported for multidimensional reward")
            act_type = ActionType.Discrete
            assert len(alf.nest.flatten(discrete_action_spec)) == 1, (
                "Only support at most one discrete action currently! "
                "Discrete action spec: {}".format(discrete_action_spec))
            assert q_network_cls is not None, (
                "If there exists a discrete action, then QNetwork must "
                "be provided!")
            if continuous_action_spec:
                act_type = ActionType.Mixed
                q_network = q_network_cls(
                    input_tensor_spec=(observation_spec,
                                       continuous_action_spec),
                    action_spec=discrete_action_spec)
            else:
                q_network = q_network_cls(input_tensor_spec=observation_spec,
                                          action_spec=action_spec)
            critic_networks = _make_parallel(q_network)

        return critic_networks, actor_network, act_type, reward_dim
コード例 #13
0
ファイル: sac_algorithm.py プロジェクト: soychanq/alf
    def __init__(self,
                 observation_spec,
                 action_spec: BoundedTensorSpec,
                 actor_network_cls=ActorDistributionNetwork,
                 critic_network_cls=CriticNetwork,
                 q_network_cls=QNetwork,
                 reward_weights=None,
                 use_entropy_reward=True,
                 use_parallel_network=False,
                 num_critic_replicas=2,
                 env=None,
                 config: TrainerConfig = None,
                 critic_loss_ctor=None,
                 target_entropy=None,
                 prior_actor_ctor=None,
                 target_kld_per_dim=3.,
                 initial_log_alpha=0.0,
                 max_log_alpha=None,
                 target_update_tau=0.05,
                 target_update_period=1,
                 dqda_clipping=None,
                 actor_optimizer=None,
                 critic_optimizer=None,
                 alpha_optimizer=None,
                 debug_summaries=False,
                 name="SacAlgorithm"):
        """
        Args:
            observation_spec (nested TensorSpec): representing the observations.
            action_spec (nested BoundedTensorSpec): representing the actions; can
                be a mixture of discrete and continuous actions. The number of
                continuous actions can be arbitrary while only one discrete
                action is allowed currently. If it's a mixture, then it must be
                a tuple/list ``(discrete_action_spec, continuous_action_spec)``.
            actor_network_cls (Callable): is used to construct the actor network.
                The constructed actor network will be called
                to sample continuous actions. All of its output specs must be
                continuous. Note that we don't need a discrete actor network
                because a discrete action can simply be sampled from the Q values.
            critic_network_cls (Callable): is used to construct critic network.
                for estimating ``Q(s,a)`` given that the action is continuous.
            q_network (Callable): is used to construct QNetwork for estimating ``Q(s,a)``
                given that the action is discrete. Its output spec must be consistent with
                the discrete action in ``action_spec``.
            reward_weights (None|list[float]): this is only used when the reward is
                multidimensional. In that case, the weighted sum of the q values
                is used for training the actor if reward_weights is not None.
                Otherwise, the sum of the q values is used.
            use_entropy_reward (bool): whether to include entropy as reward
            use_parallel_network (bool): whether to use parallel network for
                calculating critics.
            num_critic_replicas (int): number of critics to be used. Default is 2.
            env (Environment): The environment to interact with. ``env`` is a
                batched environment, which means that it runs multiple simulations
                simultateously. ``env` only needs to be provided to the root
                algorithm.
            config (TrainerConfig): config for training. It only needs to be
                provided to the algorithm which performs ``train_iter()`` by
                itself.
            critic_loss_ctor (None|OneStepTDLoss|MultiStepLoss): a critic loss
                constructor. If ``None``, a default ``OneStepTDLoss`` will be used.
            initial_log_alpha (float): initial value for variable ``log_alpha``.
            max_log_alpha (float|None): if not None, ``log_alpha`` will be
                capped at this value.
            target_entropy (float|Callable|None): If a floating value, it's the
                target average policy entropy, for updating ``alpha``. If a
                callable function, then it will be called on the action spec to
                calculate a target entropy. If ``None``, a default entropy will
                be calculated. For the mixed action type, discrete action and
                continuous action will have separate alphas and target entropies,
                so this argument can be a 2-element list/tuple, where the first
                is for discrete action and the second for continuous action.
            prior_actor_ctor (Callable): If provided, it will be called using
                ``prior_actor_ctor(observation_spec, action_spec, debug_summaries=debug_summaries)``
                to constructor a prior actor. The output of the prior actor is
                the distribution of the next action. Two prior actors are implemented:
                ``alf.algorithms.prior_actor.SameActionPriorActor`` and
                ``alf.algorithms.prior_actor.UniformPriorActor``.
            target_kld_per_dim (float): ``alpha`` is dynamically adjusted so that
                the KLD is about ``target_kld_per_dim * dim``.
            target_update_tau (float): Factor for soft update of the target
                networks.
            target_update_period (int): Period for soft update of the target
                networks.
            dqda_clipping (float): when computing the actor loss, clips the
                gradient dqda element-wise between
                ``[-dqda_clipping, dqda_clipping]``. Will not perform clipping if
                ``dqda_clipping == 0``.
            actor_optimizer (torch.optim.optimizer): The optimizer for actor.
            critic_optimizer (torch.optim.optimizer): The optimizer for critic.
            alpha_optimizer (torch.optim.optimizer): The optimizer for alpha.
            debug_summaries (bool): True if debug summaries should be created.
            name (str): The name of this algorithm.
        """
        self._num_critic_replicas = num_critic_replicas
        self._use_parallel_network = use_parallel_network

        critic_networks, actor_network, self._act_type, reward_dim = self._make_networks(
            observation_spec, action_spec, actor_network_cls,
            critic_network_cls, q_network_cls)

        self._use_entropy_reward = use_entropy_reward

        if reward_dim > 1:
            assert not use_entropy_reward, (
                "use_entropy_reward=True is not supported for multidimensional reward"
            )
            assert self._act_type == ActionType.Continuous, (
                "Only continuous action is supported for multidimensional reward"
            )

        self._reward_weights = None
        if reward_weights:
            assert reward_dim > 1, (
                "reward_weights cannot be used for one dimensional reward")
            assert len(reward_weights) == reward_dim, (
                "Mismatch between len(reward_weights)=%s and reward_dim=%s" %
                (len(reward_weights), reward_dim))
            self._reward_weights = torch.tensor(reward_weights,
                                                dtype=torch.float32)

        def _init_log_alpha():
            return nn.Parameter(torch.tensor(float(initial_log_alpha)))

        if self._act_type == ActionType.Mixed:
            # separate alphas for discrete and continuous actions
            log_alpha = type(action_spec)(
                (_init_log_alpha(), _init_log_alpha()))
        else:
            log_alpha = _init_log_alpha()

        action_state_spec = SacActionState(
            actor_network=(() if self._act_type == ActionType.Discrete else
                           actor_network.state_spec),
            critic=(() if self._act_type == ActionType.Continuous else
                    critic_networks.state_spec))
        super().__init__(
            observation_spec,
            action_spec,
            train_state_spec=SacState(
                action=action_state_spec,
                actor=(() if self._act_type != ActionType.Continuous else
                       critic_networks.state_spec),
                critic=SacCriticState(
                    critics=critic_networks.state_spec,
                    target_critics=critic_networks.state_spec)),
            predict_state_spec=SacState(action=action_state_spec),
            env=env,
            config=config,
            debug_summaries=debug_summaries,
            name=name)

        if actor_optimizer is not None:
            self.add_optimizer(actor_optimizer, [actor_network])
        if critic_optimizer is not None:
            self.add_optimizer(critic_optimizer, [critic_networks])
        if alpha_optimizer is not None:
            self.add_optimizer(alpha_optimizer, nest.flatten(log_alpha))

        self._log_alpha = log_alpha
        if self._act_type == ActionType.Mixed:
            self._log_alpha_paralist = nn.ParameterList(
                nest.flatten(log_alpha))

        if max_log_alpha is not None:
            self._max_log_alpha = torch.tensor(float(max_log_alpha))
        else:
            self._max_log_alpha = None

        self._actor_network = actor_network
        self._critic_networks = critic_networks
        self._target_critic_networks = self._critic_networks.copy(
            name='target_critic_networks')

        if critic_loss_ctor is None:
            critic_loss_ctor = OneStepTDLoss
        critic_loss_ctor = functools.partial(critic_loss_ctor,
                                             debug_summaries=debug_summaries)
        # Have different names to separate their summary curves
        self._critic_losses = []
        for i in range(num_critic_replicas):
            self._critic_losses.append(
                critic_loss_ctor(name="critic_loss%d" % (i + 1)))

        self._prior_actor = None
        if prior_actor_ctor is not None:
            assert self._act_type == ActionType.Continuous, (
                "Only continuous action is supported when using prior_actor")
            self._prior_actor = prior_actor_ctor(
                observation_spec=observation_spec,
                action_spec=action_spec,
                debug_summaries=debug_summaries)
            total_action_dims = sum(
                [spec.numel for spec in alf.nest.flatten(action_spec)])
            self._target_entropy = -target_kld_per_dim * total_action_dims
        else:
            if self._act_type == ActionType.Mixed:
                if not isinstance(target_entropy, (tuple, list)):
                    target_entropy = nest.map_structure(
                        lambda _: target_entropy, self._action_spec)
                # separate target entropies for discrete and continuous actions
                self._target_entropy = nest.map_structure(
                    lambda spec, t: _set_target_entropy(self.name, t, [spec]),
                    self._action_spec, target_entropy)
            else:
                self._target_entropy = _set_target_entropy(
                    self.name, target_entropy, nest.flatten(self._action_spec))

        self._dqda_clipping = dqda_clipping

        self._update_target = common.get_target_updater(
            models=[self._critic_networks],
            target_models=[self._target_critic_networks],
            tau=target_update_tau,
            period=target_update_period)
コード例 #14
0
    def __init__(self,
                 observation_spec,
                 feature_spec,
                 action_spec,
                 dynamics_module: DynamicsLearningAlgorithm,
                 reward_module: RewardEstimationAlgorithm,
                 planner_module: PlanAlgorithm,
                 env=None,
                 config: TrainerConfig = None,
                 dynamics_optimizer=None,
                 reward_optimizer=None,
                 planner_optimizer=None,
                 debug_summaries=False,
                 name="MbrlAlgorithm"):
        """Create an MbrlAlgorithm.
        The MbrlAlgorithm takes as input the following set of modules for
        making decisions on actions based on the current observation:
        1) learnable/fixed dynamics module
        2) learnable/fixed reward module
        3) learnable/fixed planner module

        Args:
            action_spec (nested BoundedTensorSpec): representing the actions.
            dynamics_module (DynamicsLearningAlgorithm): module for learning to
                predict the next feature based on the previous feature and action.
                It should accept input with spec [feature_spec,
                encoded_action_spec] and output a tensor of shape
                feature_spec. For discrete action, encoded_action is an one-hot
                representation of the action. For continuous action, encoded
                action is same as the original action.
            reward_module (RewardEstimationAlgorithm): module for calculating
                the reward, i.e.,  evaluating the reward for a (s, a) pair
            planner_module (PlanAlgorithm): module for generating planned action
                based on specified reward function and dynamics function
            env (Environment): The environment to interact with. env is a batched
                environment, which means that it runs multiple simulations
                simultateously. env only needs to be provided to the root
                Algorithm.
            config (TrainerConfig): config for training. config only needs to be
                provided to the algorithm which performs `train_iter()` by
                itself.
            debug_summaries (bool): True if debug summaries should be created.
            name (str): The name of this algorithm.

        """
        train_state_spec = MbrlState(dynamics=dynamics_module.train_state_spec,
                                     reward=reward_module.train_state_spec,
                                     planner=planner_module.train_state_spec)

        super().__init__(feature_spec,
                         action_spec,
                         train_state_spec=train_state_spec,
                         env=env,
                         config=config,
                         debug_summaries=debug_summaries,
                         name=name)

        flat_action_spec = nest.flatten(action_spec)
        action_spec = flat_action_spec[0]

        assert action_spec.is_continuous, "only support \
                                                    continious control"

        num_actions = action_spec.shape[-1]

        flat_feature_spec = nest.flatten(feature_spec)
        assert len(flat_feature_spec) == 1, "Mbrl doesn't support nested \
                                             feature_spec"

        self._action_spec = action_spec
        self._num_actions = num_actions

        if dynamics_optimizer is not None:
            self.add_optimizer(dynamics_optimizer, [dynamics_module])

        if planner_optimizer is not None:
            self.add_optimizer(planner_optimizer, [planner_module])

        if reward_optimizer is not None:
            self.add_optimizer(reward_optimizer, [reward_module])

        self._dynamics_module = dynamics_module
        self._reward_module = reward_module
        self._planner_module = planner_module
        self._planner_module.set_reward_func(self._calc_step_reward)
        self._planner_module.set_dynamics_func(self._predict_next_step)
コード例 #15
0
ファイル: mdq_algorithm.py プロジェクト: zhuboli/alf
    def __init__(
            self,
            observation_spec,
            action_spec: BoundedTensorSpec,
            critic_network: MdqCriticNetwork,
            env=None,
            config: TrainerConfig = None,
            critic_loss_ctor=None,
            target_entropy=dist_utils.calc_default_target_entropy_quantized,
            initial_log_alpha=0.0,
            target_update_tau=0.05,
            target_update_period=1,
            distill_noise=0.01,
            critic_optimizer=None,
            alpha_optimizer=None,
            debug_summaries=False,
            name="MdqAlgorithm"):
        """
        Args:
            observation_spec (nested TensorSpec): representing the observations.
            action_spec (nested BoundedTensorSpec): representing the actions.
            critic_network (MdqCriticNetwork): an instance of MdqCriticNetwork
            env (Environment): The environment to interact with. ``env`` is a
                batched environment, which means that it runs multiple simulations
                simultateously. ``env` only needs to be provided to the root
                algorithm.
            config (TrainerConfig): config for training. It only needs to be
                provided to the algorithm which performs ``train_iter()`` by
                itself.
            critic_loss_ctor (None|OneStepTDLoss|MultiStepLoss): a critic loss
                constructor. If ``None``, a default ``OneStepTDLoss`` will be used.
            initial_log_alpha (float): initial value for variable ``log_alpha``.
            target_entropy (float|Callable): If a floating value, it's the
                target average policy entropy, for updating ``alpha``. If a
                callable function, then it will be called on the action spec to
                calculate a target entropy. Note that in MDQ algorithm, as the
                continuous action is represented by a discrete distribution for
                each action dimension, ``calc_default_target_entropy_quantized``
                is used to compute the target entropy by default.
            target_update_tau (float): Factor for soft update of the target
                networks.
            target_update_period (int): Period for soft update of the target
                networks.
            distill_noise (int): the std of random Gaussian noise added to the
                action used for distillation.
            critic_optimizer (torch.optim.optimizer): The optimizer for critic.
            alpha_optimizer (torch.optim.optimizer): The optimizer for alpha.
            debug_summaries (bool): True if debug summaries should be created.
            name (str): The name of this algorithm.
        """

        critic_networks = critic_network
        target_critic_networks = critic_networks.copy(
            name='target_critic_networks')

        train_state_spec = MdqState(
            critic=MdqCriticState(critic=critic_networks.state_spec,
                                  target_critic=critic_networks.state_spec))

        super().__init__(observation_spec,
                         action_spec,
                         train_state_spec=train_state_spec,
                         env=env,
                         config=config,
                         debug_summaries=debug_summaries,
                         name=name)

        self._critic_networks = critic_networks
        self._target_critic_networks = target_critic_networks

        self.add_optimizer(critic_optimizer, [critic_networks])

        if critic_loss_ctor is None:
            critic_loss_ctor = OneStepTDLoss
        critic_loss_ctor = functools.partial(critic_loss_ctor,
                                             debug_summaries=debug_summaries)

        flat_action_spec = nest.flatten(self._action_spec)
        self._flat_action_spec = flat_action_spec
        self._action_dim = flat_action_spec[0].shape[0]
        self._log_pi_uniform_prior = self._critic_networks.get_uniform_prior_logpi(
        )

        self._num_critic_replicas = self._critic_networks._num_critic_replicas

        self._critic_losses = []

        for i in range(self._num_critic_replicas):
            self._critic_losses.append(
                critic_loss_ctor(name="critic_loss%d" % (i + 1)))

        self._is_continuous = flat_action_spec[0].is_continuous
        self._target_entropy = _set_target_entropy(self.name, target_entropy,
                                                   flat_action_spec)

        log_alpha = nn.Parameter(torch.Tensor([float(initial_log_alpha)]))
        self._log_alpha = log_alpha

        self._update_target = common.get_target_updater(
            models=[self._critic_networks],
            target_models=[self._target_critic_networks],
            tau=target_update_tau,
            period=target_update_period)

        if alpha_optimizer is not None:
            self.add_optimizer(alpha_optimizer, [log_alpha])
        self._distill_noise = distill_noise
コード例 #16
0
ファイル: mbrl_algorithm.py プロジェクト: soychanq/alf
    def __init__(self,
                 observation_spec,
                 action_spec,
                 planner_module: PlanAlgorithm,
                 env=None,
                 config: TrainerConfig = None,
                 planner_optimizer=None,
                 debug_summaries=False,
                 name="LatentMbrlAlgorithm"):
        """Create an LatentMbrlAlgorithm.
        The LatentMbrlAlgorithm takes as input a planner module for
        making decisions on actions based on the latent representation of the
        current observation as well as a latent dynamics model.

        The latent representation as well as the latent dynamics is provided by
        a latent predictive representation module, which is an instance of
        ``PredictiveRepresentationLearner``. It is set through the
        ``set_latent_predictive_representation_module()`` function. The latent
        predictive representation module should have a function
        ``predict_multi_step`` for performing multi-step imagined rollout.
        Currently it is assumed that the training of the latent representation
        module is outside of the ``LatentMbrlAlgorithm``, although the
        ``LatentMbrlAlgorithm`` can also contribute to its training by using
        the latent representation in loss calculation.

        Args:
            action_spec (BoundedTensorSpec): representing the actions.
            planner_module (PlanAlgorithm): module for generating planned action
                based on specified reward function and dynamics function
            env (Environment): The environment to interact with. env is a batched
                environment, which means that it runs multiple simulations
                simultateously. env only needs to be provided to the root
                Algorithm.
            config (TrainerConfig): config for training. config only needs to be
                provided to the algorithm which performs `train_iter()` by
                itself.
            debug_summaries (bool): True if debug summaries should be created.
            name (str): The name of this algorithm.

        """

        super().__init__(observation_spec,
                         feature_spec=observation_spec,
                         action_spec=action_spec,
                         dynamics_module=None,
                         reward_module=None,
                         planner_module=planner_module,
                         planner_optimizer=planner_optimizer,
                         env=env,
                         config=config,
                         debug_summaries=debug_summaries,
                         name=name)

        flat_action_spec = nest.flatten(action_spec)
        action_spec = flat_action_spec[0]

        assert action_spec.is_continuous, "only support \
                                                    continious control"

        num_actions = action_spec.shape[-1]

        self._action_spec = action_spec
        self._num_actions = num_actions

        self._latent_pred_rep_module = None  # set it later