Python MSBELoss Examples

Programming Language: Python

Namespace/Package Name: pyrobolearn.losses

Class/Type: MSBELoss

Examples at hotexamples.com: 4

Python MSBELoss - 4 examples found. These are the top rated real world Python examples of pyrobolearn.losses.MSBELoss extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

MSBELoss(4)

Frequently Used Methods

MSBELoss (4)

Example #1

Show file

    def __init__(self, task, approximators, gamma=0.99, lr=0.001, polyak=0.995, delay=2, capacity=10000, num_workers=1):
        """
        Initialize the TD3 off-policy RL algorithm.

        Args:
            task (RLTask, Env): RL task/env to run
            approximators (Policy, [Policy, Value], ActorCritic): approximators to optimize
            gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much
                importance has the future rewards we get.
            lr (float): learning rate.
            polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target
                approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the
                current parameter(s).
            delay (int): number of steps to wait before performing an update.
            capacity (int): capacity of the experience replay storage.
            num_workers (int): number of processes / workers to run in parallel
        """

        # check given approximators
        if isinstance(approximators, (tuple, list)):

            # get the policy and Q-value approximator
            policy, q_values = None, []
            for approximator in approximators:
                if isinstance(approximator, (Policy, QValue)):
                    policy = approximator
                elif isinstance(approximator, QValue):
                    q_values.append(approximator)

            # check that the policy and Q-value approximator are different than None
            if policy is None:
                raise ValueError("No policy approximator was given to the algorithm.")
            if not q_values:
                raise ValueError("No Q-value approximator was given to the algorithm.")

        else:
            raise TypeError("Expecting a list/tuple of a policy and a Q-value functions.")

        # check that there is at least 2 Q-value function approximators (the user can have more)
        if len(q_values) < 2:
            raise ValueError("Expecting at least 2 Q-value function approximators for the TD3 algorithm.")

        # get states and actions from policy
        states, actions = policy.states, policy.actions

        # check that the actions are continuous
        if not actions.is_continuous():
            raise ValueError("The TD3 assumes that the actions are continuous, however got an action which is not.")

        # evaluate target Q-value fct by copying Q-value function approximator
        memo = {}
        q_targets = [copy.deepcopy(q_value, memo=memo) for q_value in q_values]
        policy_target = copy.deepcopy(policy, memo=memo)

        # create action exploration strategy
        exploration = ActionExploration(policy=policy, action=actions)

        # create experience replay
        storage = ExperienceReplay(state_shapes=states.merged_shape, action_shapes=actions.merged_shape,
                                   capacity=capacity)
        sampler = BatchRandomSampler(storage)

        # create target return estimator
        returns = TDQValueReturn(q_value=q_values, policy=policy_target, target_qvalue=q_targets, gamma=gamma)

        # create Q-value loss and policy loss
        q_loss = MSBELoss(td_return=returns)
        policy_loss = QLoss(q_value=q_values[0], policy=policy)  # only the first q-value is used to train the policy
        losses = [q_loss, policy_loss]

        # create optimizer
        optimizer = Adam(learning_rate=lr)

        # create policy and q-value updaters
        params_updaters = [PolyakAveraging(current=policy, target=policy_target, rho=polyak)]
        for q_value, q_target in zip(q_values, q_targets):
            params_updaters.append(PolyakAveraging(current=q_value, target=q_target, rho=polyak))

        # create ticks (number of steps to wait before evaluating the loss / parameter updater)
        # this is used to delay the updates
        ticks = {updater: delay for updater in params_updaters}
        ticks.update({policy_loss: delay})

        # define the 3 main steps in RL: explore, evaluate, and update
        explorer = Explorer(task, exploration, storage, num_workers=num_workers)
        evaluator = Evaluator(None)  # off-policy
        updater = Updater(approximators, sampler, losses, optimizer, evaluators=returns, updaters=params_updaters,
                          ticks=ticks)

        # initialize RL algorithm
        super(TD3, self).__init__(explorer, evaluator, updater)

Example #2

Show file

File: ddpg.py Project: TESLA-Self-Driving-Car/PyRoboLearn-framework

    def __init__(self,
                 task,
                 approximators,
                 gamma=0.99,
                 lr=0.001,
                 polyak=0.995,
                 capacity=10000,
                 num_workers=1):
        """
        Initialize the DDPG off-policy RL algorithm.

        Args:
            task (RLTask, Env): RL task/env to run
            approximators ([Policy, QValue]): policy and Q-value function approximator to optimize.
            gamma (float): discount factor (which is a bias-variance trade-off). This parameter describes how much
                importance has the future rewards we get.
            lr (float): learning rate
            polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target
                approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the
                current parameter(s).
            capacity (int): capacity of the experience replay storage.
            num_workers (int): number of processes / workers to run in parallel
        """

        # check given approximators
        if isinstance(approximators,
                      (tuple, list)) and len(approximators) != 2:

            # get the policy and Q-value approximator
            policy, q_value = None, None
            for approximator in approximators:
                if isinstance(approximator, (Policy, QValue)):
                    policy = approximator
                elif isinstance(approximator, QValue):
                    q_value = approximator

            # check that the policy and Q-value approximator are different than None
            if policy is None:
                raise ValueError(
                    "No policy approximator was given to the algorithm.")
            if q_value is None:
                raise ValueError(
                    "No Q-value approximator was given to the algorithm.")

        else:
            raise TypeError(
                "Expecting a list/tuple of a policy and a Q-value function.")

        # get states and actions from policy
        states, actions = policy.states, policy.actions

        # check that the actions are continuous
        if not actions.is_continuous():
            raise ValueError(
                "The DDPG assumes that the actions are continuous, however got an action which is not."
            )

        # Set target parameters equal to main parameters
        memo = {}
        q_target = copy.deepcopy(q_value, memo=memo)
        policy_target = copy.deepcopy(policy, memo=memo)

        # create action exploration strategy
        exploration = ActionExploration(policy=policy, action=actions)

        # create experience replay
        storage = ExperienceReplay(state_shapes=states.merged_shape,
                                   action_shapes=actions.merged_shape,
                                   capacity=capacity)
        sampler = BatchRandomSampler(storage)

        # create target return estimator
        # target = QValueTarget(q_values=q_target, policy=policy_target, gamma=gamma)
        returns = TDQValueReturn(q_value=q_value,
                                 policy=policy_target,
                                 target_qvalue=q_target,
                                 gamma=gamma)

        # create Q-value loss and policy loss
        # q_loss = L2Loss(target=target, predictor=q_value)
        # q_loss = ValueLoss(returns=target, value=q_value)
        q_loss = MSBELoss(td_return=returns)
        policy_loss = QLoss(q_value=q_value, policy=policy)
        losses = [q_loss, policy_loss]

        # create optimizer
        optimizer = Adam(learning_rate=lr)

        # create q value and policy updaters
        q_value_updater = PolyakAveraging(current=q_value,
                                          target=q_target,
                                          rho=polyak)
        policy_updater = PolyakAveraging(current=policy,
                                         target=policy_target,
                                         rho=polyak)

        # define the 3 main steps in RL: explore, evaluate, and update
        explorer = Explorer(task,
                            exploration,
                            storage,
                            num_workers=num_workers)
        evaluator = Evaluator(None)  # off-policy
        updater = Updater(approximators,
                          sampler,
                          losses,
                          optimizer,
                          evaluators=returns,
                          updaters=[q_value_updater, policy_updater])

        # initialize RL algorithm
        super(DDPG, self).__init__(explorer, evaluator, updater)

Example #3

Show file

File: sac.py Project: zhibo-liu/pyrobolearn

    def __init__(self,
                 task,
                 approximators,
                 gamma=0.99,
                 lr=5e-4,
                 polyak=0.995,
                 alpha=0.2,
                 capacity=10000,
                 num_workers=1):
        """
        Initialize the SAC off-policy RL algorithm.

        Args:
            task (RLTask, Env): RL task/env to run
            approximators ([Policy, Value, QValue]): approximators to optimize.
            gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much
                importance has the future rewards we get.
            lr (float): learning rate
            polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target
                approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the
                current parameter(s).
            alpha (float): entropy regularization coefficient which controls the tradeoff between exploration and
                exploitation. Higher :attr:`alpha` means more exploration, and lower :attr:`alpha` corresponds to more
                exploitation.
            capacity (int): capacity of the experience replay storage.
            num_workers (int): number of processes / workers to run in parallel
        """

        # check approximators
        if not isinstance(approximators, collections.Iterable):
            raise TypeError(
                "Expecting the approximators to be a list containing a Policy, a Value, and at least 2 "
                "QValues")
        policy, value, q_values = None, None, []
        for approximator in approximators:
            if isinstance(approximator, Policy):
                policy = approximator
            elif isinstance(approximator, Value):
                value = approximator
            elif isinstance(approximator, ActorCritic):
                policy = approximator.actor
                value = approximator.critic
            elif isinstance(approximator, QValue):
                q_values.append(approximator)

        if policy is None:
            raise TypeError("No policy was given to the algorithm.")
        if value is None:
            raise TypeError(
                "No value function approximator was given to the algorithm.")
        if len(q_values) == 0:
            raise TypeError(
                "No Q-value function approximators were given to the algorithm."
            )

        # set target parameters equal to main parameters for the value function
        value_target = copy.deepcopy(value, memo={})

        # create experience replay
        states, actions = policy.states, policy.actions
        storage = ExperienceReplay(state_shapes=states.merged_shape,
                                   action_shapes=actions.merged_shape,
                                   capacity=capacity)
        sampler = BatchRandomSampler(storage)

        # create action exploration
        exploration = ActionExploration(policy)

        # create targets
        q_target = ValueTarget(values=value_target, gamma=gamma)
        v_target = EntropyValueTarget(q_values=q_values,
                                      policy=exploration,
                                      alpha=alpha)

        # create losses
        q_loss = MSBELoss(td_return=estimator)
        policy_loss = QLoss(
            q_value=q_values[0], policy=policy
        )  # only the first q-value is used to train the policy
        losses = [q_loss, policy_loss]

        # create optimizer
        optimizer = Adam(learning_rate=lr)

        # create parameter updater for target value function
        params_updater = PolyakAveraging(current=value,
                                         target=value_target,
                                         rho=polyak)

        # define the 3 main steps in RL: explore, evaluate, and update
        explorer = Explorer(task,
                            exploration,
                            storage,
                            num_workers=num_workers)
        evaluator = Evaluator(None)  # off-policy
        updater = Updater(approximators,
                          sampler,
                          losses,
                          optimizer,
                          updaters=params_updater)

        # initialize RL algorithm
        super(SAC, self).__init__(explorer, evaluator, updater)

Example #4

Show file

    def __init__(self,
                 task,
                 approximator,
                 gamma=0.99,
                 lr=5e-4,
                 capacity=10000,
                 polyak=0.995,
                 num_workers=1):
        """
        Initialize the DQN reinforcement learning algorithm.

        Args:
            task (RLTask, Env): RL task/env to run.
            approximator (ParametrizedQValueOutput, PolicyFromQValue): approximator to use and update.
            gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much
                importance has the future rewards we get.
            lr (float): learning rate.
            capacity (int): capacity of the experience replay storage.
            polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target
                approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the
                current parameter(s).
            num_workers (int): number of processes / workers to run in parallel.
        """
        # check given approximator
        if isinstance(approximator, ParametrizedQValueOutput):
            policy = PolicyFromQValue(approximator)
            q_value = approximator
        elif isinstance(approximator, PolicyFromQValue):
            policy = approximator
            q_value = approximator.value
        else:
            raise TypeError(
                "Expecting the given approximator to be an instance of `PolicyFromQValue`, or "
                "`ParametrizedQValueOutput`, instead got: {}".format(
                    type(approximator)))

        # evaluate target Q-value fct by copying Q-value function approximator
        q_target = copy.deepcopy(q_value, memo={})

        # get states and actions from policy
        states, actions = policy.states, policy.actions

        # create action exploration strategy
        exploration = EpsilonGreedyActionExploration(policy=policy,
                                                     action=actions)

        # create experience replay and sampler
        storage = ExperienceReplay(state_shapes=states.merged_shape,
                                   action_shapes=actions.merged_shape,
                                   capacity=capacity)
        sampler = BatchRandomSampler(storage)

        # create target return estimator
        # target = QLearningTarget(q_values=q_target, gamma=gamma)
        td_return = TDQLearningReturn(q_value=q_value,
                                      target_qvalue=q_target,
                                      gamma=gamma)

        # create loss
        # loss = HuberLoss(L2Loss(target=target, predictor=q_value))
        loss = HuberLoss(MSBELoss(td_return=td_return), delta=1.)

        # create optimizer
        optimizer = Adam(learning_rate=lr)

        # create target updater
        # target_updater = CopyParameter(current=q_value, target=q_target, sleep_count=100)
        target_updater = PolyakAveraging(current=q_value,
                                         target=q_target,
                                         rho=polyak)

        # define the 3 main steps in RL: explore, evaluate, and update
        explorer = Explorer(task,
                            exploration,
                            storage,
                            num_workers=num_workers)
        evaluator = Evaluator(None)  # off-policy
        updater = Updater(policy,
                          sampler,
                          loss,
                          optimizer,
                          evaluators=[td_return],
                          updaters=[target_updater])

        # initialize RL algorithm
        super(DQN, self).__init__(explorer, evaluator, updater)