def __init__(self, task, approximators, gamma=0.99, lr=0.001, polyak=0.995, delay=2, capacity=10000, num_workers=1): """ Initialize the TD3 off-policy RL algorithm. Args: task (RLTask, Env): RL task/env to run approximators (Policy, [Policy, Value], ActorCritic): approximators to optimize gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much importance has the future rewards we get. lr (float): learning rate. polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the current parameter(s). delay (int): number of steps to wait before performing an update. capacity (int): capacity of the experience replay storage. num_workers (int): number of processes / workers to run in parallel """ # check given approximators if isinstance(approximators, (tuple, list)): # get the policy and Q-value approximator policy, q_values = None, [] for approximator in approximators: if isinstance(approximator, (Policy, QValue)): policy = approximator elif isinstance(approximator, QValue): q_values.append(approximator) # check that the policy and Q-value approximator are different than None if policy is None: raise ValueError("No policy approximator was given to the algorithm.") if not q_values: raise ValueError("No Q-value approximator was given to the algorithm.") else: raise TypeError("Expecting a list/tuple of a policy and a Q-value functions.") # check that there is at least 2 Q-value function approximators (the user can have more) if len(q_values) < 2: raise ValueError("Expecting at least 2 Q-value function approximators for the TD3 algorithm.") # get states and actions from policy states, actions = policy.states, policy.actions # check that the actions are continuous if not actions.is_continuous(): raise ValueError("The TD3 assumes that the actions are continuous, however got an action which is not.") # evaluate target Q-value fct by copying Q-value function approximator memo = {} q_targets = [copy.deepcopy(q_value, memo=memo) for q_value in q_values] policy_target = copy.deepcopy(policy, memo=memo) # create action exploration strategy exploration = ActionExploration(policy=policy, action=actions) # create experience replay storage = ExperienceReplay(state_shapes=states.merged_shape, action_shapes=actions.merged_shape, capacity=capacity) sampler = BatchRandomSampler(storage) # create target return estimator returns = TDQValueReturn(q_value=q_values, policy=policy_target, target_qvalue=q_targets, gamma=gamma) # create Q-value loss and policy loss q_loss = MSBELoss(td_return=returns) policy_loss = QLoss(q_value=q_values[0], policy=policy) # only the first q-value is used to train the policy losses = [q_loss, policy_loss] # create optimizer optimizer = Adam(learning_rate=lr) # create policy and q-value updaters params_updaters = [PolyakAveraging(current=policy, target=policy_target, rho=polyak)] for q_value, q_target in zip(q_values, q_targets): params_updaters.append(PolyakAveraging(current=q_value, target=q_target, rho=polyak)) # create ticks (number of steps to wait before evaluating the loss / parameter updater) # this is used to delay the updates ticks = {updater: delay for updater in params_updaters} ticks.update({policy_loss: delay}) # define the 3 main steps in RL: explore, evaluate, and update explorer = Explorer(task, exploration, storage, num_workers=num_workers) evaluator = Evaluator(None) # off-policy updater = Updater(approximators, sampler, losses, optimizer, evaluators=returns, updaters=params_updaters, ticks=ticks) # initialize RL algorithm super(TD3, self).__init__(explorer, evaluator, updater)
def __init__(self, task, approximators, gamma=0.99, lr=0.001, polyak=0.995, capacity=10000, num_workers=1): """ Initialize the DDPG off-policy RL algorithm. Args: task (RLTask, Env): RL task/env to run approximators ([Policy, QValue]): policy and Q-value function approximator to optimize. gamma (float): discount factor (which is a bias-variance trade-off). This parameter describes how much importance has the future rewards we get. lr (float): learning rate polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the current parameter(s). capacity (int): capacity of the experience replay storage. num_workers (int): number of processes / workers to run in parallel """ # check given approximators if isinstance(approximators, (tuple, list)) and len(approximators) != 2: # get the policy and Q-value approximator policy, q_value = None, None for approximator in approximators: if isinstance(approximator, (Policy, QValue)): policy = approximator elif isinstance(approximator, QValue): q_value = approximator # check that the policy and Q-value approximator are different than None if policy is None: raise ValueError( "No policy approximator was given to the algorithm.") if q_value is None: raise ValueError( "No Q-value approximator was given to the algorithm.") else: raise TypeError( "Expecting a list/tuple of a policy and a Q-value function.") # get states and actions from policy states, actions = policy.states, policy.actions # check that the actions are continuous if not actions.is_continuous(): raise ValueError( "The DDPG assumes that the actions are continuous, however got an action which is not." ) # Set target parameters equal to main parameters memo = {} q_target = copy.deepcopy(q_value, memo=memo) policy_target = copy.deepcopy(policy, memo=memo) # create action exploration strategy exploration = ActionExploration(policy=policy, action=actions) # create experience replay storage = ExperienceReplay(state_shapes=states.merged_shape, action_shapes=actions.merged_shape, capacity=capacity) sampler = BatchRandomSampler(storage) # create target return estimator # target = QValueTarget(q_values=q_target, policy=policy_target, gamma=gamma) returns = TDQValueReturn(q_value=q_value, policy=policy_target, target_qvalue=q_target, gamma=gamma) # create Q-value loss and policy loss # q_loss = L2Loss(target=target, predictor=q_value) # q_loss = ValueLoss(returns=target, value=q_value) q_loss = MSBELoss(td_return=returns) policy_loss = QLoss(q_value=q_value, policy=policy) losses = [q_loss, policy_loss] # create optimizer optimizer = Adam(learning_rate=lr) # create q value and policy updaters q_value_updater = PolyakAveraging(current=q_value, target=q_target, rho=polyak) policy_updater = PolyakAveraging(current=policy, target=policy_target, rho=polyak) # define the 3 main steps in RL: explore, evaluate, and update explorer = Explorer(task, exploration, storage, num_workers=num_workers) evaluator = Evaluator(None) # off-policy updater = Updater(approximators, sampler, losses, optimizer, evaluators=returns, updaters=[q_value_updater, policy_updater]) # initialize RL algorithm super(DDPG, self).__init__(explorer, evaluator, updater)
def __init__(self, task, approximators, gamma=0.99, lr=5e-4, polyak=0.995, alpha=0.2, capacity=10000, num_workers=1): """ Initialize the SAC off-policy RL algorithm. Args: task (RLTask, Env): RL task/env to run approximators ([Policy, Value, QValue]): approximators to optimize. gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much importance has the future rewards we get. lr (float): learning rate polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the current parameter(s). alpha (float): entropy regularization coefficient which controls the tradeoff between exploration and exploitation. Higher :attr:`alpha` means more exploration, and lower :attr:`alpha` corresponds to more exploitation. capacity (int): capacity of the experience replay storage. num_workers (int): number of processes / workers to run in parallel """ # check approximators if not isinstance(approximators, collections.Iterable): raise TypeError( "Expecting the approximators to be a list containing a Policy, a Value, and at least 2 " "QValues") policy, value, q_values = None, None, [] for approximator in approximators: if isinstance(approximator, Policy): policy = approximator elif isinstance(approximator, Value): value = approximator elif isinstance(approximator, ActorCritic): policy = approximator.actor value = approximator.critic elif isinstance(approximator, QValue): q_values.append(approximator) if policy is None: raise TypeError("No policy was given to the algorithm.") if value is None: raise TypeError( "No value function approximator was given to the algorithm.") if len(q_values) == 0: raise TypeError( "No Q-value function approximators were given to the algorithm." ) # set target parameters equal to main parameters for the value function value_target = copy.deepcopy(value, memo={}) # create experience replay states, actions = policy.states, policy.actions storage = ExperienceReplay(state_shapes=states.merged_shape, action_shapes=actions.merged_shape, capacity=capacity) sampler = BatchRandomSampler(storage) # create action exploration exploration = ActionExploration(policy) # create targets q_target = ValueTarget(values=value_target, gamma=gamma) v_target = EntropyValueTarget(q_values=q_values, policy=exploration, alpha=alpha) # create losses q_loss = MSBELoss(td_return=estimator) policy_loss = QLoss( q_value=q_values[0], policy=policy ) # only the first q-value is used to train the policy losses = [q_loss, policy_loss] # create optimizer optimizer = Adam(learning_rate=lr) # create parameter updater for target value function params_updater = PolyakAveraging(current=value, target=value_target, rho=polyak) # define the 3 main steps in RL: explore, evaluate, and update explorer = Explorer(task, exploration, storage, num_workers=num_workers) evaluator = Evaluator(None) # off-policy updater = Updater(approximators, sampler, losses, optimizer, updaters=params_updater) # initialize RL algorithm super(SAC, self).__init__(explorer, evaluator, updater)
def __init__(self, task, approximator, gamma=0.99, lr=5e-4, capacity=10000, polyak=0.995, num_workers=1): """ Initialize the DQN reinforcement learning algorithm. Args: task (RLTask, Env): RL task/env to run. approximator (ParametrizedQValueOutput, PolicyFromQValue): approximator to use and update. gamma (float): discount factor (which is a bias-variance tradeoff). This parameter describes how much importance has the future rewards we get. lr (float): learning rate. capacity (int): capacity of the experience replay storage. polyak (float): coefficient (between 0 and 1) used in the polyak averaging when updating the target approximators. If 1, it will let the target parameter(s) unchanged, if 0 it will just copy the current parameter(s). num_workers (int): number of processes / workers to run in parallel. """ # check given approximator if isinstance(approximator, ParametrizedQValueOutput): policy = PolicyFromQValue(approximator) q_value = approximator elif isinstance(approximator, PolicyFromQValue): policy = approximator q_value = approximator.value else: raise TypeError( "Expecting the given approximator to be an instance of `PolicyFromQValue`, or " "`ParametrizedQValueOutput`, instead got: {}".format( type(approximator))) # evaluate target Q-value fct by copying Q-value function approximator q_target = copy.deepcopy(q_value, memo={}) # get states and actions from policy states, actions = policy.states, policy.actions # create action exploration strategy exploration = EpsilonGreedyActionExploration(policy=policy, action=actions) # create experience replay and sampler storage = ExperienceReplay(state_shapes=states.merged_shape, action_shapes=actions.merged_shape, capacity=capacity) sampler = BatchRandomSampler(storage) # create target return estimator # target = QLearningTarget(q_values=q_target, gamma=gamma) td_return = TDQLearningReturn(q_value=q_value, target_qvalue=q_target, gamma=gamma) # create loss # loss = HuberLoss(L2Loss(target=target, predictor=q_value)) loss = HuberLoss(MSBELoss(td_return=td_return), delta=1.) # create optimizer optimizer = Adam(learning_rate=lr) # create target updater # target_updater = CopyParameter(current=q_value, target=q_target, sleep_count=100) target_updater = PolyakAveraging(current=q_value, target=q_target, rho=polyak) # define the 3 main steps in RL: explore, evaluate, and update explorer = Explorer(task, exploration, storage, num_workers=num_workers) evaluator = Evaluator(None) # off-policy updater = Updater(policy, sampler, loss, optimizer, evaluators=[td_return], updaters=[target_updater]) # initialize RL algorithm super(DQN, self).__init__(explorer, evaluator, updater)