Ejemplo n.º 1
0
    def __init__(self, actor_approximator, critic_approximator, policy_class,
                 mdp_info, batch_size, initial_replay_size, max_replay_size,
                 tau, actor_params, critic_params, policy_params,
                 actor_fit_params=None, critic_fit_params=None):
        """
        Constructor.

        Args:
            actor_approximator (object): the approximator to use for the actor;
            critic_approximator (object): the approximator to use for the
                critic;
            policy_class (Policy): class of the policy;
            batch_size (int): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            tau (float): value of coefficient for soft updates;
            actor_params (dict): parameters of the actor approximator to
                build;
            critic_params (dict): parameters of the critic approximator to
                build;
            policy_params (dict): parameters of the policy to build;
            actor_fit_params (dict, None): parameters of the fitting algorithm
                of the actor approximator;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator;

        """
        self._actor_fit_params = dict() if actor_fit_params is None else actor_fit_params
        self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params

        self._batch_size = batch_size
        self._tau = tau

        self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size)

        target_critic_params = deepcopy(critic_params)
        self._critic_approximator = Regressor(critic_approximator,
                                              **critic_params)
        self._target_critic_approximator = Regressor(critic_approximator,
                                                     **target_critic_params)

        if 'loss' not in actor_params:
            actor_params['loss'] = ActorLoss(self._critic_approximator)

        target_actor_params = deepcopy(actor_params)
        self._actor_approximator = Regressor(actor_approximator,
                                             **actor_params)
        self._target_actor_approximator = Regressor(actor_approximator,
                                                    **target_actor_params)

        self._target_actor_approximator.model.set_weights(
            self._actor_approximator.model.get_weights())
        self._target_critic_approximator.model.set_weights(
            self._critic_approximator.model.get_weights())

        policy = policy_class(self._actor_approximator, **policy_params)
        super().__init__(policy, mdp_info)
Ejemplo n.º 2
0
    def __init__(self, mdp_info, policy_class, policy_params,
                 batch_size, initial_replay_size, max_replay_size,
                 tau, critic_params, actor_params, actor_optimizer,
                 policy_delay=1, critic_fit_params=None):
        """
        Constructor.

        Args:
            policy_class (Policy): class of the policy;
            policy_params (dict): parameters of the policy to build;
            batch_size (int): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            tau (float): value of coefficient for soft updates;
            actor_params (dict): parameters of the actor approximator to
                build;
            critic_params (dict): parameters of the critic approximator to
                build;
            actor_optimizer (dict): parameters to specify the actor optimizer
                algorithm;
            policy_delay (int, 1): the number of updates of the critic after
                which an actor update is implemented;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator;

        """

        self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params

        self._batch_size = batch_size
        self._tau = tau
        self._policy_delay = policy_delay
        self._fit_count = 0

        self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size)

        target_critic_params = deepcopy(critic_params)
        self._critic_approximator = Regressor(TorchApproximator,
                                              **critic_params)
        self._target_critic_approximator = Regressor(TorchApproximator,
                                                     **target_critic_params)

        target_actor_params = deepcopy(actor_params)
        self._actor_approximator = Regressor(TorchApproximator,
                                             **actor_params)
        self._target_actor_approximator = Regressor(TorchApproximator,
                                                    **target_actor_params)

        self._init_target()

        policy = policy_class(self._actor_approximator, **policy_params)

        policy_parameters = self._actor_approximator.model.network.parameters()
        super().__init__(policy, mdp_info, actor_optimizer, policy_parameters)
Ejemplo n.º 3
0
    def __init__(self, approximator, policy, mdp_info, batch_size,
                 initial_replay_size, max_replay_size,
                 approximator_params, target_update_frequency,
                 fit_params=None, n_approximators=1, clip_reward=True):
        """
        Constructor.

        Args:
            approximator (object): the approximator to use to fit the
               Q-function;
            batch_size (int): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            approximator_params (dict): parameters of the approximator to
                build;
            target_update_frequency (int): the number of samples collected
                between each update of the target network;
            fit_params (dict, None): parameters of the fitting algorithm of the
                approximator;
            n_approximators (int, 1): the number of approximator to use in
                ``AverageDQN``;
            clip_reward (bool, True): whether to clip the reward or not.

        """
        self._fit_params = dict() if fit_params is None else fit_params

        self._batch_size = batch_size
        self._n_approximators = n_approximators
        self._clip_reward = clip_reward
        self._target_update_frequency = target_update_frequency

        self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size)

        self._n_updates = 0

        apprx_params_train = deepcopy(approximator_params)
        apprx_params_target = deepcopy(approximator_params)
        self.approximator = Regressor(approximator, **apprx_params_train)
        self.target_approximator = Regressor(approximator,
                                             n_models=self._n_approximators,
                                             **apprx_params_target)
        policy.set_q(self.approximator)

        if self._n_approximators == 1:
            self.target_approximator.model.set_weights(
                self.approximator.model.get_weights())
        else:
            for i in range(self._n_approximators):
                self.target_approximator.model[i].set_weights(
                    self.approximator.model.get_weights())

        super().__init__(policy, mdp_info)
Ejemplo n.º 4
0
    def __init__(self, approximator, policy, mdp_info, params):
        alg_params = params['algorithm_params']
        self._batch_size = alg_params.get('batch_size')
        self._n_approximators = alg_params.get('n_approximators', 1)
        self._clip_reward = alg_params.get('clip_reward', True)
        self._train_frequency = alg_params.get('train_frequency')
        self._target_update_frequency = alg_params.get(
            'target_update_frequency')
        self._max_no_op_actions = alg_params.get('max_no_op_actions', 0)
        self._no_op_action_value = alg_params.get('no_op_action_value', 0)

        self._replay_memory = ReplayMemory(
            mdp_info, alg_params.get('initial_replay_size'),
            alg_params.get('max_replay_size'),
            alg_params.get('history_length', 1))
        self._buffer = Buffer(size=alg_params.get('history_length', 1))

        self._n_updates = 0
        self._episode_steps = 0
        self._no_op_actions = None

        apprx_params_train = deepcopy(params['approximator_params'])
        apprx_params_train['name'] = 'train'
        apprx_params_target = deepcopy(params['approximator_params'])
        apprx_params_target['name'] = 'target'
        self.approximator = Regressor(approximator, **apprx_params_train)
        self.target_approximator = Regressor(approximator,
                                             n_models=self._n_approximators,
                                             **apprx_params_target)
        policy.set_q(self.approximator)

        if self._n_approximators == 1:
            self.target_approximator.model.set_weights(
                self.approximator.model.get_weights())
        else:
            for i in xrange(self._n_approximators):
                self.target_approximator.model[i].set_weights(
                    self.approximator.model.get_weights())

        super(DQN, self).__init__(policy, mdp_info, params)
Ejemplo n.º 5
0
    def __init__(self, approximator, policy, mdp_info, batch_size,
                 initial_replay_size, max_replay_size,
                 target_update_frequency=2500, fit_params=None,
                 approximator_params=None, n_approximators=1,
                 history_length=1, clip_reward=True, max_no_op_actions=0,
                 no_op_action_value=0, dtype=np.float32):
        """
        Constructor.

        Args:
            batch_size (int): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            target_update_frequency (int): the number of samples collected
                between each update of the target network;
            fit_params (dict, None): parameters of the fitting algorithm of the
                approximator;
            approximator_params (dict, None): parameters of the approximator to
                build;
            n_approximators (int, 1): the number of approximator to use in
                ``AverageDQN``;
            history_length (int, 1): the number of samples composing a state;
            clip_reward (bool, True): whether to clip the reward or not;
            max_no_op_actions (int, 0): maximum number of no-op actions that
                can be sampled;
            no_op_action_value (int, 0): value of the no-op action;
            dtype (object, np.float32): dtype of the state array.

        """
        self._fit_params = dict() if fit_params is None else fit_params

        self._batch_size = batch_size
        self._n_approximators = n_approximators
        self._clip_reward = clip_reward
        self._target_update_frequency = target_update_frequency
        self._max_no_op_actions = max_no_op_actions
        self._no_op_action_value = no_op_action_value

        self._replay_memory = ReplayMemory(mdp_info, initial_replay_size,
                                           max_replay_size, history_length,
                                           dtype)
        self._buffer = Buffer(history_length, dtype)

        self._n_updates = 0
        self._episode_steps = 0
        self._no_op_actions = None

        apprx_params_train = deepcopy(approximator_params)
        apprx_params_target = deepcopy(approximator_params)
        self.approximator = Regressor(approximator, **apprx_params_train)
        self.target_approximator = Regressor(approximator,
                                             n_models=self._n_approximators,
                                             **apprx_params_target)
        policy.set_q(self.approximator)

        if self._n_approximators == 1:
            self.target_approximator.model.set_weights(
                self.approximator.model.get_weights())
        else:
            for i in range(self._n_approximators):
                self.target_approximator.model[i].set_weights(
                    self.approximator.model.get_weights())

        super().__init__(policy, mdp_info)
Ejemplo n.º 6
0
    def __init__(self, mdp_info,
                 batch_size, initial_replay_size, max_replay_size,
                 warmup_transitions, tau, lr_alpha,
                 actor_mu_params, actor_sigma_params,
                 actor_optimizer, critic_params,
                 target_entropy=None, critic_fit_params=None):
        """
        Constructor.

        Args:
            batch_size (int): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            warmup_transitions (int): number of samples to accumulate in the
                replay memory to start the policy fitting;
            tau (float): value of coefficient for soft updates;
            lr_alpha (float): Learning rate for the entropy coefficient;
            actor_mu_params (dict): parameters of the actor mean approximator
                to build;
            actor_sigma_params (dict): parameters of the actor sigma approximator
                to build;
            actor_optimizer (dict): parameters to specify the actor
                optimizer algorithm;
            critic_params (dict): parameters of the critic approximator to
                build;
            target_entropy (float, None): target entropy for the policy, if None
                a default value is computed ;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator.

        """
        self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params

        self._batch_size = batch_size
        self._warmup_transitions = warmup_transitions
        self._tau = tau

        if target_entropy is None:
            self._target_entropy = -np.prod(mdp_info.action_space.shape).astype(np.float32)
        else:
            self._target_entropy = target_entropy

        self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size)

        if 'n_models' in critic_params.keys():
            assert critic_params['n_models'] == 2
        else:
            critic_params['n_models'] = 2

        if 'prediction' in critic_params.keys():
            assert critic_params['prediction'] == 'min'
        else:
            critic_params['prediction'] = 'min'

        target_critic_params = deepcopy(critic_params)
        self._critic_approximator = Regressor(TorchApproximator,
                                              **critic_params)
        self._target_critic_approximator = Regressor(TorchApproximator,
                                                     **target_critic_params)

        self._log_alpha = torch.tensor(0., requires_grad=True, dtype=torch.float32)
        self._alpha_optim = optim.Adam([self._log_alpha], lr=lr_alpha)

        actor_mu_approximator = Regressor(TorchApproximator,
                                          **actor_mu_params)
        actor_sigma_approximator = Regressor(TorchApproximator,
                                             **actor_sigma_params)

        policy = SACPolicy(actor_mu_approximator,
                           actor_sigma_approximator,
                           mdp_info.action_space.low,
                           mdp_info.action_space.high)

        self._init_target()

        policy_parameters = chain(actor_mu_approximator.model.network.parameters(),
                                  actor_sigma_approximator.model.network.parameters())
        super().__init__(policy, mdp_info, actor_optimizer, policy_parameters)