Beispiel #1
0
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 lr=5.0e-4,
                 alpha=2,
                 ployak=0.995,
                 hidden_units=[32, 32],
                 **kwargs):
        assert not is_continuous, 'sql only support discrete action space'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.alpha = alpha
        self.ployak = ployak

        _q_net = lambda: rls.critic_q_all(self.feat_dim, self.a_dim,
                                          hidden_units)

        self.q_net = _q_net()
        self.q_target_net = _q_net()
        self.critic_tv = self.q_net.trainable_variables + self.other_tv
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self.update_target_net_weights(self.q_target_net.weights,
                                       self.q_net.weights)

        self.model_recorder(dict(model=self.q_net, optimizer=self.optimizer))
Beispiel #2
0
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 alpha=0.2,
                 beta=0.1,
                 ployak=0.995,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_episode=100,
                 use_epsilon=False,
                 q_lr=5.0e-4,
                 alpha_lr=5.0e-4,
                 auto_adaption=True,
                 hidden_units=[32, 32],
                 **kwargs):
        assert not is_continuous, 'maxsqn only support discrete action space'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_episode=init2mid_annealing_episode,
            max_episode=self.max_episode)
        self.use_epsilon = use_epsilon
        self.ployak = ployak
        self.log_alpha = alpha if not auto_adaption else tf.Variable(
            initial_value=0.0,
            name='log_alpha',
            dtype=tf.float32,
            trainable=True)
        self.auto_adaption = auto_adaption
        self.target_entropy = beta * np.log(self.a_dim)

        _q_net = lambda: rls.critic_q_all(self.feat_dim, self.a_dim,
                                          hidden_units)
        self.critic_net = DoubleQ(_q_net)
        self.critic_target_net = DoubleQ(_q_net)
        self.critic_tv = self.critic_net.trainable_variables + self.other_tv
        self.update_target_net_weights(self.critic_target_net.weights,
                                       self.critic_net.weights)
        self.q_lr, self.alpha_lr = map(self.init_lr, [q_lr, alpha_lr])
        self.optimizer_critic, self.optimizer_alpha = map(
            self.init_optimizer, [self.q_lr, self.alpha_lr])

        self.model_recorder(
            dict(critic_net=self.critic_net,
                 optimizer_critic=self.optimizer_critic,
                 optimizer_alpha=self.optimizer_alpha))
Beispiel #3
0
 def _q_net():
     return rls.critic_q_all(self.feat_dim, self.options_num,
                             hidden_units['q'])
Beispiel #4
0
    def __init__(
            self,
            s_dim,
            visual_sources,
            visual_resolution,
            a_dim,
            is_continuous,
            q_lr=5.0e-3,
            intra_option_lr=5.0e-4,
            termination_lr=5.0e-4,
            interest_lr=5.0e-4,
            boltzmann_temperature=1.0,
            options_num=4,
            ent_coff=0.01,
            double_q=False,
            use_baseline=True,
            terminal_mask=True,
            termination_regularizer=0.01,
            assign_interval=1000,
            hidden_units={
                'q': [32, 32],
                'intra_option': [32, 32],
                'termination': [32, 32],
                'interest': [32, 32]
            },
            **kwargs):
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.assign_interval = assign_interval
        self.options_num = options_num
        self.termination_regularizer = termination_regularizer
        self.ent_coff = ent_coff
        self.use_baseline = use_baseline
        self.terminal_mask = terminal_mask
        self.double_q = double_q
        self.boltzmann_temperature = boltzmann_temperature

        def _q_net():
            return rls.critic_q_all(self.feat_dim, self.options_num,
                                    hidden_units['q'])

        self.q_net = _q_net()
        self.q_target_net = _q_net()
        self.intra_option_net = rls.oc_intra_option(
            self.feat_dim, self.a_dim, self.options_num,
            hidden_units['intra_option'])
        self.termination_net = rls.critic_q_all(self.feat_dim,
                                                self.options_num,
                                                hidden_units['termination'],
                                                'sigmoid')
        self.interest_net = rls.critic_q_all(self.feat_dim, self.options_num,
                                             hidden_units['interest'],
                                             'sigmoid')
        self.critic_tv = self.q_net.trainable_variables + self.other_tv
        self.actor_tv = self.intra_option_net.trainable_variables
        if self.is_continuous:
            self.log_std = tf.Variable(initial_value=-0.5 * np.ones(
                (self.options_num, self.a_dim), dtype=np.float32),
                                       trainable=True)  # [P, A]
            self.actor_tv += [self.log_std]
        self.update_target_net_weights(self.q_target_net.weights,
                                       self.q_net.weights)

        self.q_lr, self.intra_option_lr, self.termination_lr, self.interest_lr = map(
            self.init_lr, [q_lr, intra_option_lr, termination_lr, interest_lr])
        self.q_optimizer = self.init_optimizer(self.q_lr, clipvalue=5.)
        self.intra_option_optimizer = self.init_optimizer(self.intra_option_lr,
                                                          clipvalue=5.)
        self.termination_optimizer = self.init_optimizer(self.termination_lr,
                                                         clipvalue=5.)
        self.interest_optimizer = self.init_optimizer(self.interest_lr,
                                                      clipvalue=5.)

        self.model_recorder(
            dict(q_net=self.q_net,
                 intra_option_net=self.intra_option_net,
                 termination_net=self.termination_net,
                 interest_net=self.interest_net,
                 q_optimizer=self.q_optimizer,
                 intra_option_optimizer=self.intra_option_optimizer,
                 termination_optimizer=self.termination_optimizer,
                 interest_optimizer=self.interest_optimizer))
Beispiel #5
0
        def _q_net(): return rls.critic_q_all(self.feat_dim, self.a_dim, hidden_units)

        self.q_net = _q_net()