Esempio n. 1
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(self, dict(
         action_pdtype='default',
         action_policy='default',
         explore_var_spec=None,
         entropy_coef_spec=None,
         policy_loss_coef=1.0,
     ))
     util.set_attr(self, self.algorithm_spec, [
         'action_pdtype',
         'action_policy',
         # theoretically, REINFORCE does not have policy update; but in this implementation we have such option
         'explore_var_spec',
         'gamma',  # the discount factor
         'entropy_coef_spec',
         'policy_loss_coef',
         'training_frequency',
     ])
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.explore_var_scheduler = policy_util.VarScheduler(self.explore_var_spec)
     self.body.explore_var = self.explore_var_scheduler.start_val
     if self.entropy_coef_spec is not None:
         self.entropy_coef_scheduler = policy_util.VarScheduler(self.entropy_coef_spec)
         self.body.entropy_coef = self.entropy_coef_scheduler.start_val
Esempio n. 2
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters.'''
     # set default
     util.set_attr(
         self,
         dict(
             action_pdtype='default',
             action_policy='default',
             explore_var_spec=None,
         ))
     util.set_attr(
         self,
         self.algorithm_spec,
         [
             'action_pdtype',
             'action_policy',
             # explore_var is epsilon, tau or etc. depending on the action policy
             # these control the trade off between exploration and exploitaton
             'explore_var_spec',
             'gamma',  # the discount factor
             'training_frequency',  # how often to train for batch training (once each training_frequency time steps)
         ])
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.explore_var_scheduler = policy_util.VarScheduler(
         self.explore_var_spec)
     self.body.explore_var = self.explore_var_scheduler.start_val
Esempio n. 3
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(
         self,
         dict(
             action_pdtype='default',
             action_policy='default',
             explore_var_spec=None,
             entropy_coef_spec=None,
             policy_loss_coef=1.0,
             val_loss_coef=1.0,
         ))
     util.set_attr(
         self,
         self.algorithm_spec,
         [
             'action_pdtype',
             'action_policy',
             # theoretically, AC does not have policy update; but in this implementation we have such option
             'explore_var_spec',
             'gamma',  # the discount factor
             'lam',
             'num_step_returns',
             'entropy_coef_spec',
             'policy_loss_coef',
             'val_loss_coef',
             'training_frequency',
         ])
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.explore_var_scheduler = policy_util.VarScheduler(
         self.explore_var_spec)
     self.body.explore_var = self.explore_var_scheduler.start_val
     if self.entropy_coef_spec is not None:
         self.entropy_coef_scheduler = policy_util.VarScheduler(
             self.entropy_coef_spec)
         self.body.entropy_coef = self.entropy_coef_scheduler.start_val
     # Select appropriate methods to calculate advs and v_targets for training
     if self.lam is not None:
         self.calc_advs_v_targets = self.calc_gae_advs_v_targets
     elif self.num_step_returns is not None:
         self.calc_advs_v_targets = self.calc_nstep_advs_v_targets
     else:
         self.calc_advs_v_targets = self.calc_ret_advs_v_targets
Esempio n. 4
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(
         self,
         dict(
             action_pdtype='default',
             action_policy='default',
             explore_var_spec=None,
             entropy_coef_spec=None,
             minibatch_size=4,
             val_loss_coef=1.0,
         ))
     util.set_attr(
         self,
         self.algorithm_spec,
         [
             'action_pdtype',
             'action_policy',
             # theoretically, PPO does not have policy update; but in this implementation we have such option
             'explore_var_spec',
             'gamma',
             'lam',
             'clip_eps_spec',
             'entropy_coef_spec',
             'val_loss_coef',
             'minibatch_size',
             'training_frequency',  # horizon
             'training_epoch',
         ])
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.explore_var_scheduler = policy_util.VarScheduler(
         self.explore_var_spec)
     self.body.explore_var = self.explore_var_scheduler.start_val
     # extra variable decays for PPO
     self.clip_eps_scheduler = policy_util.VarScheduler(self.clip_eps_spec)
     self.body.clip_eps = self.clip_eps_scheduler.start_val
     if self.entropy_coef_spec is not None:
         self.entropy_coef_scheduler = policy_util.VarScheduler(
             self.entropy_coef_spec)
         self.body.entropy_coef = self.entropy_coef_scheduler.start_val
     # PPO uses GAE
     self.calc_advs_v_targets = self.calc_gae_advs_v_targets