def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr(self, dict( action_pdtype='default', action_policy='default', explore_var_spec=None, entropy_coef_spec=None, policy_loss_coef=1.0, )) util.set_attr(self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # theoretically, REINFORCE does not have policy update; but in this implementation we have such option 'explore_var_spec', 'gamma', # the discount factor 'entropy_coef_spec', 'policy_loss_coef', 'training_frequency', ]) self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.explore_var_scheduler = policy_util.VarScheduler(self.explore_var_spec) self.body.explore_var = self.explore_var_scheduler.start_val if self.entropy_coef_spec is not None: self.entropy_coef_scheduler = policy_util.VarScheduler(self.entropy_coef_spec) self.body.entropy_coef = self.entropy_coef_scheduler.start_val
def init_algorithm_params(self): '''Initialize other algorithm parameters.''' # set default util.set_attr( self, dict( action_pdtype='default', action_policy='default', explore_var_spec=None, )) util.set_attr( self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # explore_var is epsilon, tau or etc. depending on the action policy # these control the trade off between exploration and exploitaton 'explore_var_spec', 'gamma', # the discount factor 'training_frequency', # how often to train for batch training (once each training_frequency time steps) ]) self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.explore_var_scheduler = policy_util.VarScheduler( self.explore_var_spec) self.body.explore_var = self.explore_var_scheduler.start_val
def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr( self, dict( action_pdtype='default', action_policy='default', explore_var_spec=None, entropy_coef_spec=None, policy_loss_coef=1.0, val_loss_coef=1.0, )) util.set_attr( self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # theoretically, AC does not have policy update; but in this implementation we have such option 'explore_var_spec', 'gamma', # the discount factor 'lam', 'num_step_returns', 'entropy_coef_spec', 'policy_loss_coef', 'val_loss_coef', 'training_frequency', ]) self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.explore_var_scheduler = policy_util.VarScheduler( self.explore_var_spec) self.body.explore_var = self.explore_var_scheduler.start_val if self.entropy_coef_spec is not None: self.entropy_coef_scheduler = policy_util.VarScheduler( self.entropy_coef_spec) self.body.entropy_coef = self.entropy_coef_scheduler.start_val # Select appropriate methods to calculate advs and v_targets for training if self.lam is not None: self.calc_advs_v_targets = self.calc_gae_advs_v_targets elif self.num_step_returns is not None: self.calc_advs_v_targets = self.calc_nstep_advs_v_targets else: self.calc_advs_v_targets = self.calc_ret_advs_v_targets
def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr( self, dict( action_pdtype='default', action_policy='default', explore_var_spec=None, entropy_coef_spec=None, minibatch_size=4, val_loss_coef=1.0, )) util.set_attr( self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # theoretically, PPO does not have policy update; but in this implementation we have such option 'explore_var_spec', 'gamma', 'lam', 'clip_eps_spec', 'entropy_coef_spec', 'val_loss_coef', 'minibatch_size', 'training_frequency', # horizon 'training_epoch', ]) self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.explore_var_scheduler = policy_util.VarScheduler( self.explore_var_spec) self.body.explore_var = self.explore_var_scheduler.start_val # extra variable decays for PPO self.clip_eps_scheduler = policy_util.VarScheduler(self.clip_eps_spec) self.body.clip_eps = self.clip_eps_scheduler.start_val if self.entropy_coef_spec is not None: self.entropy_coef_scheduler = policy_util.VarScheduler( self.entropy_coef_spec) self.body.entropy_coef = self.entropy_coef_scheduler.start_val # PPO uses GAE self.calc_advs_v_targets = self.calc_gae_advs_v_targets