def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr( self, dict( action_pdtype='default', action_policy='default', explore_var_spec=None, entropy_coef_spec=None, )) util.set_attr( self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # theoretically, REINFORCE does not have policy update; but in this implementation we have such option 'explore_var_spec', 'gamma', # the discount factor 'entropy_coef_spec', 'training_frequency', 'normalize_state', ]) self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.explore_var_scheduler = policy_util.VarScheduler( self.explore_var_spec) self.body.explore_var = self.explore_var_scheduler.start_val if self.entropy_coef_spec is not None: self.entropy_coef_scheduler = policy_util.VarScheduler( self.entropy_coef_spec) self.body.entropy_coef = self.entropy_coef_scheduler.start_val
def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr(self, dict( action_pdtype='default', action_policy='default', center_return=False, explore_var_spec=None, entropy_coef_spec=None, policy_loss_coef=1.0, )) util.set_attr(self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', 'center_return', # center by the mean 'explore_var_spec', 'gamma', # the discount factor 'entropy_coef_spec', 'policy_loss_coef', 'training_frequency', ]) self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.explore_var_scheduler = policy_util.VarScheduler(self.explore_var_spec) self.body.explore_var = self.explore_var_scheduler.start_val if self.entropy_coef_spec is not None: self.entropy_coef_scheduler = policy_util.VarScheduler(self.entropy_coef_spec) self.body.entropy_coef = self.entropy_coef_scheduler.start_val
def init_algorithm_params(self): util.set_attr(self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # theoretically, PPO does not have policy update; but in this implementation we have such option 'explore_var_spec', 'gamma', 'lam', 'clip_eps_spec', 'entropy_coef_spec', 'val_loss_coef', 'minibatch_size', 'time_horizon', # training_frequency = actor * horizon 'training_epoch', ]) self.to_train = 0 self.training_frequency = self.time_horizon * self.body.env.num_envs assert self.memory_spec['name'] == 'OnPolicyBatchReplay', f'PPO only works with OnPolicyBatchReplay, but got {self.memory_spec["name"]}' self.action_policy = getattr(policy_util, self.action_policy) self.explore_var_scheduler = policy_util.VarScheduler(self.explore_var_spec) self.body.explore_var = self.explore_var_scheduler.start_val # extra variable decays for PPO self.clip_eps_scheduler = policy_util.VarScheduler(self.clip_eps_spec) self.body.clip_eps = self.clip_eps_scheduler.start_val if self.entropy_coef_spec is not None: self.entropy_coef_scheduler = policy_util.VarScheduler(self.entropy_coef_spec) self.body.entropy_coef = self.entropy_coef_scheduler.start_val # PPO uses GAE self.calc_advs_v_targets = self.calc_gae_advs_v_targets
def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr( self, dict( action_pdtype='default', action_policy='default', explore_var_spec=None, entropy_coef_spec=None, minibatch_size=4, val_loss_coef=1.0, )) util.set_attr( self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # theoretically, PPO does not have policy update; but in this implementation we have such option 'explore_var_spec', 'gamma', 'lam', 'clip_eps_spec', 'entropy_coef_spec', 'val_loss_coef', 'minibatch_size', 'time_horizon', # training_frequency = actor * horizon 'training_epoch', ]) self.to_train = 0 # guard num_envs = self.body.env.num_envs if self.minibatch_size % num_envs != 0 or self.time_horizon % num_envs != 0: self.minibatch_size = math.ceil( self.minibatch_size / num_envs) * num_envs self.time_horizon = math.ceil( self.time_horizon / num_envs) * num_envs logger.info( f'minibatch_size and time_horizon needs to be multiples of num_envs; autocorrected values: minibatch_size: {self.minibatch_size} time_horizon {self.time_horizon}' ) self.training_frequency = self.time_horizon # since all memories stores num_envs by batch in list assert self.memory_spec[ 'name'] == 'OnPolicyBatchReplay', f'PPO only works with OnPolicyBatchReplay, but got {self.memory_spec["name"]}' self.action_policy = getattr(policy_util, self.action_policy) self.explore_var_scheduler = policy_util.VarScheduler( self.explore_var_spec) self.body.explore_var = self.explore_var_scheduler.start_val # extra variable decays for PPO self.clip_eps_scheduler = policy_util.VarScheduler(self.clip_eps_spec) self.body.clip_eps = self.clip_eps_scheduler.start_val if self.entropy_coef_spec is not None: self.entropy_coef_scheduler = policy_util.VarScheduler( self.entropy_coef_spec) self.body.entropy_coef = self.entropy_coef_scheduler.start_val # PPO uses GAE self.calc_advs_v_targets = self.calc_gae_advs_v_targets
def init_algorithm_params(self): '''Initialize other algorithm parameters.''' # set default util.set_attr( self, dict( action_pdtype='default', action_policy='default', explore_var_spec=None, )) util.set_attr( self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # explore_var is epsilon, tau or etc. depending on the action policy # these control the trade off between exploration and exploitaton 'explore_var_spec', 'gamma', # the discount factor 'training_frequency', # how often to train for batch training (once each training_frequency time steps) 'normalize_state', ]) self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.explore_var_scheduler = policy_util.VarScheduler( self.explore_var_spec) self.body.explore_var = self.explore_var_scheduler.start_val
def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr( self, dict( action_pdtype='default', action_policy='default', explore_var_spec=None, entropy_coef_spec=None, policy_loss_coef=1.0, val_loss_coef=1.0, )) util.set_attr( self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # theoretically, AC does not have policy update; but in this implementation we have such option 'explore_var_spec', 'gamma', # the discount factor 'lam', 'num_step_returns', 'entropy_coef_spec', 'policy_loss_coef', 'val_loss_coef', 'training_frequency', 'training_epoch', 'normalize_state', ]) self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.explore_var_scheduler = policy_util.VarScheduler( self.explore_var_spec) self.body.explore_var = self.explore_var_scheduler.start_val if self.entropy_coef_spec is not None: self.entropy_coef_scheduler = policy_util.VarScheduler( self.entropy_coef_spec) self.body.entropy_coef = self.entropy_coef_scheduler.start_val # Select appropriate methods to calculate adv_targets and v_targets for training if self.lam is not None: self.calc_advs_v_targets = self.calc_gae_advs_v_targets elif self.num_step_returns is not None: self.calc_advs_v_targets = self.calc_nstep_advs_v_targets else: self.calc_advs_v_targets = self.calc_td_advs_v_targets
def init_algorithm_params(self): '''Initialize other algorithm parameters''' # set default util.set_attr( self, dict( action_pdtype='default', action_policy='default', explore_var_spec=None, entropy_coef_spec=None, val_loss_coef=1.0, )) util.set_attr( self, self.algorithm_spec, [ 'action_pdtype', 'action_policy', # theoretically, PPO does not have policy update; but in this implementation we have such option 'explore_var_spec', 'gamma', 'lam', 'clip_eps_spec', 'entropy_coef_spec', 'val_loss_coef', 'training_frequency', # horizon 'training_epoch', 'normalize_state', ]) self.to_train = 0 self.action_policy = getattr(policy_util, self.action_policy) self.explore_var_scheduler = policy_util.VarScheduler( self.explore_var_spec) self.body.explore_var = self.explore_var_scheduler.start_val # extra variable decays for PPO self.clip_eps_scheduler = policy_util.VarScheduler(self.clip_eps_spec) self.body.clip_eps = self.clip_eps_scheduler.start_val if self.entropy_coef_spec is not None: self.entropy_coef_scheduler = policy_util.VarScheduler( self.entropy_coef_spec) self.body.entropy_coef = self.entropy_coef_scheduler.start_val # PPO uses GAE self.calc_advs_v_targets = self.calc_gae_advs_v_targets