Beispiel #1
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(
         self,
         dict(
             action_pdtype='default',
             action_policy='default',
             explore_var_spec=None,
             entropy_coef_spec=None,
         ))
     util.set_attr(
         self,
         self.algorithm_spec,
         [
             'action_pdtype',
             'action_policy',
             # theoretically, REINFORCE does not have policy update; but in this implementation we have such option
             'explore_var_spec',
             'gamma',  # the discount factor
             'entropy_coef_spec',
             'training_frequency',
             'normalize_state',
         ])
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.explore_var_scheduler = policy_util.VarScheduler(
         self.explore_var_spec)
     self.body.explore_var = self.explore_var_scheduler.start_val
     if self.entropy_coef_spec is not None:
         self.entropy_coef_scheduler = policy_util.VarScheduler(
             self.entropy_coef_spec)
         self.body.entropy_coef = self.entropy_coef_scheduler.start_val
Beispiel #2
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(self, dict(
         action_pdtype='default',
         action_policy='default',
         center_return=False,
         explore_var_spec=None,
         entropy_coef_spec=None,
         policy_loss_coef=1.0,
     ))
     util.set_attr(self, self.algorithm_spec, [
         'action_pdtype',
         'action_policy',
         'center_return',  # center by the mean
         'explore_var_spec',
         'gamma',  # the discount factor
         'entropy_coef_spec',
         'policy_loss_coef',
         'training_frequency',
     ])
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.explore_var_scheduler = policy_util.VarScheduler(self.explore_var_spec)
     self.body.explore_var = self.explore_var_scheduler.start_val
     if self.entropy_coef_spec is not None:
         self.entropy_coef_scheduler = policy_util.VarScheduler(self.entropy_coef_spec)
         self.body.entropy_coef = self.entropy_coef_scheduler.start_val
Beispiel #3
0
 def init_algorithm_params(self):
     util.set_attr(self, self.algorithm_spec, [
         'action_pdtype',
         'action_policy',
         # theoretically, PPO does not have policy update; but in this implementation we have such option
         'explore_var_spec',
         'gamma',
         'lam',
         'clip_eps_spec',
         'entropy_coef_spec',
         'val_loss_coef',
         'minibatch_size',
         'time_horizon',  # training_frequency = actor * horizon
         'training_epoch',
     ])
     self.to_train = 0
     self.training_frequency = self.time_horizon * self.body.env.num_envs
     assert self.memory_spec['name'] == 'OnPolicyBatchReplay', f'PPO only works with OnPolicyBatchReplay, but got {self.memory_spec["name"]}'
     self.action_policy = getattr(policy_util, self.action_policy)
     self.explore_var_scheduler = policy_util.VarScheduler(self.explore_var_spec)
     self.body.explore_var = self.explore_var_scheduler.start_val
     # extra variable decays for PPO
     self.clip_eps_scheduler = policy_util.VarScheduler(self.clip_eps_spec)
     self.body.clip_eps = self.clip_eps_scheduler.start_val
     if self.entropy_coef_spec is not None:
         self.entropy_coef_scheduler = policy_util.VarScheduler(self.entropy_coef_spec)
         self.body.entropy_coef = self.entropy_coef_scheduler.start_val
     # PPO uses GAE
     self.calc_advs_v_targets = self.calc_gae_advs_v_targets
Beispiel #4
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(
         self,
         dict(
             action_pdtype='default',
             action_policy='default',
             explore_var_spec=None,
             entropy_coef_spec=None,
             minibatch_size=4,
             val_loss_coef=1.0,
         ))
     util.set_attr(
         self,
         self.algorithm_spec,
         [
             'action_pdtype',
             'action_policy',
             # theoretically, PPO does not have policy update; but in this implementation we have such option
             'explore_var_spec',
             'gamma',
             'lam',
             'clip_eps_spec',
             'entropy_coef_spec',
             'val_loss_coef',
             'minibatch_size',
             'time_horizon',  # training_frequency = actor * horizon
             'training_epoch',
         ])
     self.to_train = 0
     # guard
     num_envs = self.body.env.num_envs
     if self.minibatch_size % num_envs != 0 or self.time_horizon % num_envs != 0:
         self.minibatch_size = math.ceil(
             self.minibatch_size / num_envs) * num_envs
         self.time_horizon = math.ceil(
             self.time_horizon / num_envs) * num_envs
         logger.info(
             f'minibatch_size and time_horizon needs to be multiples of num_envs; autocorrected values: minibatch_size: {self.minibatch_size}  time_horizon {self.time_horizon}'
         )
     self.training_frequency = self.time_horizon  # since all memories stores num_envs by batch in list
     assert self.memory_spec[
         'name'] == 'OnPolicyBatchReplay', f'PPO only works with OnPolicyBatchReplay, but got {self.memory_spec["name"]}'
     self.action_policy = getattr(policy_util, self.action_policy)
     self.explore_var_scheduler = policy_util.VarScheduler(
         self.explore_var_spec)
     self.body.explore_var = self.explore_var_scheduler.start_val
     # extra variable decays for PPO
     self.clip_eps_scheduler = policy_util.VarScheduler(self.clip_eps_spec)
     self.body.clip_eps = self.clip_eps_scheduler.start_val
     if self.entropy_coef_spec is not None:
         self.entropy_coef_scheduler = policy_util.VarScheduler(
             self.entropy_coef_spec)
         self.body.entropy_coef = self.entropy_coef_scheduler.start_val
     # PPO uses GAE
     self.calc_advs_v_targets = self.calc_gae_advs_v_targets
Beispiel #5
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters.'''
     # set default
     util.set_attr(
         self,
         dict(
             action_pdtype='default',
             action_policy='default',
             explore_var_spec=None,
         ))
     util.set_attr(
         self,
         self.algorithm_spec,
         [
             'action_pdtype',
             'action_policy',
             # explore_var is epsilon, tau or etc. depending on the action policy
             # these control the trade off between exploration and exploitaton
             'explore_var_spec',
             'gamma',  # the discount factor
             'training_frequency',  # how often to train for batch training (once each training_frequency time steps)
             'normalize_state',
         ])
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.explore_var_scheduler = policy_util.VarScheduler(
         self.explore_var_spec)
     self.body.explore_var = self.explore_var_scheduler.start_val
Beispiel #6
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(
         self,
         dict(
             action_pdtype='default',
             action_policy='default',
             explore_var_spec=None,
             entropy_coef_spec=None,
             policy_loss_coef=1.0,
             val_loss_coef=1.0,
         ))
     util.set_attr(
         self,
         self.algorithm_spec,
         [
             'action_pdtype',
             'action_policy',
             # theoretically, AC does not have policy update; but in this implementation we have such option
             'explore_var_spec',
             'gamma',  # the discount factor
             'lam',
             'num_step_returns',
             'entropy_coef_spec',
             'policy_loss_coef',
             'val_loss_coef',
             'training_frequency',
             'training_epoch',
             'normalize_state',
         ])
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.explore_var_scheduler = policy_util.VarScheduler(
         self.explore_var_spec)
     self.body.explore_var = self.explore_var_scheduler.start_val
     if self.entropy_coef_spec is not None:
         self.entropy_coef_scheduler = policy_util.VarScheduler(
             self.entropy_coef_spec)
         self.body.entropy_coef = self.entropy_coef_scheduler.start_val
     # Select appropriate methods to calculate adv_targets and v_targets for training
     if self.lam is not None:
         self.calc_advs_v_targets = self.calc_gae_advs_v_targets
     elif self.num_step_returns is not None:
         self.calc_advs_v_targets = self.calc_nstep_advs_v_targets
     else:
         self.calc_advs_v_targets = self.calc_td_advs_v_targets
Beispiel #7
0
 def init_algorithm_params(self):
     '''Initialize other algorithm parameters'''
     # set default
     util.set_attr(
         self,
         dict(
             action_pdtype='default',
             action_policy='default',
             explore_var_spec=None,
             entropy_coef_spec=None,
             val_loss_coef=1.0,
         ))
     util.set_attr(
         self,
         self.algorithm_spec,
         [
             'action_pdtype',
             'action_policy',
             # theoretically, PPO does not have policy update; but in this implementation we have such option
             'explore_var_spec',
             'gamma',
             'lam',
             'clip_eps_spec',
             'entropy_coef_spec',
             'val_loss_coef',
             'training_frequency',  # horizon
             'training_epoch',
             'normalize_state',
         ])
     self.to_train = 0
     self.action_policy = getattr(policy_util, self.action_policy)
     self.explore_var_scheduler = policy_util.VarScheduler(
         self.explore_var_spec)
     self.body.explore_var = self.explore_var_scheduler.start_val
     # extra variable decays for PPO
     self.clip_eps_scheduler = policy_util.VarScheduler(self.clip_eps_spec)
     self.body.clip_eps = self.clip_eps_scheduler.start_val
     if self.entropy_coef_spec is not None:
         self.entropy_coef_scheduler = policy_util.VarScheduler(
             self.entropy_coef_spec)
         self.body.entropy_coef = self.entropy_coef_scheduler.start_val
     # PPO uses GAE
     self.calc_advs_v_targets = self.calc_gae_advs_v_targets