def _end_epoch(self): # TODO: change IterationData to reflect new stuff better del self.prev self.prev = copy.deepcopy(self.cur) for m in range(self.M): self.prev[m].new_traj_distr = self.new_traj_distr[m] # NEW IterationData object, and remove new_traj_distr self.cur = [IterationData() for _ in range(self.M)] for m in range(self.M): self.cur[m].traj_info = TrajectoryInfo() self.cur[m].traj_info.dynamics = \ copy.deepcopy(self.prev[m].traj_info.dynamics) self.cur[m].step_mult = self.prev[m].step_mult self.cur[m].eta = self.prev[m].eta self.cur[m].traj_distr = self.new_traj_distr[m] self.cur[m].traj_info.last_kl_step = \ self.prev[m].traj_info.last_kl_step # MDGPS self.cur[m].pol_info = copy.deepcopy(self.prev[m].pol_info) self.new_traj_distr = None RLAlgorithm._end_epoch(self)
def _handle_step( self, observation, action, reward, next_observation, terminal, agent_info, env_info, ): """ Implement anything that needs to happen after every step :return: """ # Add to replay buffer self.replay_buffer.add_sample( observation=observation, action=action, reward=reward, terminal=terminal, next_observation=next_observation, agent_info=agent_info, env_info=env_info, ) RLAlgorithm._handle_step( self, observation=observation, action=action, reward=reward, next_observation=next_observation, terminal=terminal, agent_info=agent_info, env_info=env_info, )
def evaluate(self, epoch): self._update_logging_data() RLAlgorithm.evaluate(self, epoch) # Reset log_data for key in self.log_data.keys(): self.log_data[key].fill(0)
def _end_rollout(self): """ Implement anything that needs to happen after every rollout. """ self.replay_buffer.terminate_episode() RLAlgorithm._end_rollout(self)
def get_epoch_snapshot(self, epoch): """ Stuff to save in file. Args: epoch: Returns: """ snapshot = RLAlgorithm.get_epoch_snapshot(self, epoch) snapshot.update( policy=self.eval_policy, trained_policy=self.policy, target_policy=self._target_policy, exploration_policy=self.explo_policy, qf=self._qf, target_qf=self._target_qf, ) # Replay Buffer if self.save_replay_buffer: snapshot.update( replay_buffer=self.replay_buffer, ) return snapshot
def _handle_step( self, observation, action, reward, next_observation, terminal, agent_info, env_info, ): """ Implement anything that needs to happen after every step :return: """ # Add to replay buffer self.replay_buffer.add_sample( observation=observation, action=action, reward=reward, terminal=terminal, next_observation=next_observation, agent_info=agent_info, env_info=env_info, ) # Update observation normalizer (if applicable) if self._obs_normalizer is not None: self._obs_normalizer.update(np.array([observation])) RLAlgorithm._handle_step( self, observation=observation, action=action, reward=reward, next_observation=next_observation, terminal=terminal, agent_info=agent_info, env_info=env_info, )
def get_epoch_snapshot(self, epoch): """ Stuff to save in file. Args: epoch: Returns: """ if self._epoch_plotter is not None: self._epoch_plotter.draw() self._epoch_plotter.save_figure(epoch) snapshot = RLAlgorithm.get_epoch_snapshot(self, epoch) snapshot.update( policy=self._policy, qf=self._i_qf1, qf2=self._i_qf2, target_qf=self._i_target_qf1, vf=self._i_vf, target_vf=self._i_target_vf, u_qf=self._u_qf1, u_qf2=self._u_qf2, u_vf=self._u_vf, target_u_qf1=self._u_target_qf1, target_u_qf2=self._u_target_qf2, target_u_vf=self._u_target_vf, ) if self.explo_env.online_normalization or self.explo_env.normalize_obs: snapshot.update( obs_mean=self.explo_env.obs_mean, obs_var=self.explo_env.obs_var, ) # Observation Normalizer snapshot.update(obs_normalizer=self._obs_normalizer, ) # Replay Buffer if self.save_replay_buffer: snapshot.update(replay_buffer=self.replay_buffer, ) return snapshot
def __init__( self, explo_env, policy, qf, replay_buffer, batch_size=1024, normalize_obs=False, eval_env=None, vf=None, qf2=None, action_prior='uniform', entropy_scale=1., auto_alpha=True, tgt_entro=None, policy_lr=3e-4, qf_lr=3e-4, policy_mean_regu_weight=1e-3, policy_std_regu_weight=1e-3, policy_pre_activation_weight=0., policy_weight_decay=0., q_weight_decay=0., optimizer='adam', # optimizer='rmsprop', # optimizer='sgd', optimizer_kwargs=None, soft_target_tau=5e-3, target_update_interval=1, reward_scale=1., save_replay_buffer=False, eval_deterministic=True, log_tensorboard=False, **kwargs): # ###### # # Models # # ###### # # Exploration Policy self._policy = policy # Evaluation Policy if eval_deterministic: eval_policy = MakeDeterministic(self._policy) else: eval_policy = self._policy # Observation Normalizer if normalize_obs: self._obs_normalizer = RunningNormalizer(shape=explo_env.obs_dim) else: self._obs_normalizer = None RLAlgorithm.__init__(self, explo_env=explo_env, explo_policy=self._policy, eval_env=eval_env, eval_policy=eval_policy, obs_normalizer=self._obs_normalizer, **kwargs) # Q-function(s) and V-function self._qf = qf self._qf2 = qf2 if vf is None: self._vf = None self._target_vf = None self._target_qf1 = qf.copy() self._target_qf2 = None if qf2 is None else qf2.copy() else: self._vf = vf self._target_vf = vf.copy() self._target_qf1 = None self._target_qf2 = None # Replay Buffer self.replay_buffer = replay_buffer self.batch_size = batch_size self.save_replay_buffer = save_replay_buffer # Soft-update rate for target V-function self._soft_target_tau = soft_target_tau self._target_update_interval = target_update_interval # Important algorithm hyperparameters self._action_prior = action_prior self._entropy_scale = entropy_scale # Desired Alpha self._auto_alpha = auto_alpha if tgt_entro is None: tgt_entro = -explo_env.action_dim self._tgt_entro = torch.tensor([float(tgt_entro)], device=ptu.device) self._log_alpha = torch.zeros(1, device=ptu.device, requires_grad=True) # Reward Scale self.reward_scale = reward_scale # ########## # # Optimizers # # ########## # if optimizer.lower() == 'adam': optimizer_class = optim.Adam if optimizer_kwargs is None: optimizer_kwargs = dict(amsgrad=True, # amsgrad=False, ) elif optimizer.lower() == 'rmsprop': optimizer_class = optim.RMSprop if optimizer_kwargs is None: optimizer_kwargs = dict() else: raise ValueError('Wrong optimizer') self.qf_lr = qf_lr self.policy_lr = policy_lr # Q-function(s) optimizer(s) self._qf1_optimizer = optimizer_class(self._qf.parameters(), lr=qf_lr, weight_decay=q_weight_decay, **optimizer_kwargs) values_parameters = self._qf.parameters() if self._qf2 is None: self._qf2_optimizer = None else: self._qf2_optimizer = optimizer_class(self._qf2.parameters(), lr=qf_lr, weight_decay=q_weight_decay, **optimizer_kwargs) values_parameters = chain(values_parameters, self._qf2.parameters()) # V-function optimizer if self._vf is None: self._vf_optimizer = None else: self._vf_optimizer = optimizer_class(self._vf.parameters(), lr=qf_lr, weight_decay=q_weight_decay, **optimizer_kwargs) values_parameters = chain(values_parameters, self._vf.parameters()) self._values_optimizer = optimizer_class(values_parameters, lr=qf_lr, weight_decay=q_weight_decay, **optimizer_kwargs) # Policy optimizer self._policy_optimizer = optimizer_class( self._policy.parameters(), lr=policy_lr, weight_decay=policy_weight_decay, **optimizer_kwargs) # Alpha optimizer self._alpha_optimizer = optimizer_class([self._log_alpha], lr=policy_lr, **optimizer_kwargs) # Weights for policy regularization coefficients self.pol_mean_regu_weight = policy_mean_regu_weight self.pol_std_regu_weight = policy_std_regu_weight self.pol_pre_activation_weight = policy_pre_activation_weight # Useful Variables for logging self.log_data = dict() self.log_data['Pol KL Loss'] = np.zeros(self.num_train_steps_per_epoch) self.log_data['Qf Loss'] = np.zeros(self.num_train_steps_per_epoch) self.log_data['Qf2 Loss'] = np.zeros(self.num_train_steps_per_epoch) self.log_data['Vf Loss'] = np.zeros(self.num_train_steps_per_epoch) self.log_data['Rewards'] = np.zeros(self.num_train_steps_per_epoch) self.log_data['Pol Entropy'] = np.zeros(self.num_train_steps_per_epoch) self.log_data['Pol Log Std'] = np.zeros(( self.num_train_steps_per_epoch, self.explo_env.action_dim, )) self.log_data['Policy Mean'] = np.zeros(( self.num_train_steps_per_epoch, self.explo_env.action_dim, )) self.log_data['Alphas'] = np.zeros(self.num_train_steps_per_epoch) # Tensorboard-like Logging self._log_tensorboard = log_tensorboard if log_tensorboard: self._summary_writer = \ tensorboardX.SummaryWriter(log_dir=logger.get_snapshot_dir()) else: self._summary_writer = None
def __init__( self, env, policy, explo_policy, u_qf, replay_buffer, batch_size=1024, normalize_obs=False, eval_env=None, i_qf=None, action_prior='uniform', policy_lr=3e-4, qf_lr=1e-4, i_policy_pre_activation_weight=0., i_policy_mixing_coeff_weight=1e-3, u_policy_pre_activation_weight=None, policy_weight_decay=0., qf_weight_decay=0., optimizer='adam', # optimizer='rmsprop', # optimizer='sgd', optimizer_kwargs=None, i_soft_target_tau=1e-2, u_soft_target_tau=1e-2, i_target_update_interval=1, u_target_update_interval=1, reward_scale=1., u_reward_scales=None, min_q_value=-np.inf, max_q_value=np.inf, residual_gradient_weight=0, eval_with_target_policy=False, save_replay_buffer=False, log_tensorboard=False, **kwargs): # ###### # # Models # # ###### # # Deterministic Policies self._policy = policy self._target_policy = policy.copy() # Exploration Policy self._exploration_policy = explo_policy # Evaluation Policy if eval_with_target_policy: eval_policy = self._target_policy else: eval_policy = self._policy # Observation Normalizer if normalize_obs: self._obs_normalizer = RunningNormalizer(shape=env.obs_dim) else: self._obs_normalizer = None RLAlgorithm.__init__(self, explo_env=env, explo_policy=self._exploration_policy, eval_env=eval_env, eval_policy=eval_policy, obs_normalizer=self._obs_normalizer, **kwargs) # Number of Unintentional Tasks (Composable Tasks) self._n_unintentional = self._policy.n_heads # Evaluation Sampler (One for each unintentional) self.eval_u_samplers = [ InPlacePathSampler( env=env, policy=WeightedMultiPolicySelector(eval_policy, idx), total_samples=self.num_steps_per_eval, max_path_length=self.max_path_length, deterministic=None, ) for idx in range(self._n_unintentional) ] # Important algorithm hyperparameters self._action_prior = action_prior # Intentional (Main Task) Q-function self._i_qf = i_qf self._i_target_qf = i_qf.copy() # Unintentional (Composable Tasks) Q-functions self._u_qf = u_qf self._u_target_qf = u_qf.copy() self._min_q_value = min_q_value self._max_q_value = max_q_value self._residual_gradient_weight = residual_gradient_weight # Soft-update rate for target V-functions self._i_soft_target_tau = i_soft_target_tau self._u_soft_target_tau = u_soft_target_tau self._i_target_update_interval = i_target_update_interval self._u_target_update_interval = u_target_update_interval # Reward Scales self.reward_scale = reward_scale if u_reward_scales is None: reward_scale = kwargs['reward_scale'] u_reward_scales = [ reward_scale for _ in range(self._n_unintentional) ] self._u_reward_scales = ptu.FloatTensor(u_reward_scales) # Replay Buffer self.replay_buffer = replay_buffer self.batch_size = batch_size self.save_replay_buffer = save_replay_buffer # ########## # # Optimizers # # ########## # if optimizer.lower() == 'adam': optimizer_class = optim.Adam if optimizer_kwargs is None: optimizer_kwargs = dict(amsgrad=True, # amsgrad=False, ) elif optimizer.lower() == 'rmsprop': optimizer_class = optim.RMSprop if optimizer_kwargs is None: optimizer_kwargs = dict() else: raise ValueError('Wrong optimizer') self._qf_lr = qf_lr self._policy_lr = policy_lr # Q-function and V-function Optimization Criteria self._u_qf_criterion = nn.MSELoss() self._i_qf_criterion = nn.MSELoss() # Q-function(s) optimizers(s) self._u_qf_optimizer = optimizer_class(self._u_qf.parameters(), lr=qf_lr, weight_decay=qf_weight_decay, **optimizer_kwargs) self._i_qf_optimizer = optimizer_class(self._i_qf.parameters(), lr=qf_lr, weight_decay=qf_weight_decay, **optimizer_kwargs) # Policy optimizer self._policy_optimizer = optimizer_class( self._policy.parameters(), lr=policy_lr, weight_decay=policy_weight_decay, **optimizer_kwargs) # Policy regularization coefficients (weights) self._i_pol_pre_activ_weight = i_policy_pre_activation_weight self._i_pol_mixing_coeff_weight = i_policy_mixing_coeff_weight if u_policy_pre_activation_weight is None: u_policy_pre_activation_weight = [ i_policy_pre_activation_weight for _ in range(self._n_unintentional) ] self._u_policy_pre_activ_weight = \ ptu.FloatTensor(u_policy_pre_activation_weight) # Useful Variables for logging self.log_data = dict() self.log_data['Raw Pol Loss'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) self.log_data['Pol Loss'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) self.log_data['Qf Loss'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) self.log_data['Rewards'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) self.log_data['Policy Action'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, self.explo_env.action_dim, )) self.log_data['Mixing Weights'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional, self.explo_env.action_dim, )) # Tensorboard-like Logging self._log_tensorboard = log_tensorboard if log_tensorboard: self._summary_writer = \ tensorboardX.SummaryWriter(log_dir=logger.get_snapshot_dir()) else: self._summary_writer = None
def __init__(self, env, local_policies, global_policy, cost_fcn, eval_env=None, train_cond_idxs=None, test_cond_idxs=None, num_samples=1, test_samples=1, noisy_samples=True, noise_hyperparams=None, seed=10, base_kl_step=0.1, global_opt_iters=5000, global_opt_batch_size=64, global_opt_lr=1e-5, traj_opt_prev='nn_pol', traj_opt_iters=1, traj_opt_min_eta=1e-8, traj_opt_max_eta=1e16, **kwargs): # TO DEFINE self._fit_dynamics = True self._initial_state_var = 1.0e-2 self._global_opt_batch_size = global_opt_batch_size self._global_opt_iters = global_opt_iters self._global_opt_ent_reg = 0.0 # For update pol variance self._global_pol_sample_mode = 'add' self._global_opt_lr = global_opt_lr self._global_samples_counter = 0 self._first_global_eval = False self.base_kl_step = base_kl_step self._max_step_mult = 3.0 self._min_step_mult = 0.5 self._kl_step_rule = 'laplace' self._traj_opt_iters = traj_opt_iters self._max_ent_traj = 0.0 self._traj_opt_prev = traj_opt_prev self.T = kwargs['max_path_length'] self._num_samples = num_samples self._test_samples = test_samples self._train_cond_idxs = train_cond_idxs self._test_cond_idxs = test_cond_idxs # Get dimensions from the environment self.dU = env.action_dim self.dX = env.obs_dim # TODO: DOING THIS TEMPORALLY self.dO = env.obs_dim # Number of initial conditions self.M = len(local_policies) exploration_policy = global_policy RLAlgorithm.__init__(self, env=env, exploration_policy=exploration_policy, eval_env=eval_env, eval_policy=global_policy, eval_sampler=self.sample_global_pol, **kwargs) # Rename for GPS self.global_policy = self.eval_policy self.local_policies = local_policies # Noise to be used with trajectory distributions self.noise_data = np.zeros( (self.num_epochs, self.M, self._num_samples, self.T, self.dU)) self._noisy_samples = noisy_samples if self._noisy_samples: for ii in range(self.num_epochs): for cond in range(self.M): for n in range(self._num_samples): self.noise_data[ii, cond, n, :, :] = \ generate_noise(self.T, self.dU, noise_hyperparams) # IterationData objects for each condition. self.cur = [IterationData() for _ in range(self.M)] self.prev = [IterationData() for _ in range(self.M)] # Trajectory Info for m in range(self.M): self.cur[m].traj_info = TrajectoryInfo() if self._fit_dynamics: sigma_regu = 1e-6 prior = DynamicsPriorGMM( min_samples_per_cluster=40, max_clusters=20, max_samples=20, strength=1., ) self.cur[m].traj_info.dynamics = \ DynamicsLRPrior(prior=prior, sigma_regu=sigma_regu) self.cur[m].traj_distr = local_policies[m] # Cost Fcn self._cost_fcn = cost_fcn # Global Policy Optimization self.global_pol_optimizer = torch.optim.Adam( self.global_policy.parameters(), lr=self._global_opt_lr, betas=(0.9, 0.999), eps=1e-08, # Term added to the denominator for numerical stability # weight_decay=0.005, weight_decay=0.5, amsgrad=True, ) # Local Trajectory Information self._local_pol_optimizer = TrajOptLQR( cons_per_step=False, use_prev_distr=False, update_in_bwd_pass=True, min_eta=traj_opt_min_eta, max_eta=traj_opt_max_eta, ) level = logging.INFO self.logger = logging.getLogger(__name__) self.logger.setLevel(level) console = logging.StreamHandler() self.logger.addHandler(console) for handler in self.logger.handlers: handler.setLevel(level) self.eval_statistics = None self._return_fig = None self._return_axs = None self._return_lines = [None for _ in range(self.n_test_conds)] # MDGPS data # # ---------- # for m in range(self.M): # Same policy prior type for all conditions self.cur[m].pol_info = PolicyInfo( T=self.T, dU=self.dU, dX=self.dX, init_pol_wt=0.01, ) self.cur[m].pol_info.policy_prior = ConstantPolicyPrior()
def __init__( self, env, policy, qf, replay_buffer, normalize_obs=False, eval_env=None, action_prior='uniform', entropy_scale=1., policy_lr=1e-4, qf_lr=1e-3, policy_weight_decay=0, qf_weight_decay=0, residual_gradient_weight=0, epoch_discount_schedule=None, policy_mean_regu_weight=1e-3, policy_std_regu_weight=1e-3, policy_pre_activation_weight=0., optimizer='adam', # optimizer='rmsprop', # optimizer='sgd', optimizer_kwargs=None, target_hard_update_period=1000, tau=1e-2, use_soft_update=False, save_replay_buffer=False, eval_deterministic=True, log_tensorboard=False, **kwargs): # ###### # # Models # # ###### # # Exploration Policy self._policy = policy # Evaluation Policy if eval_deterministic: eval_policy = MakeDeterministic(self._policy) else: eval_policy = self._policy # Observation Normalizer if normalize_obs: self._obs_normalizer = RunningNormalizer(shape=env.obs_dim) else: self._obs_normalizer = None RLAlgorithm.__init__(self, env=env, exploration_policy=self._policy, eval_env=eval_env, eval_policy=eval_policy, obs_normalizer=self._obs_normalizer, **kwargs) # Important algorithm hyperparameters self._action_prior = action_prior self._entropy_scale = entropy_scale # Q-function self._qf = qf # ########## # # Optimizers # # ########## # if optimizer.lower() == 'adam': optimizer_class = optim.Adam if optimizer_kwargs is None: optimizer_kwargs = dict(amsgrad=True, # amsgrad=False, ) elif optimizer.lower() == 'rmsprop': optimizer_class = optim.RMSprop if optimizer_kwargs is None: optimizer_kwargs = dict() else: raise ValueError('Wrong optimizer') # Q-function(s) optimizer(s) self._qf_optimizer = optimizer_class(self._qf.parameters(), lr=qf_lr, weight_decay=0, **optimizer_kwargs) # Policy optimizer self._policy_optimizer = optimizer_class(self._policy.parameters(), lr=policy_lr, weight_decay=0, **optimizer_kwargs) # Policy regularization coefficients (weights) self._policy_mean_regu_weight = policy_mean_regu_weight self._policy_std_regu_weight = policy_std_regu_weight self._policy_pre_activation_weight = policy_pre_activation_weight # Useful Variables for logging self.logging_pol_kl_loss = np.zeros(self.num_train_steps_per_epoch) self.logging_qf_loss = np.zeros(self.num_train_steps_per_epoch) self.logging_rewards = np.zeros(self.num_train_steps_per_epoch) self.logging_policy_entropy = np.zeros(self.num_train_steps_per_epoch) self.logging_policy_log_std = np.zeros( (self.num_train_steps_per_epoch, self.explo_env.action_dim)) self.logging_policy_mean = np.zeros( (self.num_train_steps_per_epoch, self.explo_env.action_dim)) self._log_tensorboard = log_tensorboard self._summary_writer = tensorboardX.SummaryWriter( log_dir=logger.get_snapshot_dir())
def evaluate(self, epoch): RLAlgorithm.evaluate(self, epoch)
def __init__( self, explo_env, qf, policy, explo_policy, replay_buffer, batch_size=1024, eval_env=None, target_hard_update_period=1000, tau=1e-2, use_soft_update=False, qf_criterion=None, residual_gradient_weight=0, epoch_discount_schedule=None, eval_with_target_policy=False, policy_pre_activation_weight=0., policy_lr=1e-4, qf_lr=1e-3, policy_weight_decay=0., qf_weight_decay=0, optimizer='adam', # optimizer='rmsprop', # optimizer='sgd', optimizer_kwargs=None, obs_normalizer: TorchFixedNormalizer=None, action_normalizer: TorchFixedNormalizer=None, num_paths_for_normalization=0, reward_scale=1., min_q_value=-np.inf, max_q_value=np.inf, save_replay_buffer=False, **kwargs ): """ :param explo_env: :param qf: :param policy: :param explo_policy: :param policy_lr: :param qf_lr: :param qf_weight_decay: :param target_hard_update_period: :param tau: :param use_soft_update: :param qf_criterion: Loss function to use for the q function. Should be a function that takes in two inputs (y_predicted, y_target). :param residual_gradient_weight: c, float between 0 and 1. The gradient used for training the Q function is then (1-c) * normal td gradient + c * residual gradient :param epoch_discount_schedule: A schedule for the discount factor that varies with the epoch. :param kwargs: """ self._target_policy = policy.copy() if eval_with_target_policy: eval_policy = self._target_policy else: eval_policy = policy RLAlgorithm.__init__( self, explo_env=explo_env, explo_policy=explo_policy, eval_env=eval_env, eval_policy=eval_policy, **kwargs ) self.policy = policy self.target_hard_update_period = target_hard_update_period self.tau = tau self.use_soft_update = use_soft_update self.residual_gradient_weight = residual_gradient_weight self.policy_pre_activation_weight = policy_pre_activation_weight self.epoch_discount_schedule = epoch_discount_schedule self.obs_normalizer = obs_normalizer self.action_normalizer = action_normalizer self.num_paths_for_normalization = num_paths_for_normalization self.reward_scale = reward_scale # Q-function self._qf = qf self._target_qf = self._qf.copy() self.min_q_value = min_q_value self.max_q_value = max_q_value if qf_criterion is None: qf_criterion = nn.MSELoss() self.qf_criterion = qf_criterion # Replay Buffer self.replay_buffer = replay_buffer self.batch_size = batch_size self.save_replay_buffer = save_replay_buffer # ########## # # Optimizers # # ########## # if optimizer.lower() == 'adam': optimizer_class = optim.Adam if optimizer_kwargs is None: optimizer_kwargs = dict( amsgrad=True, # amsgrad=False, ) elif optimizer.lower() == 'rmsprop': optimizer_class = optim.RMSprop if optimizer_kwargs is None: optimizer_kwargs = dict( ) else: raise ValueError('Wrong optimizer') self._qf_lr = qf_lr self._policy_lr = policy_lr self._qf_weight_decay = qf_weight_decay self._policy_weight_decay = qf_weight_decay # Q-function optimizer self._qf_optimizer = optimizer_class( self._qf.parameters(), lr=qf_lr, weight_decay=qf_weight_decay, **optimizer_kwargs ) # Policy optimizer self._policy_optimizer = optimizer_class( self.policy.parameters(), lr=policy_lr, weight_decay=policy_weight_decay, **optimizer_kwargs ) # Useful Variables for logging self.log_data = dict() self.log_data['Raw Pol Loss'] = np.zeros(self.num_train_steps_per_epoch) self.log_data['Pol Loss'] = np.zeros(self.num_train_steps_per_epoch) self.log_data['Qf Loss'] = np.zeros(self.num_train_steps_per_epoch) self.log_data['Q pred'] = np.zeros( (self.num_train_steps_per_epoch, batch_size) ) self.log_data['Q target'] = np.zeros( (self.num_train_steps_per_epoch, batch_size) ) self.log_data['Bellman Error'] = np.zeros( (self.num_train_steps_per_epoch, batch_size) ) self.log_data['Policy Actions'] = np.zeros( (self.num_train_steps_per_epoch, batch_size, self.explo_env.action_dim) )
def __init__( self, env, policy, u_qf1, replay_buffer, batch_size=1024, normalize_obs=False, eval_env=None, i_qf1=None, u_qf2=None, i_qf2=None, i_vf=None, u_vf=None, action_prior='uniform', i_entropy_scale=1., u_entropy_scale=None, auto_alpha=True, i_tgt_entro=None, u_tgt_entros=None, policy_lr=3e-4, qf_lr=3e-4, i_policy_mean_regu_weight=1e-3, i_policy_std_regu_weight=1e-3, i_policy_pre_activation_weight=0., i_policy_mixing_coeff_weight=1e-3, u_policy_mean_regu_weight=None, u_policy_std_regu_weight=None, u_policy_pre_activation_weight=None, policy_weight_decay=0., q_weight_decay=0., optimizer='adam', # optimizer='rmsprop', # optimizer='sgd', optimizer_kwargs=None, i_soft_target_tau=5e-3, u_soft_target_tau=5e-3, i_target_update_interval=1, u_target_update_interval=1, reward_scale=1., u_reward_scales=None, save_replay_buffer=False, eval_deterministic=True, log_tensorboard=False, **kwargs): # ###### # # Models # # ###### # # Exploration Policy self._policy = policy # Evaluation Policy if eval_deterministic: eval_policy = MakeDeterministic(self._policy) else: eval_policy = self._policy # Observation Normalizer if normalize_obs: self._obs_normalizer = RunningNormalizer(shape=env.obs_dim) else: self._obs_normalizer = None RLAlgorithm.__init__(self, explo_env=env, explo_policy=self._policy, eval_env=eval_env, eval_policy=eval_policy, obs_normalizer=self._obs_normalizer, **kwargs) # Number of Unintentional Tasks (Composable Tasks) self._n_unintentional = self._policy.n_heads # Evaluation Sampler (One for each unintentional) self.eval_u_samplers = [ InPlacePathSampler( env=env, policy=WeightedMultiPolicySelector(self._policy, idx), total_samples=self.num_steps_per_eval, max_path_length=self.max_path_length, deterministic=True, ) for idx in range(self._n_unintentional) ] # Intentional (Main Task) Q-functions self._i_qf1 = i_qf1 self._i_qf2 = i_qf2 if i_vf is None: self._i_vf = None self._i_target_vf = None self._i_target_qf1 = self._i_qf1.copy() self._i_target_qf2 = \ None if self._i_qf2 is None else self._i_qf2.copy() else: self._i_vf = i_vf self._i_target_vf = self._i_vf.copy() self._i_target_qf1 = None self._i_target_qf2 = None # Unintentional (Composable Tasks) Q-functions self._u_qf1 = u_qf1 self._u_qf2 = u_qf2 if u_vf is None: self._u_vf = None self._u_target_vf = None self._u_target_qf1 = self._u_qf1.copy() self._u_target_qf2 = self._u_qf2.copy() else: self._u_vf = u_vf self._u_target_vf = self._u_vf.copy() self._u_target_qf1 = None self._u_target_qf2 = None # Replay Buffer self.replay_buffer = replay_buffer self.batch_size = batch_size self.save_replay_buffer = save_replay_buffer # Soft-update rate for target V-functions self._i_soft_target_tau = i_soft_target_tau self._u_soft_target_tau = u_soft_target_tau self._i_target_update_interval = i_target_update_interval self._u_target_update_interval = u_target_update_interval # Important algorithm hyperparameters self._action_prior = action_prior self._i_entropy_scale = i_entropy_scale if u_entropy_scale is None: u_entropy_scale = [ i_entropy_scale for _ in range(self._n_unintentional) ] self._u_entropy_scale = torch.tensor(u_entropy_scale, dtype=torch.float32, device=ptu.device) # Desired Alphas self._auto_alphas = auto_alpha if i_tgt_entro is None: i_tgt_entro = -env.action_dim self._i_tgt_entro = torch.tensor([i_tgt_entro], dtype=torch.float32, device=ptu.device) if u_tgt_entros is None: u_tgt_entros = [i_tgt_entro for _ in range(self._n_unintentional)] self._u_tgt_entros = torch.tensor(u_tgt_entros, dtype=torch.float32, device=ptu.device) self._u_log_alphas = torch.zeros(self._n_unintentional, device=ptu.device, requires_grad=True) self._i_log_alpha = torch.zeros(1, device=ptu.device, requires_grad=True) # Reward Scales self.reward_scale = reward_scale if u_reward_scales is None: reward_scale = kwargs['reward_scale'] u_reward_scales = [ reward_scale for _ in range(self._n_unintentional) ] self._u_reward_scales = torch.tensor(u_reward_scales, dtype=torch.float32, device=ptu.device) # ########## # # Optimizers # # ########## # if optimizer.lower() == 'adam': optimizer_class = optim.Adam if optimizer_kwargs is None: optimizer_kwargs = dict(amsgrad=True, # amsgrad=False, ) elif optimizer.lower() == 'rmsprop': optimizer_class = optim.RMSprop if optimizer_kwargs is None: optimizer_kwargs = dict() else: raise ValueError('Wrong optimizer') # Values optimizer vals_params_list = [self._u_qf1.parameters(), self._i_qf1.parameters()] if self._u_qf2 is not None: vals_params_list.append(self._u_qf2.parameters()) if self._i_qf2 is not None: vals_params_list.append(self._i_qf2.parameters()) if self._u_vf is not None: vals_params_list.append(self._u_vf.parameters()) if self._i_vf is not None: vals_params_list.append(self._i_vf.parameters()) vals_params = chain(*vals_params_list) self._values_optimizer = optimizer_class(vals_params, lr=qf_lr, weight_decay=q_weight_decay, **optimizer_kwargs) # Policy optimizer self._policy_optimizer = optimizer_class( self._policy.parameters(), lr=policy_lr, weight_decay=policy_weight_decay, **optimizer_kwargs) # Alpha optimizers self._alphas_optimizer = optimizer_class( [self._u_log_alphas, self._i_log_alpha], lr=policy_lr, **optimizer_kwargs) # Weights for policy regularization coefficients self._i_pol_mean_regu_weight = i_policy_mean_regu_weight self._i_pol_std_regu_weight = i_policy_std_regu_weight self._i_pol_pre_activ_weight = i_policy_pre_activation_weight self._i_pol_mixing_coeff_weight = i_policy_mixing_coeff_weight if u_policy_mean_regu_weight is None: u_policy_mean_regu_weight = [ i_policy_mean_regu_weight for _ in range(self._n_unintentional) ] self._u_policy_mean_regu_weight = \ torch.tensor(u_policy_mean_regu_weight, dtype=torch.float32, device=ptu.device) if u_policy_std_regu_weight is None: u_policy_std_regu_weight = [ i_policy_std_regu_weight for _ in range(self._n_unintentional) ] self._u_policy_std_regu_weight = \ torch.tensor(u_policy_std_regu_weight, dtype=torch.float32, device=ptu.device) if u_policy_pre_activation_weight is None: u_policy_pre_activation_weight = [ i_policy_pre_activation_weight for _ in range(self._n_unintentional) ] self._u_policy_pre_activ_weight = \ torch.tensor(u_policy_pre_activation_weight, dtype=torch.float32, device=ptu.device) # Useful Variables for logging self.log_data = dict() self.log_data['Pol KL Loss'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) self.log_data['Qf Loss'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) self.log_data['Qf2 Loss'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) self.log_data['Vf Loss'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) self.log_data['Rewards'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) self.log_data['Policy Entropy'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) self.log_data['Policy Mean'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, self.explo_env.action_dim, )) self.log_data['Pol Log Std'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, self.explo_env.action_dim, )) self.log_data['Mixing Weights'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional, self.explo_env.action_dim, )) self.log_data['Alphas'] = np.zeros(( self.num_train_steps_per_epoch, self._n_unintentional + 1, )) # Tensorboard-like Logging self._log_tensorboard = log_tensorboard if log_tensorboard: self._summary_writer = \ tensorboardX.SummaryWriter(log_dir=logger.get_snapshot_dir()) else: self._summary_writer = None