Ejemplo n.º 1
0
    def _end_epoch(self):
        # TODO: change IterationData to reflect new stuff better

        del self.prev
        self.prev = copy.deepcopy(self.cur)

        for m in range(self.M):
            self.prev[m].new_traj_distr = self.new_traj_distr[m]

        # NEW IterationData object, and remove new_traj_distr
        self.cur = [IterationData() for _ in range(self.M)]
        for m in range(self.M):
            self.cur[m].traj_info = TrajectoryInfo()
            self.cur[m].traj_info.dynamics = \
                copy.deepcopy(self.prev[m].traj_info.dynamics)
            self.cur[m].step_mult = self.prev[m].step_mult
            self.cur[m].eta = self.prev[m].eta
            self.cur[m].traj_distr = self.new_traj_distr[m]
            self.cur[m].traj_info.last_kl_step = \
                self.prev[m].traj_info.last_kl_step
            # MDGPS
            self.cur[m].pol_info = copy.deepcopy(self.prev[m].pol_info)
        self.new_traj_distr = None

        RLAlgorithm._end_epoch(self)
Ejemplo n.º 2
0
    def _handle_step(
            self,
            observation,
            action,
            reward,
            next_observation,
            terminal,
            agent_info,
            env_info,
    ):
        """
        Implement anything that needs to happen after every step
        :return:
        """
        # Add to replay buffer
        self.replay_buffer.add_sample(
            observation=observation,
            action=action,
            reward=reward,
            terminal=terminal,
            next_observation=next_observation,
            agent_info=agent_info,
            env_info=env_info,
        )

        RLAlgorithm._handle_step(
            self,
            observation=observation,
            action=action,
            reward=reward,
            next_observation=next_observation,
            terminal=terminal,
            agent_info=agent_info,
            env_info=env_info,
        )
Ejemplo n.º 3
0
    def evaluate(self, epoch):
        self._update_logging_data()
        RLAlgorithm.evaluate(self, epoch)

        # Reset log_data
        for key in self.log_data.keys():
            self.log_data[key].fill(0)
Ejemplo n.º 4
0
    def _end_rollout(self):
        """
        Implement anything that needs to happen after every rollout.
        """

        self.replay_buffer.terminate_episode()

        RLAlgorithm._end_rollout(self)
Ejemplo n.º 5
0
    def get_epoch_snapshot(self, epoch):
        """
        Stuff to save in file.
        Args:
            epoch:

        Returns:

        """
        snapshot = RLAlgorithm.get_epoch_snapshot(self, epoch)

        snapshot.update(
            policy=self.eval_policy,
            trained_policy=self.policy,
            target_policy=self._target_policy,
            exploration_policy=self.explo_policy,
            qf=self._qf,
            target_qf=self._target_qf,
        )

        # Replay Buffer
        if self.save_replay_buffer:
            snapshot.update(
                replay_buffer=self.replay_buffer,
            )

        return snapshot
Ejemplo n.º 6
0
    def _handle_step(
        self,
        observation,
        action,
        reward,
        next_observation,
        terminal,
        agent_info,
        env_info,
    ):
        """
        Implement anything that needs to happen after every step
        :return:
        """
        # Add to replay buffer
        self.replay_buffer.add_sample(
            observation=observation,
            action=action,
            reward=reward,
            terminal=terminal,
            next_observation=next_observation,
            agent_info=agent_info,
            env_info=env_info,
        )

        # Update observation normalizer (if applicable)
        if self._obs_normalizer is not None:
            self._obs_normalizer.update(np.array([observation]))

        RLAlgorithm._handle_step(
            self,
            observation=observation,
            action=action,
            reward=reward,
            next_observation=next_observation,
            terminal=terminal,
            agent_info=agent_info,
            env_info=env_info,
        )
Ejemplo n.º 7
0
    def get_epoch_snapshot(self, epoch):
        """
        Stuff to save in file.
        Args:
            epoch:

        Returns:

        """
        if self._epoch_plotter is not None:
            self._epoch_plotter.draw()
            self._epoch_plotter.save_figure(epoch)

        snapshot = RLAlgorithm.get_epoch_snapshot(self, epoch)

        snapshot.update(
            policy=self._policy,
            qf=self._i_qf1,
            qf2=self._i_qf2,
            target_qf=self._i_target_qf1,
            vf=self._i_vf,
            target_vf=self._i_target_vf,
            u_qf=self._u_qf1,
            u_qf2=self._u_qf2,
            u_vf=self._u_vf,
            target_u_qf1=self._u_target_qf1,
            target_u_qf2=self._u_target_qf2,
            target_u_vf=self._u_target_vf,
        )

        if self.explo_env.online_normalization or self.explo_env.normalize_obs:
            snapshot.update(
                obs_mean=self.explo_env.obs_mean,
                obs_var=self.explo_env.obs_var,
            )

        # Observation Normalizer
        snapshot.update(obs_normalizer=self._obs_normalizer, )

        # Replay Buffer
        if self.save_replay_buffer:
            snapshot.update(replay_buffer=self.replay_buffer, )

        return snapshot
Ejemplo n.º 8
0
    def __init__(
            self,
            explo_env,
            policy,
            qf,
            replay_buffer,
            batch_size=1024,
            normalize_obs=False,
            eval_env=None,
            vf=None,
            qf2=None,
            action_prior='uniform',
            entropy_scale=1.,
            auto_alpha=True,
            tgt_entro=None,
            policy_lr=3e-4,
            qf_lr=3e-4,
            policy_mean_regu_weight=1e-3,
            policy_std_regu_weight=1e-3,
            policy_pre_activation_weight=0.,
            policy_weight_decay=0.,
            q_weight_decay=0.,
            optimizer='adam',
            # optimizer='rmsprop',
            # optimizer='sgd',
            optimizer_kwargs=None,
            soft_target_tau=5e-3,
            target_update_interval=1,
            reward_scale=1.,
            save_replay_buffer=False,
            eval_deterministic=True,
            log_tensorboard=False,
            **kwargs):

        # ###### #
        # Models #
        # ###### #

        # Exploration Policy
        self._policy = policy

        # Evaluation Policy
        if eval_deterministic:
            eval_policy = MakeDeterministic(self._policy)
        else:
            eval_policy = self._policy

        # Observation Normalizer
        if normalize_obs:
            self._obs_normalizer = RunningNormalizer(shape=explo_env.obs_dim)
        else:
            self._obs_normalizer = None

        RLAlgorithm.__init__(self,
                             explo_env=explo_env,
                             explo_policy=self._policy,
                             eval_env=eval_env,
                             eval_policy=eval_policy,
                             obs_normalizer=self._obs_normalizer,
                             **kwargs)

        # Q-function(s) and V-function
        self._qf = qf
        self._qf2 = qf2

        if vf is None:
            self._vf = None
            self._target_vf = None
            self._target_qf1 = qf.copy()
            self._target_qf2 = None if qf2 is None else qf2.copy()
        else:
            self._vf = vf
            self._target_vf = vf.copy()
            self._target_qf1 = None
            self._target_qf2 = None

        # Replay Buffer
        self.replay_buffer = replay_buffer
        self.batch_size = batch_size
        self.save_replay_buffer = save_replay_buffer

        # Soft-update rate for target V-function
        self._soft_target_tau = soft_target_tau
        self._target_update_interval = target_update_interval

        # Important algorithm hyperparameters
        self._action_prior = action_prior
        self._entropy_scale = entropy_scale

        # Desired Alpha
        self._auto_alpha = auto_alpha
        if tgt_entro is None:
            tgt_entro = -explo_env.action_dim
        self._tgt_entro = torch.tensor([float(tgt_entro)], device=ptu.device)
        self._log_alpha = torch.zeros(1, device=ptu.device, requires_grad=True)

        # Reward Scale
        self.reward_scale = reward_scale

        # ########## #
        # Optimizers #
        # ########## #
        if optimizer.lower() == 'adam':
            optimizer_class = optim.Adam
            if optimizer_kwargs is None:
                optimizer_kwargs = dict(amsgrad=True,
                                        # amsgrad=False,
                                        )
        elif optimizer.lower() == 'rmsprop':
            optimizer_class = optim.RMSprop
            if optimizer_kwargs is None:
                optimizer_kwargs = dict()
        else:
            raise ValueError('Wrong optimizer')
        self.qf_lr = qf_lr
        self.policy_lr = policy_lr

        # Q-function(s) optimizer(s)
        self._qf1_optimizer = optimizer_class(self._qf.parameters(),
                                              lr=qf_lr,
                                              weight_decay=q_weight_decay,
                                              **optimizer_kwargs)
        values_parameters = self._qf.parameters()
        if self._qf2 is None:
            self._qf2_optimizer = None
        else:
            self._qf2_optimizer = optimizer_class(self._qf2.parameters(),
                                                  lr=qf_lr,
                                                  weight_decay=q_weight_decay,
                                                  **optimizer_kwargs)
            values_parameters = chain(values_parameters,
                                      self._qf2.parameters())

        # V-function optimizer
        if self._vf is None:
            self._vf_optimizer = None
        else:
            self._vf_optimizer = optimizer_class(self._vf.parameters(),
                                                 lr=qf_lr,
                                                 weight_decay=q_weight_decay,
                                                 **optimizer_kwargs)
            values_parameters = chain(values_parameters, self._vf.parameters())
        self._values_optimizer = optimizer_class(values_parameters,
                                                 lr=qf_lr,
                                                 weight_decay=q_weight_decay,
                                                 **optimizer_kwargs)

        # Policy optimizer
        self._policy_optimizer = optimizer_class(
            self._policy.parameters(),
            lr=policy_lr,
            weight_decay=policy_weight_decay,
            **optimizer_kwargs)

        # Alpha optimizer
        self._alpha_optimizer = optimizer_class([self._log_alpha],
                                                lr=policy_lr,
                                                **optimizer_kwargs)

        # Weights for policy regularization coefficients
        self.pol_mean_regu_weight = policy_mean_regu_weight
        self.pol_std_regu_weight = policy_std_regu_weight
        self.pol_pre_activation_weight = policy_pre_activation_weight

        # Useful Variables for logging
        self.log_data = dict()
        self.log_data['Pol KL Loss'] = np.zeros(self.num_train_steps_per_epoch)
        self.log_data['Qf Loss'] = np.zeros(self.num_train_steps_per_epoch)
        self.log_data['Qf2 Loss'] = np.zeros(self.num_train_steps_per_epoch)
        self.log_data['Vf Loss'] = np.zeros(self.num_train_steps_per_epoch)
        self.log_data['Rewards'] = np.zeros(self.num_train_steps_per_epoch)
        self.log_data['Pol Entropy'] = np.zeros(self.num_train_steps_per_epoch)
        self.log_data['Pol Log Std'] = np.zeros((
            self.num_train_steps_per_epoch,
            self.explo_env.action_dim,
        ))
        self.log_data['Policy Mean'] = np.zeros((
            self.num_train_steps_per_epoch,
            self.explo_env.action_dim,
        ))
        self.log_data['Alphas'] = np.zeros(self.num_train_steps_per_epoch)

        # Tensorboard-like Logging
        self._log_tensorboard = log_tensorboard
        if log_tensorboard:
            self._summary_writer = \
                tensorboardX.SummaryWriter(log_dir=logger.get_snapshot_dir())
        else:
            self._summary_writer = None
Ejemplo n.º 9
0
    def __init__(
            self,
            env,
            policy,
            explo_policy,
            u_qf,
            replay_buffer,
            batch_size=1024,
            normalize_obs=False,
            eval_env=None,
            i_qf=None,
            action_prior='uniform',
            policy_lr=3e-4,
            qf_lr=1e-4,
            i_policy_pre_activation_weight=0.,
            i_policy_mixing_coeff_weight=1e-3,
            u_policy_pre_activation_weight=None,
            policy_weight_decay=0.,
            qf_weight_decay=0.,
            optimizer='adam',
            # optimizer='rmsprop',
            # optimizer='sgd',
            optimizer_kwargs=None,
            i_soft_target_tau=1e-2,
            u_soft_target_tau=1e-2,
            i_target_update_interval=1,
            u_target_update_interval=1,
            reward_scale=1.,
            u_reward_scales=None,
            min_q_value=-np.inf,
            max_q_value=np.inf,
            residual_gradient_weight=0,
            eval_with_target_policy=False,
            save_replay_buffer=False,
            log_tensorboard=False,
            **kwargs):

        # ###### #
        # Models #
        # ###### #

        # Deterministic Policies
        self._policy = policy
        self._target_policy = policy.copy()

        # Exploration Policy
        self._exploration_policy = explo_policy

        # Evaluation Policy
        if eval_with_target_policy:
            eval_policy = self._target_policy
        else:
            eval_policy = self._policy

        # Observation Normalizer
        if normalize_obs:
            self._obs_normalizer = RunningNormalizer(shape=env.obs_dim)
        else:
            self._obs_normalizer = None

        RLAlgorithm.__init__(self,
                             explo_env=env,
                             explo_policy=self._exploration_policy,
                             eval_env=eval_env,
                             eval_policy=eval_policy,
                             obs_normalizer=self._obs_normalizer,
                             **kwargs)

        # Number of Unintentional Tasks (Composable Tasks)
        self._n_unintentional = self._policy.n_heads

        # Evaluation Sampler (One for each unintentional)
        self.eval_u_samplers = [
            InPlacePathSampler(
                env=env,
                policy=WeightedMultiPolicySelector(eval_policy, idx),
                total_samples=self.num_steps_per_eval,
                max_path_length=self.max_path_length,
                deterministic=None,
            ) for idx in range(self._n_unintentional)
        ]

        # Important algorithm hyperparameters
        self._action_prior = action_prior

        # Intentional (Main Task) Q-function
        self._i_qf = i_qf
        self._i_target_qf = i_qf.copy()

        # Unintentional (Composable Tasks) Q-functions
        self._u_qf = u_qf
        self._u_target_qf = u_qf.copy()

        self._min_q_value = min_q_value
        self._max_q_value = max_q_value
        self._residual_gradient_weight = residual_gradient_weight

        # Soft-update rate for target V-functions
        self._i_soft_target_tau = i_soft_target_tau
        self._u_soft_target_tau = u_soft_target_tau
        self._i_target_update_interval = i_target_update_interval
        self._u_target_update_interval = u_target_update_interval

        # Reward Scales
        self.reward_scale = reward_scale
        if u_reward_scales is None:
            reward_scale = kwargs['reward_scale']
            u_reward_scales = [
                reward_scale for _ in range(self._n_unintentional)
            ]
        self._u_reward_scales = ptu.FloatTensor(u_reward_scales)

        # Replay Buffer
        self.replay_buffer = replay_buffer
        self.batch_size = batch_size
        self.save_replay_buffer = save_replay_buffer

        # ########## #
        # Optimizers #
        # ########## #
        if optimizer.lower() == 'adam':
            optimizer_class = optim.Adam
            if optimizer_kwargs is None:
                optimizer_kwargs = dict(amsgrad=True,
                                        # amsgrad=False,
                                        )
        elif optimizer.lower() == 'rmsprop':
            optimizer_class = optim.RMSprop
            if optimizer_kwargs is None:
                optimizer_kwargs = dict()
        else:
            raise ValueError('Wrong optimizer')
        self._qf_lr = qf_lr
        self._policy_lr = policy_lr

        # Q-function and V-function Optimization Criteria
        self._u_qf_criterion = nn.MSELoss()
        self._i_qf_criterion = nn.MSELoss()

        # Q-function(s) optimizers(s)
        self._u_qf_optimizer = optimizer_class(self._u_qf.parameters(),
                                               lr=qf_lr,
                                               weight_decay=qf_weight_decay,
                                               **optimizer_kwargs)
        self._i_qf_optimizer = optimizer_class(self._i_qf.parameters(),
                                               lr=qf_lr,
                                               weight_decay=qf_weight_decay,
                                               **optimizer_kwargs)

        # Policy optimizer
        self._policy_optimizer = optimizer_class(
            self._policy.parameters(),
            lr=policy_lr,
            weight_decay=policy_weight_decay,
            **optimizer_kwargs)

        # Policy regularization coefficients (weights)
        self._i_pol_pre_activ_weight = i_policy_pre_activation_weight
        self._i_pol_mixing_coeff_weight = i_policy_mixing_coeff_weight

        if u_policy_pre_activation_weight is None:
            u_policy_pre_activation_weight = [
                i_policy_pre_activation_weight
                for _ in range(self._n_unintentional)
            ]
        self._u_policy_pre_activ_weight = \
            ptu.FloatTensor(u_policy_pre_activation_weight)

        # Useful Variables for logging
        self.log_data = dict()
        self.log_data['Raw Pol Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Pol Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Qf Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Rewards'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Policy Action'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
            self.explo_env.action_dim,
        ))
        self.log_data['Mixing Weights'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional,
            self.explo_env.action_dim,
        ))

        # Tensorboard-like Logging
        self._log_tensorboard = log_tensorboard
        if log_tensorboard:
            self._summary_writer = \
                tensorboardX.SummaryWriter(log_dir=logger.get_snapshot_dir())
        else:
            self._summary_writer = None
Ejemplo n.º 10
0
    def __init__(self,
                 env,
                 local_policies,
                 global_policy,
                 cost_fcn,
                 eval_env=None,
                 train_cond_idxs=None,
                 test_cond_idxs=None,
                 num_samples=1,
                 test_samples=1,
                 noisy_samples=True,
                 noise_hyperparams=None,
                 seed=10,
                 base_kl_step=0.1,
                 global_opt_iters=5000,
                 global_opt_batch_size=64,
                 global_opt_lr=1e-5,
                 traj_opt_prev='nn_pol',
                 traj_opt_iters=1,
                 traj_opt_min_eta=1e-8,
                 traj_opt_max_eta=1e16,
                 **kwargs):

        # TO DEFINE
        self._fit_dynamics = True
        self._initial_state_var = 1.0e-2

        self._global_opt_batch_size = global_opt_batch_size
        self._global_opt_iters = global_opt_iters
        self._global_opt_ent_reg = 0.0  # For update pol variance
        self._global_pol_sample_mode = 'add'
        self._global_opt_lr = global_opt_lr
        self._global_samples_counter = 0
        self._first_global_eval = False

        self.base_kl_step = base_kl_step
        self._max_step_mult = 3.0
        self._min_step_mult = 0.5
        self._kl_step_rule = 'laplace'

        self._traj_opt_iters = traj_opt_iters
        self._max_ent_traj = 0.0
        self._traj_opt_prev = traj_opt_prev

        self.T = kwargs['max_path_length']
        self._num_samples = num_samples
        self._test_samples = test_samples

        self._train_cond_idxs = train_cond_idxs
        self._test_cond_idxs = test_cond_idxs

        # Get dimensions from the environment
        self.dU = env.action_dim
        self.dX = env.obs_dim  # TODO: DOING THIS TEMPORALLY
        self.dO = env.obs_dim

        # Number of initial conditions
        self.M = len(local_policies)

        exploration_policy = global_policy

        RLAlgorithm.__init__(self,
                             env=env,
                             exploration_policy=exploration_policy,
                             eval_env=eval_env,
                             eval_policy=global_policy,
                             eval_sampler=self.sample_global_pol,
                             **kwargs)

        # Rename for GPS
        self.global_policy = self.eval_policy
        self.local_policies = local_policies

        # Noise to be used with trajectory distributions
        self.noise_data = np.zeros(
            (self.num_epochs, self.M, self._num_samples, self.T, self.dU))
        self._noisy_samples = noisy_samples
        if self._noisy_samples:
            for ii in range(self.num_epochs):
                for cond in range(self.M):
                    for n in range(self._num_samples):
                        self.noise_data[ii, cond, n, :, :] = \
                            generate_noise(self.T, self.dU, noise_hyperparams)

        # IterationData objects for each condition.
        self.cur = [IterationData() for _ in range(self.M)]
        self.prev = [IterationData() for _ in range(self.M)]

        # Trajectory Info
        for m in range(self.M):
            self.cur[m].traj_info = TrajectoryInfo()

            if self._fit_dynamics:
                sigma_regu = 1e-6
                prior = DynamicsPriorGMM(
                    min_samples_per_cluster=40,
                    max_clusters=20,
                    max_samples=20,
                    strength=1.,
                )

                self.cur[m].traj_info.dynamics = \
                    DynamicsLRPrior(prior=prior, sigma_regu=sigma_regu)

                self.cur[m].traj_distr = local_policies[m]

        # Cost Fcn
        self._cost_fcn = cost_fcn

        # Global Policy Optimization
        self.global_pol_optimizer = torch.optim.Adam(
            self.global_policy.parameters(),
            lr=self._global_opt_lr,
            betas=(0.9, 0.999),
            eps=1e-08,  # Term added to the denominator for numerical stability
            # weight_decay=0.005,
            weight_decay=0.5,
            amsgrad=True,
        )

        # Local Trajectory Information
        self._local_pol_optimizer = TrajOptLQR(
            cons_per_step=False,
            use_prev_distr=False,
            update_in_bwd_pass=True,
            min_eta=traj_opt_min_eta,
            max_eta=traj_opt_max_eta,
        )

        level = logging.INFO
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(level)
        console = logging.StreamHandler()
        self.logger.addHandler(console)
        for handler in self.logger.handlers:
            handler.setLevel(level)

        self.eval_statistics = None

        self._return_fig = None
        self._return_axs = None
        self._return_lines = [None for _ in range(self.n_test_conds)]

        # MDGPS data #
        # ---------- #
        for m in range(self.M):
            # Same policy prior type for all conditions
            self.cur[m].pol_info = PolicyInfo(
                T=self.T,
                dU=self.dU,
                dX=self.dX,
                init_pol_wt=0.01,
            )
            self.cur[m].pol_info.policy_prior = ConstantPolicyPrior()
Ejemplo n.º 11
0
    def __init__(
            self,
            env,
            policy,
            qf,
            replay_buffer,
            normalize_obs=False,
            eval_env=None,
            action_prior='uniform',
            entropy_scale=1.,
            policy_lr=1e-4,
            qf_lr=1e-3,
            policy_weight_decay=0,
            qf_weight_decay=0,
            residual_gradient_weight=0,
            epoch_discount_schedule=None,
            policy_mean_regu_weight=1e-3,
            policy_std_regu_weight=1e-3,
            policy_pre_activation_weight=0.,
            optimizer='adam',
            # optimizer='rmsprop',
            # optimizer='sgd',
            optimizer_kwargs=None,
            target_hard_update_period=1000,
            tau=1e-2,
            use_soft_update=False,
            save_replay_buffer=False,
            eval_deterministic=True,
            log_tensorboard=False,
            **kwargs):

        # ###### #
        # Models #
        # ###### #

        # Exploration Policy
        self._policy = policy

        # Evaluation Policy
        if eval_deterministic:
            eval_policy = MakeDeterministic(self._policy)
        else:
            eval_policy = self._policy

        # Observation Normalizer
        if normalize_obs:
            self._obs_normalizer = RunningNormalizer(shape=env.obs_dim)
        else:
            self._obs_normalizer = None

        RLAlgorithm.__init__(self,
                             env=env,
                             exploration_policy=self._policy,
                             eval_env=eval_env,
                             eval_policy=eval_policy,
                             obs_normalizer=self._obs_normalizer,
                             **kwargs)

        # Important algorithm hyperparameters
        self._action_prior = action_prior
        self._entropy_scale = entropy_scale

        # Q-function
        self._qf = qf

        # ########## #
        # Optimizers #
        # ########## #
        if optimizer.lower() == 'adam':
            optimizer_class = optim.Adam
            if optimizer_kwargs is None:
                optimizer_kwargs = dict(amsgrad=True,
                                        # amsgrad=False,
                                        )
        elif optimizer.lower() == 'rmsprop':
            optimizer_class = optim.RMSprop
            if optimizer_kwargs is None:
                optimizer_kwargs = dict()
        else:
            raise ValueError('Wrong optimizer')

        # Q-function(s) optimizer(s)
        self._qf_optimizer = optimizer_class(self._qf.parameters(),
                                             lr=qf_lr,
                                             weight_decay=0,
                                             **optimizer_kwargs)

        # Policy optimizer
        self._policy_optimizer = optimizer_class(self._policy.parameters(),
                                                 lr=policy_lr,
                                                 weight_decay=0,
                                                 **optimizer_kwargs)

        # Policy regularization coefficients (weights)
        self._policy_mean_regu_weight = policy_mean_regu_weight
        self._policy_std_regu_weight = policy_std_regu_weight
        self._policy_pre_activation_weight = policy_pre_activation_weight

        # Useful Variables for logging
        self.logging_pol_kl_loss = np.zeros(self.num_train_steps_per_epoch)
        self.logging_qf_loss = np.zeros(self.num_train_steps_per_epoch)
        self.logging_rewards = np.zeros(self.num_train_steps_per_epoch)
        self.logging_policy_entropy = np.zeros(self.num_train_steps_per_epoch)
        self.logging_policy_log_std = np.zeros(
            (self.num_train_steps_per_epoch, self.explo_env.action_dim))
        self.logging_policy_mean = np.zeros(
            (self.num_train_steps_per_epoch, self.explo_env.action_dim))

        self._log_tensorboard = log_tensorboard
        self._summary_writer = tensorboardX.SummaryWriter(
            log_dir=logger.get_snapshot_dir())
Ejemplo n.º 12
0
 def evaluate(self, epoch):
     RLAlgorithm.evaluate(self, epoch)
Ejemplo n.º 13
0
    def __init__(
            self,
            explo_env,
            qf,
            policy,
            explo_policy,

            replay_buffer,
            batch_size=1024,
            eval_env=None,

            target_hard_update_period=1000,
            tau=1e-2,
            use_soft_update=False,
            qf_criterion=None,
            residual_gradient_weight=0,
            epoch_discount_schedule=None,
            eval_with_target_policy=False,

            policy_pre_activation_weight=0.,

            policy_lr=1e-4,
            qf_lr=1e-3,

            policy_weight_decay=0.,
            qf_weight_decay=0,

            optimizer='adam',
            # optimizer='rmsprop',
            # optimizer='sgd',
            optimizer_kwargs=None,

            obs_normalizer: TorchFixedNormalizer=None,
            action_normalizer: TorchFixedNormalizer=None,
            num_paths_for_normalization=0,

            reward_scale=1.,

            min_q_value=-np.inf,
            max_q_value=np.inf,

            save_replay_buffer=False,
            **kwargs
    ):
        """

        :param explo_env:
        :param qf:
        :param policy:
        :param explo_policy:
        :param policy_lr:
        :param qf_lr:
        :param qf_weight_decay:
        :param target_hard_update_period:
        :param tau:
        :param use_soft_update:
        :param qf_criterion: Loss function to use for the q function. Should
        be a function that takes in two inputs (y_predicted, y_target).
        :param residual_gradient_weight: c, float between 0 and 1. The gradient
        used for training the Q function is then
            (1-c) * normal td gradient + c * residual gradient
        :param epoch_discount_schedule: A schedule for the discount factor
        that varies with the epoch.
        :param kwargs:
        """
        self._target_policy = policy.copy()
        if eval_with_target_policy:
            eval_policy = self._target_policy
        else:
            eval_policy = policy
        RLAlgorithm.__init__(
            self,
            explo_env=explo_env,
            explo_policy=explo_policy,
            eval_env=eval_env,
            eval_policy=eval_policy,
            **kwargs
        )
        self.policy = policy
        self.target_hard_update_period = target_hard_update_period
        self.tau = tau
        self.use_soft_update = use_soft_update
        self.residual_gradient_weight = residual_gradient_weight
        self.policy_pre_activation_weight = policy_pre_activation_weight
        self.epoch_discount_schedule = epoch_discount_schedule
        self.obs_normalizer = obs_normalizer
        self.action_normalizer = action_normalizer
        self.num_paths_for_normalization = num_paths_for_normalization
        self.reward_scale = reward_scale

        # Q-function
        self._qf = qf
        self._target_qf = self._qf.copy()
        self.min_q_value = min_q_value
        self.max_q_value = max_q_value
        if qf_criterion is None:
            qf_criterion = nn.MSELoss()
        self.qf_criterion = qf_criterion

        # Replay Buffer
        self.replay_buffer = replay_buffer
        self.batch_size = batch_size
        self.save_replay_buffer = save_replay_buffer

        # ########## #
        # Optimizers #
        # ########## #
        if optimizer.lower() == 'adam':
            optimizer_class = optim.Adam
            if optimizer_kwargs is None:
                optimizer_kwargs = dict(
                    amsgrad=True,
                    # amsgrad=False,
                )
        elif optimizer.lower() == 'rmsprop':
            optimizer_class = optim.RMSprop
            if optimizer_kwargs is None:
                optimizer_kwargs = dict(

                )
        else:
            raise ValueError('Wrong optimizer')
        self._qf_lr = qf_lr
        self._policy_lr = policy_lr
        self._qf_weight_decay = qf_weight_decay
        self._policy_weight_decay = qf_weight_decay

        # Q-function optimizer
        self._qf_optimizer = optimizer_class(
            self._qf.parameters(),
            lr=qf_lr,
            weight_decay=qf_weight_decay,
            **optimizer_kwargs
        )

        # Policy optimizer
        self._policy_optimizer = optimizer_class(
            self.policy.parameters(),
            lr=policy_lr,
            weight_decay=policy_weight_decay,
            **optimizer_kwargs
        )

        # Useful Variables for logging
        self.log_data = dict()
        self.log_data['Raw Pol Loss'] = np.zeros(self.num_train_steps_per_epoch)
        self.log_data['Pol Loss'] = np.zeros(self.num_train_steps_per_epoch)
        self.log_data['Qf Loss'] = np.zeros(self.num_train_steps_per_epoch)
        self.log_data['Q pred'] = np.zeros(
            (self.num_train_steps_per_epoch, batch_size)
        )
        self.log_data['Q target'] = np.zeros(
            (self.num_train_steps_per_epoch, batch_size)
        )
        self.log_data['Bellman Error'] = np.zeros(
            (self.num_train_steps_per_epoch, batch_size)
        )
        self.log_data['Policy Actions'] = np.zeros(
            (self.num_train_steps_per_epoch, batch_size, self.explo_env.action_dim)
        )
Ejemplo n.º 14
0
    def __init__(
            self,
            env,
            policy,
            u_qf1,
            replay_buffer,
            batch_size=1024,
            normalize_obs=False,
            eval_env=None,
            i_qf1=None,
            u_qf2=None,
            i_qf2=None,
            i_vf=None,
            u_vf=None,
            action_prior='uniform',
            i_entropy_scale=1.,
            u_entropy_scale=None,
            auto_alpha=True,
            i_tgt_entro=None,
            u_tgt_entros=None,
            policy_lr=3e-4,
            qf_lr=3e-4,
            i_policy_mean_regu_weight=1e-3,
            i_policy_std_regu_weight=1e-3,
            i_policy_pre_activation_weight=0.,
            i_policy_mixing_coeff_weight=1e-3,
            u_policy_mean_regu_weight=None,
            u_policy_std_regu_weight=None,
            u_policy_pre_activation_weight=None,
            policy_weight_decay=0.,
            q_weight_decay=0.,
            optimizer='adam',
            # optimizer='rmsprop',
            # optimizer='sgd',
            optimizer_kwargs=None,
            i_soft_target_tau=5e-3,
            u_soft_target_tau=5e-3,
            i_target_update_interval=1,
            u_target_update_interval=1,
            reward_scale=1.,
            u_reward_scales=None,
            save_replay_buffer=False,
            eval_deterministic=True,
            log_tensorboard=False,
            **kwargs):

        # ###### #
        # Models #
        # ###### #

        # Exploration Policy
        self._policy = policy

        # Evaluation Policy
        if eval_deterministic:
            eval_policy = MakeDeterministic(self._policy)
        else:
            eval_policy = self._policy

        # Observation Normalizer
        if normalize_obs:
            self._obs_normalizer = RunningNormalizer(shape=env.obs_dim)
        else:
            self._obs_normalizer = None

        RLAlgorithm.__init__(self,
                             explo_env=env,
                             explo_policy=self._policy,
                             eval_env=eval_env,
                             eval_policy=eval_policy,
                             obs_normalizer=self._obs_normalizer,
                             **kwargs)

        # Number of Unintentional Tasks (Composable Tasks)
        self._n_unintentional = self._policy.n_heads

        # Evaluation Sampler (One for each unintentional)
        self.eval_u_samplers = [
            InPlacePathSampler(
                env=env,
                policy=WeightedMultiPolicySelector(self._policy, idx),
                total_samples=self.num_steps_per_eval,
                max_path_length=self.max_path_length,
                deterministic=True,
            ) for idx in range(self._n_unintentional)
        ]

        # Intentional (Main Task) Q-functions
        self._i_qf1 = i_qf1
        self._i_qf2 = i_qf2
        if i_vf is None:
            self._i_vf = None
            self._i_target_vf = None
            self._i_target_qf1 = self._i_qf1.copy()
            self._i_target_qf2 = \
                None if self._i_qf2 is None else self._i_qf2.copy()
        else:
            self._i_vf = i_vf
            self._i_target_vf = self._i_vf.copy()
            self._i_target_qf1 = None
            self._i_target_qf2 = None

        # Unintentional (Composable Tasks) Q-functions
        self._u_qf1 = u_qf1
        self._u_qf2 = u_qf2
        if u_vf is None:
            self._u_vf = None
            self._u_target_vf = None
            self._u_target_qf1 = self._u_qf1.copy()
            self._u_target_qf2 = self._u_qf2.copy()
        else:
            self._u_vf = u_vf
            self._u_target_vf = self._u_vf.copy()
            self._u_target_qf1 = None
            self._u_target_qf2 = None

        # Replay Buffer
        self.replay_buffer = replay_buffer
        self.batch_size = batch_size
        self.save_replay_buffer = save_replay_buffer

        # Soft-update rate for target V-functions
        self._i_soft_target_tau = i_soft_target_tau
        self._u_soft_target_tau = u_soft_target_tau
        self._i_target_update_interval = i_target_update_interval
        self._u_target_update_interval = u_target_update_interval

        # Important algorithm hyperparameters
        self._action_prior = action_prior
        self._i_entropy_scale = i_entropy_scale
        if u_entropy_scale is None:
            u_entropy_scale = [
                i_entropy_scale for _ in range(self._n_unintentional)
            ]
        self._u_entropy_scale = torch.tensor(u_entropy_scale,
                                             dtype=torch.float32,
                                             device=ptu.device)

        # Desired Alphas
        self._auto_alphas = auto_alpha
        if i_tgt_entro is None:
            i_tgt_entro = -env.action_dim
        self._i_tgt_entro = torch.tensor([i_tgt_entro],
                                         dtype=torch.float32,
                                         device=ptu.device)
        if u_tgt_entros is None:
            u_tgt_entros = [i_tgt_entro for _ in range(self._n_unintentional)]
        self._u_tgt_entros = torch.tensor(u_tgt_entros,
                                          dtype=torch.float32,
                                          device=ptu.device)
        self._u_log_alphas = torch.zeros(self._n_unintentional,
                                         device=ptu.device,
                                         requires_grad=True)
        self._i_log_alpha = torch.zeros(1,
                                        device=ptu.device,
                                        requires_grad=True)

        # Reward Scales
        self.reward_scale = reward_scale
        if u_reward_scales is None:
            reward_scale = kwargs['reward_scale']
            u_reward_scales = [
                reward_scale for _ in range(self._n_unintentional)
            ]
        self._u_reward_scales = torch.tensor(u_reward_scales,
                                             dtype=torch.float32,
                                             device=ptu.device)

        # ########## #
        # Optimizers #
        # ########## #
        if optimizer.lower() == 'adam':
            optimizer_class = optim.Adam
            if optimizer_kwargs is None:
                optimizer_kwargs = dict(amsgrad=True,
                                        # amsgrad=False,
                                        )
        elif optimizer.lower() == 'rmsprop':
            optimizer_class = optim.RMSprop
            if optimizer_kwargs is None:
                optimizer_kwargs = dict()
        else:
            raise ValueError('Wrong optimizer')

        # Values optimizer
        vals_params_list = [self._u_qf1.parameters(), self._i_qf1.parameters()]
        if self._u_qf2 is not None:
            vals_params_list.append(self._u_qf2.parameters())
        if self._i_qf2 is not None:
            vals_params_list.append(self._i_qf2.parameters())
        if self._u_vf is not None:
            vals_params_list.append(self._u_vf.parameters())
        if self._i_vf is not None:
            vals_params_list.append(self._i_vf.parameters())
        vals_params = chain(*vals_params_list)

        self._values_optimizer = optimizer_class(vals_params,
                                                 lr=qf_lr,
                                                 weight_decay=q_weight_decay,
                                                 **optimizer_kwargs)

        # Policy optimizer
        self._policy_optimizer = optimizer_class(
            self._policy.parameters(),
            lr=policy_lr,
            weight_decay=policy_weight_decay,
            **optimizer_kwargs)

        # Alpha optimizers
        self._alphas_optimizer = optimizer_class(
            [self._u_log_alphas, self._i_log_alpha],
            lr=policy_lr,
            **optimizer_kwargs)

        # Weights for policy regularization coefficients
        self._i_pol_mean_regu_weight = i_policy_mean_regu_weight
        self._i_pol_std_regu_weight = i_policy_std_regu_weight
        self._i_pol_pre_activ_weight = i_policy_pre_activation_weight
        self._i_pol_mixing_coeff_weight = i_policy_mixing_coeff_weight

        if u_policy_mean_regu_weight is None:
            u_policy_mean_regu_weight = [
                i_policy_mean_regu_weight for _ in range(self._n_unintentional)
            ]
        self._u_policy_mean_regu_weight = \
            torch.tensor(u_policy_mean_regu_weight, dtype=torch.float32,
                         device=ptu.device)
        if u_policy_std_regu_weight is None:
            u_policy_std_regu_weight = [
                i_policy_std_regu_weight for _ in range(self._n_unintentional)
            ]
        self._u_policy_std_regu_weight = \
            torch.tensor(u_policy_std_regu_weight, dtype=torch.float32,
                         device=ptu.device)
        if u_policy_pre_activation_weight is None:
            u_policy_pre_activation_weight = [
                i_policy_pre_activation_weight
                for _ in range(self._n_unintentional)
            ]
        self._u_policy_pre_activ_weight = \
            torch.tensor(u_policy_pre_activation_weight, dtype=torch.float32,
                         device=ptu.device)

        # Useful Variables for logging
        self.log_data = dict()
        self.log_data['Pol KL Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Qf Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Qf2 Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Vf Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Rewards'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Policy Entropy'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Policy Mean'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
            self.explo_env.action_dim,
        ))
        self.log_data['Pol Log Std'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
            self.explo_env.action_dim,
        ))
        self.log_data['Mixing Weights'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional,
            self.explo_env.action_dim,
        ))
        self.log_data['Alphas'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))

        # Tensorboard-like Logging
        self._log_tensorboard = log_tensorboard
        if log_tensorboard:
            self._summary_writer = \
                tensorboardX.SummaryWriter(log_dir=logger.get_snapshot_dir())
        else:
            self._summary_writer = None