Beispiel #1
0
    def __init__(self,
                 env,
                 policy,
                 policy_lr=1e-3,
                 optimizer_class=optim.Adam,
                 causality=True,
                 discounted=False,
                 plotter=None,
                 eval_deterministic=True,
                 **kwargs):
        """

        Args:
            env:
            qf (`robolearn.PyTorchModule`): Q-function approximator.
            policy (`robolearn.PyTorchModule`):
            policy_lr (`float`): Learning rate used for the Policy approximator.
            plotter (`MultiQFPolicyPlotter`): Plotter instance to be used for
                visualizing Q-function during training.
            eval_deterministic: Evaluate with deterministic version of current
                _i_policy.
            **kwargs:
        """
        if eval_deterministic:
            eval_policy = MakeDeterministic(policy)
        else:
            eval_policy = policy
        super(Reinforce, self).__init__(env=env,
                                        exploration_policy=policy,
                                        eval_policy=eval_policy,
                                        **kwargs)
        self.policy = policy

        self.plotter = plotter

        # Env data
        self._action_dim = self.explo_env.action_space.low.size
        self._obs_dim = self.explo_env.observation_space.low.size

        # Optimize Policy
        self.policy_optimizer = optimizer_class(
            self.policy.parameters(),
            lr=policy_lr,
        )

        # Return computation
        self._causality = causality
        self.discounted = discounted
def simulate_policy(args):

    np.random.seed(SEED)
    ptu.seed(SEED)

    data = joblib.load(args.file)
    if args.deterministic:
        if args.un > -1:
            print('Using the deterministic version of the UNintentional policy '
                  '%02d.' % args.un)
            if 'u_policy' in data:
                policy = MakeDeterministic(
                    MultiPolicySelector(data['u_policy'], args.un))
                    # WeightedMultiPolicySelector(data['u_policy'], args.un))
            else:
                # policy = MakeDeterministic(data['u_policies'][args.un])
                if isinstance(data['policy'], TanhGaussianPolicy):
                    policy = MakeDeterministic(data['policy'])
                else:
                    policy = MakeDeterministic(
                        WeightedMultiPolicySelector(data['policy'], args.un)
                    )
        else:
            print('Using the deterministic version of the Intentional policy.')
            if isinstance(data['policy'], ExplorationPolicy):
                policy = MakeDeterministic(data['policy'])
            else:
                policy = data['policy']
    else:
        if args.un > -1:
            print('Using the UNintentional stochastic policy %02d' % args.un)
            if 'u_policy' in data:
                # policy = MultiPolicySelector(data['u_policy'], args.un)
                policy = WeightedMultiPolicySelector(data['u_policy'], args.un)
            else:
                policy = WeightedMultiPolicySelector(data['policy'], args.un)
                # policy = data['policy'][args.un]
        else:
            print('Using the Intentional stochastic policy.')
            # policy = data['exploration_policy']
            policy = data['policy']

    print("Policy loaded!!")

    # Load environment
    dirname = os.path.dirname(args.file)
    with open(os.path.join(dirname, 'variant.json')) as json_data:
        log_data = json.load(json_data)
        env_params = log_data['env_params']
        H = int(log_data['path_length'])

    env_params.pop('goal', None)
    env_params['is_render'] = True

    if args.subtask and args.un != -1:
        env_params['subtask'] = args.un

    env = NormalizedBoxEnv(
        Pusher2D3DofGoalCompoEnv(**env_params),
        # normalize_obs=True,
        normalize_obs=False,
        online_normalization=False,
        obs_mean=None,
        obs_var=None,
        obs_alpha=0.001,
    )
    print("Environment loaded!!")

    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    if isinstance(policy, MakeDeterministic):
        if isinstance(policy.stochastic_policy, PyTorchModule):
            policy.stochastic_policy.train(False)
    else:
        if isinstance(policy, PyTorchModule):
            policy.train(False)

    while True:
        if args.record:
            rollout_start_fcn = lambda: \
                env.start_recording_video('pusher_video.mp4')
            rollout_end_fcn = lambda: \
                env.stop_recording_video()
        else:
            rollout_start_fcn = None
            rollout_end_fcn = None

        obs_normalizer = data.get('obs_normalizer')

        if args.H != -1:
            H = args.H

        path = rollout(
            env,
            policy,
            max_path_length=H,
            animated=True,
            obs_normalizer=obs_normalizer,
            rollout_start_fcn=rollout_start_fcn,
            rollout_end_fcn=rollout_end_fcn,
        )
        # plot_rollout_reward(path)

        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])

        logger.dump_tabular()

        if args.record:
            break
Beispiel #3
0
    def __init__(
            self,
            explo_env,
            policy,
            qf,
            replay_buffer,
            batch_size=1024,
            normalize_obs=False,
            eval_env=None,
            vf=None,
            qf2=None,
            action_prior='uniform',
            entropy_scale=1.,
            auto_alpha=True,
            tgt_entro=None,
            policy_lr=3e-4,
            qf_lr=3e-4,
            policy_mean_regu_weight=1e-3,
            policy_std_regu_weight=1e-3,
            policy_pre_activation_weight=0.,
            policy_weight_decay=0.,
            q_weight_decay=0.,
            optimizer='adam',
            # optimizer='rmsprop',
            # optimizer='sgd',
            optimizer_kwargs=None,
            soft_target_tau=5e-3,
            target_update_interval=1,
            reward_scale=1.,
            save_replay_buffer=False,
            eval_deterministic=True,
            log_tensorboard=False,
            **kwargs):

        # ###### #
        # Models #
        # ###### #

        # Exploration Policy
        self._policy = policy

        # Evaluation Policy
        if eval_deterministic:
            eval_policy = MakeDeterministic(self._policy)
        else:
            eval_policy = self._policy

        # Observation Normalizer
        if normalize_obs:
            self._obs_normalizer = RunningNormalizer(shape=explo_env.obs_dim)
        else:
            self._obs_normalizer = None

        RLAlgorithm.__init__(self,
                             explo_env=explo_env,
                             explo_policy=self._policy,
                             eval_env=eval_env,
                             eval_policy=eval_policy,
                             obs_normalizer=self._obs_normalizer,
                             **kwargs)

        # Q-function(s) and V-function
        self._qf = qf
        self._qf2 = qf2

        if vf is None:
            self._vf = None
            self._target_vf = None
            self._target_qf1 = qf.copy()
            self._target_qf2 = None if qf2 is None else qf2.copy()
        else:
            self._vf = vf
            self._target_vf = vf.copy()
            self._target_qf1 = None
            self._target_qf2 = None

        # Replay Buffer
        self.replay_buffer = replay_buffer
        self.batch_size = batch_size
        self.save_replay_buffer = save_replay_buffer

        # Soft-update rate for target V-function
        self._soft_target_tau = soft_target_tau
        self._target_update_interval = target_update_interval

        # Important algorithm hyperparameters
        self._action_prior = action_prior
        self._entropy_scale = entropy_scale

        # Desired Alpha
        self._auto_alpha = auto_alpha
        if tgt_entro is None:
            tgt_entro = -explo_env.action_dim
        self._tgt_entro = torch.tensor([float(tgt_entro)], device=ptu.device)
        self._log_alpha = torch.zeros(1, device=ptu.device, requires_grad=True)

        # Reward Scale
        self.reward_scale = reward_scale

        # ########## #
        # Optimizers #
        # ########## #
        if optimizer.lower() == 'adam':
            optimizer_class = optim.Adam
            if optimizer_kwargs is None:
                optimizer_kwargs = dict(amsgrad=True,
                                        # amsgrad=False,
                                        )
        elif optimizer.lower() == 'rmsprop':
            optimizer_class = optim.RMSprop
            if optimizer_kwargs is None:
                optimizer_kwargs = dict()
        else:
            raise ValueError('Wrong optimizer')
        self.qf_lr = qf_lr
        self.policy_lr = policy_lr

        # Q-function(s) optimizer(s)
        self._qf1_optimizer = optimizer_class(self._qf.parameters(),
                                              lr=qf_lr,
                                              weight_decay=q_weight_decay,
                                              **optimizer_kwargs)
        values_parameters = self._qf.parameters()
        if self._qf2 is None:
            self._qf2_optimizer = None
        else:
            self._qf2_optimizer = optimizer_class(self._qf2.parameters(),
                                                  lr=qf_lr,
                                                  weight_decay=q_weight_decay,
                                                  **optimizer_kwargs)
            values_parameters = chain(values_parameters,
                                      self._qf2.parameters())

        # V-function optimizer
        if self._vf is None:
            self._vf_optimizer = None
        else:
            self._vf_optimizer = optimizer_class(self._vf.parameters(),
                                                 lr=qf_lr,
                                                 weight_decay=q_weight_decay,
                                                 **optimizer_kwargs)
            values_parameters = chain(values_parameters, self._vf.parameters())
        self._values_optimizer = optimizer_class(values_parameters,
                                                 lr=qf_lr,
                                                 weight_decay=q_weight_decay,
                                                 **optimizer_kwargs)

        # Policy optimizer
        self._policy_optimizer = optimizer_class(
            self._policy.parameters(),
            lr=policy_lr,
            weight_decay=policy_weight_decay,
            **optimizer_kwargs)

        # Alpha optimizer
        self._alpha_optimizer = optimizer_class([self._log_alpha],
                                                lr=policy_lr,
                                                **optimizer_kwargs)

        # Weights for policy regularization coefficients
        self.pol_mean_regu_weight = policy_mean_regu_weight
        self.pol_std_regu_weight = policy_std_regu_weight
        self.pol_pre_activation_weight = policy_pre_activation_weight

        # Useful Variables for logging
        self.log_data = dict()
        self.log_data['Pol KL Loss'] = np.zeros(self.num_train_steps_per_epoch)
        self.log_data['Qf Loss'] = np.zeros(self.num_train_steps_per_epoch)
        self.log_data['Qf2 Loss'] = np.zeros(self.num_train_steps_per_epoch)
        self.log_data['Vf Loss'] = np.zeros(self.num_train_steps_per_epoch)
        self.log_data['Rewards'] = np.zeros(self.num_train_steps_per_epoch)
        self.log_data['Pol Entropy'] = np.zeros(self.num_train_steps_per_epoch)
        self.log_data['Pol Log Std'] = np.zeros((
            self.num_train_steps_per_epoch,
            self.explo_env.action_dim,
        ))
        self.log_data['Policy Mean'] = np.zeros((
            self.num_train_steps_per_epoch,
            self.explo_env.action_dim,
        ))
        self.log_data['Alphas'] = np.zeros(self.num_train_steps_per_epoch)

        # Tensorboard-like Logging
        self._log_tensorboard = log_tensorboard
        if log_tensorboard:
            self._summary_writer = \
                tensorboardX.SummaryWriter(log_dir=logger.get_snapshot_dir())
        else:
            self._summary_writer = None
Beispiel #4
0
def simulate_policy(args):

    np.random.seed(SEED)
    ptu.seed(SEED)

    data = joblib.load(args.file)
    if args.deterministic:
        if args.un > -1:
            print(
                'Using the deterministic version of the UNintentional policy '
                '%02d.' % args.un)
            if 'u_policy' in data:
                policy = MakeDeterministic(
                    MultiPolicySelector(data['u_policy'], args.un))
                # WeightedMultiPolicySelector(data['u_policy'], args.un))
            else:
                # policy = MakeDeterministic(data['u_policies'][args.un])
                if isinstance(data['policy'], TanhGaussianPolicy):
                    policy = MakeDeterministic(data['policy'])
                else:
                    policy = MakeDeterministic(
                        WeightedMultiPolicySelector(data['policy'], args.un))
        else:
            print('Using the deterministic version of the Intentional policy.')
            if isinstance(data['policy'], ExplorationPolicy):
                policy = MakeDeterministic(data['policy'])
            else:
                policy = data['policy']
    else:
        if args.un > -1:
            print('Using the UNintentional stochastic policy %02d' % args.un)
            if 'u_policy' in data:
                # policy = MultiPolicySelector(data['u_policy'], args.un)
                policy = WeightedMultiPolicySelector(data['u_policy'], args.un)
            else:
                policy = WeightedMultiPolicySelector(data['policy'], args.un)
                # policy = data['policy'][args.un]
        else:
            print('Using the Intentional stochastic policy.')
            # policy = data['exploration_policy']
            policy = data['policy']

    print("Policy loaded!!")

    # Load environment
    dirname = os.path.dirname(args.file)
    with open(os.path.join(dirname, 'variant.json')) as json_data:
        log_data = json.load(json_data)
        env_params = log_data['env_params']
        H = int(log_data['path_length'])
    env_params['is_render'] = True

    if 'obs_mean' in data.keys():
        obs_mean = data['obs_mean']
        print('OBS_MEAN')
        print(repr(obs_mean))
    else:
        obs_mean = None
        # obs_mean = np.array([ 0.07010766,  0.37585765,  0.21402615,  0.24426296,  0.5789634 ,
        #                       0.88510203,  1.6878743 ,  0.02656335,  0.03794186, -1.0241051 ,
        #                       -0.5226027 ,  0.6198239 ,  0.49062446,  0.01197532,  0.7888951 ,
        #                       -0.4857273 ,  0.69160587, -0.00617676,  0.08966777, -0.14694819,
        #                       0.9559917 ,  1.0450271 , -0.40958315,  0.86435956,  0.00609685,
        #                       -0.01115279, -0.21607827,  0.9762933 ,  0.80748135, -0.48661205,
        #                       0.7473679 ,  0.01649722,  0.15451911, -0.17285274,  0.89978695])

    if 'obs_var' in data.keys():
        obs_var = data['obs_var']
        print('OBS_VAR')
        print(repr(obs_var))
    else:
        obs_var = None
        # obs_var = np.array([0.10795759, 0.12807205, 0.9586606 , 0.46407   , 0.8994803 ,
        #                     0.35167143, 0.30286264, 0.34667444, 0.35105848, 1.9919134 ,
        #                     0.9462659 , 2.245269  , 0.84190637, 1.5407104 , 0.1       ,
        #                     0.10330457, 0.1       , 0.1       , 0.1       , 0.1528581 ,
        #                     0.1       , 0.1       , 0.1       , 0.1       , 0.1       ,
        #                     0.1       , 0.1       , 0.1       , 0.1       , 0.12320185,
        #                     0.1       , 0.18369523, 0.200373  , 0.11895574, 0.15118493])
    print(env_params)

    if args.subtask and args.un != -1:
        env_params['subtask'] = args.un
    # else:
    #     env_params['subtask'] = None

    env = NormalizedBoxEnv(
        CentauroTrayEnv(**env_params),
        # normalize_obs=True,
        normalize_obs=False,
        online_normalization=False,
        obs_mean=None,
        obs_var=None,
        obs_alpha=0.001,
    )
    print("Environment loaded!!")

    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    if isinstance(policy, MakeDeterministic):
        if isinstance(policy.stochastic_policy, PyTorchModule):
            policy.stochastic_policy.train(False)
    else:
        if isinstance(policy, PyTorchModule):
            policy.train(False)

    while True:
        if args.record:
            rollout_start_fcn = lambda: \
                env.start_recording_video('centauro_video.mp4')
            rollout_end_fcn = lambda: \
                env.stop_recording_video()
        else:
            rollout_start_fcn = None
            rollout_end_fcn = None

        obs_normalizer = data.get('obs_normalizer')

        if args.H != -1:
            H = args.H

        path = rollout(
            env,
            policy,
            max_path_length=H,
            animated=True,
            obs_normalizer=obs_normalizer,
            rollout_start_fcn=rollout_start_fcn,
            rollout_end_fcn=rollout_end_fcn,
        )
        plot_rollout_reward(path)

        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])

        logger.dump_tabular()

        if args.record:
            break
Beispiel #5
0
def simulate_policy(args):

    np.random.seed(SEED)
    ptu.seed(SEED)

    data = joblib.load(args.file)
    if args.deterministic:
        print('Using the deterministic version of the policy.')
        if isinstance(data['policy'], ExplorationPolicy):
            policy = MakeDeterministic(data['policy'])
        else:
            policy = data['policy']
    else:
        print('Using the stochastic policy.')
        policy = data['exploration_policy']

    print("Policy loaded!!")

    # Load environment
    with open('variant.json') as json_data:
        env_params = json.load(json_data)['env_params']

    env_params['is_render'] = True
    env = NormalizedBoxEnv(
        Reacher2D3DofBulletEnv(**env_params),
        # normalize_obs=True,
        normalize_obs=False,
        online_normalization=False,
        obs_mean=None,
        obs_var=None,
        obs_alpha=0.001,
    )
    print("Environment loaded!!")

    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    if isinstance(policy, MakeDeterministic):
        if isinstance(policy.stochastic_policy, PyTorchModule):
            policy.stochastic_policy.train(False)
    else:
        if isinstance(policy, PyTorchModule):
            policy.train(False)

    while True:
        if args.record:
            rollout_start_fcn = lambda: \
                env.start_recording_video('reacher_video.mp4')
            rollout_end_fcn = lambda: \
                env.stop_recording_video()
        else:
            rollout_start_fcn = None
            rollout_end_fcn = None

        obs_normalizer = data.get('obs_normalizer')

        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            animated=True,
            obs_normalizer=obs_normalizer,
            rollout_start_fcn=rollout_start_fcn,
            rollout_end_fcn=rollout_end_fcn,
        )

        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])

        logger.dump_tabular()

        if args.record:
            break
Beispiel #6
0
    def __init__(
            self,
            env,
            policy,
            qf,
            replay_buffer,
            normalize_obs=False,
            eval_env=None,
            action_prior='uniform',
            entropy_scale=1.,
            policy_lr=1e-4,
            qf_lr=1e-3,
            policy_weight_decay=0,
            qf_weight_decay=0,
            residual_gradient_weight=0,
            epoch_discount_schedule=None,
            policy_mean_regu_weight=1e-3,
            policy_std_regu_weight=1e-3,
            policy_pre_activation_weight=0.,
            optimizer='adam',
            # optimizer='rmsprop',
            # optimizer='sgd',
            optimizer_kwargs=None,
            target_hard_update_period=1000,
            tau=1e-2,
            use_soft_update=False,
            save_replay_buffer=False,
            eval_deterministic=True,
            log_tensorboard=False,
            **kwargs):

        # ###### #
        # Models #
        # ###### #

        # Exploration Policy
        self._policy = policy

        # Evaluation Policy
        if eval_deterministic:
            eval_policy = MakeDeterministic(self._policy)
        else:
            eval_policy = self._policy

        # Observation Normalizer
        if normalize_obs:
            self._obs_normalizer = RunningNormalizer(shape=env.obs_dim)
        else:
            self._obs_normalizer = None

        RLAlgorithm.__init__(self,
                             env=env,
                             exploration_policy=self._policy,
                             eval_env=eval_env,
                             eval_policy=eval_policy,
                             obs_normalizer=self._obs_normalizer,
                             **kwargs)

        # Important algorithm hyperparameters
        self._action_prior = action_prior
        self._entropy_scale = entropy_scale

        # Q-function
        self._qf = qf

        # ########## #
        # Optimizers #
        # ########## #
        if optimizer.lower() == 'adam':
            optimizer_class = optim.Adam
            if optimizer_kwargs is None:
                optimizer_kwargs = dict(amsgrad=True,
                                        # amsgrad=False,
                                        )
        elif optimizer.lower() == 'rmsprop':
            optimizer_class = optim.RMSprop
            if optimizer_kwargs is None:
                optimizer_kwargs = dict()
        else:
            raise ValueError('Wrong optimizer')

        # Q-function(s) optimizer(s)
        self._qf_optimizer = optimizer_class(self._qf.parameters(),
                                             lr=qf_lr,
                                             weight_decay=0,
                                             **optimizer_kwargs)

        # Policy optimizer
        self._policy_optimizer = optimizer_class(self._policy.parameters(),
                                                 lr=policy_lr,
                                                 weight_decay=0,
                                                 **optimizer_kwargs)

        # Policy regularization coefficients (weights)
        self._policy_mean_regu_weight = policy_mean_regu_weight
        self._policy_std_regu_weight = policy_std_regu_weight
        self._policy_pre_activation_weight = policy_pre_activation_weight

        # Useful Variables for logging
        self.logging_pol_kl_loss = np.zeros(self.num_train_steps_per_epoch)
        self.logging_qf_loss = np.zeros(self.num_train_steps_per_epoch)
        self.logging_rewards = np.zeros(self.num_train_steps_per_epoch)
        self.logging_policy_entropy = np.zeros(self.num_train_steps_per_epoch)
        self.logging_policy_log_std = np.zeros(
            (self.num_train_steps_per_epoch, self.explo_env.action_dim))
        self.logging_policy_mean = np.zeros(
            (self.num_train_steps_per_epoch, self.explo_env.action_dim))

        self._log_tensorboard = log_tensorboard
        self._summary_writer = tensorboardX.SummaryWriter(
            log_dir=logger.get_snapshot_dir())
Beispiel #7
0
    def __init__(self,
                 env,
                 qf,
                 policy,
                 qf_lr=1e-3,
                 policy_lr=1e-3,
                 optimizer_class=optim.Adam,
                 use_hard_updates=False,
                 hard_update_period=1000,
                 soft_target_tau=0.001,
                 value_n_particles=16,
                 kernel_fn=adaptive_isotropic_gaussian_kernel,
                 kernel_n_particles=16,
                 kernel_update_ratio=0.5,
                 epoch_plotter=None,
                 eval_deterministic=True,
                 **kwargs):
        """

        Args:
            env:
            qf (`robolearn.PyTorchModule`): Q-function approximator.
            policy (`robolearn.PyTorchModule`):
            qf_lr (`float`): Learning rate used for the Q-function approximator.
            use_hard_updates (`bool`): Use a hard rather than soft update.
            hard_update_period (`int`): How many gradient steps before copying
                the parameters over. Used if `use_hard_updates` is True.
            soft_target_tau (`float`): Soft target tau to update target QF.
                Used if `use_hard_updates` is False.
            value_n_particles (`int`): The number of action samples used for
                estimating the value of next state.
            kernel_fn (function object): A function object that represents
                a kernel function.
            kernel_n_particles (`int`): Total number of particles per state
                used in SVGD updates.
            epoch_plotter (`MultiQFPolicyPlotter`): Plotter instance to be used for
                visualizing Q-function during training.
            eval_deterministic: Evaluate with deterministic version of current
                _i_policy.
            **kwargs:
        """
        if eval_deterministic:
            eval_policy = MakeDeterministic(policy)
        else:
            eval_policy = policy
        super(SQL, self).__init__(env=env,
                                  exploration_policy=policy,
                                  eval_policy=eval_policy,
                                  **kwargs)
        self.policy = policy
        self.qf = qf
        self.target_qf = self.qf.copy()

        self._epoch_plotter = epoch_plotter

        # Env data
        self._action_dim = self.explo_env.action_space.low.size
        self._obs_dim = self.explo_env.observation_space.low.size

        # Optimize Q-fcn
        self.qf_optimizer = optimizer_class(
            self.qf.parameters(),
            lr=qf_lr,
        )
        self._value_n_particles = value_n_particles

        # Optimize Policy
        self.policy_optimizer = optimizer_class(
            self.policy.parameters(),
            lr=policy_lr,
        )

        self._kernel_n_particles = kernel_n_particles
        self._kernel_update_ratio = kernel_update_ratio
        self._kernel_fn = kernel_fn

        # Optimize target Q-fcn
        self.use_hard_updates = use_hard_updates
        self.hard_update_period = hard_update_period
        self.soft_target_tau = soft_target_tau
Beispiel #8
0
    def __init__(
            self,
            env,
            policy,
            u_qf1,
            replay_buffer,
            batch_size=1024,
            normalize_obs=False,
            eval_env=None,
            i_qf1=None,
            u_qf2=None,
            i_qf2=None,
            i_vf=None,
            u_vf=None,
            action_prior='uniform',
            i_entropy_scale=1.,
            u_entropy_scale=None,
            auto_alpha=True,
            i_tgt_entro=None,
            u_tgt_entros=None,
            policy_lr=3e-4,
            qf_lr=3e-4,
            i_policy_mean_regu_weight=1e-3,
            i_policy_std_regu_weight=1e-3,
            i_policy_pre_activation_weight=0.,
            i_policy_mixing_coeff_weight=1e-3,
            u_policy_mean_regu_weight=None,
            u_policy_std_regu_weight=None,
            u_policy_pre_activation_weight=None,
            policy_weight_decay=0.,
            q_weight_decay=0.,
            optimizer='adam',
            # optimizer='rmsprop',
            # optimizer='sgd',
            optimizer_kwargs=None,
            i_soft_target_tau=5e-3,
            u_soft_target_tau=5e-3,
            i_target_update_interval=1,
            u_target_update_interval=1,
            reward_scale=1.,
            u_reward_scales=None,
            save_replay_buffer=False,
            eval_deterministic=True,
            log_tensorboard=False,
            **kwargs):

        # ###### #
        # Models #
        # ###### #

        # Exploration Policy
        self._policy = policy

        # Evaluation Policy
        if eval_deterministic:
            eval_policy = MakeDeterministic(self._policy)
        else:
            eval_policy = self._policy

        # Observation Normalizer
        if normalize_obs:
            self._obs_normalizer = RunningNormalizer(shape=env.obs_dim)
        else:
            self._obs_normalizer = None

        RLAlgorithm.__init__(self,
                             explo_env=env,
                             explo_policy=self._policy,
                             eval_env=eval_env,
                             eval_policy=eval_policy,
                             obs_normalizer=self._obs_normalizer,
                             **kwargs)

        # Number of Unintentional Tasks (Composable Tasks)
        self._n_unintentional = self._policy.n_heads

        # Evaluation Sampler (One for each unintentional)
        self.eval_u_samplers = [
            InPlacePathSampler(
                env=env,
                policy=WeightedMultiPolicySelector(self._policy, idx),
                total_samples=self.num_steps_per_eval,
                max_path_length=self.max_path_length,
                deterministic=True,
            ) for idx in range(self._n_unintentional)
        ]

        # Intentional (Main Task) Q-functions
        self._i_qf1 = i_qf1
        self._i_qf2 = i_qf2
        if i_vf is None:
            self._i_vf = None
            self._i_target_vf = None
            self._i_target_qf1 = self._i_qf1.copy()
            self._i_target_qf2 = \
                None if self._i_qf2 is None else self._i_qf2.copy()
        else:
            self._i_vf = i_vf
            self._i_target_vf = self._i_vf.copy()
            self._i_target_qf1 = None
            self._i_target_qf2 = None

        # Unintentional (Composable Tasks) Q-functions
        self._u_qf1 = u_qf1
        self._u_qf2 = u_qf2
        if u_vf is None:
            self._u_vf = None
            self._u_target_vf = None
            self._u_target_qf1 = self._u_qf1.copy()
            self._u_target_qf2 = self._u_qf2.copy()
        else:
            self._u_vf = u_vf
            self._u_target_vf = self._u_vf.copy()
            self._u_target_qf1 = None
            self._u_target_qf2 = None

        # Replay Buffer
        self.replay_buffer = replay_buffer
        self.batch_size = batch_size
        self.save_replay_buffer = save_replay_buffer

        # Soft-update rate for target V-functions
        self._i_soft_target_tau = i_soft_target_tau
        self._u_soft_target_tau = u_soft_target_tau
        self._i_target_update_interval = i_target_update_interval
        self._u_target_update_interval = u_target_update_interval

        # Important algorithm hyperparameters
        self._action_prior = action_prior
        self._i_entropy_scale = i_entropy_scale
        if u_entropy_scale is None:
            u_entropy_scale = [
                i_entropy_scale for _ in range(self._n_unintentional)
            ]
        self._u_entropy_scale = torch.tensor(u_entropy_scale,
                                             dtype=torch.float32,
                                             device=ptu.device)

        # Desired Alphas
        self._auto_alphas = auto_alpha
        if i_tgt_entro is None:
            i_tgt_entro = -env.action_dim
        self._i_tgt_entro = torch.tensor([i_tgt_entro],
                                         dtype=torch.float32,
                                         device=ptu.device)
        if u_tgt_entros is None:
            u_tgt_entros = [i_tgt_entro for _ in range(self._n_unintentional)]
        self._u_tgt_entros = torch.tensor(u_tgt_entros,
                                          dtype=torch.float32,
                                          device=ptu.device)
        self._u_log_alphas = torch.zeros(self._n_unintentional,
                                         device=ptu.device,
                                         requires_grad=True)
        self._i_log_alpha = torch.zeros(1,
                                        device=ptu.device,
                                        requires_grad=True)

        # Reward Scales
        self.reward_scale = reward_scale
        if u_reward_scales is None:
            reward_scale = kwargs['reward_scale']
            u_reward_scales = [
                reward_scale for _ in range(self._n_unintentional)
            ]
        self._u_reward_scales = torch.tensor(u_reward_scales,
                                             dtype=torch.float32,
                                             device=ptu.device)

        # ########## #
        # Optimizers #
        # ########## #
        if optimizer.lower() == 'adam':
            optimizer_class = optim.Adam
            if optimizer_kwargs is None:
                optimizer_kwargs = dict(amsgrad=True,
                                        # amsgrad=False,
                                        )
        elif optimizer.lower() == 'rmsprop':
            optimizer_class = optim.RMSprop
            if optimizer_kwargs is None:
                optimizer_kwargs = dict()
        else:
            raise ValueError('Wrong optimizer')

        # Values optimizer
        vals_params_list = [self._u_qf1.parameters(), self._i_qf1.parameters()]
        if self._u_qf2 is not None:
            vals_params_list.append(self._u_qf2.parameters())
        if self._i_qf2 is not None:
            vals_params_list.append(self._i_qf2.parameters())
        if self._u_vf is not None:
            vals_params_list.append(self._u_vf.parameters())
        if self._i_vf is not None:
            vals_params_list.append(self._i_vf.parameters())
        vals_params = chain(*vals_params_list)

        self._values_optimizer = optimizer_class(vals_params,
                                                 lr=qf_lr,
                                                 weight_decay=q_weight_decay,
                                                 **optimizer_kwargs)

        # Policy optimizer
        self._policy_optimizer = optimizer_class(
            self._policy.parameters(),
            lr=policy_lr,
            weight_decay=policy_weight_decay,
            **optimizer_kwargs)

        # Alpha optimizers
        self._alphas_optimizer = optimizer_class(
            [self._u_log_alphas, self._i_log_alpha],
            lr=policy_lr,
            **optimizer_kwargs)

        # Weights for policy regularization coefficients
        self._i_pol_mean_regu_weight = i_policy_mean_regu_weight
        self._i_pol_std_regu_weight = i_policy_std_regu_weight
        self._i_pol_pre_activ_weight = i_policy_pre_activation_weight
        self._i_pol_mixing_coeff_weight = i_policy_mixing_coeff_weight

        if u_policy_mean_regu_weight is None:
            u_policy_mean_regu_weight = [
                i_policy_mean_regu_weight for _ in range(self._n_unintentional)
            ]
        self._u_policy_mean_regu_weight = \
            torch.tensor(u_policy_mean_regu_weight, dtype=torch.float32,
                         device=ptu.device)
        if u_policy_std_regu_weight is None:
            u_policy_std_regu_weight = [
                i_policy_std_regu_weight for _ in range(self._n_unintentional)
            ]
        self._u_policy_std_regu_weight = \
            torch.tensor(u_policy_std_regu_weight, dtype=torch.float32,
                         device=ptu.device)
        if u_policy_pre_activation_weight is None:
            u_policy_pre_activation_weight = [
                i_policy_pre_activation_weight
                for _ in range(self._n_unintentional)
            ]
        self._u_policy_pre_activ_weight = \
            torch.tensor(u_policy_pre_activation_weight, dtype=torch.float32,
                         device=ptu.device)

        # Useful Variables for logging
        self.log_data = dict()
        self.log_data['Pol KL Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Qf Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Qf2 Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Vf Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Rewards'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Policy Entropy'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Policy Mean'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
            self.explo_env.action_dim,
        ))
        self.log_data['Pol Log Std'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
            self.explo_env.action_dim,
        ))
        self.log_data['Mixing Weights'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional,
            self.explo_env.action_dim,
        ))
        self.log_data['Alphas'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))

        # Tensorboard-like Logging
        self._log_tensorboard = log_tensorboard
        if log_tensorboard:
            self._summary_writer = \
                tensorboardX.SummaryWriter(log_dir=logger.get_snapshot_dir())
        else:
            self._summary_writer = None
Beispiel #9
0
def simulate_policy(args):
    data = joblib.load(args.file)
    if args.deterministic:
        if args.un > -1:
            print(
                'Using the deterministic version of the UNintentional policy '
                '%02d.' % args.un)
            if 'u_policy' in data:
                policy = MakeDeterministic(
                    # MultiPolicySelector(data['u_policy'], args.un))
                    WeightedMultiPolicySelector(data['policy'], args.un))
            else:
                policy = MakeDeterministic(
                    WeightedMultiPolicySelector(data['policy'], args.un))
        else:
            print('Using the deterministic version of the Intentional policy.')
            policy = MakeDeterministic(data['policy'])
    else:
        if args.un > -1:
            print('Using the UNintentional stochastic policy %02d' % args.un)
            if 'u_policy' in data:
                # policy = MultiPolicySelector(data['u_policy'], args.un)
                policy = WeightedMultiPolicySelector(data['policy'], args.un)
            else:
                # policy = data['u_policies'][args.un]
                policy = WeightedMultiPolicySelector(data['policy'], args.un)
        else:
            print('Using the Intentional stochastic policy.')
            # policy = data['exploration_policy']
            policy = data['policy']

    print("Policy loaded!!")

    # Load environment
    with open('variant.json') as json_data:
        env_params = json.load(json_data)['env_params']
    env = NormalizedBoxEnv(Navigation2dGoalCompoEnv(**env_params))
    print("Environment loaded!!")

    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    while True:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            animated=True,
            # deterministic=args.deterministic,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()