コード例 #1
0
def get_config(
    variant,
    expl_env,
    eval_env,
    obs_dim,
    action_dim,
    replay_buffer,
):
    """
    Policy construction
    """

    M = variant['policy_kwargs']['layer_size']

    qf1, qf2, target_qf1, target_qf2 = ppp.group_init(
        4,
        FlattenMlp,
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
    )

    trainer = SACTrainer(
        env=eval_env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        **variant['trainer_kwargs'],
    )
    """
    Create config dict
    """

    config = dict()
    config.update(
        dict(
            trainer=trainer,
            exploration_policy=policy,
            evaluation_policy=MakeDeterministic(policy),
            exploration_env=expl_env,
            evaluation_env=eval_env,
            replay_buffer=replay_buffer,
        ))
    config['algorithm_kwargs'] = variant['algorithm_kwargs']

    return config
コード例 #2
0
def get_config(
    variant,
    expl_env,
    eval_env,
    obs_dim,
    action_dim,
    replay_buffer,
):
    """
    Policy construction
    """

    M = variant['policy_kwargs']['layer_size']
    latent_dim = variant['policy_kwargs']['latent_dim']
    restrict_dim = variant['discriminator_kwargs']['restrict_input_size']

    control_policy = TanhGaussianPolicy(
        obs_dim=obs_dim + latent_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
        restrict_obs_dim=restrict_dim,
    )

    prior = torch.distributions.uniform.Uniform(
        -ptu.ones(latent_dim),
        ptu.ones(latent_dim),
    )

    policy = PriorLatentPolicy(
        policy=control_policy,
        prior=prior,
        unconditional=True,
    )

    qf1, qf2, target_qf1, target_qf2 = ppp.group_init(
        4,
        FlattenMlp,
        input_size=obs_dim + latent_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    """
    Learned skill-practice distribution
    """

    skill_practice_dist = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=latent_dim,
        hidden_sizes=[M, M],
    )

    prior_qf1, prior_qf2, prior_target_qf1, prior_target_qf2 = ppp.group_init(
        4,
        FlattenMlp,
        input_size=obs_dim + latent_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )

    skill_practice_trainer = SACTrainer(
        env=expl_env,
        policy=skill_practice_dist,
        qf1=prior_qf1,
        qf2=prior_qf2,
        target_qf1=prior_target_qf1,
        target_qf2=prior_target_qf2,
        **variant['skill_practice_trainer_kwargs'],
    )
    """
    Discriminator
    """

    discrim_kwargs = variant['discriminator_kwargs']
    discriminator = SkillDynamics(
        observation_size=obs_dim if restrict_dim == 0 else restrict_dim,
        action_size=action_dim,
        latent_size=latent_dim,
        normalize_observations=True,
        fix_variance=True,
        fc_layer_params=[discrim_kwargs['layer_size']] *
        discrim_kwargs['num_layers'],
        # restrict_observation=0,  # we handle this outside of skill-dynamics
        # use_latents_as_delta=variant.get('use_latents_as_delta', False),
    )
    """
    Policy trainer
    """

    policy_trainer = SACTrainer(
        env=expl_env,
        policy=control_policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        **variant['policy_trainer_kwargs'],
    )
    """
    Model-based reinforcement learning (MBRL) dynamics models
    """

    M = variant['mbrl_kwargs']['layer_size']

    dynamics_model = ProbabilisticEnsemble(
        ensemble_size=variant['mbrl_kwargs']['ensemble_size'],
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M, M, M],
    )
    model_trainer = MBRLTrainer(
        ensemble=dynamics_model,
        **variant['mbrl_kwargs'],
    )

    rollout_len_schedule = variant['rollout_len_schedule']

    def rollout_len(train_steps):
        """
        rollout_len_schedule: [a, b, len_a, len_b]
        linearly increase length from len_a -> len_b over epochs a -> b
        """
        epoch = train_steps // 1000
        if epoch < rollout_len_schedule[0]:
            return 1
        elif epoch >= rollout_len_schedule[1]:
            return rollout_len_schedule[3]
        else:
            return int((epoch - rollout_len_schedule[0]) /
                       (rollout_len_schedule[1] - rollout_len_schedule[0]) *
                       (rollout_len_schedule[3] -
                        rollout_len_schedule[2])) + rollout_len_schedule[2]

    """
    Setup of intrinsic control
    """

    trainer = LiSPTrainer(
        skill_practice_dist=skill_practice_dist,
        skill_practice_trainer=skill_practice_trainer,
        dynamics_model=dynamics_model,
        rollout_len_func=rollout_len,
        control_policy=control_policy,
        discriminator=discriminator,
        replay_buffer=replay_buffer,
        replay_size=variant['generated_replay_buffer_size'],
        policy_trainer=policy_trainer,
        restrict_input_size=restrict_dim,
        **variant['trainer_kwargs'],
    )
    """
    Create config dict
    """

    config = dict()
    config.update(
        dict(
            trainer=trainer,
            model_trainer=model_trainer,
            exploration_policy=policy,
            evaluation_policy=policy,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            replay_buffer=replay_buffer,
            dynamics_model=dynamics_model,
            prior=prior,
            learned_prior=skill_practice_dist,
            skill_practice_trainer=skill_practice_trainer,
            control_policy=control_policy,
            latent_dim=latent_dim,
            policy_trainer=policy_trainer,
            rollout_len_func=rollout_len,
        ))
    config['algorithm_kwargs'] = variant.get('algorithm_kwargs', dict())
    config['offline_kwargs'] = variant.get('offline_kwargs', dict())

    return config
コード例 #3
0
def get_config(
    variant,
    expl_env,
    eval_env,
    obs_dim,
    action_dim,
    replay_buffer,
):
    """
    Setup of soft actor critic (SAC), used as the policy optimization procedure of MBPO
    """

    M = variant['policy_kwargs']['layer_size']

    qf1, qf2, target_qf1, target_qf2 = ppp.group_init(
        4,
        FlattenMlp,
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
    )

    policy_trainer = SACTrainer(env=eval_env,
                                policy=policy,
                                qf1=qf1,
                                qf2=qf2,
                                target_qf1=target_qf1,
                                target_qf2=target_qf2,
                                **variant['trainer_kwargs']['policy_kwargs'])
    """
    Model-based reinforcement learning (MBRL) dynamics models
    """

    dynamics_model = ProbabilisticEnsemble(
        ensemble_size=variant['mbrl_kwargs']['ensemble_size'],
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=variant['mbrl_kwargs']['hidden_sizes'],
    )
    model_trainer = MBRLTrainer(
        ensemble=dynamics_model,
        **variant['mbrl_kwargs'],
    )
    """
    Setup of model-based policy optimization (MBPO)
    """

    generated_replay_buffer = EnvReplayBuffer(
        variant['trainer_kwargs']['generated_buffer_size'],
        expl_env,
    )

    rollout_len_schedule = variant['trainer_kwargs']['rollout_len_schedule']

    def rollout_len(train_steps):
        """
        rollout_len_schedule: [a, b, len_a, len_b]
        Linearly increase length from len_a -> len_b over epochs a -> b
        """
        if 'algorithm_kwargs' in variant:
            epoch = train_steps // variant['algorithm_kwargs'][
                'num_trains_per_train_loop']
        else:
            epoch = 1
        if epoch < rollout_len_schedule[0]:
            return 1
        elif epoch >= rollout_len_schedule[1]:
            return rollout_len_schedule[3]
        else:
            return int(
                (epoch - rollout_len_schedule[0]) / \
                (rollout_len_schedule[1] - rollout_len_schedule[0]) * \
                (rollout_len_schedule[3] - rollout_len_schedule[2])
            ) + 1

    trainer = MBPOTrainer(policy_trainer=policy_trainer,
                          dynamics_model=dynamics_model,
                          replay_buffer=replay_buffer,
                          generated_data_buffer=generated_replay_buffer,
                          rollout_len_func=rollout_len,
                          **variant['trainer_kwargs'])
    """
    Create config dict
    """

    config = dict()
    config.update(
        dict(
            trainer=trainer,
            model_trainer=model_trainer,
            exploration_policy=policy,
            evaluation_policy=MakeDeterministic(policy),
            exploration_env=expl_env,
            evaluation_env=eval_env,
            replay_buffer=replay_buffer,
        ))

    return config
コード例 #4
0
def get_config(
        variant,
        expl_env,
        eval_env,
        obs_dim,
        action_dim,
        replay_buffer,
):
    """
    Set up terminal value function
    """

    M = variant['policy_kwargs']['layer_size']

    critic_policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
    )

    qf1, qf2, target_qf1, target_qf2 = ppp.group_init(
        4,
        FlattenMlp,
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )

    critic_policy_trainer = SACTrainer(
        env=expl_env,
        policy=critic_policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        **variant['policy_trainer_kwargs'],
    )

    """
    Set up dynamics model
    """

    M = variant['mbrl_kwargs']['layer_size']

    dynamics_model = ProbabilisticEnsemble(
        ensemble_size=variant['mbrl_kwargs']['ensemble_size'],
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M, M, M],
    )
    model_trainer = MBRLTrainer(
        ensemble=dynamics_model,
        **variant['mbrl_kwargs'],
    )

    """
    Set up MPC
    """

    policy = MPCPolicy(
        env=expl_env,
        dynamics_model=dynamics_model,
        plan_dim=action_dim,
        value_func=value_func,
        value_func_kwargs=dict(
            critic_policy=critic_policy,
            qf1=qf1,
            qf2=qf2,
        ),
        **variant['mpc_kwargs'],
    )
    trainer = MPPITrainer(
        policy=policy,
    )

    trainer = MultiTrainer(
        trainers=[trainer, critic_policy_trainer],
        trainer_steps=[1, 1],
        trainer_names=['mpc_trainer', 'sac_trainer'],
    )

    config = dict()
    config.update(dict(
        trainer=trainer,
        model_trainer=model_trainer,
        exploration_policy=policy,
        evaluation_policy=critic_policy,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        replay_buffer=replay_buffer,
        dynamics_model=dynamics_model,
    ))
    config['algorithm_kwargs'] = variant.get('algorithm_kwargs', dict())

    return config
コード例 #5
0
ファイル: dads_config.py プロジェクト: zhan0903/lifelong_rl
def get_config(
    variant,
    expl_env,
    eval_env,
    obs_dim,
    action_dim,
    replay_buffer,
):
    """
    Policy construction
    """

    M = variant['policy_kwargs']['layer_size']
    latent_dim = variant['policy_kwargs']['latent_dim']
    restrict_dim = variant['discriminator_kwargs']['restrict_input_size']

    control_policy = TanhGaussianPolicy(
        obs_dim=obs_dim + latent_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
        restrict_obs_dim=restrict_dim,
    )

    prior = torch.distributions.uniform.Uniform(
        -ptu.ones(latent_dim),
        ptu.ones(latent_dim),
    )

    policy = PriorLatentPolicy(
        policy=control_policy,
        prior=prior,
        unconditional=True,
    )

    qf1, qf2, target_qf1, target_qf2 = ppp.group_init(
        4,
        FlattenMlp,
        input_size=obs_dim + latent_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    """
    Discriminator
    """

    discrim_kwargs = variant['discriminator_kwargs']
    discriminator = SkillDynamics(
        observation_size=obs_dim if restrict_dim == 0 else restrict_dim,
        action_size=action_dim,
        latent_size=latent_dim,
        normalize_observations=discrim_kwargs.get('normalize_observations',
                                                  True),
        fix_variance=discrim_kwargs.get('fix_variance', True),
        fc_layer_params=[discrim_kwargs['layer_size']] *
        discrim_kwargs['num_layers'],
    )
    """
    Policy trainer
    """

    policy_trainer = SACTrainer(
        env=expl_env,
        policy=control_policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        **variant['policy_trainer_kwargs'],
    )
    """
    Setup of intrinsic control
    """

    dads_type = variant.get('dads_type', 'onpolicy')
    if dads_type == 'onpolicy':
        trainer_class = DADSTrainer
    else:
        raise NotImplementedError('dads_type not recognized')

    trainer = trainer_class(
        control_policy=control_policy,
        discriminator=discriminator,
        replay_buffer=replay_buffer,
        replay_size=variant['generated_replay_buffer_size'],
        policy_trainer=policy_trainer,
        restrict_input_size=restrict_dim,
        **variant['trainer_kwargs'],
    )
    """
    Create config dict
    """

    config = dict()
    config.update(
        dict(
            trainer=trainer,
            exploration_policy=policy,
            evaluation_policy=policy,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            replay_buffer=replay_buffer,
            prior=prior,
            control_policy=control_policy,
            latent_dim=latent_dim,
            policy_trainer=policy_trainer,
        ))
    config['algorithm_kwargs'] = variant.get('algorithm_kwargs', dict())
    config['offline_kwargs'] = variant.get('offline_kwargs', dict())

    return config