Esempio n. 1
0
def setup_mixins(policy, obs_space, action_space, config):
    """Copied from PPO"""
    KLCoeffMixin.__init__(policy, config)
    EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
                                  config["entropy_coeff_schedule"])
    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])


def central_vf_stats(policy, train_batch, grads):
    """Report the explained variance of the central value function"""
    return {
        "vf_explained_var":
        explained_variance(train_batch[Postprocessing.VALUE_TARGETS],
                           policy.central_value_out),
    }


CCPPO = PPOTFPolicy.with_updates(
    name="CCPPO",
    postprocess_fn=centralized_critic_postprocessing,
    loss_fn=loss_with_central_critic,
    before_loss_init=setup_mixins,
    grad_stats_fn=central_vf_stats,
    mixins=[
        LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
        CentralizedValueMixin
    ])

CCTrainer = PPOTrainer.with_updates(name="CCPPOTrainer", default_policy=CCPPO)
Esempio n. 2
0
        values = values[:len(values) - max(upper, 0)]
        values = np.pad(
            values,
            pad_width=[
                (-min(lower, 0), -min(0, upper)),
                *[(0, 0) for k in range(values.ndim - 1)],
            ],
            mode="constant",
        )
        return values


CCPPOPolicy = PPOTFPolicy.with_updates(
    name="CCPPOPolicy",
    postprocess_fn=centralized_critic_postprocessing,
    loss_fn=loss_with_central_critic,
    before_loss_init=setup_mixins,
    grad_stats_fn=central_vf_stats,
    mixins=[
        LearningRateSchedule,
        EntropyCoeffSchedule,
        KLCoeffMixin,
        CentralizedValueMixin,
    ],
)
register_trainable(
    "CcTransformer",
    PPOTrainer.with_updates(name="CCPPOTrainer",
                            get_policy_class=lambda c: CCPPOPolicy),
)
Esempio n. 3
0
    postprocess_fn=centralized_critic_postprocessing,
    loss_fn=loss_with_central_critic,
    before_init=setup_mixins,
    mixins=[
        TorchLR, TorchEntropyCoeffSchedule, TorchKLCoeffMixin,
        CentralizedValueMixin
    ])


def get_policy_class(config):
    return CCPPOTorchPolicy if config["use_pytorch"] else CCPPOTFPolicy


CCTrainer = PPOTrainer.with_updates(
    name="CCPPOTrainer",
    default_policy=CCPPOTFPolicy,
    get_policy_class=get_policy_class,
)

if __name__ == "__main__":
    ray.init(local_mode=True)
    args = parser.parse_args()

    ModelCatalog.register_custom_model(
        "cc_model",
        TorchCentralizedCriticModel if args.torch else CentralizedCriticModel)

    config = {
        "env": TwoStepGame,
        "batch_mode": "complete_episodes",
        "eager": False,
Esempio n. 4
0
    # only update the policies pool if used DELAY_UPDATE, otherwise
    # the policies_pool in each policy is simply not used, so we don't
    # need to update it.
    if trainer.config[DELAY_UPDATE]:
        if trainer.workers.remote_workers():
            weights = ray.put(trainer.workers.local_worker().get_weights())
            for e in trainer.workers.remote_workers():
                e.set_weights.remote(weights)

            def _delay_update_for_worker(worker, worker_index):
                worker.foreach_policy(lambda p, _: p.update_target())

            trainer.workers.foreach_worker_with_index(_delay_update_for_worker)


def get_policy_class(config):
    return DiCEPolicy


DiCETrainer = PPOTrainer.with_updates(
    name="DiCETrainer",
    default_config=dice_default_config,
    default_policy=DiCEPolicy,
    get_policy_class=get_policy_class,
    validate_config=validate_config,
    make_policy_optimizer=make_policy_optimizer_tnbes,
    after_init=setup_policies_pool,
    after_optimizer_step=after_optimizer_iteration,
)
Esempio n. 5
0
from toolbox import train
from toolbox.evolution import GaussianESTrainer
from toolbox.evolution_plugin.evolution_plugin import choose_optimzier, \
    merge_dicts, DEFAULT_CONFIG
from toolbox.train import get_train_parser

ppo_sgd_config = merge_dicts(DEFAULT_CONFIG, dict(master_optimizer_type="sgd"))

PPOSGDPolicy = PPOTFPolicy.with_updates(
    name="PPOSGDPolicy",
    get_default_config=lambda: ppo_sgd_config,
    optimizer_fn=choose_optimzier)

PPOSGDTrainer = PPOTrainer.with_updates(
    name="PPOSGD",
    default_config=ppo_sgd_config,
    default_policy=PPOSGDPolicy,
    get_policy_class=lambda _: PPOSGDPolicy)

if __name__ == '__main__':
    parser = get_train_parser()
    parser.add_argument("--ppo", action="store_true")
    parser.add_argument("--es", action="store_true")
    # parser.add_argument("--optimizer", type=str, default="sgd")  # [adam, sgd]
    parser.add_argument("--stop", type=float, default=1e7)
    parser.add_argument("--local-mode", "-lm", action="store_true")
    args = parser.parse_args()
    print(args)
    local_mode = args.local_mode
    now = time.time()
    assert int(args.ppo) + int(args.es) == 1
Esempio n. 6
0
            policy.central_value_out),
    }


CCPPO = PPOTFPolicy.with_updates(
    name="CCPPO",
    postprocess_fn=centralized_critic_postprocessing,
    loss_fn=loss_with_central_critic,
    before_loss_init=setup_mixins,
    grad_stats_fn=central_vf_stats,
    mixins=[
        LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
        CentralizedValueMixin
    ])

CCTrainer = PPOTrainer.with_updates(
    name="CCPPOTrainer", default_policy=CCPPO, get_policy_class=None)

if __name__ == "__main__":
    args = parser.parse_args()
    ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel)
    tune.run(
        CCTrainer,
        stop={
            "timesteps_total": args.stop,
            "episode_reward_mean": 7.99,
        },
        config={
            "env": TwoStepGame,
            "batch_mode": "complete_episodes",
            "eager": False,
            "num_workers": 0,
Esempio n. 7
0
            train_batch[Postprocessing.VALUE_TARGETS],
            policy.model.value_function()
        )
    }


def setup_mixins_without_kl(policy, obs_space, action_space, config):
    ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
    # EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
    #                               config["entropy_coeff_schedule"])
    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])


PPOTFPolicyWithoutKL = PPOTFPolicy.with_updates(
    name="PPOTFPolicyWithoutKL",
    loss_fn=ppo_surrogate_loss_without_kl,
    stats_fn=loss_stats,
    before_loss_init=setup_mixins_without_kl,
    mixins=[
        LearningRateSchedule,
        # EntropyCoeffSchedule,
        ValueNetworkMixin
    ]
)

PPOTrainerWithoutKL = PPOTrainer.with_updates(
    name="PPOWithoutKL",
    default_policy=PPOTFPolicyWithoutKL,
    after_optimizer_step=None
)
Esempio n. 8
0
    "redo_invalid_games": False,
    "wandb": {},
    "ed": None,
    "policy_catalog": None,
    "eq_iters": None,
    "adaptive_pval_test": False,
    "br_thres": None,
    "eq_thres": None,
    "br_eval_against_policy": None,
    "thres_is_pval": None,
    "adaptive_pval": None
}

PPO_CUSTOM_EVAL_TRAINER_DEFAULT_CONFIG = with_base_config(
    base_config=DEFAULT_CONFIG,
    extra_config=ppo_custom_eval_trainer_added_config_items)

ppo_custom_eval_trainer_mixins = [
    CustomEvaluationsTrainerMixin, WeightsUtilsTrainerMixin
]

# Add custom evaluation logic to PPOTrainer
PPOCustomEvalTrainer = PPOTrainer.with_updates(
    name="PPOCustomEvalTrainer",
    default_config=PPO_CUSTOM_EVAL_TRAINER_DEFAULT_CONFIG,
    before_init=ppo_custom_eval_trainer_before_init,
    after_init=ppo_custom_eval_trainer_after_init,
    validate_config=ppo_custom_eval_trainer_validate_config,
    after_optimizer_step=after_optimizer_step,
    collect_metrics_fn=collect_metrics,
    mixins=ppo_custom_eval_trainer_mixins)
        tf.shape(policy.get_placeholder(SampleBatch.CUR_OBS))[0])


def grad_stats(policy, train_batch, grads):
    return {
        "grad_gnorm": tf.global_norm(grads),
        "vf_explained_var": explained_variance(
            train_batch[Postprocessing.VALUE_TARGETS],
            policy.central_value_function),
    }


ImitationCentralizedPolicy = PPOTFPolicy.with_updates(
    name="ImitationCentralizedPolicy",
    before_loss_init=setup_mixins,
    postprocess_fn=centralized_critic_postprocessing,
    stats_fn=loss_stats,
    grad_stats_fn=grad_stats,
    loss_fn=new_ppo_surrogate_loss,
    mixins=[
        LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
        CentralizedValueMixin, ImitationLearningRateSchedule
    ])

ImitationCentralizedTrainer = PPOTrainer.with_updates(name="ImitationCentralizedPPOTrainer",
                                                      default_policy=ImitationCentralizedPolicy,
                                                      after_optimizer_step=update_kl)


CCImitationTrainer = PPOTrainer.with_updates(name="CCImitationPPOTrainer", default_policy=ImitationCentralizedPolicy,
                                             )
Esempio n. 10
0
        train_batch_size=config["train_batch_size"],
        standardize_fields=["advantages"],
        shuffle_sequences=config["shuffle_sequences"])


def setup_mixins_modified(policy, obs_space, action_space, config):
    AddLossMixin.__init__(policy, config)
    setup_mixins(policy, obs_space, action_space, config)


ExtraLossPPOTFPolicy = PPOTFPolicy.with_updates(
    name="ExtraLossPPOTFPolicy",
    get_default_config=lambda: extra_loss_ppo_default_config,
    postprocess_fn=postprocess_ppo_gae_modified,
    stats_fn=kl_and_loss_stats_modified,
    loss_fn=extra_loss_ppo_loss,
    before_loss_init=setup_mixins_modified,
    mixins=mixin_list + [AddLossMixin])

ExtraLossPPOTrainer = PPOTrainer.with_updates(
    name="ExtraLossPPO",
    default_config=extra_loss_ppo_default_config,
    validate_config=validate_config_modified,
    default_policy=ExtraLossPPOTFPolicy,
    make_policy_optimizer=choose_policy_optimizer)

if __name__ == '__main__':
    from toolbox.marl.test_extra_loss import test_extra_loss_ppo_trainer1

    test_extra_loss_ppo_trainer1(True)
Esempio n. 11
0
    ImitationLearningRateSchedule.__init__(
        policy, config["model"]["custom_options"]["num_imitation_iters"],
        config["model"]["custom_options"]["imitation_weight"], config)


def grad_stats(policy, train_batch, grads):
    return {
        "grad_gnorm":
        tf.global_norm(grads),
        "vf_explained_var":
        explained_variance(train_batch[Postprocessing.VALUE_TARGETS],
                           policy.model.value_function()),
    }


ImitationPolicy = PPOTFPolicy.with_updates(name="ImitationPolicy",
                                           before_loss_init=setup_mixins,
                                           stats_fn=loss_stats,
                                           grad_stats_fn=grad_stats,
                                           loss_fn=new_ppo_surrogate_loss,
                                           mixins=[
                                               LearningRateSchedule,
                                               EntropyCoeffSchedule,
                                               KLCoeffMixin, ValueNetworkMixin,
                                               ImitationLearningRateSchedule
                                           ])

ImitationTrainer = PPOTrainer.with_updates(name="ImitationPPOTrainer",
                                           default_policy=ImitationPolicy,
                                           after_optimizer_step=update_kl)
    # Which observation filter to apply to the observation.
    "observation_filter": "NoFilter",
    # Uses the sync samples optimizer instead of the multi-gpu one. This is
    # usually slower, but you might want to try it if you run into issues with
    # the default optimizer.
    "simple_optimizer": False,
    # Whether to fake GPUs (using CPUs).
    # Set this to True for debugging on non-GPU machines (set `num_gpus` > 0).
    "_fake_gpus": False,
    # Use PyTorch as framework?
    "use_pytorch": False
})
# __sphinx_doc_end__
# yapf: enable


def get_policy_class(config):
    if config["use_pytorch"]:
        from algorithms.custom_ppo.custom_ppo_torch_policy import CustomPPOTorchPolicy
        return CustomPPOTorchPolicy
    else:
        return PPOTFPolicy


CustomPPOTrainer = PPOTrainer.with_updates(
    name="CustomPPO",
    default_config=DEFAULT_CONFIG,
    default_policy=PPOTFPolicy,
    get_policy_class=get_policy_class,
)
Esempio n. 13
0
                                              EntropyCoeffSchedule,
                                              KLCoeffMixin, ValueNetworkMixin,
                                              FIMEmbeddingMixin
                                          ])


def get_policy_class(config):
    if config.get("use_pytorch") is True:
        raise NotImplementedError()
    else:
        return PPOFIMTFPolicy


PPOFIMTrainer = PPOTrainer.with_updates(
    name="PPOFIM",
    default_policy=PPOFIMTFPolicy,
    get_policy_class=get_policy_class,
)


def agent_to_vector(target_agent, probe_agent):
    # Step 1: sample a dataset for given subject_agent
    dataset = []
    for i in range(20):
        dataset.append(target_agent.workers.local_worker().sample())
    dataset = dataset[0].concat_samples(dataset)
    dataset.shuffle()
    # TODO not sure the samples is uniformly spread since each batch is
    #  in one episode.

    # Step 2: compute the embdding for subject_agent via probe_agent
Esempio n. 14
0
def validate_config(config):
    tmp_env = MultiAgentEnvWrapper(config["env_config"])
    config["multiagent"]["policies"] = {
        "agent{}".format(i):
        (None, tmp_env.observation_space, tmp_env.action_space, {})
        for i in range(num_agents)
    }
    config["multiagent"]["policy_mapping_fn"] = lambda x: x

    original_validate(config)


PPOESTrainer = PPOTrainer.with_updates(
    name="PPOES",
    default_config=ppo_es_default_config,
    after_train_result=run_evolution_strategies,
    validate_config=validate_config)

if __name__ == '__main__':
    env_name = "CartPole-v0"
    num_agents = 3
    config = {
        "num_sgd_iter": 2,
        "train_batch_size": 400,
        "update_steps": 1000,
        **get_marl_env_config(env_name, num_agents)
    }
    initialize_ray(test_mode=True, local_mode=True)
    train(PPOESTrainer,
          config,
Esempio n. 15
0
def before_train_step(trainer):
    policy = trainer.get_policy()
    if not policy.initialized_policies_pool:
        # function to call for each worker (including remote and local workers)
        def init_novelty(worker):
            # function for each policy within one worker.
            def _init_novelty_policy(policy, _):
                policy._lazy_initialize()

            worker.foreach_policy(_init_novelty_policy)

        trainer.workers.foreach_worker(init_novelty)


def validate_config(config):
    validate_config_original(config)
    assert config['model']['custom_model'] == "ActorDoubleCriticNetwork"
    config['model']['custom_options'] = {
        "use_novelty_value_network": config['use_novelty_value_network']
    }


TNBTrainer = PPOTrainer.with_updates(
    name="TNBPPO",
    validate_config=validate_config,
    make_policy_optimizer=choose_policy_optimizer,
    default_config=tnb_default_config,
    before_train_step=before_train_step,
    default_policy=TNBPolicy,
    get_policy_class=lambda _: TNBPolicy)
Esempio n. 16
0
    trainer.workers.foreach_worker(_init_pool)


IPDPolicy = PPOTFPolicy.with_updates(
    name="IPDPolicy",
    get_default_config=lambda: ipd_default_config,
    before_loss_init=setup_mixins_tnb,
    mixins=[
        LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
        ValueNetworkMixin, AgentPoolMixin
    ]
)

IPDTrainer = PPOTrainer.with_updates(
    name="IPD",
    default_config=ipd_default_config,
    after_init=after_init,
    default_policy=IPDPolicy
)

if __name__ == '__main__':
    from ray import tune

    from toolbox import initialize_ray

    initialize_ray(test_mode=True, local_mode=False)
    env_name = "CartPole-v0"
    config = {
        "num_sgd_iter": 2,
        "env": IPDEnv,
        "env_config": {
            "env_name": env_name,