Exemple #1
0
                                              EntropyCoeffSchedule,
                                              KLCoeffMixin, ValueNetworkMixin,
                                              FIMEmbeddingMixin
                                          ])


def get_policy_class(config):
    if config.get("use_pytorch") is True:
        raise NotImplementedError()
    else:
        return PPOFIMTFPolicy


PPOFIMTrainer = PPOTrainer.with_updates(
    name="PPOFIM",
    default_policy=PPOFIMTFPolicy,
    get_policy_class=get_policy_class,
)


def agent_to_vector(target_agent, probe_agent):
    # Step 1: sample a dataset for given subject_agent
    dataset = []
    for i in range(20):
        dataset.append(target_agent.workers.local_worker().sample())
    dataset = dataset[0].concat_samples(dataset)
    dataset.shuffle()
    # TODO not sure the samples is uniformly spread since each batch is
    #  in one episode.

    # Step 2: compute the embdding for subject_agent via probe_agent
Exemple #2
0
def validate_config(config):
    tmp_env = MultiAgentEnvWrapper(config["env_config"])
    config["multiagent"]["policies"] = {
        "agent{}".format(i):
        (None, tmp_env.observation_space, tmp_env.action_space, {})
        for i in range(num_agents)
    }
    config["multiagent"]["policy_mapping_fn"] = lambda x: x

    original_validate(config)


PPOESTrainer = PPOTrainer.with_updates(
    name="PPOES",
    default_config=ppo_es_default_config,
    after_train_result=run_evolution_strategies,
    validate_config=validate_config)

if __name__ == '__main__':
    env_name = "CartPole-v0"
    num_agents = 3
    config = {
        "num_sgd_iter": 2,
        "train_batch_size": 400,
        "update_steps": 1000,
        **get_marl_env_config(env_name, num_agents)
    }
    initialize_ray(test_mode=True, local_mode=True)
    train(PPOESTrainer,
          config,
class AlternateTraining(Trainable):
    def _setup(self, config):
        self.config = config
        self.env = config['env']
        agent_config = self.config
        adv_config = deepcopy(self.config)
        agent_config['multiagent']['policies_to_train'] = ['agent']
        adv_config['multiagent']['policies_to_train'] = ['adversary']

        self.agent_trainer = PPOTrainer(env=self.env, config=agent_config)
        self.adv_trainer = PPOTrainer(env=self.env, config=adv_config)

    def _train(self):
        # improve the Adversary policy
        print("-- Adversary Training --")
        original_weight = self.adv_trainer.get_weights(
            ["adversary"])['adversary']['adversary/fc_1/kernel'][0, 0]
        print(pretty_print(self.adv_trainer.train()))
        first_weight = self.adv_trainer.get_weights(
            ["adversary"])['adversary']['adversary/fc_1/kernel'][0, 0]

        # Check that the weights are updating after training
        assert original_weight != first_weight, 'The weight hasn\'t changed after training what gives'

        # swap weights to synchronize
        self.agent_trainer.set_weights(
            self.adv_trainer.get_weights(["adversary"]))

        # improve the Agent policy
        print("-- Agent Training --")
        output = self.agent_trainer.train()

        # Assert that the weight hasn't changed but it has
        new_weight = self.agent_trainer.get_weights(
            ["adversary"])['adversary']['adversary/fc_1/kernel'][0, 0]

        # Check that the adversary is not being trained when the agent trainer is training
        assert first_weight == new_weight, 'The weight of the adversary matrix has changed but it shouldnt have been updated!'

        # swap weights to synchronize
        self.adv_trainer.set_weights(self.agent_trainer.get_weights(["agent"]))

        return output

    def _save(self, tmp_checkpoint_dir):
        return self.agent_trainer._save(tmp_checkpoint_dir)
Exemple #4
0
        values = values[:len(values) - max(upper, 0)]
        values = np.pad(
            values,
            pad_width=[
                (-min(lower, 0), -min(0, upper)),
                *[(0, 0) for k in range(values.ndim - 1)],
            ],
            mode="constant",
        )
        return values


CCPPOPolicy = PPOTFPolicy.with_updates(
    name="CCPPOPolicy",
    postprocess_fn=centralized_critic_postprocessing,
    loss_fn=loss_with_central_critic,
    before_loss_init=setup_mixins,
    grad_stats_fn=central_vf_stats,
    mixins=[
        LearningRateSchedule,
        EntropyCoeffSchedule,
        KLCoeffMixin,
        CentralizedValueMixin,
    ],
)
register_trainable(
    "CcConcatenate",
    PPOTrainer.with_updates(name="CCPPOTrainer",
                            get_policy_class=lambda c: CCPPOPolicy),
)
Exemple #5
0
def before_train_step(trainer):
    policy = trainer.get_policy()
    if not policy.initialized_policies_pool:
        # function to call for each worker (including remote and local workers)
        def init_novelty(worker):
            # function for each policy within one worker.
            def _init_novelty_policy(policy, _):
                policy._lazy_initialize()

            worker.foreach_policy(_init_novelty_policy)

        trainer.workers.foreach_worker(init_novelty)


def validate_config(config):
    validate_config_original(config)
    assert config['model']['custom_model'] == "ActorDoubleCriticNetwork"
    config['model']['custom_options'] = {
        "use_novelty_value_network": config['use_novelty_value_network']
    }


TNBTrainer = PPOTrainer.with_updates(
    name="TNBPPO",
    validate_config=validate_config,
    make_policy_optimizer=choose_policy_optimizer,
    default_config=tnb_default_config,
    before_train_step=before_train_step,
    default_policy=TNBPolicy,
    get_policy_class=lambda _: TNBPolicy)
Exemple #6
0
            "policies": {
                "policy_1": (None, obs_space, act_space, {}),
                "policy_2": (None, obs_space, act_space, {}),
                "policy_3": (None, obs_space, act_space, {}),
                "policy_4": (None, obs_space, act_space, {}),
            },
            "policy_mapping_fn": policy_mapping_fn,
            "policies_to_train": ["policy_1"],
        },
        "model": {
            "custom_model": "yaniv_mask",
            "fcnet_hiddens": [512, 512],
        },
    }

    ray.init(include_dashboard=False, local_mode=True)

    ppo = PPOTrainer(env="yaniv", config=config)
    ppo.restore(args.ppo_checkpoint)

    a3c = A3CTrainer(env="yaniv", config=config)
    a3c.restore(args.a3c_checkpoint)

    tourney = YanivTournament(env_config,
                              trainers=[a3c],
                              opponent="intermediate")
    # tourney.run_episode(True)
    # tourney.print_stats()
    tourney.run(args.eval_num)
    print("\n\nRESULTS:\n")
    tourney.print_stats()
Exemple #7
0
def gen_trainer_from_params(params):
    # All ray environment set-up
    if not ray.is_initialized():
        ray.init(ignore_reinit_error=True, include_webui=False, temp_dir=params['ray_params']['temp_dir'])
    register_env("overcooked_multi_agent", params['ray_params']['env_creator'])
    ModelCatalog.register_custom_model(params['ray_params']['custom_model_id'], params['ray_params']['custom_model_cls'])

    # Parse params
    training_params = params['training_params']
    environment_params = params['environment_params']
    evaluation_params = params['evaluation_params']
    multi_agent_params = params['environment_params']['multi_agent_params']
    agent_params = params["agent_params"] # only ml based agents

    env = OvercookedMultiAgent.from_config(environment_params)

    # Returns a properly formatted policy tuple to be passed into ppotrainer config
    def gen_policy(policy_type="ppo"):
        return (
            agent_params[policy_type].get("policy_cls"),
            env.observation_spaces[policy_type],
            env.action_space,
            agent_params[policy_type]["config"]
            )

    # Rllib compatible way of setting the directory we store agent checkpoints in
    logdir_prefix = "{0}_{1}_{2}".format(params["experiment_name"], params['training_params']['seed'], timestr)
    def custom_logger_creator(config):
        """Creates a Unified logger that stores results in <params['results_dir']>/<params["experiment_name"]>_<seed>_<timestamp>
        """
        results_dir = params['results_dir']
        if not os.path.exists(results_dir):
            try:
                os.makedirs(results_dir)
            except Exception as e:
                print("error creating custom logging dir. Falling back to default logdir {}".format(DEFAULT_RESULTS_DIR))
                results_dir = DEFAULT_RESULTS_DIR
        logdir = tempfile.mkdtemp(
            prefix=logdir_prefix, dir=results_dir)
        logger = UnifiedLogger(config, logdir, loggers=None)
        return logger

    if "outer_shape" not in environment_params:
        environment_params["outer_shape"] = None

    if "mdp_params" in environment_params:
        environment_params["eval_mdp_params"] = environment_params["mdp_params"]
    
    # Create rllib compatible multi-agent config based on params
    multi_agent_config = {}

    if multi_agent_params.get('bc_schedule'):
        agents_schedule = OvercookedMultiAgent.bc_schedule_to_agents_schedule(multi_agent_params['bc_schedule'])
    else:
        agents_schedule = multi_agent_params['agents_schedule']
    all_policies = OvercookedMultiAgent.agents_from_schedule(agents_schedule)
    ml_policies = [p for p in all_policies if OvercookedMultiAgent.is_ml_agent(p)]

    multi_agent_config['policies'] = { policy : gen_policy(policy) for policy in ml_policies }
    
    def select_policy(agent_id):
        return OvercookedMultiAgent.agent_id_to_agent_name(agent_id)

    multi_agent_config['policy_mapping_fn'] = select_policy
    multi_agent_config['policies_to_train'] = 'ppo'

    eval_function = get_rllib_eval_function(evaluation_params, environment_params['eval_mdp_params'],
        environment_params['env_params'], environment_params["outer_shape"], multi_agent_params["featurize_fns"], shuffle=multi_agent_params["shuffle_agents"],
        )

    trainer = PPOTrainer(env="overcooked_multi_agent", config={
        "multiagent": multi_agent_config,
        "callbacks" : TrainingCallbacks,
        "custom_eval_function" : eval_function,
        "env_config" : environment_params,
        "eager" : False,
        **training_params
    }, logger_creator=custom_logger_creator)
    return trainer
Exemple #8
0
    trainer.workers.foreach_worker(_init_pool)


IPDPolicy = PPOTFPolicy.with_updates(
    name="IPDPolicy",
    get_default_config=lambda: ipd_default_config,
    before_loss_init=setup_mixins_tnb,
    mixins=[
        LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
        ValueNetworkMixin, AgentPoolMixin
    ]
)

IPDTrainer = PPOTrainer.with_updates(
    name="IPD",
    default_config=ipd_default_config,
    after_init=after_init,
    default_policy=IPDPolicy
)

if __name__ == '__main__':
    from ray import tune

    from toolbox import initialize_ray

    initialize_ray(test_mode=True, local_mode=False)
    env_name = "CartPole-v0"
    config = {
        "num_sgd_iter": 2,
        "env": IPDEnv,
        "env_config": {
            "env_name": env_name,