EntropyCoeffSchedule, KLCoeffMixin, ValueNetworkMixin, FIMEmbeddingMixin ]) def get_policy_class(config): if config.get("use_pytorch") is True: raise NotImplementedError() else: return PPOFIMTFPolicy PPOFIMTrainer = PPOTrainer.with_updates( name="PPOFIM", default_policy=PPOFIMTFPolicy, get_policy_class=get_policy_class, ) def agent_to_vector(target_agent, probe_agent): # Step 1: sample a dataset for given subject_agent dataset = [] for i in range(20): dataset.append(target_agent.workers.local_worker().sample()) dataset = dataset[0].concat_samples(dataset) dataset.shuffle() # TODO not sure the samples is uniformly spread since each batch is # in one episode. # Step 2: compute the embdding for subject_agent via probe_agent
def validate_config(config): tmp_env = MultiAgentEnvWrapper(config["env_config"]) config["multiagent"]["policies"] = { "agent{}".format(i): (None, tmp_env.observation_space, tmp_env.action_space, {}) for i in range(num_agents) } config["multiagent"]["policy_mapping_fn"] = lambda x: x original_validate(config) PPOESTrainer = PPOTrainer.with_updates( name="PPOES", default_config=ppo_es_default_config, after_train_result=run_evolution_strategies, validate_config=validate_config) if __name__ == '__main__': env_name = "CartPole-v0" num_agents = 3 config = { "num_sgd_iter": 2, "train_batch_size": 400, "update_steps": 1000, **get_marl_env_config(env_name, num_agents) } initialize_ray(test_mode=True, local_mode=True) train(PPOESTrainer, config,
class AlternateTraining(Trainable): def _setup(self, config): self.config = config self.env = config['env'] agent_config = self.config adv_config = deepcopy(self.config) agent_config['multiagent']['policies_to_train'] = ['agent'] adv_config['multiagent']['policies_to_train'] = ['adversary'] self.agent_trainer = PPOTrainer(env=self.env, config=agent_config) self.adv_trainer = PPOTrainer(env=self.env, config=adv_config) def _train(self): # improve the Adversary policy print("-- Adversary Training --") original_weight = self.adv_trainer.get_weights( ["adversary"])['adversary']['adversary/fc_1/kernel'][0, 0] print(pretty_print(self.adv_trainer.train())) first_weight = self.adv_trainer.get_weights( ["adversary"])['adversary']['adversary/fc_1/kernel'][0, 0] # Check that the weights are updating after training assert original_weight != first_weight, 'The weight hasn\'t changed after training what gives' # swap weights to synchronize self.agent_trainer.set_weights( self.adv_trainer.get_weights(["adversary"])) # improve the Agent policy print("-- Agent Training --") output = self.agent_trainer.train() # Assert that the weight hasn't changed but it has new_weight = self.agent_trainer.get_weights( ["adversary"])['adversary']['adversary/fc_1/kernel'][0, 0] # Check that the adversary is not being trained when the agent trainer is training assert first_weight == new_weight, 'The weight of the adversary matrix has changed but it shouldnt have been updated!' # swap weights to synchronize self.adv_trainer.set_weights(self.agent_trainer.get_weights(["agent"])) return output def _save(self, tmp_checkpoint_dir): return self.agent_trainer._save(tmp_checkpoint_dir)
values = values[:len(values) - max(upper, 0)] values = np.pad( values, pad_width=[ (-min(lower, 0), -min(0, upper)), *[(0, 0) for k in range(values.ndim - 1)], ], mode="constant", ) return values CCPPOPolicy = PPOTFPolicy.with_updates( name="CCPPOPolicy", postprocess_fn=centralized_critic_postprocessing, loss_fn=loss_with_central_critic, before_loss_init=setup_mixins, grad_stats_fn=central_vf_stats, mixins=[ LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, CentralizedValueMixin, ], ) register_trainable( "CcConcatenate", PPOTrainer.with_updates(name="CCPPOTrainer", get_policy_class=lambda c: CCPPOPolicy), )
def before_train_step(trainer): policy = trainer.get_policy() if not policy.initialized_policies_pool: # function to call for each worker (including remote and local workers) def init_novelty(worker): # function for each policy within one worker. def _init_novelty_policy(policy, _): policy._lazy_initialize() worker.foreach_policy(_init_novelty_policy) trainer.workers.foreach_worker(init_novelty) def validate_config(config): validate_config_original(config) assert config['model']['custom_model'] == "ActorDoubleCriticNetwork" config['model']['custom_options'] = { "use_novelty_value_network": config['use_novelty_value_network'] } TNBTrainer = PPOTrainer.with_updates( name="TNBPPO", validate_config=validate_config, make_policy_optimizer=choose_policy_optimizer, default_config=tnb_default_config, before_train_step=before_train_step, default_policy=TNBPolicy, get_policy_class=lambda _: TNBPolicy)
"policies": { "policy_1": (None, obs_space, act_space, {}), "policy_2": (None, obs_space, act_space, {}), "policy_3": (None, obs_space, act_space, {}), "policy_4": (None, obs_space, act_space, {}), }, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["policy_1"], }, "model": { "custom_model": "yaniv_mask", "fcnet_hiddens": [512, 512], }, } ray.init(include_dashboard=False, local_mode=True) ppo = PPOTrainer(env="yaniv", config=config) ppo.restore(args.ppo_checkpoint) a3c = A3CTrainer(env="yaniv", config=config) a3c.restore(args.a3c_checkpoint) tourney = YanivTournament(env_config, trainers=[a3c], opponent="intermediate") # tourney.run_episode(True) # tourney.print_stats() tourney.run(args.eval_num) print("\n\nRESULTS:\n") tourney.print_stats()
def gen_trainer_from_params(params): # All ray environment set-up if not ray.is_initialized(): ray.init(ignore_reinit_error=True, include_webui=False, temp_dir=params['ray_params']['temp_dir']) register_env("overcooked_multi_agent", params['ray_params']['env_creator']) ModelCatalog.register_custom_model(params['ray_params']['custom_model_id'], params['ray_params']['custom_model_cls']) # Parse params training_params = params['training_params'] environment_params = params['environment_params'] evaluation_params = params['evaluation_params'] multi_agent_params = params['environment_params']['multi_agent_params'] agent_params = params["agent_params"] # only ml based agents env = OvercookedMultiAgent.from_config(environment_params) # Returns a properly formatted policy tuple to be passed into ppotrainer config def gen_policy(policy_type="ppo"): return ( agent_params[policy_type].get("policy_cls"), env.observation_spaces[policy_type], env.action_space, agent_params[policy_type]["config"] ) # Rllib compatible way of setting the directory we store agent checkpoints in logdir_prefix = "{0}_{1}_{2}".format(params["experiment_name"], params['training_params']['seed'], timestr) def custom_logger_creator(config): """Creates a Unified logger that stores results in <params['results_dir']>/<params["experiment_name"]>_<seed>_<timestamp> """ results_dir = params['results_dir'] if not os.path.exists(results_dir): try: os.makedirs(results_dir) except Exception as e: print("error creating custom logging dir. Falling back to default logdir {}".format(DEFAULT_RESULTS_DIR)) results_dir = DEFAULT_RESULTS_DIR logdir = tempfile.mkdtemp( prefix=logdir_prefix, dir=results_dir) logger = UnifiedLogger(config, logdir, loggers=None) return logger if "outer_shape" not in environment_params: environment_params["outer_shape"] = None if "mdp_params" in environment_params: environment_params["eval_mdp_params"] = environment_params["mdp_params"] # Create rllib compatible multi-agent config based on params multi_agent_config = {} if multi_agent_params.get('bc_schedule'): agents_schedule = OvercookedMultiAgent.bc_schedule_to_agents_schedule(multi_agent_params['bc_schedule']) else: agents_schedule = multi_agent_params['agents_schedule'] all_policies = OvercookedMultiAgent.agents_from_schedule(agents_schedule) ml_policies = [p for p in all_policies if OvercookedMultiAgent.is_ml_agent(p)] multi_agent_config['policies'] = { policy : gen_policy(policy) for policy in ml_policies } def select_policy(agent_id): return OvercookedMultiAgent.agent_id_to_agent_name(agent_id) multi_agent_config['policy_mapping_fn'] = select_policy multi_agent_config['policies_to_train'] = 'ppo' eval_function = get_rllib_eval_function(evaluation_params, environment_params['eval_mdp_params'], environment_params['env_params'], environment_params["outer_shape"], multi_agent_params["featurize_fns"], shuffle=multi_agent_params["shuffle_agents"], ) trainer = PPOTrainer(env="overcooked_multi_agent", config={ "multiagent": multi_agent_config, "callbacks" : TrainingCallbacks, "custom_eval_function" : eval_function, "env_config" : environment_params, "eager" : False, **training_params }, logger_creator=custom_logger_creator) return trainer
trainer.workers.foreach_worker(_init_pool) IPDPolicy = PPOTFPolicy.with_updates( name="IPDPolicy", get_default_config=lambda: ipd_default_config, before_loss_init=setup_mixins_tnb, mixins=[ LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin, ValueNetworkMixin, AgentPoolMixin ] ) IPDTrainer = PPOTrainer.with_updates( name="IPD", default_config=ipd_default_config, after_init=after_init, default_policy=IPDPolicy ) if __name__ == '__main__': from ray import tune from toolbox import initialize_ray initialize_ray(test_mode=True, local_mode=False) env_name = "CartPole-v0" config = { "num_sgd_iter": 2, "env": IPDEnv, "env_config": { "env_name": env_name,