def get_agent_pair_trajs(self, a0, a1=None, num_games=100, game_length=None, start_state_fn=None, display=False, info=True): """Evaluate agent pair on both indices, and return trajectories by index""" if a1 is None: ap = AgentPair(a0, a0, allow_duplicate_agents=True) trajs_0 = trajs_1 = self.evaluate_agent_pair( ap, num_games=num_games, game_length=game_length, start_state_fn=start_state_fn, display=display, info=info) else: trajs_0 = self.evaluate_agent_pair(AgentPair(a0, a1), num_games=num_games, game_length=game_length, start_state_fn=start_state_fn, display=display, info=info) trajs_1 = self.evaluate_agent_pair(AgentPair(a1, a0), num_games=num_games, game_length=game_length, start_state_fn=start_state_fn, display=display, info=info) return trajs_0, trajs_1
def evaluate_ppo_hm_and_bc(layout, ppo_hm_path, bc_test_path, num_rounds, seeds, best=False, display=False): ppo_hm_performance = defaultdict(lambda: defaultdict(list)) agent_bc_test, bc_params = get_bc_agent_from_saved(bc_test_path) del bc_params["data_params"] del bc_params["mdp_fn_params"] evaluator = AgentEvaluator(**bc_params) for seed in seeds: agent_ppo, _ = get_ppo_agent(ppo_hm_path, seed, best=best) ppo_and_bc = evaluator.evaluate_agent_pair(AgentPair( agent_ppo, agent_bc_test), num_games=num_rounds, display=display) avg_ppo_and_bc = np.mean(ppo_and_bc['ep_returns']) ppo_hm_performance[layout]["PPO_HM+BC_test_0"].append(avg_ppo_and_bc) bc_and_ppo = evaluator.evaluate_agent_pair(AgentPair( agent_bc_test, agent_ppo), num_games=num_rounds, display=display) avg_bc_and_ppo = np.mean(bc_and_ppo['ep_returns']) ppo_hm_performance[layout]["PPO_HM+BC_test_1"].append(avg_bc_and_ppo) return ppo_hm_performance
def test_mdp_serialization(self): # Where to store serialized states -- will be overwritten each timestep dummy_path = os.path.join(TESTING_DATA_DIR, 'test_mdp_serialization', 'dummy.json') # Get starting seed and random agent pair seed = 47 random_pair = AgentPair(RandomAgent(all_actions=True), RandomAgent(all_actions=True)) # Run rollouts with different seeds until sparse reward is achieved sparse_reward = 0 while sparse_reward <= 0: np.random.seed(seed) state = self.base_mdp.get_standard_start_state() for _ in range(1500): # Ensure serialization and deserializations are inverses reconstructed_state = OvercookedState.from_dict( load_from_json(save_as_json(state.to_dict(), dummy_path))) self.assertEqual( state, reconstructed_state, "\nState: \t\t\t{}\nReconstructed State: \t{}".format( state, reconstructed_state)) # Advance state joint_action, _ = zip(*random_pair.joint_action(state)) state, infos = self.base_mdp.get_state_transition( state, joint_action) sparse_reward += sum(infos['sparse_reward_by_agent']) seed += 1
def test_scenario_1_s(self): # Smaller version of the corridor collisions scenario above # to facilitate DRL training scenario_1_mdp = OvercookedGridworld.from_layout_name( 'scenario1_s', start_order_list=['any'], cook_time=5) mlp = MediumLevelPlanner.from_pickle_or_compute( scenario_1_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute) a0 = GreedyHumanModel(mlp) a1 = CoupledPlanningAgent(mlp) agent_pair = AgentPair(a0, a1) start_state = OvercookedState( [P((2, 1), s, Obj('onion', (2, 1))), P((4, 2), s)], {}, order_list=['onion']) env = OvercookedEnv.from_mdp(scenario_1_mdp, start_state_fn=lambda: start_state) trajectory, time_taken_hr, _, _ = env.run_agents( agent_pair, include_final_state=True, display=DISPLAY) env.reset() print("\n" * 5) print("-" * 50) a0 = CoupledPlanningAgent(mlp) a1 = CoupledPlanningAgent(mlp) agent_pair = AgentPair(a0, a1) trajectory, time_taken_rr, _, _ = env.run_agents( agent_pair, include_final_state=True, display=DISPLAY) print("H+R time taken: ", time_taken_hr) print("R+R time taken: ", time_taken_rr) self.assertGreater(time_taken_hr, time_taken_rr)
def evaluate_bc_models(bc_model_paths, num_rounds): """ Evaluate BC models passed in over `num_rounds` rounds """ best_bc_models_performance = {} # Evaluate best for layout_name in bc_model_paths['train'].keys(): print(layout_name) best_bc_models_performance[layout_name] = {} eval_trajs = eval_with_benchmarking_from_saved(num_rounds, bc_model_paths['train'][layout_name]) best_bc_models_performance[layout_name]["BC_train+BC_train"] = mean_and_std_err(eval_trajs['ep_returns']) eval_trajs = eval_with_benchmarking_from_saved(num_rounds, bc_model_paths['test'][layout_name]) best_bc_models_performance[layout_name]["BC_test+BC_test"] = mean_and_std_err(eval_trajs['ep_returns']) bc_train, bc_params_train = get_bc_agent_from_saved(bc_model_paths['train'][layout_name]) bc_test, bc_params_test = get_bc_agent_from_saved(bc_model_paths['test'][layout_name]) del bc_params_train["data_params"] del bc_params_test["data_params"] assert common_keys_equal(bc_params_train, bc_params_test) ae = AgentEvaluator(mdp_params=bc_params_train["mdp_params"], env_params=bc_params_train["env_params"]) train_and_test = ae.evaluate_agent_pair(AgentPair(bc_train, bc_test), num_games=num_rounds) best_bc_models_performance[layout_name]["BC_train+BC_test_0"] = mean_and_std_err(train_and_test['ep_returns']) test_and_train = ae.evaluate_agent_pair(AgentPair(bc_test, bc_train), num_games=num_rounds) best_bc_models_performance[layout_name]["BC_train+BC_test_1"] = mean_and_std_err(test_and_train['ep_returns']) return best_bc_models_performance
def match_ppo_with_other_agent(save_dir, other_agent, n=1, display=False): agent, agent_eval = get_ppo_agent(save_dir) ap0 = AgentPair(agent, other_agent) agent_eval.evaluate_agent_pair(ap0, display=display, num_games=n) # Sketch switch ap1 = AgentPair(other_agent, agent) agent_eval.evaluate_agent_pair(ap1, display=display, num_games=n)
def evaluate_one_optimal_one_greedy_human(self, num_games, h_idx=0, display=True): h = GreedyHumanModel(self.mlp) r = CoupledPlanningAgent(self.mlp) agent_pair = AgentPair(h, r) if h_idx == 0 else AgentPair(r, h) return self.evaluate_agent_pair(agent_pair, num_games=num_games, display=display)
def get_agent_pair_trajs(self, a0, a1=None, num_games=100, display=False): """Evaluate agent pair on both indices, and return trajectories by index""" if a1 is None: ap = AgentPair(a0, a0, allow_duplicate_agents=True) trajs_0 = trajs_1 = self.evaluate_agent_pair(ap, num_games=num_games, display=display) else: trajs_0 = self.evaluate_agent_pair(AgentPair(a0, a1), num_games=num_games, display=display) trajs_1 = self.evaluate_agent_pair(AgentPair(a1, a0), num_games=num_games, display=display) return trajs_0, trajs_1
def evaluate_optimal_pair(self, display=True, delivery_horizon=2): a0 = CoupledPlanningAgent(self.mlp, delivery_horizon=delivery_horizon) a1 = CoupledPlanningAgent(self.mlp, delivery_horizon=delivery_horizon) a0.mlp.env = self.env a1.mlp.env = self.env agent_pair = AgentPair(a0, a1) return self.evaluate_agent_pair(agent_pair, display=display)
def evaluate_human_model_pair(self, display=True, num_games=1): a0 = GreedyHumanModel(self.mlp) a1 = GreedyHumanModel(self.mlp) agent_pair = AgentPair(a0, a1) return self.evaluate_agent_pair(agent_pair, display=display, num_games=num_games)
def test_slowed_down_agent(self): def should_stop(step_num, stop_every_n_steps): # currently SlowDownAgent always stops at 2nd step return not bool((i-1) % stop_every_n_steps) horizon = 100 #NOTE: if stop_every_n_steps is 3 this would not work because of rounding error # (ok for practical purposes, will just skip turn later but would fail test below) for stop_every_n_steps in [2, 4]: slowdown_rate = 1 - 1/stop_every_n_steps agent_pair = AgentPair( SlowedDownAgent(RandomAgent(), slowdown_rate), SlowedDownAgent(RandomAgent(), slowdown_rate) ) skip_action_probs = SlowedDownAgent(RandomAgent()).skip_action[1]["action_probs"].tolist() env = OvercookedEnv.from_mdp(large_mdp, horizon=horizon) trajectory, time_taken, _, _ = env.run_agents(agent_pair, include_final_state=True, display=DISPLAY) for i, traj_step in enumerate(trajectory): (s_t, a_t, r_t, done, info) = traj_step if not done: agent_0_probs = info["agent_infos"][0]["action_probs"] agent_1_probs = info["agent_infos"][1]["action_probs"] if should_stop(i, stop_every_n_steps): self.assertEqual(agent_0_probs.tolist(), skip_action_probs) self.assertEqual(agent_1_probs.tolist(), skip_action_probs) else: self.assertNotEqual(agent_0_probs.tolist(), skip_action_probs) self.assertNotEqual(agent_1_probs.tolist(), skip_action_probs)
def test_scenario_1(self): # Myopic corridor collision # # X X X X X O X D X X X X X # X ↓Ho X X # X X X X X X X X ↓R X # X X # X S X X X X X X X X P P X # # H on left with onion, further away to the tunnel entrance than R. # Optimal planner tells R to go first and that H will wait # for R to pass. H however, starts going through the tunnel # and they get stuck. The H plan is a bit extreme (it would probably # realize that it should retrace it's steps at some point) scenario_1_mdp = OvercookedGridworld.from_layout_name( 'small_corridor', start_order_list=['any'], cook_time=5) mlp = MediumLevelPlanner.from_pickle_or_compute( scenario_1_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute) a0 = GreedyHumanModel(mlp) a1 = CoupledPlanningAgent(mlp) agent_pair = AgentPair(a0, a1) start_state = OvercookedState( [P((2, 1), s, Obj('onion', (2, 1))), P((10, 2), s)], {}, order_list=['onion']) env = OvercookedEnv.from_mdp(scenario_1_mdp, start_state_fn=lambda: start_state) env.run_agents(agent_pair, include_final_state=True, display=DISPLAY)
def evaluate_one_optimal_one_random(self, num_games, display=True): a0 = CoupledPlanningAgent(self.mlp) a1 = RandomAgent() agent_pair = AgentPair(a0, a1) return self.evaluate_agent_pair(agent_pair, num_games=num_games, display=display)
def test_rollouts(self): ap = AgentPair(RandomAgent(), RandomAgent()) trajs = self.agent_eval.evaluate_agent_pair(ap, num_games=5) try: AgentEvaluator.check_trajectories(trajs) except AssertionError as e: self.fail("Trajectories were not returned in standard format:\n{}".format(e))
def setUp(self): self.base_mdp = OvercookedGridworld.from_layout_name("cramped_room") self.mlp = MediumLevelPlanner.from_pickle_or_compute( self.base_mdp, NO_COUNTERS_PARAMS, force_compute=True) self.env = OvercookedEnv(self.base_mdp, **DEFAULT_ENV_PARAMS) self.rnd_agent_pair = AgentPair(GreedyHumanModel(self.mlp), GreedyHumanModel(self.mlp)) np.random.seed(0)
def evaluate_random_pair(self, num_games=1, all_actions=True, display=False): agent_pair = AgentPair(RandomAgent(all_actions=all_actions), RandomAgent(all_actions=all_actions)) return self.evaluate_agent_pair(agent_pair, num_games=num_games, display=display)
def eval_with_benchmarking_from_model(n_games, model, bc_params, no_waits, display=False): bc_params = copy.deepcopy(bc_params) a0 = get_bc_agent_from_model(model, bc_params, no_waits) a1 = get_bc_agent_from_model(model, bc_params, no_waits) del bc_params["data_params"], bc_params["mdp_fn_params"] a_eval = AgentEvaluator(**bc_params) ap = AgentPair(a0, a1) trajectories = a_eval.evaluate_agent_pair(ap, num_games=n_games, display=display) return trajectories
def test_fixed_plan_agents(self): a0 = FixedPlanAgent([s, e, n, w]) a1 = FixedPlanAgent([s, w, n, e]) agent_pair = AgentPair(a0, a1) env = OvercookedEnv.from_mdp(large_mdp, horizon=10) trajectory, time_taken, _, _ = env.run_agents(agent_pair, include_final_state=True, display=DISPLAY) end_state = trajectory[-1][0] self.assertEqual(time_taken, 10) self.assertEqual(env.mdp.get_standard_start_state().player_positions, end_state.player_positions)
def test_one_coupled_one_fixed(self): a0 = CoupledPlanningAgent(self.mlp_large) a1 = FixedPlanAgent([s, e, n, w]) agent_pair = AgentPair(a0, a1) env = OvercookedEnv.from_mdp(large_mdp, horizon=10) trajectory, time_taken, _, _ = env.run_agents(agent_pair, include_final_state=True, display=DISPLAY) self.assertEqual(time_taken, 10)
def test_agents_on_open_map(self): scenario_2_mdp = OvercookedGridworld.from_layout_name('scenario2') mlam = MediumLevelActionManager.from_pickle_or_compute(scenario_2_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute) agent_pairs = [ AgentPair(GreedyHumanModel(mlam), GreedyHumanModel(mlam)), AgentPair(SimpleGreedyHumanModel(mlam), SimpleGreedyHumanModel(mlam)), AgentPair(RandomAgent(all_actions=True), RandomAgent(all_actions=True)), AgentPair(RandomAgent(all_actions=False), RandomAgent(all_actions=False)) ] start_state = OvercookedState( [P((8, 1), s), P((1, 1), s)], {}, all_orders=scenario_2_mdp.start_all_orders ) for agent_pair in agent_pairs: env = OvercookedEnv.from_mdp(scenario_2_mdp, start_state_fn=lambda: start_state, horizon=100) trajectory, time_taken, _, _ = env.run_agents(agent_pair, include_final_state=True, display=DISPLAY)
def test_embedded_planning_agent(self): agent_evaluator = AgentEvaluator({"layout_name": "cramped_room"}, {"horizon": 100}) other_agent = GreedyHumanModel(agent_evaluator.mlp) epa = EmbeddedPlanningAgent(other_agent, agent_evaluator.mlp, agent_evaluator.env, delivery_horizon=1) ap = AgentPair(epa, other_agent) agent_evaluator.evaluate_agent_pair(ap, num_games=1, display=DISPLAY)
def repetative_runs(self, evaluator, num_games=10): trajectory_0 = evaluator.evaluate_human_model_pair(num_games=num_games, native_eval=True) trajectory_1 = evaluator.evaluate_human_model_pair(num_games=num_games, native_eval=True) h0 = GreedyHumanModel(evaluator.env.mlam) h1 = GreedyHumanModel(evaluator.env.mlam) ap_hh_2 = AgentPair(h0, h1) trajectory_2 = evaluator.evaluate_agent_pair(agent_pair=ap_hh_2, num_games=num_games, native_eval=True) h3 = GreedyHumanModel(evaluator.env.mlam) h4 = GreedyHumanModel(evaluator.env.mlam) ap_hh_3 = AgentPair(h3, h4) trajectory_3 = evaluator.evaluate_agent_pair(agent_pair=ap_hh_3, num_games=num_games, native_eval=True)
def evaluate_human_model_pair(self, num_games=1, display=False, native_eval=False): a0 = GreedyHumanModel(self.env.mlam) a1 = GreedyHumanModel(self.env.mlam) agent_pair = AgentPair(a0, a1) return self.evaluate_agent_pair(agent_pair, num_games=num_games, display=display, native_eval=native_eval)
def eval_pbt_over_seeds(pbt_agents, bc_agent, layout_name, num_rounds, pbt_performance, agent_evaluator): ae = agent_evaluator for i in range(len(pbt_agents)): pbt_and_pbt = ae.evaluate_agent_pair(AgentPair( pbt_agents[i], pbt_agents[i], allow_duplicate_agents=True), num_games=num_rounds) avg_pbt_and_pbt = np.mean(pbt_and_pbt['ep_returns']) pbt_performance[layout_name]["PBT+PBT"].append(avg_pbt_and_pbt) pbt_and_bc = ae.evaluate_agent_pair(AgentPair(pbt_agents[i], bc_agent), num_games=num_rounds) avg_pbt_and_bc = np.mean(pbt_and_bc['ep_returns']) pbt_performance[layout_name]["PBT+BC_0"].append(avg_pbt_and_bc) bc_and_pbt = ae.evaluate_agent_pair(AgentPair(bc_agent, pbt_agents[i]), num_games=num_rounds) avg_bc_and_pbt = np.mean(bc_and_pbt['ep_returns']) pbt_performance[layout_name]["PBT+BC_1"].append(avg_bc_and_pbt) return pbt_performance
def evaluate(eval_params, mdp_params, outer_shape, agent_0_policy, agent_1_policy, agent_0_featurize_fn=None, agent_1_featurize_fn=None, verbose=False): """ Used to visualize rollouts of trained policies eval_params (dict): Contains configurations such as the rollout length, number of games, and whether to display rollouts mdp_params (dict): OvercookedMDP compatible configuration used to create environment used for evaluation outer_shape (list): a list of 2 item specifying the outer shape of the evaluation layout agent_0_policy (rllib.Policy): Policy instance used to map states to action logits for agent 0 agent_1_policy (rllib.Policy): Policy instance used to map states to action logits for agent 1 agent_0_featurize_fn (func): Used to preprocess states for agent 0, defaults to lossless_state_encoding if 'None' agent_1_featurize_fn (func): Used to preprocess states for agent 1, defaults to lossless_state_encoding if 'None' """ if verbose: print("eval mdp params", mdp_params) evaluator = get_base_ae(mdp_params, { "horizon": eval_params['ep_length'], "num_mdp": 1 }, outer_shape) # Override pre-processing functions with defaults if necessary agent_0_featurize_fn = agent_0_featurize_fn if agent_0_featurize_fn else evaluator.env.lossless_state_encoding_mdp agent_1_featurize_fn = agent_1_featurize_fn if agent_1_featurize_fn else evaluator.env.lossless_state_encoding_mdp # Wrap rllib policies in overcooked agents to be compatible with Evaluator code agent0 = RlLibAgent(agent_0_policy, agent_index=0, featurize_fn=agent_0_featurize_fn) agent1 = RlLibAgent(agent_1_policy, agent_index=1, featurize_fn=agent_1_featurize_fn) # Compute rollouts if 'store_dir' not in eval_params: eval_params['store_dir'] = None if 'display_phi' not in eval_params: eval_params['display_phi'] = False results = evaluator.evaluate_agent_pair( AgentPair(agent0, agent1), num_games=eval_params['num_games'], display=eval_params['display'], dir=eval_params['store_dir'], display_phi=eval_params['display_phi'], info=verbose) return results
def test_two_coupled_agents(self): a0 = CoupledPlanningAgent(self.mlp_large) a1 = CoupledPlanningAgent(self.mlp_large) agent_pair = AgentPair(a0, a1) start_state = OvercookedState([P( (2, 2), n), P((2, 1), n)], {}, order_list=['any']) env = OvercookedEnv(large_mdp, start_state_fn=lambda: start_state) trajectory, time_taken, _, _ = env.run_agents(agent_pair, include_final_state=True, display=DISPLAY) end_state = trajectory[-1][0] self.assertEqual(end_state.order_list, [])
def test_two_greedy_human_open_map(self): scenario_2_mdp = OvercookedGridworld.from_layout_name('scenario2') mlam = MediumLevelActionManager.from_pickle_or_compute( scenario_2_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute) a0 = GreedyHumanModel(mlam) a1 = GreedyHumanModel(mlam) agent_pair = AgentPair(a0, a1) start_state = OvercookedState( [P((8, 1), s), P((1, 1), s)], {}, all_orders=scenario_2_mdp.start_all_orders) env = OvercookedEnv.from_mdp(scenario_2_mdp, start_state_fn=lambda: start_state, horizon=100) trajectory, time_taken, _, _ = env.run_agents(agent_pair, include_final_state=True, display=DISPLAY)
def test_one_coupled_one_greedy_human(self): # Even though in the first ~10 timesteps it seems like agent 1 is wasting time # it turns out that this is actually not suboptimal as the true bottleneck is # going to be agent 0 later on (when it goes to get the 3rd onion) a0 = GreedyHumanModel(self.mlp_large) a1 = CoupledPlanningAgent(self.mlp_large) agent_pair = AgentPair(a0, a1) start_state = OvercookedState([P( (2, 1), s), P((1, 1), s)], {}, order_list=['onion']) env = OvercookedEnv(large_mdp, start_state_fn=lambda: start_state) trajectory, time_taken, _, _ = env.run_agents(agent_pair, include_final_state=True, display=DISPLAY) end_state = trajectory[-1][0] self.assertEqual(end_state.order_list, [])
def test_two_greedy_human_open_map(self): scenario_2_mdp = OvercookedGridworld.from_layout_name( 'scenario2', start_order_list=['any'], cook_time=5) mlp = MediumLevelPlanner.from_pickle_or_compute( scenario_2_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute) a0 = GreedyHumanModel(mlp) a1 = GreedyHumanModel(mlp) agent_pair = AgentPair(a0, a1) start_state = OvercookedState([P( (8, 1), s), P((1, 1), s)], {}, order_list=['onion']) env = OvercookedEnv.from_mdp(scenario_2_mdp, start_state_fn=lambda: start_state, horizon=100) trajectory, time_taken, _, _ = env.run_agents(agent_pair, include_final_state=True, display=DISPLAY) end_state = trajectory[-1][0] self.assertEqual(len(end_state.order_list), 0)
def evaluate(eval_params, mdp_params, outer_shape, policies, featurize_fns): """ Used to visualize rollouts of trained policies eval_params (dict): Contains configurations such as the rollout length, number of games, and whether to display rollouts mdp_params (dict): OvercookedMDP compatible configuration used to create environment used for evaluation outer_shape (list): a list of 2 item specifying the outer shape of the evaluation layout policies (list(rllib.Policy or str(non_ml_agent_name))): Policy instances used to map states to action logits for agents or non ml agent name featurize_fns(list(func)): Used to preprocess states for agents defaults to lossless_state_encoding if 'None'; used only when policy inside policies param with_fns """ assert len(policies) == len(featurize_fns), "featurize_fns needs to have same length as policies" evaluator = get_base_ae(mdp_params, {"horizon" : eval_params['ep_length'], "num_mdp":1, "mlam_params": eval_params.get("mlam_params")}, outer_shape) agents = [] # Wrap rllib policies in overcooked agents to be compatible with Evaluator code for i, policy, featurize_fn in zip(range(len(policies)), policies, featurize_fns): if isinstance(policy, RllibPolicy): agent = RlLibAgent(policy, agent_index=i, featurize_fn=featurize_fn or evaluator.env.lossless_state_encoding_mdp) else: agent = OvercookedMultiAgent.create_non_ml_agent(policy, eval_params["non_ml_agents_params"], evaluator.env) agent.set_agent_index(i) agents.append(agent) # Compute rollouts if 'store_dir' not in eval_params: eval_params['store_dir'] = None if 'display_phi' not in eval_params: eval_params['display_phi'] = False results = evaluator.evaluate_agent_pair(AgentPair(*agents), num_games=eval_params['num_games'], display=eval_params['display'], dir=eval_params['store_dir'], display_phi=eval_params['display_phi'], native_eval=True) return results