def get_rollouts(self, agent_pair, num_games, display=False, dir=None, final_state=False, display_phi=False,
                     display_until=np.Inf, metadata_fn=None, metadata_info_fn=None, info=True):
        """
        Simulate `num_games` number rollouts with the current agent_pair and returns processed 
        trajectories.

        Returning excessive information to be able to convert trajectories to any required format 
        (baselines, stable_baselines, etc)

        metadata_fn returns some metadata information computed at the end of each trajectory based on
        some of the trajectory data.

        NOTE: this is the standard trajectories format used throughout the codebase
        """
        trajectories = { k:[] for k in self.DEFAULT_TRAJ_KEYS }
        metadata_fn = (lambda x: {}) if metadata_fn is None else metadata_fn
        metadata_info_fn = (lambda x: "") if metadata_info_fn is None else metadata_info_fn
        range_iterator = tqdm.trange(num_games, desc="", leave=True) if info else range(num_games)
        for i in range_iterator:
            agent_pair.set_mdp(self.mdp)

            rollout_info = self.run_agents(agent_pair, display=display, dir=dir, include_final_state=final_state,
                                           display_phi=display_phi, display_until=display_until)
            trajectory, time_taken, tot_rews_sparse, _tot_rews_shaped = rollout_info
            obs, actions, rews, dones, infos = trajectory.T[0], trajectory.T[1], trajectory.T[2], trajectory.T[3], trajectory.T[4]
            trajectories["ep_states"].append(obs)
            trajectories["ep_actions"].append(actions)
            trajectories["ep_rewards"].append(rews)
            trajectories["ep_dones"].append(dones)
            trajectories["ep_infos"].append(infos)
            trajectories["ep_returns"].append(tot_rews_sparse)
            trajectories["ep_lengths"].append(time_taken)
            trajectories["mdp_params"].append(self.mdp.mdp_params)
            trajectories["env_params"].append(self.env_params)
            trajectories["metadatas"].append(metadata_fn(rollout_info))

            # we do not need to regenerate MDP if we are trying to generate a series of rollouts using the same MDP
            # Basically, the FALSE here means that we are using the same layout and starting positions
            # (if regen_mdp == True, resetting will call mdp_gen_fn to generate another layout & starting position)
            self.reset(regen_mdp=False)
            agent_pair.reset()

            if info:
                mu, se = mean_and_std_err(trajectories["ep_returns"])
                description = "Avg rew: {:.2f} (std: {:.2f}, se: {:.2f}); avg len: {:.2f}; ".format(
                    mu, np.std(trajectories["ep_returns"]), se, np.mean(trajectories["ep_lengths"]))
                description += metadata_info_fn(trajectories["metadatas"])
                range_iterator.set_description(description)
                range_iterator.refresh()

        # Converting to numpy arrays
        trajectories = {k: np.array(v) for k, v in trajectories.items()}

        # Merging all metadata dictionaries, assumes same keys throughout all
        trajectories["metadatas"] = append_dictionaries(trajectories["metadatas"])

        # TODO: should probably transfer check methods over to Env class
        from overcooked_ai_py.agents.benchmarking import AgentEvaluator
        AgentEvaluator.check_trajectories(trajectories)
        return trajectories
 def test_human_model_pair(self):
     trajs = self.agent_eval.evaluate_human_model_pair()
     try:
         AgentEvaluator.check_trajectories(trajs)
     except AssertionError as e:
         self.fail("Trajectories were not returned in standard format:\n{}".
                   format(e))
Beispiel #3
0
 def test_rollouts(self):
     ap = AgentPair(RandomAgent(), RandomAgent())
     trajs = self.agent_eval.evaluate_agent_pair(ap, num_games=5)
     try:
         AgentEvaluator.check_trajectories(trajs)
     except AssertionError as e:
         self.fail("Trajectories were not returned in standard format:\n{}".format(e))
Beispiel #4
0
def evaluate_ppo_hm_and_bc(layout,
                           ppo_hm_path,
                           bc_test_path,
                           num_rounds,
                           seeds,
                           best=False,
                           display=False):
    ppo_hm_performance = defaultdict(lambda: defaultdict(list))

    agent_bc_test, bc_params = get_bc_agent_from_saved(bc_test_path)
    del bc_params["data_params"]
    del bc_params["mdp_fn_params"]
    evaluator = AgentEvaluator(**bc_params)

    for seed in seeds:
        agent_ppo, _ = get_ppo_agent(ppo_hm_path, seed, best=best)

        ppo_and_bc = evaluator.evaluate_agent_pair(AgentPair(
            agent_ppo, agent_bc_test),
                                                   num_games=num_rounds,
                                                   display=display)
        avg_ppo_and_bc = np.mean(ppo_and_bc['ep_returns'])
        ppo_hm_performance[layout]["PPO_HM+BC_test_0"].append(avg_ppo_and_bc)

        bc_and_ppo = evaluator.evaluate_agent_pair(AgentPair(
            agent_bc_test, agent_ppo),
                                                   num_games=num_rounds,
                                                   display=display)
        avg_bc_and_ppo = np.mean(bc_and_ppo['ep_returns'])
        ppo_hm_performance[layout]["PPO_HM+BC_test_1"].append(avg_bc_and_ppo)

    return ppo_hm_performance
 def test_schelling_s(self):
     # Schelling failure scenario
     #
     # X X S P-D X X
     # X     ↓R    X
     # X     X     X
     # O           O
     # X     X     X
     # X     ↓H    X
     # X X D P-S X X
     #
     # The layout is completely symmetric. Both pots need 2 more onions,
     # and only one delivery is left. The best thing to do would be to split up
     # towards the different pots, but the agents must somehow coordinate on the
     # first step. In the H+R case, this doesn't work, but in the R+R it does.
     #
     eva = AgentEvaluator(
         {
             "layout_name": "schelling_s",
             "start_order_list": ["any", "any"],
             "cook_time": 5
         },
         force_compute=force_compute)
     start_state = eva.env.mdp.get_standard_start_state()
     start_state.objects = {
         (2, 0): Obj('soup', (2, 0), ('onion', 2, 5)),
         (2, 4): Obj('soup', (2, 4), ('onion', 2, 5))
     }
     eva.start_state = start_state
     self.compare_times(eva, h_idx=1)
Beispiel #6
0
def evaluate_bc_models(bc_model_paths, num_rounds):
    """
    Evaluate BC models passed in over `num_rounds` rounds
    """
    best_bc_models_performance = {}

    # Evaluate best
    for layout_name in bc_model_paths['train'].keys():
        print(layout_name)
        best_bc_models_performance[layout_name] = {}
        
        eval_trajs = eval_with_benchmarking_from_saved(num_rounds, bc_model_paths['train'][layout_name])
        best_bc_models_performance[layout_name]["BC_train+BC_train"] = mean_and_std_err(eval_trajs['ep_returns'])
        
        eval_trajs = eval_with_benchmarking_from_saved(num_rounds, bc_model_paths['test'][layout_name])
        best_bc_models_performance[layout_name]["BC_test+BC_test"] = mean_and_std_err(eval_trajs['ep_returns'])

        bc_train, bc_params_train = get_bc_agent_from_saved(bc_model_paths['train'][layout_name])
        bc_test, bc_params_test = get_bc_agent_from_saved(bc_model_paths['test'][layout_name])
        del bc_params_train["data_params"]
        del bc_params_test["data_params"]
        assert common_keys_equal(bc_params_train, bc_params_test)
        ae = AgentEvaluator(mdp_params=bc_params_train["mdp_params"], env_params=bc_params_train["env_params"])
        
        train_and_test = ae.evaluate_agent_pair(AgentPair(bc_train, bc_test), num_games=num_rounds)
        best_bc_models_performance[layout_name]["BC_train+BC_test_0"] = mean_and_std_err(train_and_test['ep_returns'])

        test_and_train = ae.evaluate_agent_pair(AgentPair(bc_test, bc_train), num_games=num_rounds)
        best_bc_models_performance[layout_name]["BC_train+BC_test_1"] = mean_and_std_err(test_and_train['ep_returns'])
    
    return best_bc_models_performance
class TestAgentEvaluator(unittest.TestCase):
    def setUp(self):
        self.agent_eval = AgentEvaluator({"layout_name": "cramped_room"},
                                         {"horizon": 100})

    def test_human_model_pair(self):
        trajs = self.agent_eval.evaluate_human_model_pair()
        try:
            AgentEvaluator.check_trajectories(trajs)
        except AssertionError as e:
            self.fail("Trajectories were not returned in standard format:\n{}".
                      format(e))

    def test_rollouts(self):
        ap = AgentPair(RandomAgent(), RandomAgent())
        trajs = self.agent_eval.evaluate_agent_pair(ap, num_games=5)
        try:
            AgentEvaluator.check_trajectories(trajs)
        except AssertionError as e:
            self.fail("Trajectories were not returned in standard format:\n{}".
                      format(e))

    def test_mlp_computation(self):
        try:
            self.agent_eval.mlp
        except Exception as e:
            self.fail("Failed to compute MediumLevelPlanner:\n{}".format(e))
Beispiel #8
0
def get_base_ae(mdp_params,
                env_params,
                outer_shape=None,
                mdp_params_schedule_fn=None):
    """
    mdp_params: one set of fixed mdp parameter used by the enviroment
    env_params: env parameters (horizon, etc)
    outer_shape: outer shape of the environment
    mdp_params_schedule_fn: the schedule for varying mdp params

    return: the base agent evaluator
    """
    assert mdp_params == None or mdp_params_schedule_fn == None, "either of the two has to be null"
    if type(mdp_params) == dict and "layout_name" in mdp_params:
        ae = AgentEvaluator.from_layout_name(mdp_params=mdp_params,
                                             env_params=env_params)
    elif 'num_mdp' in env_params:
        if np.isinf(env_params['num_mdp']):
            ae = AgentEvaluator.from_mdp_params_infinite(
                mdp_params=mdp_params,
                env_params=env_params,
                outer_shape=outer_shape,
                mdp_params_schedule_fn=mdp_params_schedule_fn)
        else:
            ae = AgentEvaluator.from_mdp_params_finite(
                mdp_params=mdp_params,
                env_params=env_params,
                outer_shape=outer_shape,
                mdp_params_schedule_fn=mdp_params_schedule_fn)
    else:
        # should not reach this case
        raise NotImplementedError()
    return ae
Beispiel #9
0
 def get_data(self):
     if self.save_trajectory:
         file_path = os.path.join(TRAJECTORIES_DIR,
                                  self._create_trajectory_filename())
         traj_dict = self._get_trajectory_dict()
         AgentEvaluator.save_traj_as_json(traj_dict, file_path)
         self.trajectory = []
     return super(OvercookedGame, self).get_data()
Beispiel #10
0
    def test_mdp_dynamics(self):
        traj_path = os.path.join(TESTING_DATA_DIR, 'test_mdp_dynamics', 'expected.json')

        # NOTE: uncomment the following line to recompute trajectories if MDP dymamics were deliberately updated
        generate_serialized_trajectory(self.base_mdp, traj_path)

        test_trajectory = AgentEvaluator.load_traj_from_json(traj_path)
        AgentEvaluator.check_trajectories(test_trajectory, from_json=True)
def eval_with_benchmarking_from_model(n_games, model, bc_params, no_waits, display=False):
    bc_params = copy.deepcopy(bc_params)
    a0 = get_bc_agent_from_model(model, bc_params, no_waits)
    a1 = get_bc_agent_from_model(model, bc_params, no_waits)
    del bc_params["data_params"], bc_params["mdp_fn_params"]
    a_eval = AgentEvaluator(**bc_params)
    ap = AgentPair(a0, a1)
    trajectories = a_eval.evaluate_agent_pair(ap, num_games=n_games, display=display)
    return trajectories
 def test_embedded_planning_agent(self):
     agent_evaluator = AgentEvaluator({"layout_name": "cramped_room"},
                                      {"horizon": 100})
     other_agent = GreedyHumanModel(agent_evaluator.mlp)
     epa = EmbeddedPlanningAgent(other_agent,
                                 agent_evaluator.mlp,
                                 agent_evaluator.env,
                                 delivery_horizon=1)
     ap = AgentPair(epa, other_agent)
     agent_evaluator.evaluate_agent_pair(ap, num_games=1, display=DISPLAY)
Beispiel #13
0
 def test_from_mdp_params_variable_across(self):
     for mdp_gen_params in self.mdp_gen_params_lst:
         ae0 = AgentEvaluator.from_mdp_params_infinite(mdp_params=mdp_gen_params,
                                                       env_params={"horizon": 400, "num_mdp": np.inf},
                                                       outer_shape=self.outer_shape)
         ae1 = AgentEvaluator.from_mdp_params_infinite(mdp_params=mdp_gen_params,
                                                       env_params={"horizon": 400, "num_mdp": np.inf},
                                                       outer_shape=self.outer_shape)
         self.assertFalse(ae0.env.mdp == ae1.env.mdp,
                          "2 randomly generated layouts across 2 evaluators are the same, which is wrong")
Beispiel #14
0
 def test_common_mdp_jsons(self):
     traj_test_json_paths = iterate_over_files_in_dir(
         "../common_tests/trajectory_tests/")
     for test_json_path in traj_test_json_paths:
         test_trajectory = AgentEvaluator.load_traj_from_json(
             test_json_path)
         try:
             AgentEvaluator.check_trajectories(test_trajectory)
         except AssertionError as e:
             self.fail("File {} failed with error:\n{}".format(
                 test_json_path, e))
def generate_serialized_trajectory(mdp, save_path):
    # Saving trajectory for dynamics consistency test
    seed = 0
    sparse_reward = 0
    while sparse_reward <= 0:
        np.random.seed(seed)
        ae = AgentEvaluator.from_mdp(mdp, env_params={"horizon": 1500})
        test_trajs = ae.evaluate_random_pair(all_actions=True, num_games=1)
        sparse_reward = np.mean(test_trajs["ep_returns"])
        seed += 1

    AgentEvaluator.save_traj_as_json(test_trajs, save_path)
 def test_from_mdp_params_variable_finite(self):
     for mdp_gen_params in self.mdp_gen_params_lst:
         ae = AgentEvaluator.from_mdp_params_finite(
             mdp_params=mdp_gen_params,
             env_params={
                 "horizon": 400,
                 "num_mdp": 2
             },
             outer_shape=self.outer_shape)
         mdp_0 = ae.env.mdp.copy()
         seen = [mdp_0]
         for _ in range(20):
             ae.env.reset(regen_mdp=True)
             mdp_i = ae.env.mdp
             if len(seen) == 1:
                 if mdp_i != seen[0]:
                     seen.append(mdp_i.copy())
             elif len(seen) == 2:
                 mdp_0, mdp_1 = seen
                 self.assertTrue((
                     mdp_i == mdp_0 or mdp_i == mdp_1
                 ), "more than 2 mdp was created, the function failed to perform"
                                 )
             else:
                 self.assertTrue(False,
                                 "theoretically unreachable statement")
    def test_from_mdp_lst_biased(self):
        mdp_lst = [
            OvercookedGridworld.from_layout_name(name)
            for name in self.layout_name_short_lst
        ]
        ae = AgentEvaluator.from_mdp_lst(mdp_lst=mdp_lst,
                                         env_params={"horizon": 400},
                                         sampling_freq=self.biased)
        counts = {}

        for _ in range(self.num_reset):
            ae.env.reset(regen_mdp=True)
            if ae.env.mdp.layout_name in counts:
                counts[ae.env.mdp.layout_name] += 1
            else:
                counts[ae.env.mdp.layout_name] = 1

        # construct the ground truth
        gt = {
            self.layout_name_short_lst[i]: self.biased[i]
            for i in range(len(self.layout_name_short_lst))
        }

        for k, v in counts.items():
            self.assertAlmostEqual(gt[k], v / self.num_reset, 2,
                                   "more than 2 places off for " + k)
    def test_scenario_2(self):
        # Simple asymmetric advantages scenario
        #
        # X X X X X O X X X X
        # S                 O
        # D         ↑H  ↑R  X
        # X X X X X X P=X X X
        #
        # Worse version of scenario 3 (probably to be deleted)
        #
        # The optimal thing to do for the human is to go and get a dish
        # so that by the time it gets back to the pot, the soup will be ready.
        # However, H goes to get the onion, and so does R initially, as it
        # assumes H will go and get the dish. Once H has picked up the onion,
        # R realizes that it should go and get the dish itself. This leads to
        # a couple of timesteps lost (the difference could be made bigger with a
        # better thought through map)

        start_state = OvercookedState(
            [P((5, 2), n), P(
                (7, 2), n)], {(6, 3): Obj('soup', (6, 3), ('onion', 2, 0))},
            order_list=['onion'])

        mdp_params = {"layout_name": "scenario2", "cook_time": 5}
        env_params = {"start_state_fn": lambda: start_state}
        eva = AgentEvaluator(mdp_params, env_params)
        self.compare_times(eva)
Beispiel #19
0
def evaluate_pbt_for_layout(layout_name,
                            num_rounds,
                            pbt_performance,
                            pbt_model_paths,
                            best_test_bc_models,
                            seeds,
                            best=False):
    bc_agent, bc_params = get_bc_agent_from_saved(
        model_name=best_test_bc_models[layout_name])
    ae = AgentEvaluator(mdp_params=bc_params["mdp_params"],
                        env_params=bc_params["env_params"])

    pbt_save_dir = PBT_DATA_DIR + pbt_model_paths[layout_name] + "/"
    pbt_config = load_dict_from_txt(pbt_save_dir + "config")
    assert common_keys_equal(
        bc_params["mdp_params"], pbt_config["mdp_params"]
    ), "Mdp params differed between PBT and BC models training"
    assert common_keys_equal(
        bc_params["env_params"], pbt_config["env_params"]
    ), "Env params differed between PBT and BC models training"

    pbt_agents = [
        get_pbt_agent_from_config(pbt_save_dir,
                                  pbt_config["sim_threads"],
                                  seed=seed,
                                  agent_idx=0,
                                  best=best) for seed in seeds
    ]
    eval_pbt_over_seeds(pbt_agents, bc_agent, layout_name, num_rounds,
                        pbt_performance, ae)
    return pbt_performance
Beispiel #20
0
    def test_scenario_3_yes_counter(self):
        # Asymmetric advantage scenario
        #
        # X X X X X O X X X X
        # S           X X P X
        # X         ↑H      X
        # D   X X X X!X X   X
        # X           →R    O
        # X X X X X X X X X X
        #
        # This test does not allow only (5. 3) as the only counter

        mdp_params = {"layout_name": "scenario3"}
        mdp = OvercookedGridworld.from_layout_name(**mdp_params)
        start_state = mdp.get_standard_start_state()

        valid_counters = [(5, 3)]
        one_counter_params = {
            'start_orientations': False,
            'wait_allowed': False,
            'counter_goals': valid_counters,
            'counter_drop': valid_counters,
            'counter_pickup': [],
            'same_motion_goals': True
        }

        env_params = {"start_state_fn": lambda: start_state, "horizon": 1000}
        eva = AgentEvaluator.from_layout_name(mdp_params,
                                              env_params,
                                              mlam_params=one_counter_params,
                                              force_compute=force_compute)

        self.repetative_runs(eva)
    def test_scenario_4(self):
        # Yet another asymmetric advantage scenario
        #
        # X X X X X O X X X X
        # S             X P=X
        # D         ↑H      X
        # X X X X X X X X   X
        # X X X X X X →R    O
        # X X X X X X X X X X
        #
        # Similar to scenario 3, just keeping for reference for now.
        # In this case we only have human suboptimality, and R
        # assuming H optimality does not end up to be a problem
        mdp_params = {"layout_name": "scenario4", "cook_time": 5}
        mdp = OvercookedGridworld.from_layout_name(**mdp_params)

        start_state = mdp.get_standard_start_state()
        start_state.objects = {(8, 1): Obj('soup', (8, 1), ('onion', 2, 5))}
        start_state.order_list = ['onion']

        env_params = {"start_state_fn": lambda: start_state, "horizon": 1000}
        eva = AgentEvaluator(mdp_params,
                             env_params,
                             force_compute=force_compute)
        self.compare_times(eva)
Beispiel #22
0
    def test_trajectory_visualization(self):
        # we don't have good way to check slider automatically so its mostly test for basic stuff like number of outputted images, if using method raises error etc.
        traj_path = os.path.join(TESTING_DATA_DIR, 'test_state_visualizer',
                                 'test_trajectory.json')
        test_trajectory = AgentEvaluator.load_traj_from_json(traj_path)
        expected_images_num = len(test_trajectory["ep_states"][0])
        assert expected_images_num == 10
        action_probs = [
            [RandomAgent(all_actions=True).action(state)[1]["action_probs"]] *
            2 for state in test_trajectory["ep_states"][0]
        ]

        result_img_directory_path = StateVisualizer(
        ).display_rendered_trajectory(test_trajectory,
                                      action_probs=action_probs,
                                      ipython_display=False)
        self.assertEqual(get_file_count(result_img_directory_path),
                         expected_images_num)

        custom_img_directory_path = generate_temporary_file_path(
            prefix="overcooked_visualized_trajectory", extension="")
        self.assertNotEqual(custom_img_directory_path,
                            result_img_directory_path)
        result_img_directory_path = StateVisualizer(
        ).display_rendered_trajectory(
            test_trajectory,
            img_directory_path=custom_img_directory_path,
            ipython_display=False)
        self.assertEqual(custom_img_directory_path, result_img_directory_path)
        self.assertEqual(get_file_count(result_img_directory_path),
                         expected_images_num)
Beispiel #23
0
 def setUp(self):
     Recipe.configure({})
     trajectory_path = os.path.join(TESTING_DATA_DIR, "test_visualizations",
                                    "trajectory.json")
     events_path = os.path.join(TESTING_DATA_DIR, "test_visualizations",
                                "expected_extracted_events.json")
     self.trajectory1 = AgentEvaluator.load_traj_from_json(trajectory_path)
     self.extracted_events1 = load_from_json(events_path)
    def test_unidentifiable_s(self):
        # Same as above, but smaller layout to facilitate DRL training

        eva = AgentEvaluator(
            {
                "layout_name": "asymmetric_advantages",
                "start_order_list": ["any", "any"],
                "cook_time": 5
            },
            force_compute=force_compute)
        start_state = eva.env.mdp.get_standard_start_state()
        start_state.objects = {
            (4, 2): Obj('soup', (4, 2), ('onion', 2, 0)),
            (4, 3): Obj('soup', (4, 3), ('onion', 3, 5))
        }
        eva.start_state = start_state
        self.compare_times(eva, h_idx=0)
 def test_from_mdp(self):
     for layout_name in self.layout_name_lst:
         orignal_mdp = OvercookedGridworld.from_layout_name(layout_name)
         ae = AgentEvaluator.from_mdp(mdp=orignal_mdp,
                                      env_params={"horizon": 400})
         ae_mdp = ae.env.mdp
         self.assertEqual(
             orignal_mdp, ae_mdp, "mdp with name " + layout_name +
             " experienced an inconsistency")
Beispiel #26
0
def df_traj_to_python_joint_traj(traj_df, complete_traj=True):
    if len(traj_df) == 0:
        return None

    datapoint = traj_df.iloc[0]
    python_layout_name = JS_LAYOUT_NAME_TO_PYTHON_NAME[datapoint['layout_name']]
    # python_layout_name = datapoint['layout_name']
    agent_evaluator = AgentEvaluator(
        mdp_params={"layout_name": python_layout_name}, 
        env_params={"horizon": 1250}
    )
    mdp = agent_evaluator.env.mdp
    env = agent_evaluator.env

    overcooked_states = [json_state_to_python_state(mdp, s) for s in traj_df.state]
    overcooked_actions = [json_joint_action_to_python_action(joint_action) for joint_action in traj_df.joint_action]
    overcooked_rewards = list(traj_df.reward_norm)

    assert sum(overcooked_rewards) == datapoint.reward_norm_total, "Rewards didn't sum up to cumulative rewards. Probably trajectory df is corrupted / not complete"

    trajectories = {
        "ep_observations": [overcooked_states],
        "ep_actions": [overcooked_actions],
        "ep_rewards": [overcooked_rewards], # Individual (dense) reward values

        "ep_dones": [[False] * len(overcooked_states)], # Individual done values

        "ep_returns": [sum(overcooked_rewards)], # Sum of dense rewards across each episode
        "ep_returns_sparse": [sum(overcooked_rewards)], # Sum of sparse rewards across each episode
        "ep_lengths": [len(overcooked_states)], # Lengths of each episode
        "mdp_params": [mdp.mdp_params],
        "env_params": [env.env_params]
    }
    trajectories = {k: np.array(v) if k != "ep_actions" else v for k, v in trajectories.items() }

    if complete_traj:
        agent_evaluator.check_trajectories(trajectories)

    traj_metadata = {
        'worker_id': datapoint['workerid_num'],
        'round_num': datapoint['round_num'],
        'mdp': agent_evaluator.env.mdp
    }
    return trajectories, traj_metadata
Beispiel #27
0
 def test_from_mdp_params_variable_infinite_specified(self):
     for mdp_gen_params in self.mdp_gen_params_lst:
         ae = AgentEvaluator.from_mdp_params_infinite(mdp_params=mdp_gen_params,
                                                      env_params={"horizon": 400, "num_mdp": np.inf},
                                                      outer_shape=self.outer_shape)
         mdp_0 = ae.env.mdp.copy()
         for _ in range(5):
             ae.env.reset(regen_mdp=True)
             mdp_1 = ae.env.mdp
             self.assertFalse(mdp_0 == mdp_1,
                              "with infinite layout generator and regen_mdp=True, the 2 layouts should not be the same")
Beispiel #28
0
    def _get_trajectory_dict(self):
        trajectories = {k: [] for k in self.env.DEFAULT_TRAJ_KEYS}
        trajectory = np.array(self.trajectory)
        obs, actions, rews, dones, infos = trajectory.T[0], trajectory.T[
            1], trajectory.T[2], trajectory.T[3], trajectory.T[4]
        infos[-1] = self.env._add_episode_info(infos[-1])
        trajectories["ep_states"].append(obs)
        trajectories["ep_actions"].append(actions)
        trajectories["ep_rewards"].append(rews)
        trajectories["ep_dones"].append(dones)
        trajectories["ep_infos"].append(infos)
        trajectories["ep_returns"].append(self.score)
        trajectories["ep_lengths"].append(self.env.state.timestep)
        trajectories["mdp_params"].append(self.env.mdp.mdp_params)
        trajectories["env_params"].append(self.env.env_params)
        trajectories["metadatas"].append({})
        trajectories = {k: np.array(v) for k, v in trajectories.items()}

        AgentEvaluator.check_trajectories(trajectories)
        return trajectories
Beispiel #29
0
    def test_save_load(self):
        # Train a quick self play agent for 2 iterations
        ex.run(
            config_updates={
                # Please feel free to modify the parameters below
                "results_dir": self.temp_results_dir,
                "experiment_name": "save_load_test",
                "layout_name": "cramped_room",
                "num_workers": 1,
                "train_batch_size": 800,
                "sgd_minibatch_size": 800,
                "num_training_iters": 2,
                "evaluation_interval": 10,
                "entropy_coeff_start": 0.0,
                "entropy_coeff_end": 0.0,
                "use_phi": False,
                "evaluation_display": False,
                "verbose": False
            },
            options={'--loglevel': 'ERROR'})

        # Kill all ray processes to ensure loading works in a vaccuum
        ray.shutdown()

        # Where the agent is stored (this is kind of hardcoded, would like for it to be more easily obtainable)
        load_path = os.path.join(
            glob.glob(os.path.join(self.temp_results_dir,
                                   "save_load_test*"))[0], 'checkpoint_2',
            'checkpoint-2')

        # Load a dummy state
        mdp = OvercookedGridworld.from_layout_name("cramped_room")
        state = mdp.get_standard_start_state()

        # Ensure simple single-agent loading works
        agent_0 = load_agent(load_path)
        agent_0.reset()

        agent_1 = load_agent(load_path)
        agent_1.reset()

        # Ensure forward pass of policy network still works
        _, _ = agent_0.action(state)
        _, _ = agent_1.action(state)

        # Now let's load an agent pair and evaluate it
        agent_pair = load_agent_pair(load_path)
        ae = AgentEvaluator.from_layout_name(
            mdp_params={"layout_name": "cramped_room"},
            env_params={"horizon": 400})

        # We assume no runtime errors => success, no performance consistency check for now
        ae.evaluate_agent_pair(agent_pair, 1, info=False)
def df_traj_to_python_joint_traj(traj_df, complete_traj=True):
    if len(traj_df) == 0:
        return None

    datapoint = traj_df.iloc[0]
    layout_name = datapoint['layout_name']
    agent_evaluator = AgentEvaluator.from_layout_name(
        mdp_params={"layout_name": layout_name},
        env_params={
            "horizon": 1250
        }  # Defining the horizon of the mdp of origin of the trajectories
    )
    mdp = agent_evaluator.env.mdp
    env = agent_evaluator.env

    overcooked_states = [json_state_to_python_state(s) for s in traj_df.state]
    overcooked_actions = [
        json_joint_action_to_python_action(joint_action)
        for joint_action in traj_df.joint_action
    ]
    overcooked_rewards = list(traj_df.reward)

    assert sum(
        overcooked_rewards
    ) == datapoint.score_total, "Rewards didn't sum up to cumulative rewards. Probably trajectory df is corrupted / not complete"

    trajectories = {
        "ep_observations": [overcooked_states],
        "ep_actions": [overcooked_actions],
        "ep_rewards": [overcooked_rewards],  # Individual (dense) reward values
        "ep_dones":
        [[False] * len(overcooked_states)],  # Individual done values
        "ep_infos": [{}] * len(overcooked_states),
        "ep_returns":
        [sum(overcooked_rewards)],  # Sum of dense rewards across each episode
        "ep_lengths": [len(overcooked_states)],  # Lengths of each episode
        "mdp_params": [mdp.mdp_params],
        "env_params": [env.env_params],
        "metadatas": {
            'player_0_id': [datapoint['player_0_id']],
            'player_1_id': [datapoint['player_1_id']],
            'env': [agent_evaluator.env]
        }
    }
    trajectories = {
        k: np.array(v) if k not in ["ep_actions", "metadatas"] else v
        for k, v in trajectories.items()
    }

    if complete_traj:
        agent_evaluator.check_trajectories(trajectories)
    return trajectories