def test_multiple_mdp_env(self): mdp0 = OvercookedGridworld.from_layout_name("cramped_room") mdp1 = OvercookedGridworld.from_layout_name("counter_circuit") mdp_fn = lambda: np.random.choice([mdp0, mdp1]) # Default env env = OvercookedEnv(mdp_fn, horizon=100) env.get_rollouts(self.rnd_agent_pair, 5)
def mdp_gen_fn_from_dict(mdp_params={}, mdp_choices=None, size_bounds=((4, 7), (4, 7)), prop_empty=(0.6, 0.8), prop_feats=(0.1, 0.2), display=False): """ Returns an MDP generator with the passed in properties. mdp_choices: selects MDP randomly among choices OR (if mdp_choices is None) size_bounds: (min_layout_size, max_layout_size) prop_empty: (min, max) proportion of empty space in generated layout prop_feats: (min, max) proportion of counters with features on them """ if "layout_name" in mdp_params.keys( ) and mdp_params["layout_name"] is not None: mdp = OvercookedGridworld.from_layout_name(**mdp_params) mdp_generator_fn = lambda: mdp elif mdp_choices is not None: assert type(mdp_choices) is list # If list of MDPs, randomly choose one at each reset mdp_sizes = [] for mdp_name in mdp_choices: mdp = OvercookedGridworld.from_layout_name( mdp_name, **mdp_params) mdp_sizes.append([mdp.width, mdp.height]) widths, heights = np.array(mdp_sizes).T min_padding = max(widths), max(heights) def mdp_generator_fn(): chosen_mdp = np.random.choice(mdp_choices) mdp = OvercookedGridworld.from_layout_name( chosen_mdp, **mdp_params) lg = LayoutGenerator(min_padding, mdp_params) mdp_padded = lg.padded_mdp(mdp) return mdp_padded else: min_padding = (size_bounds[0][1], size_bounds[1][1]) layout_generator = LayoutGenerator(min_padding, mdp_params) mdp_generator_fn = lambda: layout_generator.make_disjoint_sets_layout( inner_shape=[rnd_int_uniform(*dim) for dim in size_bounds], prop_empty=rnd_uniform(*prop_empty), prop_features=rnd_uniform(*prop_feats), display=display) return mdp_generator_fn
def get_overcooked_obj_attr(attr, env=None, mdp=None, env_params=None, mdp_params=None): """ returns overcooked object attribute based on its name; used mostly to get state processing (encoding) functions and gym spaces when receives string parse it to get attribute; format is "env"/"mdp" + "." + method name i.e "env.lossless_state_encoding_mdp" also support dicts (where replaces strings in values with object attributes) when receives method/function returns original method; this obviously does not work this way if attr is str/dict """ attr_type = type(attr) if attr_type is str: name = attr [obj_name, attr_name] = name.split(".") if obj_name == "mdp": if not mdp: if env: mdp = env.mdp else: mdp = OvercookedGridworld(**mdp_params) attr = getattr(mdp, attr_name) elif obj_name == "env": if not env: if not mdp: mdp = OvercookedGridworld(**mdp_params) env_params = only_valid_named_args(env_params, OvercookedEnv.from_mdp) env = OvercookedEnv.from_mdp(mdp, **env_params) attr = getattr(env, attr_name) # not tested or used anywhere yet # elif obj_name in kwargs: # attr = getattr(kwargs[obj_name], attr_name) else: raise ValueError("Unsupported obj attr string " + name) elif attr_type is dict: attr = { k: get_overcooked_obj_attr(v, env=env, mdp=mdp, env_params=env_params, mdp_params=mdp_params) for k, v in attr.items() } # not tested or used anywhere yet # elif attr_type in [list, tuple]: # attr = attr_type(get_overcooked_obj_attr(elem, env=env, mdp=mdp, env_params=env_params, # mdp_params=mdp_params) for elem in attr) return attr
def __init__(self, mdp_params, env_params={}, mdp_fn_params=None, force_compute=False, mlp_params=NO_COUNTERS_PARAMS, debug=False): """ mdp_params (dict): params for creation of an OvercookedGridworld instance through the `from_layout_name` method env_params (dict): params for creation of an OvercookedEnv mdp_fn_params (dict): params to setup random MDP generation force_compute (bool): whether should re-compute MediumLevelPlanner although matching file is found mlp_params (dict): params for MediumLevelPlanner """ assert type(mdp_params) is dict, "mdp_params must be a dictionary" if mdp_fn_params is None: self.variable_mdp = False self.mdp_fn = lambda: OvercookedGridworld.from_layout_name( **mdp_params) else: self.variable_mdp = True self.mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict( mdp_params, **mdp_fn_params) self.env = OvercookedEnv(self.mdp_fn, **env_params) self.force_compute = force_compute self.debug = debug self.mlp_params = mlp_params self._mlp = None
def evaluate_layout_loss_for_pbt_models(pbt_model_paths, layout_name, trajs, eps, seeds, best=True): layout_losses = defaultdict(dict) pbt_save_dir = PBT_DATA_DIR + pbt_model_paths[layout_name] + "/" pbt_config = load_dict_from_txt(pbt_save_dir + "config") for seed in seeds: reset_tf() agent_pbt = get_pbt_agent_from_config(pbt_save_dir, pbt_config["sim_threads"], seed=seed, agent_idx=0, best=best) agent_pbt.action_probs = True agent_pbt.set_mdp( OvercookedGridworld.from_layout_name(**pbt_config["mdp_params"])) losses, accuracies = get_trajs_losses_for_model(trajs, agent_pbt, eps) layout_losses["{}_seed{}".format(layout_name, seed)]['losses'] = losses layout_losses["{}_seed{}".format(layout_name, seed)]['accuracies'] = accuracies return layout_losses
def get_bc_agent_from_model(model, bc_params, no_waits=False): mdp = OvercookedGridworld.from_layout_name(**bc_params["mdp_params"]) mlp = MediumLevelPlanner.from_pickle_or_compute(mdp, NO_COUNTERS_PARAMS, force_compute=False) def encoded_state_policy(observations, include_waits=True, stochastic=False): action_probs_n = model.action_probability(observations) if not include_waits: action_probs = ImitationAgentFromPolicy.remove_indices_and_renormalize( action_probs_n, [Action.ACTION_TO_INDEX[Direction.STAY]]) if stochastic: return [np.random.choice(len(action_probs[i]), p=action_probs[i]) for i in range(len(action_probs))] return action_probs_n def state_policy(mdp_states, agent_indices, include_waits, stochastic=False): # encode_fn = lambda s: mdp.preprocess_observation(s) encode_fn = lambda s: mdp.featurize_state(s, mlp) obs = [] for agent_idx, s in zip(agent_indices, mdp_states): ob = encode_fn(s)[agent_idx] obs.append(ob) obs = np.array(obs) action_probs = encoded_state_policy(obs, include_waits, stochastic) return action_probs return ImitationAgentFromPolicy(state_policy, encoded_state_policy, no_waits=no_waits, mlp=mlp)
def _check_trajectories_dynamics(trajectories): for idx in range(len(trajectories["ep_observations"])): states, actions, rewards = trajectories["ep_observations"][ idx], trajectories["ep_actions"][idx], trajectories[ "ep_rewards"][idx] mdp_params, env_params = trajectories["mdp_params"][ idx], trajectories["env_params"][idx] assert len(states) == len(actions) == len( rewards), "# states {}\t# actions {}\t# rewards {}".format( len(states), len(actions), len(rewards)) # Checking that actions would give rise to same behaviour in current MDP simulation_env = OvercookedEnv( OvercookedGridworld.from_layout_name(**mdp_params), **env_params) for i in range(len(states) - 1): curr_state = states[i] simulation_env.state = curr_state next_state, reward, done, info = simulation_env.step( actions[i]) assert states[ i + 1] == next_state, "States differed (expected vs actual): {}".format( simulation_env.display_states(states[i + 1], next_state)) assert rewards[i] == reward, "{} \t {}".format( rewards[i], reward)
def activate(self): super(OvercookedGame, self).activate() # Sanity check at start of each game if not self.npc_players.union(self.human_players) == set(self.players): raise ValueError("Inconsistent State") self.curr_layout = self.layouts.pop() mdp = OvercookedGridworld.from_layout_name(self.curr_layout, **self.mdp_params) self.env = OvercookedEnv.from_mdp(mdp) if self.show_potential: self.mp = MotionPlanner.from_pickle_or_compute( self.mdp, counter_goals=NO_COUNTERS_PARAMS) if self.show_potential: self.phi = self.mdp.potential_function(self.state, self.mp, gamma=0.99) self.start_time = time() self.curr_tick = 0 self.score = 0 self.threads = [] for npc_policy in self.npc_policies: self.npc_policies[npc_policy].reset() self.npc_state_queues[npc_policy].put(self.state) t = Thread(target=self.npc_policy_consumer, args=(npc_policy, )) self.threads.append(t) t.start()
def test_scenario_4(self): # Yet another asymmetric advantage scenario # # X X X X X O X X X X # S X P=X # D ↑H X # X X X X X X X X X # X X X X X X →R O # X X X X X X X X X X # # Similar to scenario 3, just keeping for reference for now. # In this case we only have human suboptimality, and R # assuming H optimality does not end up to be a problem mdp_params = {"layout_name": "scenario4", "cook_time": 5} mdp = OvercookedGridworld.from_layout_name(**mdp_params) start_state = mdp.get_standard_start_state() start_state.objects = {(8, 1): Obj('soup', (8, 1), ('onion', 2, 5))} start_state.order_list = ['onion'] env_params = {"start_state_fn": lambda: start_state, "horizon": 1000} eva = AgentEvaluator(mdp_params, env_params, force_compute=force_compute) self.compare_times(eva)
def test_one_player_env(self): mdp = OvercookedGridworld.from_layout_name("cramped_room_single") env = OvercookedEnv(mdp, horizon=12) a0 = FixedPlanAgent([stay, w, w, e, e, n, e, interact, w, n, interact]) ag = AgentGroup(a0) env.run_agents(ag, display=False) self.assertEqual(env.state.players_pos_and_or, (((2, 1), (0, -1)), ))
def test_get_encoding_function(self): mdp = OvercookedGridworld.from_layout_name("cramped_room") mdp_params = mdp.mdp_params env_params = {"horizon": 100} env = OvercookedEnv.from_mdp(mdp, **env_params) state = mdp.get_standard_start_state() example_encoding_fns_names = ["mdp.multi_hot_orders_encoding", "env.featurize_state_mdp", "env.lossless_state_encoding_mdp"] example_encoding_fns = [mdp.multi_hot_orders_encoding, env.featurize_state_mdp, env.lossless_state_encoding_mdp] for encoding_fn_name, encoding_fn in zip(example_encoding_fns_names, example_encoding_fns): encoding_fn_from_name = get_encoding_function(encoding_fn_name, env=env) self.assertEqual(encoding_fn_from_name, encoding_fn) if encoding_fn_name.split(".")[0] == "mdp": encoding_fn_from_name = get_encoding_function(encoding_fn_name, mdp=mdp) self.assertEqual(encoding_fn_from_name, encoding_fn) encoding_fn_from_name = get_encoding_function(encoding_fn_name, mdp_params=mdp_params) # compare names as new instance of mdp is created self.assertEqual(encoding_fn_from_name.__name__, encoding_fn.__name__) else: encoding_fn_from_name = get_encoding_function(encoding_fn_name, env_params=env_params, mdp_params=mdp_params) # compare names as new instance of env is created self.assertEqual(encoding_fn_from_name.__name__, encoding_fn.__name__) expected_encoded_state_dict = {str(i): fn(state) for i, fn in enumerate(example_encoding_fns)} actual_encoded_state_dict = get_encoding_function({str(i): fn_name for i, fn_name in enumerate(example_encoding_fns_names)}, env=env)(state) self.assertEqual(expected_encoded_state_dict.keys(), actual_encoded_state_dict.keys()) for k in expected_encoded_state_dict.keys(): self.assertTrue(np.array_equal(expected_encoded_state_dict[k], actual_encoded_state_dict[k]))
def test_file_constructor(self): mdp = OvercookedGridworld.from_layout_name('corridor') expected_start_state = OvercookedState( [PlayerState((3, 1), Direction.NORTH), PlayerState((10, 1), Direction.NORTH)], {}, all_orders=[{ "ingredients" : ["onion", "onion", "onion"]}]) actual_start_state = mdp.get_standard_start_state() self.assertEqual(actual_start_state, expected_start_state, '\n' + str(actual_start_state) + '\n' + str(expected_start_state))
def test_scenario_3_yes_counter(self): # Asymmetric advantage scenario # # X X X X X O X X X X # S X X P X # X ↑H X # D X X X X!X X X # X →R O # X X X X X X X X X X # # This test does not allow only (5. 3) as the only counter mdp_params = {"layout_name": "scenario3"} mdp = OvercookedGridworld.from_layout_name(**mdp_params) start_state = mdp.get_standard_start_state() valid_counters = [(5, 3)] one_counter_params = { 'start_orientations': False, 'wait_allowed': False, 'counter_goals': valid_counters, 'counter_drop': valid_counters, 'counter_pickup': [], 'same_motion_goals': True } env_params = {"start_state_fn": lambda: start_state, "horizon": 1000} eva = AgentEvaluator.from_layout_name(mdp_params, env_params, mlam_params=one_counter_params, force_compute=force_compute) self.repetative_runs(eva)
def test_from_mdp_lst_biased(self): mdp_lst = [ OvercookedGridworld.from_layout_name(name) for name in self.layout_name_short_lst ] ae = AgentEvaluator.from_mdp_lst(mdp_lst=mdp_lst, env_params={"horizon": 400}, sampling_freq=self.biased) counts = {} for _ in range(self.num_reset): ae.env.reset(regen_mdp=True) if ae.env.mdp.layout_name in counts: counts[ae.env.mdp.layout_name] += 1 else: counts[ae.env.mdp.layout_name] = 1 # construct the ground truth gt = { self.layout_name_short_lst[i]: self.biased[i] for i in range(len(self.layout_name_short_lst)) } for k, v in counts.items(): self.assertAlmostEqual(gt[k], v / self.num_reset, 2, "more than 2 places off for " + k)
def test_scenario_1(self): # Myopic corridor collision # # X X X X X O X D X X X X X # X ↓Ho X X # X X X X X X X X ↓R X # X X # X S X X X X X X X X P P X # # H on left with onion, further away to the tunnel entrance than R. # Optimal planner tells R to go first and that H will wait # for R to pass. H however, starts going through the tunnel # and they get stuck. The H plan is a bit extreme (it would probably # realize that it should retrace it's steps at some point) scenario_1_mdp = OvercookedGridworld.from_layout_name( 'small_corridor', start_order_list=['any'], cook_time=5) mlp = MediumLevelPlanner.from_pickle_or_compute( scenario_1_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute) a0 = GreedyHumanModel(mlp) a1 = CoupledPlanningAgent(mlp) agent_pair = AgentPair(a0, a1) start_state = OvercookedState( [P((2, 1), s, Obj('onion', (2, 1))), P((10, 2), s)], {}, order_list=['onion']) env = OvercookedEnv.from_mdp(scenario_1_mdp, start_state_fn=lambda: start_state) env.run_agents(agent_pair, include_final_state=True, display=DISPLAY)
def test_scenario_1_s(self): # Smaller version of the corridor collisions scenario above # to facilitate DRL training scenario_1_mdp = OvercookedGridworld.from_layout_name( 'scenario1_s', start_order_list=['any'], cook_time=5) mlp = MediumLevelPlanner.from_pickle_or_compute( scenario_1_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute) a0 = GreedyHumanModel(mlp) a1 = CoupledPlanningAgent(mlp) agent_pair = AgentPair(a0, a1) start_state = OvercookedState( [P((2, 1), s, Obj('onion', (2, 1))), P((4, 2), s)], {}, order_list=['onion']) env = OvercookedEnv.from_mdp(scenario_1_mdp, start_state_fn=lambda: start_state) trajectory, time_taken_hr, _, _ = env.run_agents( agent_pair, include_final_state=True, display=DISPLAY) env.reset() print("\n" * 5) print("-" * 50) a0 = CoupledPlanningAgent(mlp) a1 = CoupledPlanningAgent(mlp) agent_pair = AgentPair(a0, a1) trajectory, time_taken_rr, _, _ = env.run_agents( agent_pair, include_final_state=True, display=DISPLAY) print("H+R time taken: ", time_taken_hr) print("R+R time taken: ", time_taken_rr) self.assertGreater(time_taken_hr, time_taken_rr)
def mdp_gen_fn_from_dict(mdp_params, outer_shape=None, mdp_params_schedule_fn=None): """ mdp_params: one set of fixed mdp parameter used by the environment outer_shape: outer shape of the environment mdp_params_schedule_fn: the schedule for varying mdp params """ # if outer_shape is not defined, we have to be using one of the default layout from names bank if outer_shape is None: assert type(mdp_params) is dict and "layout_name" in mdp_params mdp = OvercookedGridworld.from_layout_name(**mdp_params) mdp_fn = lambda _ignored: mdp else: # there is no schedule, we are using the same set of mdp_params all the time if mdp_params_schedule_fn is None: assert mdp_params is not None mdp_pg = MDPParamsGenerator.from_fixed_param( mdp_params_always=mdp_params) else: assert mdp_params is None, "please remove the mdp_params from the variable, " \ "because mdp_params_schedule_fn exist and we will " \ "always use the schedule_fn if it exist" mdp_pg = MDPParamsGenerator( params_schedule_fn=mdp_params_schedule_fn) lg = LayoutGenerator(mdp_pg, outer_shape) mdp_fn = lg.generate_padded_mdp return mdp_fn
def mdp_generator_fn(): chosen_mdp = np.random.choice(mdp_choices) mdp = OvercookedGridworld.from_layout_name( chosen_mdp, **mdp_params) lg = LayoutGenerator(min_padding, mdp_params) mdp_padded = lg.padded_mdp(mdp) return mdp_padded
def setUp(self): self.base_mdp = OvercookedGridworld.from_layout_name("cramped_room") self.mlp = MediumLevelPlanner.from_pickle_or_compute( self.base_mdp, NO_COUNTERS_PARAMS, force_compute=True) self.env = OvercookedEnv(self.base_mdp, **DEFAULT_ENV_PARAMS) self.rnd_agent_pair = AgentPair(GreedyHumanModel(self.mlp), GreedyHumanModel(self.mlp)) np.random.seed(0)
def test_display_phi(self): mdp0 = OvercookedGridworld.from_layout_name("cramped_room") mdp_fn = lambda _ignored: mdp0 env = OvercookedEnv(mdp_fn, horizon=20) env.get_rollouts(self.rnd_agent_pair, 1, display=True, display_phi=True)
def test_constructor_invalid_inputs(self): # Height and width must be at least 3. with self.assertRaises(AssertionError): mdp = OvercookedGridworld.from_grid(['X', 'X', 'X']) with self.assertRaises(AssertionError): mdp = OvercookedGridworld.from_grid([['X', 'X', 'X']]) with self.assertRaises(AssertionError): # Borders must be present. mdp = OvercookedGridworld.from_grid(['XOSX', 'P D', ' 21 ']) with self.assertRaises(AssertionError): # The grid can't be ragged. mdp = OvercookedGridworld.from_grid( ['XXPXX', 'O 2XX', 'X1 3 X', 'XDXSXX']) with self.assertRaises(AssertionError): # The agents must be numbered 1 and 2. mdp = OvercookedGridworld.from_grid( ['XXPXX', 'O 3O', 'X1 X', 'XDXSX']) with self.assertRaises(AssertionError): # The agents must be numbered 1 and 2. mdp = OvercookedGridworld.from_grid( ['XXPXX', 'O 1O', 'X1 X', 'XDXSX']) with self.assertRaises(AssertionError): # B is not a valid element. mdp = OvercookedGridworld.from_grid( ['XBPXX', 'O 2O', 'X1 X', 'XDXSX'])
def mdps_and_envs_from_trajectories(trajectories): mdps, envs = [], [] for idx in range(len(trajectories["ep_lengths"])): mdp_params, env_params = trajectories["mdp_params"][idx], trajectories["env_params"][idx] mdp = OvercookedGridworld.from_layout_name(**mdp_params) env = OvercookedEnv(mdp, **env_params) mdps.append(mdp) envs.append(env) return mdps, envs
def test_from_mdp(self): for layout_name in self.layout_name_lst: orignal_mdp = OvercookedGridworld.from_layout_name(layout_name) ae = AgentEvaluator.from_mdp(mdp=orignal_mdp, env_params={"horizon": 400}) ae_mdp = ae.env.mdp self.assertEqual( orignal_mdp, ae_mdp, "mdp with name " + layout_name + " experienced an inconsistency")
def test_starting_obj_randomization(self): self.base_mdp = OvercookedGridworld.from_layout_name("cramped_room") start_state_fn = self.base_mdp.get_random_start_state_fn(random_start_pos=False, rnd_obj_prob_thresh=0.8) env = OvercookedEnv.from_mdp(self.base_mdp, start_state_fn) start_state = env.state.all_objects_list for _ in range(3): env.reset() curr_terrain = env.state.all_objects_list self.assertFalse(np.array_equal(start_state, curr_terrain))
def padded_mdp(self, mdp, display=False): """Returns a padded MDP from an MDP""" grid = Grid.from_mdp(mdp) padded_grid = self.embed_grid(grid) start_positions = self.get_random_starting_positions(padded_grid) mdp_grid = self.padded_grid_to_layout_grid(padded_grid, start_positions, display=display) return OvercookedGridworld.from_grid(mdp_grid)
def init_gym_env(bc_params): env_setup_params = copy.deepcopy(bc_params) del env_setup_params["data_params"] # Not necessary for setting up env mdp = OvercookedGridworld.from_layout_name(**bc_params["mdp_params"]) env = OvercookedEnv(mdp, **bc_params["env_params"]) gym_env = gym.make("Overcooked-v0") mlp = MediumLevelPlanner.from_pickle_or_compute(mdp, NO_COUNTERS_PARAMS, force_compute=False) gym_env.custom_init(env, featurize_fn=lambda x: mdp.featurize_state(x, mlp)) return gym_env
def test_file_constructor(self): mdp = OvercookedGridworld.from_layout_name('corridor') expected_start_state = OvercookedState([ PlayerState((3, 1), Direction.NORTH), PlayerState((10, 1), Direction.NORTH) ], {}, order_list=None) actual_start_state = mdp.get_standard_start_state() self.assertEqual( actual_start_state, expected_start_state, '\n' + str(actual_start_state) + '\n' + str(expected_start_state))
def test_starting_position_randomization(self): self.base_mdp = OvercookedGridworld.from_layout_name("cramped_room") start_state_fn = self.base_mdp.get_random_start_state_fn( random_start_pos=True, rnd_obj_prob_thresh=0.0) env = OvercookedEnv(self.base_mdp, start_state_fn) start_state = env.state.players_pos_and_or for _ in range(3): env.reset() print(env) curr_terrain = env.state.players_pos_and_or self.assertFalse(np.array_equal(start_state, curr_terrain))
def test_save_load(self): # Train a quick self play agent for 2 iterations ex.run( config_updates={ # Please feel free to modify the parameters below "results_dir": self.temp_results_dir, "experiment_name": "save_load_test", "layout_name": "cramped_room", "num_workers": 1, "train_batch_size": 800, "sgd_minibatch_size": 800, "num_training_iters": 2, "evaluation_interval": 10, "entropy_coeff_start": 0.0, "entropy_coeff_end": 0.0, "use_phi": False, "evaluation_display": False, "verbose": False }, options={'--loglevel': 'ERROR'}) # Kill all ray processes to ensure loading works in a vaccuum ray.shutdown() # Where the agent is stored (this is kind of hardcoded, would like for it to be more easily obtainable) load_path = os.path.join( glob.glob(os.path.join(self.temp_results_dir, "save_load_test*"))[0], 'checkpoint_2', 'checkpoint-2') # Load a dummy state mdp = OvercookedGridworld.from_layout_name("cramped_room") state = mdp.get_standard_start_state() # Ensure simple single-agent loading works agent_0 = load_agent(load_path) agent_0.reset() agent_1 = load_agent(load_path) agent_1.reset() # Ensure forward pass of policy network still works _, _ = agent_0.action(state) _, _ = agent_1.action(state) # Now let's load an agent pair and evaluate it agent_pair = load_agent_pair(load_path) ae = AgentEvaluator.from_layout_name( mdp_params={"layout_name": "cramped_room"}, env_params={"horizon": 400}) # We assume no runtime errors => success, no performance consistency check for now ae.evaluate_agent_pair(agent_pair, 1, info=False)
def ppo_run(params): create_dir_if_not_exists(params["SAVE_DIR"]) save_pickle(params, params["SAVE_DIR"] + "config") ############# # PPO SETUP # ############# train_infos = [] for seed in params["SEEDS"]: reset_tf() set_global_seed(seed) curr_seed_dir = params["SAVE_DIR"] + "seed" + str(seed) + "/" create_dir_if_not_exists(curr_seed_dir) save_pickle(params, curr_seed_dir + "config") print("Creating env with params", params) # Configure mdp mdp = OvercookedGridworld.from_layout_name(**params["mdp_params"]) env = OvercookedEnv(mdp, **params["env_params"]) mlp = MediumLevelPlanner.from_pickle_or_compute(mdp, NO_COUNTERS_PARAMS, force_compute=True) # Configure gym env gym_env = get_vectorized_gym_env( env, 'Overcooked-v0', featurize_fn=lambda x: mdp.lossless_state_encoding(x), **params ) gym_env.self_play_randomization = 0 if params["SELF_PLAY_HORIZON"] is None else 1 gym_env.trajectory_sp = params["TRAJECTORY_SELF_PLAY"] gym_env.update_reward_shaping_param(1 if params["mdp_params"]["rew_shaping_params"] != 0 else 0) configure_other_agent(params, gym_env, mlp, mdp) # Create model with tf.device('/device:GPU:{}'.format(params["GPU_ID"])): model = create_model(gym_env, "ppo_agent", **params) # Train model params["CURR_SEED"] = seed train_info = update_model(gym_env, model, **params) # Save model save_ppo_model(model, curr_seed_dir + model.agent_name) print("Saved training info at", curr_seed_dir + "training_info") save_pickle(train_info, curr_seed_dir + "training_info") train_infos.append(train_info) return train_infos