def test_random_layout(self): mdp_gen_params = {"prop_feats": (1, 1)} mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(**mdp_gen_params) env = OvercookedEnv(mdp=mdp_fn, **DEFAULT_ENV_PARAMS) start_terrain = env.mdp.terrain_mtx for _ in range(3): env.reset() print(env) curr_terrain = env.mdp.terrain_mtx self.assertFalse(np.array_equal(start_terrain, curr_terrain)) mdp_gen_params = { "mdp_choices": ['cramped_room', 'asymmetric_advantages'] } mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(**mdp_gen_params) env = OvercookedEnv(mdp=mdp_fn, **DEFAULT_ENV_PARAMS) layouts_seen = [] for _ in range(10): layouts_seen.append(env.mdp.terrain_mtx) env.reset() all_same_layout = all([ np.array_equal(env.mdp.terrain_mtx, terrain) for terrain in layouts_seen ]) self.assertFalse(all_same_layout)
def test_scenario_1_s(self): # Smaller version of the corridor collisions scenario above # to facilitate DRL training scenario_1_mdp = OvercookedGridworld.from_layout_name( 'scenario1_s', start_order_list=['any'], cook_time=5) mlp = MediumLevelPlanner.from_pickle_or_compute( scenario_1_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute) a0 = GreedyHumanModel(mlp) a1 = CoupledPlanningAgent(mlp) agent_pair = AgentPair(a0, a1) start_state = OvercookedState( [P((2, 1), s, Obj('onion', (2, 1))), P((4, 2), s)], {}, order_list=['onion']) env = OvercookedEnv(scenario_1_mdp, start_state_fn=lambda: start_state) trajectory, time_taken_hr, _, _ = env.run_agents( agent_pair, include_final_state=True, display=DISPLAY) env.reset() print("\n" * 5) print("-" * 50) a0 = CoupledPlanningAgent(mlp) a1 = CoupledPlanningAgent(mlp) agent_pair = AgentPair(a0, a1) trajectory, time_taken_rr, _, _ = env.run_agents( agent_pair, include_final_state=True, display=DISPLAY) print("H+R time taken: ", time_taken_hr) print("R+R time taken: ", time_taken_rr) self.assertGreater(time_taken_hr, time_taken_rr)
def test_random_layout_feature_types(self): mandatory_features = {POT, DISH_DISPENSER, SERVING_LOC} optional_features = {ONION_DISPENSER, TOMATO_DISPENSER} optional_features_combinations = [{ONION_DISPENSER, TOMATO_DISPENSER}, {ONION_DISPENSER}, {TOMATO_DISPENSER}] for optional_features_combo in optional_features_combinations: left_out_optional_features = optional_features - optional_features_combo used_features = list(optional_features_combo | mandatory_features) mdp_gen_params = {"prop_feats": 0.9, "feature_types": used_features, "prop_empty": 0.1, "inner_shape": (6, 5), "display": False, "start_all_orders" : [ { "ingredients" : ["onion", "onion", "onion"]} ]} mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(mdp_gen_params, outer_shape=(6, 5)) env = OvercookedEnv(mdp_fn, **DEFAULT_ENV_PARAMS) for _ in range(10): env.reset() curr_terrain = env.mdp.terrain_mtx terrain_features = set.union(*(set(line) for line in curr_terrain)) self.assertTrue(all(elem in terrain_features for elem in used_features)) # all used_features are actually used if left_out_optional_features: self.assertFalse(any(elem in terrain_features for elem in left_out_optional_features)) # all left_out optional_features are not used
def test_starting_obj_randomization(self): self.base_mdp = OvercookedGridworld.from_layout_name("cramped_room") start_state_fn = self.base_mdp.get_random_start_state_fn( random_start_pos=False, rnd_obj_prob_thresh=0.8) env = OvercookedEnv(self.base_mdp, start_state_fn) start_state = env.state.all_objects_list for _ in range(3): env.reset() print(env) curr_terrain = env.state.all_objects_list self.assertFalse(np.array_equal(start_state, curr_terrain))
def test_starting_position_randomization(self): self.base_mdp = OvercookedGridworld.from_layout_name("simple") start_state_fn = self.base_mdp.get_random_start_state_fn( random_start_pos=True, rnd_obj_prob_thresh=0.0) env = OvercookedEnv(self.base_mdp, start_state_fn) start_state = env.state.players_pos_and_or for _ in range(3): env.reset() print(env) curr_terrain = env.state.players_pos_and_or self.assertFalse(np.array_equal(start_state, curr_terrain))
def test_random_layout(self): mdp_gen_params = { "inner_shape": (5, 4), "prop_empty": 0.8, "prop_feats": 0.2, "start_all_orders": [{ "ingredients": ["onion", "onion", "onion"] }], "recipe_values": [20], "recipe_times": [20], "display": False } mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(mdp_gen_params, outer_shape=(5, 4)) env = OvercookedEnv(mdp_fn, **DEFAULT_ENV_PARAMS) start_terrain = env.mdp.terrain_mtx for _ in range(3): env.reset() curr_terrain = env.mdp.terrain_mtx self.assertFalse(np.array_equal(start_terrain, curr_terrain)) mdp_gen_params = {"layout_name": 'cramped_room'} mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(mdp_gen_params) env = OvercookedEnv(mdp_fn, **DEFAULT_ENV_PARAMS) layouts_seen = [] for _ in range(5): layouts_seen.append(env.mdp.terrain_mtx) env.reset() all_same_layout = all([ np.array_equal(env.mdp.terrain_mtx, terrain) for terrain in layouts_seen ]) self.assertTrue(all_same_layout) mdp_gen_params = {"layout_name": 'asymmetric_advantages'} mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(mdp_gen_params) env = OvercookedEnv(mdp_fn, **DEFAULT_ENV_PARAMS) for _ in range(5): layouts_seen.append(env.mdp.terrain_mtx) env.reset() all_same_layout = all([ np.array_equal(env.mdp.terrain_mtx, terrain) for terrain in layouts_seen ]) self.assertFalse(all_same_layout)
def test_random_layout_generated_recipes(self): only_onions_recipes = [Recipe(["onion", "onion"]), Recipe(["onion", "onion", "onion"])] only_onions_dict_recipes = [r.to_dict() for r in only_onions_recipes] # checking if recipes are generated from mdp_params mdp_gen_params = {"generate_all_orders": {"n":2, "ingredients": ["onion"], "min_size":2, "max_size":3}, "prop_feats": 0.9, "prop_empty": 0.1, "inner_shape": (6, 5), "display": False} mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(mdp_gen_params, outer_shape=(6, 5)) env = OvercookedEnv(mdp_fn, **DEFAULT_ENV_PARAMS) for _ in range(10): env.reset() self.assertCountEqual(env.mdp.start_all_orders, only_onions_dict_recipes) self.assertEqual(len(env.mdp.start_bonus_orders), 0) # checking if bonus_orders is subset of all_orders even if not specified mdp_gen_params = {"generate_all_orders": {"n":2, "ingredients": ["onion"], "min_size":2, "max_size":3}, "generate_bonus_orders": {"n":1, "min_size":2, "max_size":3}, "prop_feats": 0.9, "prop_empty": 0.1, "inner_shape": (6, 5), "display": False} mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(mdp_gen_params, outer_shape=(6,5)) env = OvercookedEnv(mdp_fn, **DEFAULT_ENV_PARAMS) for _ in range(10): env.reset() self.assertCountEqual(env.mdp.start_all_orders, only_onions_dict_recipes) self.assertEqual(len(env.mdp.start_bonus_orders), 1) self.assertTrue(env.mdp.start_bonus_orders[0] in only_onions_dict_recipes) # checking if after reset there are new recipes generated mdp_gen_params = {"generate_all_orders": {"n":3, "min_size":2, "max_size":3}, "prop_feats": 0.9, "prop_empty": 0.1, "inner_shape": (6, 5), "display": False, "feature_types": [POT, DISH_DISPENSER, SERVING_LOC, ONION_DISPENSER, TOMATO_DISPENSER] } mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict(mdp_gen_params, outer_shape=(6,5)) env = OvercookedEnv(mdp_fn, **DEFAULT_ENV_PARAMS) generated_recipes_strings = set() for _ in range(20): env.reset() generated_recipes_strings |= {json.dumps(o, sort_keys=True) for o in env.mdp.start_all_orders} self.assertTrue(len(generated_recipes_strings) > 3)
def learn(*, network, env, total_timesteps, early_stopping=False, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, scope='', **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' additional_params = network_kwargs["network_kwargs"] from baselines import logger # set_global_seeds(seed) We deal with seeds upstream if "LR_ANNEALING" in additional_params.keys(): lr_reduction_factor = additional_params["LR_ANNEALING"] start_lr = lr lr = lambda prop: (start_lr / lr_reduction_factor) + ( start_lr - (start_lr / lr_reduction_factor )) * prop # Anneals linearly from lr to lr/red factor if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) bestrew = 0 # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, scope=scope) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.perf_counter() best_rew_per_step = 0 run_info = defaultdict(list) nupdates = total_timesteps // nbatch print("TOT NUM UPDATES", nupdates) for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0, "Have {} total batch size and want {} minibatches, can't split evenly".format( nbatch, nminibatches) # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 eplenmean = safemean([epinfo['ep_length'] for epinfo in epinfos]) eprewmean = safemean([epinfo['r'] for epinfo in epinfos]) rew_per_step = eprewmean / eplenmean print("Curr learning rate {} \t Curr reward per step {}".format( lrnow, rew_per_step)) if rew_per_step > best_rew_per_step and early_stopping: # Avoid updating best model at first iteration because the means might be a bit off because # of how the multithreaded batch simulation works best_rew_per_step = eprewmean / eplenmean checkdir = osp.join(logger.get_dir(), 'checkpoints') model.save(checkdir + ".temp_best_model") print("Saved model as best", best_rew_per_step, "avg rew/step") epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in tqdm.trange(0, nbatch, nbatch_train, desc="{}/{}".format(_, noptepochs)): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) eprewmean = safemean([epinfo['r'] for epinfo in epinfobuf]) ep_dense_rew_mean = safemean( [epinfo['ep_shaped_r'] for epinfo in epinfobuf]) ep_sparse_rew_mean = safemean( [epinfo['ep_sparse_r'] for epinfo in epinfobuf]) eplenmean = safemean([epinfo['ep_length'] for epinfo in epinfobuf]) run_info['eprewmean'].append(eprewmean) run_info['ep_dense_rew_mean'].append(ep_dense_rew_mean) run_info['ep_sparse_rew_mean'].append(ep_sparse_rew_mean) run_info['eplenmean'].append(eplenmean) run_info['explained_variance'].append(float(ev)) logger.logkv( 'true_eprew', safemean([epinfo['ep_sparse_r'] for epinfo in epinfobuf])) logger.logkv('eprewmean', eprewmean) logger.logkv('eplenmean', eplenmean) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) time_elapsed = tnow - tfirststart logger.logkv('time_elapsed', time_elapsed) time_per_update = time_elapsed / update time_remaining = (nupdates - update) * time_per_update logger.logkv('time_remaining', time_remaining / 60) for (lossval, lossname) in zip(lossvals, model.loss_names): run_info[lossname].append(lossval) logger.logkv(lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() # Update current logs if additional_params["RUN_TYPE"] in ["ppo", "joint_ppo"]: from overcooked_ai_py.utils import save_dict_to_file save_dict_to_file(run_info, additional_params["SAVE_DIR"] + "logs") # Linear annealing of reward shaping if additional_params["REW_SHAPING_HORIZON"] != 0: # Piecewise linear annealing schedule # annealing_thresh: until when we should stop doing 100% reward shaping # annealing_horizon: when we should reach doing 0% reward shaping annealing_horizon = additional_params[ "REW_SHAPING_HORIZON"] annealing_thresh = 0 def fn(x): if annealing_thresh != 0 and annealing_thresh - ( annealing_horizon / annealing_thresh) * x > 1: return 1 else: fn = lambda x: -1 * (x - annealing_thresh) * 1 / ( annealing_horizon - annealing_thresh) + 1 return max(fn(x), 0) curr_timestep = update * nbatch curr_reward_shaping = fn(curr_timestep) env.update_reward_shaping_param(curr_reward_shaping) print("Current reward shaping", curr_reward_shaping) sp_horizon = additional_params["SELF_PLAY_HORIZON"] # Save/overwrite best model if past a certain threshold if ep_sparse_rew_mean > bestrew and ep_sparse_rew_mean > additional_params[ "SAVE_BEST_THRESH"]: # Don't save best model if still doing some self play and it's supposed to be a BC model if additional_params[ "OTHER_AGENT_TYPE"][: 2] == "bc" and sp_horizon != 0 and env.self_play_randomization > 0: pass else: from human_aware_rl.ppo.ppo import save_ppo_model print("BEST REW", ep_sparse_rew_mean, "overwriting previous model with", bestrew) save_ppo_model( model, "{}seed{}/best".format( additional_params["SAVE_DIR"], additional_params["CURR_SEED"])) bestrew = max(ep_sparse_rew_mean, bestrew) # If not sp run, and horizon is not None, # vary amount of self play over time, either with a sigmoidal feedback loop # or with a fixed piecewise linear schedule. if additional_params[ "OTHER_AGENT_TYPE"] != "sp" and sp_horizon is not None: if type(sp_horizon) is not list: # Sigmoid self-play schedule based on current performance (not recommended) curr_reward = ep_sparse_rew_mean rew_target = sp_horizon shift = rew_target / 2 t = (1 / rew_target) * 10 fn = lambda x: -1 * (np.exp(t * (x - shift)) / (1 + np.exp(t * (x - shift)))) + 1 env.self_play_randomization = fn(curr_reward) print("Current self-play randomization", env.self_play_randomization) else: assert len(sp_horizon) == 2 # Piecewise linear self-play schedule # self_play_thresh: when we should stop doing 100% self-play # self_play_timeline: when we should reach doing 0% self-play self_play_thresh, self_play_timeline = sp_horizon def fn(x): if self_play_thresh != 0 and self_play_timeline - ( self_play_timeline / self_play_thresh) * x > 1: return 1 else: fn = lambda x: -1 * ( x - self_play_thresh) * 1 / ( self_play_timeline - self_play_thresh ) + 1 return max(fn(x), 0) curr_timestep = update * nbatch env.self_play_randomization = fn(curr_timestep) print("Current self-play randomization", env.self_play_randomization) if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and ( MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) # Visualization of rollouts with actual other agent run_type = additional_params["RUN_TYPE"] if run_type in ["ppo", "joint_ppo" ] and update % additional_params["VIZ_FREQUENCY"] == 0: from overcooked_ai_py.mdp.overcooked_env import OvercookedEnv from overcooked_ai_py.mdp.overcooked_mdp import OvercookedGridworld from overcooked_ai_py.agents.agent import AgentPair from overcooked_ai_py.agents.benchmarking import AgentEvaluator from human_aware_rl.baselines_utils import get_agent_from_model print(additional_params["SAVE_DIR"]) mdp = OvercookedGridworld.from_layout_name( **additional_params["mdp_params"]) overcooked_env = OvercookedEnv(mdp, **additional_params["env_params"]) agent = get_agent_from_model( model, additional_params["sim_threads"], is_joint_action=(run_type == "joint_ppo")) agent.set_mdp(mdp) if run_type == "ppo": if additional_params["OTHER_AGENT_TYPE"] == 'sp': agent_pair = AgentPair(agent, agent, allow_duplicate_agents=True) else: print("PPO agent on index 0:") env.other_agent.set_mdp(mdp) agent_pair = AgentPair(agent, env.other_agent) trajectory, time_taken, tot_rewards, tot_shaped_rewards = overcooked_env.run_agents( agent_pair, display=True, display_until=100) overcooked_env.reset() agent_pair.reset() print("tot rew", tot_rewards, "tot rew shaped", tot_shaped_rewards) print("PPO agent on index 1:") agent_pair = AgentPair(env.other_agent, agent) else: agent_pair = AgentPair(agent) trajectory, time_taken, tot_rewards, tot_shaped_rewards = overcooked_env.run_agents( agent_pair, display=True, display_until=100) overcooked_env.reset() agent_pair.reset() print("tot rew", tot_rewards, "tot rew shaped", tot_shaped_rewards) print(additional_params["SAVE_DIR"]) if nupdates > 0 and early_stopping: checkdir = osp.join(logger.get_dir(), 'checkpoints') print("Loaded best model", best_rew_per_step) model.load(checkdir + ".temp_best_model") return model, run_info
def pbt_one_run(params, seed): # Iterating noptepochs over same batch data but shuffled differently # dividing each batch in `nminibatches` and doing a gradient step for each one create_dir_if_not_exists(params["SAVE_DIR"]) save_dict_to_file(params, params["SAVE_DIR"] + "config") ####### # pbt # ####### mdp = OvercookedGridworld.from_layout_name(**params["mdp_params"]) overcooked_env = OvercookedEnv(mdp, **params["env_params"]) print("Sample training environments:") for _ in range(5): overcooked_env.reset() print(overcooked_env) gym_env = get_vectorized_gym_env( overcooked_env, 'Overcooked-v0', featurize_fn=lambda x: mdp.lossless_state_encoding(x), **params) gym_env.update_reward_shaping_param(1.0) # Start reward shaping from 1 annealer = LinearAnnealer(horizon=params["REW_SHAPING_HORIZON"]) # ppo_expert_model = load_model("data/expert_agent/", "agent0", actual_agent_name="agent0") # pbt_expert_model = load_model("data/expert_agent/", "agent2", actual_agent_name="agent2") # AGENT POPULATION INITIALIZATION population_size = params["POPULATION_SIZE"] pbt_population = [] pbt_agent_names = ['agent' + str(i) for i in range(population_size)] for agent_name in pbt_agent_names: agent = PBTAgent(agent_name, params, gym_env=gym_env) # overwrite_model(ppo_expert_model, model) pbt_population.append(agent) print("Initialized agent models") all_pairs = [] for i in range(population_size): for j in range(i + 1, population_size): all_pairs.append((i, j)) # MAIN LOOP def pbt_training(): best_sparse_rew_avg = [-np.Inf] * population_size print(params['NUM_PBT_ITER']) for pbt_iter in range(1, params["NUM_PBT_ITER"] + 1): print("\n\n\nPBT ITERATION NUM {}".format(pbt_iter)) # TRAINING PHASE assert params["ITER_PER_SELECTION"] == population_size**2 pairs_to_train = list( itertools.product(range(population_size), range(population_size))) for sel_iter in range(params["ITER_PER_SELECTION"]): # Randomly select agents to be trained pair_idx = np.random.choice(len(pairs_to_train)) idx0, idx1 = pairs_to_train.pop(pair_idx) pbt_agent0, pbt_agent1 = pbt_population[idx0], pbt_population[ idx1] # Training agent 1, leaving agent 0 fixed print( "Training agent {} ({}) with agent {} ({}) fixed (pbt #{}/{}, sel #{}/{})" .format(idx1, pbt_agent1.num_ppo_runs, idx0, pbt_agent0.num_ppo_runs, pbt_iter, params["NUM_PBT_ITER"], sel_iter, params["ITER_PER_SELECTION"])) agent_env_steps = pbt_agent1.num_ppo_runs * params[ "PPO_RUN_TOT_TIMESTEPS"] reward_shaping_param = annealer.param_value(agent_env_steps) print("Current reward shaping:", reward_shaping_param, "\t Save_dir", params["SAVE_DIR"]) pbt_agent1.logs["reward_shaping"].append(reward_shaping_param) gym_env.update_reward_shaping_param(reward_shaping_param) gym_env.other_agent = pbt_agent0.get_agent() pbt_agent1.update(gym_env) save_folder = params["SAVE_DIR"] + pbt_agent1.agent_name + '/' pbt_agent1.save(save_folder) agent_pair = AgentPair(pbt_agent0.get_agent(), pbt_agent1.get_agent()) overcooked_env.get_rollouts( agent_pair, num_games=1, final_state=True, reward_shaping=reward_shaping_param) assert len(pairs_to_train) == 0 # SELECTION PHASE # Overwrite worst agent with best model's agent, according to # a proxy for generalization performance (avg dense reward across population) print("\nSELECTION PHASE\n") # Dictionary with average returns for each agent when matched with each other agent avg_ep_returns_dict = defaultdict(list) avg_ep_returns_sparse_dict = defaultdict(list) for i, pbt_agent in enumerate(pbt_population): # Saving each agent model at the end of the pbt iteration pbt_agent.update_pbt_iter_logs() if pbt_iter == params["NUM_PBT_ITER"]: save_folder = params[ "SAVE_DIR"] + pbt_agent.agent_name + '/' pbt_agent.save_predictor(save_folder + "pbt_iter{}/".format(pbt_iter)) pbt_agent.save(save_folder + "pbt_iter{}/".format(pbt_iter)) for j in range(i, population_size): # Pairs each agent with all other agents including itself in assessing generalization performance print("Evaluating agent {} and {}".format(i, j)) pbt_agent_other = pbt_population[j] agent_pair = AgentPair(pbt_agent.get_agent(), pbt_agent_other.get_agent()) trajs = overcooked_env.get_rollouts( agent_pair, params["NUM_SELECTION_GAMES"], reward_shaping=reward_shaping_param) dense_rews, sparse_rews, lens = trajs["ep_returns"], trajs[ "ep_returns_sparse"], trajs["ep_lengths"] rew_per_step = np.sum(dense_rews) / np.sum(lens) avg_ep_returns_dict[i].append(rew_per_step) avg_ep_returns_sparse_dict[i].append(sparse_rews) if j != i: avg_ep_returns_dict[j].append(rew_per_step) avg_ep_returns_sparse_dict[j].append(sparse_rews) print("AVG ep rewards dict", avg_ep_returns_dict) for i, pbt_agent in enumerate(pbt_population): pbt_agent.update_avg_rew_per_step_logs(avg_ep_returns_dict[i]) avg_sparse_rew = np.mean(avg_ep_returns_sparse_dict[i]) if avg_sparse_rew > best_sparse_rew_avg[i]: best_sparse_rew_avg[i] = avg_sparse_rew agent_name = pbt_agent.agent_name print( "New best avg sparse rews {} for agent {}, saving...". format(best_sparse_rew_avg, agent_name)) best_save_folder = params[ "SAVE_DIR"] + agent_name + '/best/' delete_dir_if_exists(best_save_folder, verbose=True) pbt_agent.save_predictor(best_save_folder) pbt_agent.save(best_save_folder) # Get best and worst agents when averageing rew per step across all agents best_agent_idx = max( avg_ep_returns_dict, key=lambda key: np.mean(avg_ep_returns_dict[key])) worst_agent_idx = min( avg_ep_returns_dict, key=lambda key: np.mean(avg_ep_returns_dict[key])) # MUTATION PHASE pbt_population[worst_agent_idx].explore_from( pbt_population[best_agent_idx]) print( "Overwrote worst model {} ({} rew) with best model {} ({} rew)" .format(worst_agent_idx, avg_ep_returns_dict[worst_agent_idx], best_agent_idx, avg_ep_returns_dict[best_agent_idx])) best_agent = pbt_population[best_agent_idx].get_agent() best_agent_copy = pbt_population[best_agent_idx].get_agent() agent_pair = AgentPair(best_agent, best_agent_copy) overcooked_env.get_rollouts(agent_pair, num_games=1, final_state=True, display=True, reward_shaping=reward_shaping_param) pbt_training() reset_tf() print(params["SAVE_DIR"])
class AgentEvaluator(object): """ Class used to get rollouts and evaluate performance of various types of agents. """ def __init__(self, mdp_params, env_params={}, mdp_fn_params=None, force_compute=False, mlp_params=NO_COUNTERS_PARAMS, debug=False): """ mdp_params (dict): params for creation of an OvercookedGridworld instance through the `from_layout_name` method env_params (dict): params for creation of an OvercookedEnv mdp_fn_params (dict): params to setup random MDP generation force_compute (bool): whether should re-compute MediumLevelPlanner although matching file is found mlp_params (dict): params for MediumLevelPlanner """ assert type(mdp_params) is dict, "mdp_params must be a dictionary" if mdp_fn_params is None: self.variable_mdp = False self.mdp_fn = lambda: OvercookedGridworld.from_layout_name( **mdp_params) else: self.variable_mdp = True self.mdp_fn = LayoutGenerator.mdp_gen_fn_from_dict( mdp_params, **mdp_fn_params) self.env = OvercookedEnv(self.mdp_fn, **env_params) self.force_compute = force_compute self.debug = debug self.mlp_params = mlp_params self._mlp = None @property def mlp(self): assert not self.variable_mdp, "Variable mdp is not currently supported for planning" if self._mlp is None: if self.debug: print("Computing Planner") self._mlp = MediumLevelPlanner.from_pickle_or_compute( self.env.mdp, self.mlp_params, force_compute=self.force_compute) return self._mlp def evaluate_random_pair(self, interact=True, display=False): agent_pair = AgentPair(RandomAgent(interact=interact), RandomAgent(interact=interact)) return self.evaluate_agent_pair(agent_pair, display=display) def evaluate_human_model_pair(self, display=True, num_games=1): a0 = GreedyHumanModel(self.mlp) a1 = GreedyHumanModel(self.mlp) agent_pair = AgentPair(a0, a1) return self.evaluate_agent_pair(agent_pair, display=display, num_games=num_games) def evaluate_optimal_pair(self, display=True, delivery_horizon=2): a0 = CoupledPlanningAgent(self.mlp, delivery_horizon=delivery_horizon) a1 = CoupledPlanningAgent(self.mlp, delivery_horizon=delivery_horizon) a0.mlp.env = self.env a1.mlp.env = self.env agent_pair = AgentPair(a0, a1) return self.evaluate_agent_pair(agent_pair, display=display) def evaluate_one_optimal_one_random(self, display=True): a0 = CoupledPlanningAgent(self.mlp) a1 = RandomAgent() agent_pair = AgentPair(a0, a1) return self.evaluate_agent_pair(agent_pair, display=display) def evaluate_one_optimal_one_greedy_human(self, h_idx=0, display=True): h = GreedyHumanModel(self.mlp) r = CoupledPlanningAgent(self.mlp) agent_pair = AgentPair(h, r) if h_idx == 0 else AgentPair(r, h) return self.evaluate_agent_pair(agent_pair, display=display) def evaluate_agent_pair(self, agent_pair, num_games=1, display=False, info=True): self.env.reset() return self.env.get_rollouts(agent_pair, num_games, display=display, info=info) def get_agent_pair_trajs(self, a0, a1=None, num_games=100, display=False): """Evaluate agent pair on both indices, and return trajectories by index""" if a1 is None: ap = AgentPair(a0, a0, allow_duplicate_agents=True) trajs_0 = trajs_1 = self.evaluate_agent_pair(ap, num_games=num_games, display=display) else: trajs_0 = self.evaluate_agent_pair(AgentPair(a0, a1), num_games=num_games, display=display) trajs_1 = self.evaluate_agent_pair(AgentPair(a1, a0), num_games=num_games, display=display) return trajs_0, trajs_1 @staticmethod def check_trajectories(trajectories): """ Checks that of trajectories are in standard format and are consistent with dynamics of mdp. """ AgentEvaluator._check_standard_traj_keys(set(trajectories.keys())) AgentEvaluator._check_right_types(trajectories) AgentEvaluator._check_trajectories_dynamics(trajectories) # TODO: Check shapes? @staticmethod def _check_standard_traj_keys(traj_keys_set): assert traj_keys_set == set( DEFAULT_TRAJ_KEYS ), "Keys of traj dict did not match standard form.\nMissing keys: {}\nAdditional keys: {}".format( [k for k in DEFAULT_TRAJ_KEYS if k not in traj_keys_set], [k for k in traj_keys_set if k not in DEFAULT_TRAJ_KEYS]) @staticmethod def _check_right_types(trajectories): for idx in range(len(trajectories["ep_observations"])): states, actions, rewards = trajectories["ep_observations"][ idx], trajectories["ep_actions"][idx], trajectories[ "ep_rewards"][idx] mdp_params, env_params = trajectories["mdp_params"][ idx], trajectories["env_params"][idx] assert all(type(j_a) is tuple for j_a in actions) assert all(type(s) is OvercookedState for s in states) assert type(mdp_params) is dict assert type(env_params) is dict # TODO: check that are all lists @staticmethod def _check_trajectories_dynamics(trajectories): _, envs = AgentEvaluator.mdps_and_envs_from_trajectories(trajectories) for idx in range(len(trajectories["ep_observations"])): states, actions, rewards = trajectories["ep_observations"][ idx], trajectories["ep_actions"][idx], trajectories[ "ep_rewards"][idx] simulation_env = envs[idx] assert len(states) == len(actions) == len( rewards), "# states {}\t# actions {}\t# rewards {}".format( len(states), len(actions), len(rewards)) # Checking that actions would give rise to same behaviour in current MDP for i in range(len(states) - 1): curr_state = states[i] simulation_env.state = curr_state next_state, reward, done, info = simulation_env.step( actions[i]) assert states[ i + 1] == next_state, "States differed (expected vs actual): {}".format( simulation_env.display_states(states[i + 1], next_state)) assert rewards[i] == reward, "{} \t {}".format( rewards[i], reward) @staticmethod def mdps_and_envs_from_trajectories(trajectories): mdps, envs = [], [] for idx in range(len(trajectories["ep_lengths"])): mdp_params, env_params = trajectories["mdp_params"][ idx], trajectories["env_params"][idx] mdp = OvercookedGridworld.from_layout_name(**mdp_params) env = OvercookedEnv(mdp, **env_params) mdps.append(mdp) envs.append(env) return mdps, envs ### I/O METHODS ### @staticmethod def save_trajectory(trajectory, filename): AgentEvaluator.check_trajectories(trajectory) save_pickle(trajectory, filename) @staticmethod def load_trajectory(filename): traj = load_pickle(filename) AgentEvaluator.check_trajectories(traj) return traj @staticmethod def save_traj_in_stable_baselines_format(rollout_trajs, filename): # Converting episode dones to episode starts eps_starts = [ np.zeros(len(traj)) for traj in rollout_trajs["ep_dones"] ] for ep_starts in eps_starts: ep_starts[0] = 1 eps_starts = [ep_starts.astype(np.bool) for ep_starts in eps_starts] stable_baselines_trajs_dict = { 'actions': np.concatenate(rollout_trajs["ep_actions"]), 'obs': np.concatenate(rollout_trajs["ep_observations"]), 'rewards': np.concatenate(rollout_trajs["ep_rewards"]), 'episode_starts': np.concatenate(eps_starts), 'episode_returns': rollout_trajs["ep_returns"] } stable_baselines_trajs_dict = { k: np.array(v) for k, v in stable_baselines_trajs_dict.items() } np.savez(filename, **stable_baselines_trajs_dict) @staticmethod def save_traj_as_json(trajectory, filename): """Saves the `idx`th trajectory as a list of state action pairs""" assert set(DEFAULT_TRAJ_KEYS) == set( trajectory.keys()), "{} vs\n{}".format(DEFAULT_TRAJ_KEYS, trajectory.keys()) AgentEvaluator.check_trajectories(trajectory) trajectory = AgentEvaluator.make_trajectories_json_serializable( trajectory) save_as_json(trajectory, filename) @staticmethod def make_trajectories_json_serializable(trajectories): """ Cannot convert np.arrays or special types of ints to JSON. This method converts all components of a trajectory to standard types. """ dict_traj = copy.deepcopy(trajectories) dict_traj["ep_observations"] = [[ ob.to_dict() for ob in one_ep_obs ] for one_ep_obs in trajectories["ep_observations"]] for k in dict_traj.keys(): dict_traj[k] = list(dict_traj[k]) dict_traj['ep_actions'] = [ list(lst) for lst in dict_traj['ep_actions'] ] dict_traj['ep_rewards'] = [ list(lst) for lst in dict_traj['ep_rewards'] ] dict_traj['ep_dones'] = [int(lst) for lst in dict_traj['ep_dones']] dict_traj['ep_returns'] = [int(val) for val in dict_traj['ep_returns']] dict_traj['ep_lengths'] = [int(val) for val in dict_traj['ep_lengths']] return dict_traj @staticmethod def load_traj_from_json(filename): traj_dict = load_from_json(filename) traj_dict["ep_observations"] = [[ OvercookedState.from_dict(ob) for ob in curr_ep_obs ] for curr_ep_obs in traj_dict["ep_observations"]] traj_dict["ep_actions"] = [[ tuple(tuple(a) if type(a) is list else a for a in j_a) for j_a in ep_acts ] for ep_acts in traj_dict["ep_actions"]] return traj_dict ### VIZUALIZATION METHODS ### @staticmethod def interactive_from_traj(trajectories, traj_idx=0): """ Displays ith trajectory of trajectories (in standard format) interactively in a Jupyter notebook. """ from ipywidgets import widgets, interactive_output states = trajectories["ep_observations"][traj_idx] joint_actions = trajectories["ep_actions"][traj_idx] cumulative_rewards = cumulative_rewards_from_rew_list( trajectories["ep_rewards"][traj_idx]) mdp_params = trajectories["mdp_params"][traj_idx] env_params = trajectories["env_params"][traj_idx] env = AgentEvaluator(mdp_params, env_params=env_params).env def update(t=1.0): env.state = states[int(t)] joint_action = joint_actions[int(t - 1)] if t > 0 else (Action.STAY, Action.STAY) print(env) print("Joint Action: {} \t Score: {}".format( Action.joint_action_to_char(joint_action), cumulative_rewards[t])) t = widgets.IntSlider(min=0, max=len(states) - 1, step=1, value=0) out = interactive_output(update, {'t': t}) display(out, t)