Esempio n. 1
0
    def _test_bc_schedule(self, bc_schedule, expected_agents_probas):
        self.params['multi_agent_params']['bc_schedule'] = bc_schedule

        env = OvercookedMultiAgent.from_config(self.params)
        actual_agents_probas = []

        for t in self.timesteps:
            env.anneal_agents_schedule(t)
            actual_agents_probas.append(env.agents_probas)
        self._assert_lists_almost_equal(expected_agents_probas, actual_agents_probas)
Esempio n. 2
0
    def test_env_creation(self):
        # Valid creation
        env = OvercookedMultiAgent.from_config(self.params)
        for param, expected in self.params['multi_agent_params'].items():
            self.assertEqual(expected, getattr(env, param))

        # Invalid bc_schedules
        invalid_schedules = [[(-1, 0.0), (1.0, 1e5)], [(0.0, 0.0), (10, 1),  (5, 0.5)], [(0, 0), (5, 1), (10, 1.5)]]
        for sched in invalid_schedules:
            self.params['multi_agent_params']['bc_schedule'] = sched
            self.assertRaises(AssertionError, OvercookedMultiAgent.from_config, self.params)
Esempio n. 3
0
    def test_agent_creation(self):
        env = OvercookedMultiAgent.from_config(self.params)
        obs = env.reset()

        # Check that we have the right number of agents with valid names
        self.assertEqual(len(env.current_agents_ids), 2)
        self.assertListEqual(list(obs.keys()), env.current_agents_ids)

        # Ensure that bc agents are created 'factor' percentage of the time
        bc_factors = [0.0, 0.1, 0.5, 0.9, 1.0]
        for factor in bc_factors:
            self._test_bc_creation_proportion(env, factor)
Esempio n. 4
0
    def test_reward_shaping_annealing(self):
        self.params['multi_agent_params']['reward_shaping_factor'] = 1
        self.params['multi_agent_params']['reward_shaping_horizon'] = 1e3

        expected_rew_factors = [1, 990/1e3, 900/1e3, 500/1e3, 0.0, 0.0, 0.0, 0.0]
        actual_rew_factors = []

        env = OvercookedMultiAgent.from_config(self.params)

        for t in self.timesteps:
            env.anneal_reward_shaping_factor(t)
            actual_rew_factors.append(env.reward_shaping_factor)

        self._assert_lists_almost_equal(expected_rew_factors, actual_rew_factors)
def my_config():
    ### PPO model params ###

    # Base model params
    NUM_HIDDEN_LAYERS = 3
    SIZE_HIDDEN_LAYERS = 64
    NUM_FILTERS = 25
    NUM_CONV_LAYERS = 3
    # whether to use recurrence in ppo model
    use_lstm = False
    D2RL = False
    # LSTM memory cell size (only used if use_lstm=True)
    CELL_SIZE = 256



    ### Other agents params ###
    # path to pickled policy model for behavior cloning
    bc_model_dir = os.path.join(BC_SAVE_DIR, "default")

    # Whether bc agents should return action logit argmax or sample
    bc_stochastic = True
    
    # config for non machine learning based agent (rule based)
    # agent_init_kwargs_variables is used by fill_init_kwargs inside OvercookedMultiAgent.create_non_ml_agent to fill selected fields with env/mdp attributes
    # if you want to change these easiest way to do it is to supply .txt file with whole variable that can be evaluated into this dict
    non_ml_agents_params = {
        "StayAgent": {
            "config": {
                "agent_cls": StayAgent
            }
        },
        "RandomAgentInteracting": {
             "config": {
                "agent_cls": RandomAgent,
                "agent_init_kwargs": {
                    "all_actions": True
                }
             }
        },
        "RandomAgent": {
             "config": {
                "agent_cls": RandomAgent,
                "agent_init_kwargs": {
                    "all_actions": False
                }
             }
        },
        "GreedyHumanModel": {
            "config": {
                "agent_cls": GreedyHumanModel,
                "agent_init_kwargs": {},
                "agent_init_kwargs_variables": {
                    "mlam": "env.mlam"
                }
            }
        }
    }
    non_ml_agents_params_file = None
    if non_ml_agents_params_file:
        with open(non_ml_agents_params_file, "r") as f:
            non_ml_agents_params = eval(f.read())



    ### Training Params ###

    num_workers = 20 if not LOCAL_TESTING else 2

    # list of all random seeds to use for experiments, used to reproduce results
    seeds = [0]

    # Placeholder for random for current trial
    seed = None

    # Number of gpus the central driver should use
    num_gpus = 0 if LOCAL_TESTING else 1

    # How many environment timesteps will be simulated (across all environments)
    # for one set of gradient updates. Is divided equally across environments
    # train_batch_size = 40000 if not LOCAL_TESTING else 800
    train_batch_size = 100000 if not LOCAL_TESTING else 800

    # size of minibatches we divide up each batch into before
    # performing gradient steps
    # sgd_minibatch_size = 10000 if not LOCAL_TESTING else 800
    sgd_minibatch_size = 25000 if not LOCAL_TESTING else 800

    # Rollout length
    rollout_fragment_length = 400

    # Stepsize of SGD.
    lr = 5e-3

    # Learning rate schedule.
    lr_schedule = None

    # If specified, clip the global norm of gradients by this amount
    grad_clip = 0.1

    # Discount factor
    gamma = 0.99

    # Exponential decay factor for GAE (how much weight to put on monte carlo samples)
    # Reference: https://arxiv.org/pdf/1506.02438.pdf
    lmbda = 0.98

    # Whether the value function shares layers with the policy model
    vf_share_layers = True

    # How much the loss of the value network is weighted in overall loss
    vf_loss_coeff = 1e-4

    # Entropy bonus coefficient, will anneal linearly from _start to _end over _horizon steps
    entropy_coeff_start = 0.02
    entropy_coeff_end = 0.00005
    entropy_coeff_horizon = 3e5

    # Initial coefficient for KL divergence.
    kl_coeff = 0.2

    # PPO clipping factor
    clip_param = 0.05

    # Number of SGD iterations in each outer loop (i.e., number of epochs to
    # execute per train batch).
    num_sgd_iter = 8 if not LOCAL_TESTING else 1

    # Whether tensorflow should execute eagerly or not
    eager = False

    # Number of training iterations to run
    num_training_iters = 400 if not LOCAL_TESTING else 2
    
    # How many trainind iterations (calls to trainer.train()) to run before saving model checkpoint
    save_freq = 250

    # Whether all PPO agents should share the same policy network
    shared_policy = True



    ### Evaluation params ###
    
    # Agents used in evaluation
    evaluation_agents = ["ppo", "ppo"]

    # How many timesteps should be in an evaluation episode
    evaluation_ep_length = 400

    # Number of games to simulation each evaluation
    evaluation_num_games = 2

    # Whether to display rollouts in evaluation
    evaluation_display = True

    # Where to store replay txt files
    evaluation_replay_store_dir = None

    # How many training iterations to run between each evaluation
    evaluation_interval = 50 if not LOCAL_TESTING else 1

    ### Environment Params ###

    outer_shape = (5, 4)
    
    # The number of MDP in the env.mdp_lst
    num_mdp = 1
    # num_mdp = np.inf  # for infinite mdp

    # Max episode length
    horizon = 400

    # used when one of the agents uses mlam
    # NOTE: as layouts are generated there is no point in modifying counter_goals/drop/pickup
    mlam_params = {
        'start_orientations': False,
        'wait_allowed': False,
        'counter_goals': [],
        'counter_drop': [],
        'counter_pickup': [],
        'same_motion_goals': True
    }

    # Whether dense reward should come from potential function or not
    use_phi = True

    # Constant by which shaped rewards are multiplied by when calculating total reward
    reward_shaping_factor = 1.0

    # Linearly anneal the reward shaping factor such that it reaches zero after this number of timesteps
    reward_shaping_horizon = 1e6

    # bc_factor represents that ppo agent gets paired with a bc agent for any episode
    # schedule for bc_factor is represented by a list of points (t_i, v_i) where v_i represents the 
    # value of bc_factor at timestep t_i. Values are linearly interpolated between points
    # The default listed below represents bc_factor=0 for all timesteps
    bc_schedule = None

    # agents_schedule is list of dicts where key "agents" is list of dicts representing probability of having agent 
    #   of given type for every player at given timestep (with key "timestep") with linear interpolation in between the timesteps
    #   example dict: {"timestep": 10, "agents": [{"ppo":1}, {"ppo":0.3, "bc": 0.7}]}
    # you cannot change agents_schedule directly by running file from command line: python ppo_rllib_client.py with agents_schedule=...,
    #   use agents_schedule_file instead
    agents_schedule_file = None
    if bc_schedule is not None:
        agents_schedule = OvercookedMultiAgent.bc_schedule_to_agents_schedule(bc_schedule)
    elif agents_schedule_file:
        agents_schedule = load_dict_from_unknown_filetype(agents_schedule_file)
    else:
        agents_schedule = OvercookedMultiAgent.self_play_schedule
    agents_schedule_file = None

    # if agents starting positions should be shuffled in training and evaluation
    shuffle_agents = True

    # map type of ml based agent (ppo or bc) to state encoding functions and observation spaces
    # state encoding functions and observation spaces can be encoded in form of string if they are attributes/properties of mdp with 
    # format: "mdp.property_name" i.e.
    # {"ppo": "mdp.lossless_state_encoding_gym_space", 
    # "bc": "mdp.lossless_state_encoding_gym_space"}
    # use dicts for dict state spaces (only supported keys are: "observations" (encoded observation, used by default) 
    # and "auxillary_info" (1d vector containing info about anything else i.e. orders)) i.e.
    # {"ppo": {"observations": "mdp.lossless_state_encoding_gym_space", 
    #          "auxillary_info": "mdp.multi_hot_orders_encoding_gym_space"},
    #  "bc": "mdp.lossless_state_encoding_gym_space"}
    featurize_fns = copy.deepcopy(OvercookedMultiAgent.default_featurize_fns)
    featurize_fns_file = None
    if featurize_fns_file:
        featurize_fns = load_dict_from_unknown_filetype(featurize_fns_file)

    observation_spaces = copy.deepcopy(OvercookedMultiAgent.default_observation_spaces)
    observation_spaces_file = None
    if observation_spaces_file:
        observation_spaces = load_dict_from_unknown_filetype(observation_spaces_file)



    # Where to log the ray dashboard stats
    temp_dir = os.path.join(os.path.abspath(os.sep), "tmp", "ray_tmp") if not LOCAL_TESTING else None

    # Where to store model checkpoints and training stats
    results_dir = os.path.join(os.path.abspath('.'), 'results_client_temp')
    
    params_str = "nw=%d_vf=%f_es=%f_en=%f_kl=%f_outer_shape=%d_%d--inner_shape=%d_%d--prop_empty=%f--prop_feats=%f" % (
        num_workers,
        vf_loss_coeff,
        entropy_coeff_start,
        entropy_coeff_end,
        kl_coeff,
        outer_shape[0],
        outer_shape[1],
        INNER_SHAPE[0],
        INNER_SHAPE[1],
        PROP_EMPTY,
        PROP_FEATS
    )

    # Name of directory to store training results in (stored in ~/ray_results/<experiment_name>)
    experiment_name = "{0}_{1}".format("PPO_fp_", params_str)


    ppo_agent_params = {
            "policy_cls": DictObsSpacePPOTFPolicy,
            "config": {
                "model": {
                    # To be passed into rl-lib model/custom_options config
                    "custom_options": {
                        "NUM_HIDDEN_LAYERS": NUM_HIDDEN_LAYERS,
                        "SIZE_HIDDEN_LAYERS": SIZE_HIDDEN_LAYERS,
                        "NUM_FILTERS": NUM_FILTERS,
                        "NUM_CONV_LAYERS": NUM_CONV_LAYERS,      
                        "use_lstm": use_lstm,
                        "CELL_SIZE": CELL_SIZE,
                        "D2RL": D2RL
                    },
                    "custom_model": CUSTOM_MODEL_ID
                }
            }
    }

    bc_agent_params = {
        "policy_cls": BehaviorCloningPolicy,
        "config": {
            "model_dir": bc_model_dir,
            "stochastic": bc_stochastic,
            "eager": eager
        }
    }

    ml_agent_params = {
        "ppo": ppo_agent_params,
        "bc": bc_agent_params
    }

    # to be passed into the rllib.PPOTrainer class
    training_params = {
        "num_workers": num_workers,
        "seed": seed,
        "num_gpus": num_gpus,
        "train_batch_size": train_batch_size,
        "sgd_minibatch_size": sgd_minibatch_size,
        "rollout_fragment_length": rollout_fragment_length,
        "lr": lr,
        "lr_schedule": lr_schedule,
        "grad_clip": grad_clip,
        "gamma": gamma,
        "lambda": lmbda,
        "vf_share_layers": vf_share_layers,
        "vf_loss_coeff": vf_loss_coeff,
        "entropy_coeff_schedule": [(0, entropy_coeff_start), (entropy_coeff_horizon, entropy_coeff_end)],
        "kl_coeff": kl_coeff,
        "clip_param": clip_param,
        "num_sgd_iter": num_sgd_iter,
        "evaluation_interval": evaluation_interval,
        "eager": eager
    }

    # To be passed into AgentEvaluator constructor and _evaluate function
    evaluation_params = {
        "agents": evaluation_agents,
        "ep_length": evaluation_ep_length,
        "num_games": evaluation_num_games,
        "display": evaluation_display,
        "store_dir": evaluation_replay_store_dir,
        "non_ml_agents_params": non_ml_agents_params,
        "display_phi": True
    }

    environment_params = {
        # To be passed into OvercookedGridWorld constructor
        "outer_shape": outer_shape,
        "mdp_params_schedule_fn": naive_params_schedule_fn,
        # To be passed into OvercookedEnv constructor
        "env_params": {
            "horizon": horizon,
            "num_mdp": num_mdp,
            "initial_info": {}
        },

        # evaluation mdp params
        "eval_mdp_params": get_mdp_default_gen_params(rew_shaping=False),

        #"eval_mdp_params" :{
        #    "layout_name": "cramped_room"
        #},
        # To be passed into OvercookedMultiAgent constructor
        "multi_agent_params": {
            "use_phi": use_phi,
            "reward_shaping_factor": reward_shaping_factor,
            "reward_shaping_horizon": reward_shaping_horizon,
            "agents_schedule": agents_schedule,
            "shuffle_agents": shuffle_agents,
            "featurize_fns": featurize_fns,
            "observation_spaces": observation_spaces,
            "non_ml_agents_params": non_ml_agents_params
        }
    }

    ray_params = {
        "custom_model_id": CUSTOM_MODEL_ID,
        "custom_model_cls": RllibLSTMPPOModel if use_lstm else RllibPPOModel,
        "temp_dir": temp_dir,
        "env_creator": _env_creator
    }

    params = {
        "agent_params": ml_agent_params,
        "training_params": training_params,
        "evaluation_params": evaluation_params,
        "environment_params": environment_params,
        "ray_params": ray_params,
        "shared_policy": shared_policy,
        "num_training_iters": num_training_iters,
        "experiment_name": experiment_name,
        "save_every": save_freq,
        "seeds": seeds,
        "results_dir": results_dir,
    }
def _env_creator(env_config):
    # Re-import required here to work with serialization
    from human_aware_rl.rllib.rllib import OvercookedMultiAgent 
    return OvercookedMultiAgent.from_config(env_config)
def _env_creater(env_config):
    return OvercookedMultiAgent.from_config(env_config)