def testPyTorchModel(self): ModelCatalog.register_custom_model("composite", TorchSpyModel) register_env("nested", lambda _: NestedDictEnv()) a2c = A2CAgent( env="nested", config={ "num_workers": 0, "use_pytorch": True, "sample_batch_size": 5, "train_batch_size": 5, "model": { "custom_model": "composite", }, }) a2c.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "torch_spy_in_{}".format(i))) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = one_hot( DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def doTestNestedDict(self, make_env, test_lstm=False): ModelCatalog.register_custom_model("composite", DictSpyModel) register_env("nested", make_env) pg = PGAgent( env="nested", config={ "num_workers": 0, "sample_batch_size": 5, "train_batch_size": 5, "model": { "custom_model": "composite", "use_lstm": test_lstm, }, }) pg.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "d_spy_in_{}".format(i))) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = one_hot( DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def doTestNestedTuple(self, make_env): ModelCatalog.register_custom_model("composite2", TupleSpyModel) register_env("nested2", make_env) pg = PGAgent( env="nested2", config={ "num_workers": 0, "sample_batch_size": 5, "train_batch_size": 5, "model": { "custom_model": "composite2", }, }) pg.train() # Check that the model sees the correct reconstructed observations for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "t_spy_in_{}".format(i))) pos_i = TUPLE_SAMPLES[i][0].tolist() cam_i = TUPLE_SAMPLES[i][1][0].tolist() task_i = one_hot(TUPLE_SAMPLES[i][2], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def __init__(self, sess, action_space, obs_space, preprocessor, observation_filter, model_config, action_noise_std=0.0): self.sess = sess self.action_space = action_space self.action_noise_std = action_noise_std self.preprocessor = preprocessor self.observation_filter = get_filter(observation_filter, self.preprocessor.shape) self.inputs = tf.placeholder(tf.float32, [None] + list(self.preprocessor.shape)) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( action_space, model_config, dist_type="deterministic") model = ModelCatalog.get_model({ "obs": self.inputs }, obs_space, action_space, dist_dim, model_config) dist = dist_class(model.outputs) self.sampler = dist.sample() self.variables = ray.experimental.tf_utils.TensorFlowVariables( model.outputs, self.sess) self.num_params = sum( np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items()) self.sess.run(tf.global_variables_initializer())
def testCustomModel(self): ray.init() ModelCatalog.register_custom_model("foo", CustomModel) p1 = ModelCatalog.get_model({ "obs": tf.constant([1, 2, 3]) }, Box(0, 1, shape=(3, ), dtype=np.float32), Discrete(5), 5, {"custom_model": "foo"}) self.assertEqual(str(type(p1)), str(CustomModel))
def testGymPreprocessors(self): p1 = ModelCatalog.get_preprocessor( get_registry(), gym.make("CartPole-v0")) self.assertEqual(type(p1), NoPreprocessor) p2 = ModelCatalog.get_preprocessor( get_registry(), gym.make("FrozenLake-v0")) self.assertEqual(type(p2), OneHotPreprocessor)
def testInvalidModel(self): ModelCatalog.register_custom_model("invalid", InvalidModel) self.assertRaises(ValueError, lambda: PGAgent( env="CartPole-v0", config={ "model": { "custom_model": "invalid", }, }))
def testMinibatchSequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPOAgent( env="counter", config={ "num_workers": 0, "sample_batch_size": 20, "train_batch_size": 20, "sgd_minibatch_size": 10, "vf_share_layers": True, "simple_optimizer": False, "num_sgd_iter": 1, "model": { "custom_model": "rnn", "max_seq_len": 4, }, }) ppo.train() ppo.train() # first epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]: batch0, batch1 = batch1, batch0 # sort minibatches self.assertEqual(batch0["seq_lens"].tolist(), [4, 4]) self.assertEqual(batch1["seq_lens"].tolist(), [4, 3]) self.assertEqual(batch0["sequences"].tolist(), [ [[0], [1], [2], [3]], [[4], [5], [6], [7]], ]) self.assertEqual(batch1["sequences"].tolist(), [ [[8], [9], [10], [11]], [[12], [13], [14], [0]], ]) # second epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch2 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2")) batch3 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3")) if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]: batch2, batch3 = batch3, batch2 self.assertEqual(batch2["seq_lens"].tolist(), [4, 4]) self.assertEqual(batch3["seq_lens"].tolist(), [2, 4]) self.assertEqual(batch2["sequences"].tolist(), [ [[5], [6], [7], [8]], [[9], [10], [11], [12]], ]) self.assertEqual(batch3["sequences"].tolist(), [ [[13], [14], [0], [0]], [[0], [1], [2], [3]], ])
def testInvalidModel2(self): ModelCatalog.register_custom_model("invalid2", InvalidModel2) self.assertRaisesRegexp( ValueError, "Expected output.*", lambda: PGAgent( env="CartPole-v0", config={ "model": { "custom_model": "invalid2", }, }))
def testSimpleOptimizerSequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPOAgent( env="counter", config={ "num_workers": 0, "sample_batch_size": 10, "train_batch_size": 10, "sgd_minibatch_size": 10, "vf_share_layers": True, "simple_optimizer": True, "num_sgd_iter": 1, "model": { "custom_model": "rnn", "max_seq_len": 4, }, }) ppo.train() ppo.train() batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) self.assertEqual( batch0["sequences"].tolist(), [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]]) self.assertEqual(batch0["seq_lens"].tolist(), [4, 4, 2]) self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0]) self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0) self.assertTrue( np.allclose(batch0["state_in"][0].tolist()[1:], batch0["state_out"][0].tolist()[:-1])) self.assertTrue( np.allclose(batch0["state_in"][1].tolist()[1:], batch0["state_out"][1].tolist()[:-1])) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) self.assertEqual(batch1["sequences"].tolist(), [ [[10], [11], [12], [13]], [[14], [0], [0], [0]], [[0], [1], [2], [3]], [[4], [0], [0], [0]], ]) self.assertEqual(batch1["seq_lens"].tolist(), [4, 1, 4, 1]) self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0]) self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0)
def testDefaultModels(self): ray.init() with tf.variable_scope("test1"): p1 = ModelCatalog.get_model( get_registry(), np.zeros((10, 3), dtype=np.float32), 5) self.assertEqual(type(p1), FullyConnectedNetwork) with tf.variable_scope("test2"): p2 = ModelCatalog.get_model( get_registry(), np.zeros((10, 80, 80, 3), dtype=np.float32), 5) self.assertEqual(type(p2), VisionNetwork)
def testDefaultModels(self): ray.init() with tf.variable_scope("test1"): p1 = ModelCatalog.get_model({ "obs": tf.zeros((10, 3), dtype=tf.float32) }, Box(0, 1, shape=(3, ), dtype=np.float32), 5, {}) self.assertEqual(type(p1), FullyConnectedNetwork) with tf.variable_scope("test2"): p2 = ModelCatalog.get_model({ "obs": tf.zeros((10, 84, 84, 3), dtype=tf.float32) }, Box(0, 1, shape=(84, 84, 3), dtype=np.float32), 5, {}) self.assertEqual(type(p2), VisionNetwork)
def testMultiAgentComplexSpaces(self): ModelCatalog.register_custom_model("dict_spy", DictSpyModel) ModelCatalog.register_custom_model("tuple_spy", TupleSpyModel) register_env("nested_ma", lambda _: NestedMultiAgentEnv()) act_space = spaces.Discrete(2) pg = PGAgent( env="nested_ma", config={ "num_workers": 0, "sample_batch_size": 5, "train_batch_size": 5, "multiagent": { "policy_graphs": { "tuple_policy": ( PGPolicyGraph, TUPLE_SPACE, act_space, {"model": {"custom_model": "tuple_spy"}}), "dict_policy": ( PGPolicyGraph, DICT_SPACE, act_space, {"model": {"custom_model": "dict_spy"}}), }, "policy_mapping_fn": lambda a: { "tuple_agent": "tuple_policy", "dict_agent": "dict_policy"}[a], }, }) pg.train() for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "d_spy_in_{}".format(i))) pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist() cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist() task_i = one_hot( DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i) for i in range(4): seen = pickle.loads( ray.experimental.internal_kv._internal_kv_get( "t_spy_in_{}".format(i))) pos_i = TUPLE_SAMPLES[i][0].tolist() cam_i = TUPLE_SAMPLES[i][1][0].tolist() task_i = one_hot(TUPLE_SAMPLES[i][2], 5) self.assertEqual(seen[0][0].tolist(), pos_i) self.assertEqual(seen[1][0].tolist(), cam_i) self.assertEqual(seen[2][0].tolist(), task_i)
def __init__(self, state_values, cumulative_rewards, logits, actions, action_space, beta): ma_adv_norm = tf.get_variable( name="moving_average_of_advantage_norm", dtype=tf.float32, initializer=100.0, trainable=False) # advantage estimation adv = cumulative_rewards - state_values # update averaged advantage norm update_adv_norm = tf.assign_add( ref=ma_adv_norm, value=1e-6 * (tf.reduce_mean(tf.square(adv)) - ma_adv_norm)) # exponentially weighted advantages with tf.control_dependencies([update_adv_norm]): exp_advs = tf.exp( beta * tf.divide(adv, 1e-8 + tf.sqrt(ma_adv_norm))) # log\pi_\theta(a|s) dist_cls, _ = ModelCatalog.get_action_dist(action_space, {}) action_dist = dist_cls(logits) logprobs = action_dist.logp(actions) self.loss = -1.0 * tf.reduce_mean( tf.stop_gradient(exp_advs) * logprobs)
def _build_q_network(registry, inputs, num_actions, config): dueling = config["dueling"] hiddens = config["hiddens"] frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"]) frontend_out = frontend.last_layer with tf.variable_scope("action_value"): action_out = frontend_out for hidden in hiddens: action_out = layers.fully_connected( action_out, num_outputs=hidden, activation_fn=tf.nn.relu) action_scores = layers.fully_connected( action_out, num_outputs=num_actions, activation_fn=None) if dueling: with tf.variable_scope("state_value"): state_out = frontend_out for hidden in hiddens: state_out = layers.fully_connected( state_out, num_outputs=hidden, activation_fn=tf.nn.relu) state_score = layers.fully_connected( state_out, num_outputs=1, activation_fn=None) action_scores_mean = tf.reduce_mean(action_scores, 1) action_scores_centered = action_scores - tf.expand_dims( action_scores_mean, 1) return state_score + action_scores_centered else: return action_scores
def _build_q_network(self, obs, obs_space, action_space, actions): q_net = QNetwork( ModelCatalog.get_model({ "obs": obs, "is_training": self._get_is_training_placeholder(), }, obs_space, action_space, 1, self.config["model"]), actions, self.config["critic_hiddens"], self.config["critic_hidden_activation"]) return q_net.value, q_net.model
def _build_p_network(self, obs, obs_space): policy_net = PNetwork( ModelCatalog.get_model({ "obs": obs, "is_training": self._get_is_training_placeholder(), }, obs_space, 1, self.config["model"]), self.dim_actions, self.config["actor_hiddens"], self.config["actor_hidden_activation"]) return policy_net.action_scores, policy_net.model
def __init__(self, registry, env_creator, config, logdir): env = ModelCatalog.get_preprocessor_as_wrapper(registry, env_creator( config["env_config"]), config["model"]) self.dataset = ExperienceDataset(config["dataset_path"]) # TODO(rliaw): should change this to be just env.observation_space self.policy = BCPolicy(registry, env.observation_space.shape, env.action_space, config) self.config = config self.logdir = logdir self.metrics_queue = queue.Queue()
def _build_q_network(self, obs, space): qnet = QNetwork( ModelCatalog.get_model({ "obs": obs, "is_training": self._get_is_training_placeholder(), }, space, self.num_actions, self.config["model"]), self.num_actions, self.config["dueling"], self.config["hiddens"], self.config["noisy"], self.config["num_atoms"], self.config["v_min"], self.config["v_max"], self.config["sigma0"]) return qnet.value, qnet.logits, qnet.dist, qnet.model
def run(args, parser): def create_environment(env_config): # This import must happen inside the method so that worker processes import this code import roboschool return gym.make(args.env) if not args.config: # Load configuration from file config_dir = os.path.dirname(args.checkpoint) # params.json is saved in the model directory during ray training by default config_path = os.path.join(config_dir, "params.json") with open(config_path) as f: args.config = json.load(f) if not args.env: if not args.config.get("env"): parser.error("the following arguments are required: --env") args.env = args.config.get("env") ray.init() register_env(args.env, create_environment) cls = get_agent_class(args.algorithm) config = args.config config["monitor"] = False config["num_workers"] = 1 config["num_gpus"] = 0 agent = cls(env=args.env, config=config) agent.restore(args.checkpoint) num_episodes = int(args.evaluate_episodes) if args.algorithm == "DQN": env = gym.make(args.env) env = wrap_dqn(env, args.config.get("model", {})) else: env = ModelCatalog.get_preprocessor_as_wrapper(gym.make(args.env)) env = wrappers.Monitor(env, OUTPUT_DIR, force=True, video_callable=lambda episode_id: True) all_rewards = [] for episode in range(num_episodes): steps = 0 state = env.reset() done = False reward_total = 0.0 while not done: action = agent.compute_action(state) next_state, reward, done, _ = env.step(action) reward_total += reward steps += 1 state = next_state all_rewards.append(reward_total) print("Episode reward: %s. Episode steps: %s" % (reward_total, steps)) print("Mean Reward:", np.mean(all_rewards)) print("Max Reward:", np.max(all_rewards)) print("Min Reward:", np.min(all_rewards))
def testTuplePreprocessor(self): ray.init() class TupleEnv(object): def __init__(self): self.observation_space = Tuple( [Discrete(5), Box(0, 1, shape=(3,), dtype=np.float32)]) p1 = ModelCatalog.get_preprocessor( get_registry(), TupleEnv()) self.assertEqual(p1.shape, (8,)) self.assertEqual( list(p1.transform((0, [1, 2, 3]))), [float(x) for x in [1, 0, 0, 0, 0, 1, 2, 3]])
def testCustomPreprocessor(self): ray.init() ModelCatalog.register_custom_preprocessor("foo", CustomPreprocessor) ModelCatalog.register_custom_preprocessor("bar", CustomPreprocessor2) env = gym.make("CartPole-v0") p1 = ModelCatalog.get_preprocessor(env, {"custom_preprocessor": "foo"}) self.assertEqual(str(type(p1)), str(CustomPreprocessor)) p2 = ModelCatalog.get_preprocessor(env, {"custom_preprocessor": "bar"}) self.assertEqual(str(type(p2)), str(CustomPreprocessor2)) p3 = ModelCatalog.get_preprocessor(env) self.assertEqual(type(p3), NoPreprocessor)
def _build_policy_map(self, policy_dict, policy_config): policy_map = {} preprocessors = {} for name, (cls, obs_space, act_space, conf) in sorted(policy_dict.items()): merged_conf = merge_dicts(policy_config, conf) if self.preprocessing_enabled: preprocessor = ModelCatalog.get_preprocessor_for_space( obs_space, merged_conf.get("model")) preprocessors[name] = preprocessor obs_space = preprocessor.observation_space else: preprocessors[name] = NoPreprocessor(obs_space) if isinstance(obs_space, gym.spaces.Dict) or \ isinstance(obs_space, gym.spaces.Tuple): raise ValueError( "Found raw Tuple|Dict space as input to policy graph. " "Please preprocess these observations with a " "Tuple|DictFlatteningPreprocessor.") with tf.variable_scope(name): policy_map[name] = cls(obs_space, act_space, merged_conf) return policy_map, preprocessors
def __init__( self, registry, env_creator, config, logdir, start_sampler=True): env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) self.env = env policy_cls = get_policy_cls(config) # TODO(rliaw): should change this to be just env.observation_space self.policy = policy_cls( registry, env.observation_space.shape, env.action_space, config) self.config = config # Technically not needed when not remote self.obs_filter = get_filter( config["observation_filter"], env.observation_space.shape) self.rew_filter = get_filter(config["reward_filter"], ()) self.filters = {"obs_filter": self.obs_filter, "rew_filter": self.rew_filter} self.sampler = AsyncSampler(env, self.policy, self.obs_filter, config["batch_size"]) if start_sampler and self.sampler.async: self.sampler.start() self.logdir = logdir
os.path.dirname(os.path.abspath(__file__)), "../tests/data/cartpole_small")) if __name__ == "__main__": ray.init() args = parser.parse_args() # Bazel makes it hard to find files specified in `args` (and `data`). # Look for them here. if not os.path.exists(args.input_files): # This script runs in the ray/rllib/examples dir. rllib_dir = Path(__file__).parent.parent input_dir = rllib_dir.absolute().joinpath(args.input_files) args.input_files = str(input_dir) ModelCatalog.register_custom_model( "custom_loss", TorchCustomLossModel if args.torch else CustomLossModel) config = { "env": "CartPole-v0", "num_workers": 0, "model": { "custom_model": "custom_loss", "custom_model_config": { "input_files": args.input_files, }, }, "framework": "torch" if args.torch else "tf", } stop = { "training_iteration": args.stop_iters,
def __init__( self, observation_space, action_space, observations, value_targets, advantages, actions, prev_logits, prev_vf_preds, logit_dim, kl_coeff, distribution_class, config, sess, registry): self.prev_dist = distribution_class(prev_logits) # Saved so that we can compute actions given different observations self.observations = observations self.curr_logits = ModelCatalog.get_model( registry, observations, logit_dim, config["model"]).outputs self.curr_dist = distribution_class(self.curr_logits) self.sampler = self.curr_dist.sample() if config["use_gae"]: vf_config = config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( registry, observations, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) # Make loss functions. self.ratio = tf.exp(self.curr_dist.logp(actions) - self.prev_dist.logp(actions)) self.kl = self.prev_dist.kl(self.curr_dist) self.mean_kl = tf.reduce_mean(self.kl) self.entropy = self.curr_dist.entropy() self.mean_entropy = tf.reduce_mean(self.entropy) self.surr1 = self.ratio * advantages self.surr2 = tf.clip_by_value(self.ratio, 1 - config["clip_param"], 1 + config["clip_param"]) * advantages self.surr = tf.minimum(self.surr1, self.surr2) self.mean_policy_loss = tf.reduce_mean(-self.surr) if config["use_gae"]: # We use a huber loss here to be more robust against outliers, # which seem to occur when the rollouts get longer (the variance # scales superlinearly with the length of the rollout) self.vf_loss1 = tf.square(self.value_function - value_targets) vf_clipped = prev_vf_preds + tf.clip_by_value( self.value_function - prev_vf_preds, -config["clip_param"], config["clip_param"]) self.vf_loss2 = tf.square(vf_clipped - value_targets) self.vf_loss = tf.minimum(self.vf_loss1, self.vf_loss2) self.mean_vf_loss = tf.reduce_mean(self.vf_loss) self.loss = tf.reduce_mean( -self.surr + kl_coeff * self.kl + config["vf_loss_coeff"] * self.vf_loss - config["entropy_coeff"] * self.entropy) else: self.mean_vf_loss = tf.constant(0.0) self.loss = tf.reduce_mean( -self.surr + kl_coeff * self.kl - config["entropy_coeff"] * self.entropy) self.sess = sess if config["use_gae"]: self.policy_results = [ self.sampler, self.curr_logits, self.value_function] else: self.policy_results = [ self.sampler, self.curr_logits, tf.constant("NA")]
def reward_adapter(env_obs, env_reward): return env_reward def action_adapter(model_action): throttle, brake, steering = model_action return np.array([throttle, brake, steering]) class TrainingModel(FullyConnectedNetwork): NAME = "FullyConnectedNetwork" ModelCatalog.register_custom_model(TrainingModel.NAME, TrainingModel) class ModelPolicy(AgentPolicy): def __init__(self, path_to_model, observation_space): self._prep = ModelCatalog.get_preprocessor_for_space(observation_space) self._path_to_model = path_to_model def setup(self): self._sess = tf.Session(graph=tf.Graph()) self._sess.__enter__() tf.saved_model.load(self._sess, export_dir=self._path_to_model, tags=["serve"]) def teardown(self):
if __name__ == '__main__': args = parser.parse_args() sep = os.pathsep os.environ['PYTHONPATH'] = sep.join(sys.path) ray.init(include_dashboard=False, num_gpus=1, num_cpus=args.num_cpus) env_name = "ray-griddly-env" def _create_env(env_config): env = RLlibEnv(env_config) return FlatActionWrapper(env) register_env(env_name, _create_env) ModelCatalog.register_custom_model("SimpleConv", SimpleConvFlatAgent) wandbLoggerCallback = WandbLoggerCallback( project='conditional_action_trees_reproduce', api_key_file='~/.wandb_rc', dir=args.root_directory) max_training_steps = args.max_training_steps gdy_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.yaml_file) config = { 'framework': 'torch', 'seed': args.seed, 'num_workers': args.num_workers, 'num_envs_per_worker': args.num_envs_per_worker,
from ray.rllib.models import ModelCatalog from ray.tune.registry import register_env from .model import ReallocationModel, Dirichlet from .env import create_env register_env("TradingEnv", create_env) ModelCatalog.register_custom_action_dist("dirichlet", Dirichlet) ModelCatalog.register_custom_model("reallocate", ReallocationModel)
# <class 'ray.rllib.policy.tf_policy_template.MyTFPolicy'> MyTFPolicy = build_tf_policy( name="MyTFPolicy", loss_fn=policy_gradient_loss, ) # <class 'ray.rllib.agents.trainer_template.MyCustomTrainer'> MyTrainer = build_trainer( name="MyCustomTrainer", default_policy=MyTFPolicy, ) if __name__ == "__main__": ray.init() args = parser.parse_args() ModelCatalog.register_custom_model("eager_model", EagerModel) config = { "env": "CartPole-v0", "num_workers": 0, "model": { "custom_model": "eager_model" }, "framework": "tfe", } stop = { "timesteps_total": args.stop_timesteps, "training_iteration": args.stop_iters, "episode_reward_mean": args.stop_reward, }
from MultiAgentSimEnv import MultiAgentSimEnv from hunter_dqn.dqn import DQNTrainer from hunter_dqn.dqn_model import DQNModel from hunter_dqn.hunter_policy import HunterPolicy from prey_dqn.MultiPreyEnv import MultiPreyEnv from prey_dqn.prey_model import DQNModelPrey from prey_dqn.prey_policy import PreyPolicy def env_creator(env_config): return MultiAgentSimEnv(env_config) if __name__ == "__main__": ray.init() ModelCatalog.register_custom_model("DQNModel", DQNModel) config = { "num_hunters": 20, "num_preys": 100, } env = register_env("MultiAgentSimEnv-v0", env_creator) singleAgentEnv = MultiAgentSimEnv(config) policies = { "hunter": (HunterPolicy, singleAgentEnv.observation_space_hunter, singleAgentEnv.action_space_hunter, config), "prey": (PreyPolicy, singleAgentEnv.observation_space_prey, singleAgentEnv.action_space_prey, config) }
def build_rnnsac_model( policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict, ) -> ModelV2: """Constructs the necessary ModelV2 for the Policy and returns it. Args: policy (Policy): The TFPolicy that will use the models. obs_space (gym.spaces.Space): The observation space. action_space (gym.spaces.Space): The action space. config (TrainerConfigDict): The SAC trainer's config dict. Returns: ModelV2: The ModelV2 to be used by the Policy. Note: An additional target model will be created in this function and assigned to `policy.target_model`. """ # With separate state-preprocessor (before obs+action concat). num_outputs = int(np.product(obs_space.shape)) # Force-ignore any additionally provided hidden layer sizes. # Everything should be configured using SAC's `q_model_config` and # `policy_model_config` config settings. policy_model_config = MODEL_DEFAULTS.copy() policy_model_config.update(config["policy_model_config"]) q_model_config = MODEL_DEFAULTS.copy() q_model_config.update(config["q_model_config"]) default_model_cls = RNNSACTorchModel model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework=config["framework"], default_model=default_model_cls, name="sac_model", policy_model_config=policy_model_config, q_model_config=q_model_config, twin_q=config["twin_q"], initial_alpha=config["initial_alpha"], target_entropy=config["target_entropy"], ) assert isinstance(model, default_model_cls) # Create an exact copy of the model and store it in `policy.target_model`. # This will be used for tau-synched Q-target models that run behind the # actual Q-networks and are used for target q-value calculations in the # loss terms. policy.target_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework=config["framework"], default_model=default_model_cls, name="target_sac_model", policy_model_config=policy_model_config, q_model_config=q_model_config, twin_q=config["twin_q"], initial_alpha=config["initial_alpha"], target_entropy=config["target_entropy"], ) assert isinstance(policy.target_model, default_model_cls) return model
def wrap(env): return ModelCatalog.get_preprocessor_as_wrapper( env, model_config)
def build_ddpg_models(policy, observation_space, action_space, config): if config["model"]["custom_model"]: logger.warning( "Setting use_state_preprocessor=True since a custom model " "was specified.") config["use_state_preprocessor"] = True if not isinstance(action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format(action_space)) elif len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space has multiple dimensions " "{}. ".format(action_space.shape) + "Consider reshaping this into a single dimension, " "using a Tuple action space, or the multi-agent API.") if policy.config["use_state_preprocessor"]: default_model = None # catalog decides num_outputs = 256 # arbitrary config["model"]["no_final_linear"] = True else: default_model = NoopModel num_outputs = int(np.product(observation_space.shape)) policy.model = ModelCatalog.get_model_v2( obs_space=observation_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="tf", model_interface=DDPGModel, default_model=default_model, name="ddpg_model", actor_hidden_activation=config["actor_hidden_activation"], actor_hiddens=config["actor_hiddens"], critic_hidden_activation=config["critic_hidden_activation"], critic_hiddens=config["critic_hiddens"], twin_q=config["twin_q"], add_layer_norm=(policy.config["exploration_config"].get("type") == "ParameterNoise"), ) policy.target_model = ModelCatalog.get_model_v2( obs_space=observation_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="tf", model_interface=DDPGModel, default_model=default_model, name="target_ddpg_model", actor_hidden_activation=config["actor_hidden_activation"], actor_hiddens=config["actor_hiddens"], critic_hidden_activation=config["critic_hidden_activation"], critic_hiddens=config["critic_hiddens"], twin_q=config["twin_q"], add_layer_norm=(policy.config["exploration_config"].get("type") == "ParameterNoise"), ) return policy.model
def forward(self, input_dict, state, seq_lens): model_out, self._value_out = self.model( [input_dict["obs"]["a"], input_dict["obs"]["b"]] ) return model_out, state def value_function(self): return tf.reshape(self._value_out, [-1]) if __name__ == "__main__": # Can also register the env creator function explicitly with: # register_env("corridor", lambda config: SimpleCorridor(config)) ray.init() ModelCatalog.register_custom_model("my_model", CustomModel) tune.run( "PPO", stop={"timesteps_total": 10000}, config={ # "log_level": "ERROR", "eager": False, "env": SimpleCorridor, # or "corridor" if registered above "model": {"custom_model": "my_model"}, "vf_share_layers": True, # "lr": grid_search([1e-2, 1e-4, 1e-6]), # try different lrs # "lr": grid_search([1e-2, 1e-4, 1e-6]), # try different lrs "num_workers": 1, # parallelism "env_config": {"corridor_length": 5}, }, )
def build_sac_model(policy, obs_space, action_space, config): if config["model"].get("custom_model"): logger.warning( "Setting use_state_preprocessor=True since a custom model " "was specified.") config["use_state_preprocessor"] = True if not isinstance(action_space, (Box, Discrete)): raise UnsupportedSpaceException( "Action space {} is not supported for SAC.".format(action_space)) if isinstance(action_space, Box) and len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space has multiple dimensions " "{}. ".format(action_space.shape) + "Consider reshaping this into a single dimension, " "using a Tuple action space, or the multi-agent API.") # 2 cases: # 1) with separate state-preprocessor (before obs+action concat). # 2) no separate state-preprocessor: concat obs+actions right away. if config["use_state_preprocessor"]: num_outputs = 256 # Flatten last Conv2D to this many nodes. else: config["model"]["fcnet_hiddens"] = [] num_outputs = 0 # Force-ignore any additionally provided hidden layer sizes. # Everything should be configured using SAC's "Q_model" and "policy_model" # settings. policy.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch" if config["use_pytorch"] else "tf", model_interface=SACTorchModel if config["use_pytorch"] else SACTFModel, name="sac_model", actor_hidden_activation=config["policy_model"]["fcnet_activation"], actor_hiddens=config["policy_model"]["fcnet_hiddens"], critic_hidden_activation=config["Q_model"]["fcnet_activation"], critic_hiddens=config["Q_model"]["fcnet_hiddens"], twin_q=config["twin_q"], initial_alpha=config["initial_alpha"], target_entropy=config["target_entropy"]) policy.target_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch" if config["use_pytorch"] else "tf", model_interface=SACTorchModel if config["use_pytorch"] else SACTFModel, name="target_sac_model", actor_hidden_activation=config["policy_model"]["fcnet_activation"], actor_hiddens=config["policy_model"]["fcnet_hiddens"], critic_hidden_activation=config["Q_model"]["fcnet_activation"], critic_hiddens=config["Q_model"]["fcnet_hiddens"], twin_q=config["twin_q"], initial_alpha=config["initial_alpha"], target_entropy=config["target_entropy"]) return policy.model
"--ppo-checkpoint", type=str, default= "/home/jippo/ray_results/YanivTrainer_2021-04-03_21-40-03/YanivTrainer_yaniv_c49f4_00000_0_2021-04-03_21-40-03/checkpoint_001580/checkpoint-225" ) parser.add_argument( "--a3c-checkpoint", type=str, default= "/home/jippo/ray_results/YanivTrainer_2021-04-11_23-01-13/YanivTrainer_yaniv_6e345_00000_0_2021-04-11_23-01-13/checkpoint_021605/checkpoint-13385" ) parser.add_argument("--obs-scheme", type=int, default=0) args = parser.parse_args() register_env("yaniv", lambda config: YanivEnv(config)) ModelCatalog.register_custom_model("yaniv_mask", YanivActionMaskModel) env_config = { "end_after_n_deck_replacements": 0, "end_after_n_steps": 130, "early_end_reward": 0, "use_scaled_negative_reward": True, "use_scaled_positive_reward": True, "max_negative_reward": -1, "negative_score_cutoff": 30, "single_step": False, "step_reward": 0, "use_unkown_cards_in_state": False, "use_dead_cards_in_state": True, "observation_scheme": args.obs_scheme, "n_players": 2,
input_dict["obs"], 64, activation_fn=tf.nn.relu, scope="fc1") last_layer = slim.fully_connected( last_layer, 64, activation_fn=tf.nn.relu, scope="fc2") output = slim.fully_connected( last_layer, num_outputs, activation_fn=None, scope="fc_out") return output, last_layer ''' if __name__ == "__main__": args = parser.parse_args() ray.init() # Simple environment with `num_agents` independent entities register_env("collision_avoidance", lambda _: Collision_Avoidance_Env(args.num_agents)) ModelCatalog.register_custom_model("model1", CustomModel1) # ModelCatalog.register_custom_model("model2", CustomModel2) single_env = gym.make('collision_avoidance-v0') obs_space = single_env.observation_space act_space = single_env.action_space # Each policy can have a different configuration (including custom model) def gen_policy(i): config = { "model": { # "custom_model": ["model1", "model2"][i % 2], "custom_model": "model1" }, # "gamma": random.choice([0.95, 0.99]), "gamma": 0.95, }
return CCPPOTorchPolicy if config["framework"] == "torch" \ else CCPPOTFPolicy CCTrainer = PPOTrainer.with_updates( name="CCPPOTrainer", default_policy=CCPPOTFPolicy, get_policy_class=get_policy_class, ) if __name__ == "__main__": ray.init(local_mode=True) args = parser.parse_args() ModelCatalog.register_custom_model( "cc_model", TorchCentralizedCriticModel if args.torch else CentralizedCriticModel) config = { "env": TwoStepGame, "batch_mode": "complete_episodes", "num_workers": 0, "multiagent": { "policies": { "pol1": (None, Discrete(6), TwoStepGame.action_space, { "framework": "torch" if args.torch else "tf", }), "pol2": (None, Discrete(6), TwoStepGame.action_space, { "framework": "torch" if args.torch else "tf", }), },
def test_gym_preprocessors(self): p1 = ModelCatalog.get_preprocessor(gym.make("CartPole-v0")) self.assertEqual(type(p1), NoPreprocessor) p2 = ModelCatalog.get_preprocessor(gym.make("FrozenLake-v0")) self.assertEqual(type(p2), OneHotPreprocessor)
tf = try_import_tf() cnn_shape = (4, 4, 3) # The torch version of MobileNetV2 does channels first. cnn_shape_torch = (3, 224, 224) parser = argparse.ArgumentParser() parser.add_argument("--torch", action="store_true") if __name__ == "__main__": args = parser.parse_args() # Register our custom model. ModelCatalog.register_custom_model( "my_model", TorchMobileV2PlusRNNModel if args.torch else MobileV2PlusRNNModel) # Configure our Trainer. config = { "use_pytorch": args.torch, "model": { "custom_model": "my_model", # Extra config passed to the custom model's c'tor as kwargs. "custom_options": { "cnn_shape": cnn_shape_torch if args.torch else cnn_shape, }, "max_seq_len": 20, }, "vf_share_layers": True, "num_workers": 0, # no parallelism
def gen_trainer_from_params(params): # All ray environment set-up if not ray.is_initialized(): ray.init(ignore_reinit_error=True, include_webui=False, temp_dir=params['ray_params']['temp_dir']) register_env("overcooked_multi_agent", params['ray_params']['env_creator']) ModelCatalog.register_custom_model(params['ray_params']['custom_model_id'], params['ray_params']['custom_model_cls']) # Parse params training_params = params['training_params'] environment_params = params['environment_params'] evaluation_params = params['evaluation_params'] multi_agent_params = params['environment_params']['multi_agent_params'] agent_params = params["agent_params"] # only ml based agents env = OvercookedMultiAgent.from_config(environment_params) # Returns a properly formatted policy tuple to be passed into ppotrainer config def gen_policy(policy_type="ppo"): return ( agent_params[policy_type].get("policy_cls"), env.observation_spaces[policy_type], env.action_space, agent_params[policy_type]["config"] ) # Rllib compatible way of setting the directory we store agent checkpoints in logdir_prefix = "{0}_{1}_{2}".format(params["experiment_name"], params['training_params']['seed'], timestr) def custom_logger_creator(config): """Creates a Unified logger that stores results in <params['results_dir']>/<params["experiment_name"]>_<seed>_<timestamp> """ results_dir = params['results_dir'] if not os.path.exists(results_dir): try: os.makedirs(results_dir) except Exception as e: print("error creating custom logging dir. Falling back to default logdir {}".format(DEFAULT_RESULTS_DIR)) results_dir = DEFAULT_RESULTS_DIR logdir = tempfile.mkdtemp( prefix=logdir_prefix, dir=results_dir) logger = UnifiedLogger(config, logdir, loggers=None) return logger if "outer_shape" not in environment_params: environment_params["outer_shape"] = None if "mdp_params" in environment_params: environment_params["eval_mdp_params"] = environment_params["mdp_params"] # Create rllib compatible multi-agent config based on params multi_agent_config = {} if multi_agent_params.get('bc_schedule'): agents_schedule = OvercookedMultiAgent.bc_schedule_to_agents_schedule(multi_agent_params['bc_schedule']) else: agents_schedule = multi_agent_params['agents_schedule'] all_policies = OvercookedMultiAgent.agents_from_schedule(agents_schedule) ml_policies = [p for p in all_policies if OvercookedMultiAgent.is_ml_agent(p)] multi_agent_config['policies'] = { policy : gen_policy(policy) for policy in ml_policies } def select_policy(agent_id): return OvercookedMultiAgent.agent_id_to_agent_name(agent_id) multi_agent_config['policy_mapping_fn'] = select_policy multi_agent_config['policies_to_train'] = 'ppo' eval_function = get_rllib_eval_function(evaluation_params, environment_params['eval_mdp_params'], environment_params['env_params'], environment_params["outer_shape"], multi_agent_params["featurize_fns"], shuffle=multi_agent_params["shuffle_agents"], ) trainer = PPOTrainer(env="overcooked_multi_agent", config={ "multiagent": multi_agent_config, "callbacks" : TrainingCallbacks, "custom_eval_function" : eval_function, "env_config" : environment_params, "eager" : False, **training_params }, logger_creator=custom_logger_creator) return trainer
# Training Deep RL agents with custom models using Ray Tune # Chapter 8, TensorFlow 2 Reinforcement Learning Cookbook | Praveen Palanisamy import sys import ray import ray.rllib.agents.impala as impala from ray.tune.logger import pretty_print from ray.rllib.models import ModelCatalog if not "." in sys.path: sys.path.insert(0, ".") from custom_model import CustomModel # Register custom-model in ModelCatalog ModelCatalog.register_custom_model("CustomCNN", CustomModel) ray.init() config = impala.DEFAULT_CONFIG.copy() config["num_gpus"] = 0 config["num_workers"] = 1 config["model"]["custom_model"] = "CustomCNN" config["log_level"] = "INFO" config["framework"] = "tf2" trainer = impala.ImpalaTrainer(config=config, env="procgen:procgen-coinrun-v0") for step in range(1000): # Custom training loop result = trainer.train() print(pretty_print(result))
if self.prev_action_mode == "concat": policy_input = torch.cat((policy_input, a), axis=-1) logits = self.policy_fc2(policy_input) # Generate the value output. value_input = self.value_fc1(x) value_input = nn.functional.relu(value_input) value_input = self.dropout_fc(value_input) if self.prev_action_mode == "concat": value_input = torch.cat((value_input, a), axis=-1) value = self.value_fc2(value_input) self._cur_value = value.squeeze(-1) return logits, [h] def set_norm_layer_mode(self, mode): if mode == "train": self.dropout_fc.train() else: self.dropout_fc.eval() for conv_seq in self.conv_seqs: conv_seq.set_norm_layer_mode(mode) def _in_rollout(self, x): # Single timestep indicates rollout. return x.shape[1] == 1 ModelCatalog.register_custom_model("custom_impala_cnn_rnn_torch", CustomImpalaCNNRNN)
help="Reward at which we stop training.") if __name__ == "__main__": args = parser.parse_args() ray.init(num_cpus=args.num_cpus or None) # Register the models to use. if args.framework == "torch": mod1 = mod2 = TorchSharedWeightsModel elif args.framework in ["tfe", "tf2"]: mod1 = mod2 = TF2SharedWeightsModel else: mod1 = SharedWeightsModel1 mod2 = SharedWeightsModel2 ModelCatalog.register_custom_model("model1", mod1) ModelCatalog.register_custom_model("model2", mod2) # Get obs- and action Spaces. single_env = gym.make("CartPole-v0") obs_space = single_env.observation_space act_space = single_env.action_space # Each policy can have a different configuration (including custom model). def gen_policy(i): config = { "model": { "custom_model": ["model1", "model2"][i % 2], }, "gamma": random.choice([0.95, 0.99]), }
my_id = info["agent_id"] other_id = 1 if my_id == 0 else 0 action_encoder = ModelCatalog.get_preprocessor_for_space(Discrete(2)) # set the opponent actions into the observation _, opponent_batch = info["all_pre_batches"][other_id] opponent_actions = np.array([ action_encoder.transform(a) for a in opponent_batch[SampleBatch.ACTIONS] ]) to_update[:, -2:] = opponent_actions if __name__ == "__main__": args = parser.parse_args() ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel) tune.run( "PPO", stop={ "timesteps_total": args.stop, "episode_reward_mean": 7.99, }, config={ "env": GlobalObsTwoStepGame, "batch_mode": "complete_episodes", "callbacks": { "on_postprocess_traj": fill_in_actions, }, "num_workers": 0, "multiagent": { "policies": {
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) self.config = config dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Action inputs self.obs_t = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder( tf.float32, [None], name="prev_reward") with tf.variable_scope(P_SCOPE) as scope: self.model = ModelCatalog.get_model({ "obs": self.obs_t, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"]) logits = self.model.outputs self.p_func_vars = _scope_vars(scope.name) # Action outputs action_dist = dist_cls(logits) self.output_actions = action_dist.sample() # Training inputs self.act_t = tf.placeholder(tf.int32, [None], name="action") self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward") # v network evaluation with tf.variable_scope(V_SCOPE) as scope: state_values = self.model.value_function() self.v_func_vars = _scope_vars(scope.name) self.v_loss = self._build_value_loss(state_values, self.cum_rew_t) self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t, logits, self.act_t, action_space) # which kind of objective to optimize objective = ( self.p_loss.loss + self.config["vf_coeff"] * self.v_loss.loss) self.explained_variance = tf.reduce_mean( explained_variance(self.cum_rew_t, state_values)) # initialize TFPolicyGraph self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("advantages", self.cum_rew_t), ] TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.obs_t, action_sampler=self.output_actions, action_prob=action_dist.sampled_action_prob(), loss=objective, model=self.model, loss_inputs=self.loss_inputs, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "total_loss": objective, "vf_explained_var": self.explained_variance, "policy_loss": self.p_loss.loss, "vf_loss": self.v_loss.loss }
def visualizer_rllib(args): result_dir = args.result_dir if args.result_dir[-1] != '/' \ else args.result_dir[:-1] # config = get_rllib_config(result_dir + '/..') # pkl = get_rllib_pkl(result_dir + '/..') config = get_rllib_config(result_dir) # TODO(ev) backwards compatibility hack try: pkl = get_rllib_pkl(result_dir) except Exception: pass # check if we have a multiagent scenario but in a # backwards compatible way if config.get('multiagent', {}).get('policy_graphs', {}): multiagent = True config['multiagent'] = pkl['multiagent'] else: multiagent = False # Run on only one cpu for rendering purposes config['num_workers'] = 0 flow_params = get_flow_params(config) # hack for old pkl files # TODO(ev) remove eventually sim_params = flow_params['sim'] setattr(sim_params, 'num_clients', 1) # Create and register a gym+rllib env create_env, env_name = make_create_env(params=flow_params, version=0, render=False) register_env(env_name, create_env) # Determine agent and checkpoint config_run = config['env_config']['run'] if 'run' in config['env_config'] \ else None if args.run and config_run: if args.run != config_run: print('visualizer_rllib.py: error: run argument ' + '\'{}\' passed in '.format(args.run) + 'differs from the one stored in params.json ' + '\'{}\''.format(config_run)) sys.exit(1) if args.run: agent_cls = get_agent_class(args.run) elif config_run: agent_cls = get_agent_class(config_run) else: print('visualizer_rllib.py: error: could not find flow parameter ' '\'run\' in params.json, ' 'add argument --run to provide the algorithm or model used ' 'to train the results\n e.g. ' 'python ./visualizer_rllib.py /tmp/ray/result_dir 1 --run PPO') sys.exit(1) sim_params.restart_instance = False sim_params.emission_path = './test_time_rollout/' # prepare for rendering if args.render_mode == 'sumo_web3d': sim_params.num_clients = 2 sim_params.render = False elif args.render_mode == 'drgb': sim_params.render = 'drgb' sim_params.pxpm = 4 elif args.render_mode == 'sumo_gui': sim_params.render = True elif args.render_mode == 'no_render': sim_params.render = False if args.save_render: sim_params.render = 'drgb' sim_params.pxpm = 4 sim_params.save_render = True # Recreate the scenario from the pickled parameters exp_tag = flow_params['exp_tag'] net_params = flow_params['net'] vehicles = flow_params['veh'] initial_config = flow_params['initial'] module = __import__('flow.scenarios', fromlist=[flow_params['scenario']]) scenario_class = getattr(module, flow_params['scenario']) scenario = scenario_class(name=exp_tag, vehicles=vehicles, net_params=net_params, initial_config=initial_config) # check if the environment is a single or multiagent environment, and # get the right address accordingly single_agent_envs = [ env for env in dir(flow.envs) if not env.startswith('__') ] if flow_params['env_name'] in single_agent_envs: env_loc = 'flow.envs' else: env_loc = 'flow.multiagent_envs' # Start the environment with the gui turned on and a path for the # emission file module = __import__(env_loc, fromlist=[flow_params['env_name']]) env_class = getattr(module, flow_params['env_name']) env_params = flow_params['env'] env_params.restart_instance = False if args.evaluate: env_params.evaluate = True # lower the horizon if testing if args.horizon: config['horizon'] = args.horizon env_params.horizon = args.horizon # create the agent that will be used to compute the actions agent = agent_cls(env=env_name, config=config) checkpoint = result_dir + '/checkpoint_' + args.checkpoint_num checkpoint = checkpoint + '/checkpoint-' + args.checkpoint_num agent.restore(checkpoint) env = ModelCatalog.get_preprocessor_as_wrapper( env_class(env_params=env_params, sim_params=sim_params, scenario=scenario)) if multiagent: rets = {} # map the agent id to its policy policy_map_fn = config['multiagent']['policy_mapping_fn'].func for key in config['multiagent']['policy_graphs'].keys(): rets[key] = [] else: rets = [] final_outflows = [] mean_speed = [] for i in range(args.num_rollouts): vel = [] state = env.reset() if multiagent: ret = {key: [0] for key in rets.keys()} else: ret = 0 for _ in range(env_params.horizon): vehicles = env.unwrapped.vehicles vel.append(np.mean(vehicles.get_speed(vehicles.get_ids()))) if multiagent: action = {} for agent_id in state.keys(): action[agent_id] = agent.compute_action( state[agent_id], policy_id=policy_map_fn(agent_id)) else: action = agent.compute_action(state) state, reward, done, _ = env.step(action) if multiagent: for actor, rew in reward.items(): ret[policy_map_fn(actor)][0] += rew else: ret += reward if multiagent and done['__all__']: break if not multiagent and done: break if multiagent: for key in rets.keys(): rets[key].append(ret[key]) else: rets.append(ret) outflow = vehicles.get_outflow_rate(500) final_outflows.append(outflow) mean_speed.append(np.mean(vel)) if multiagent: for agent_id, rew in rets.items(): print('Round {}, Return: {} for agent {}'.format( i, ret, agent_id)) else: print('Round {}, Return: {}'.format(i, ret)) if multiagent: for agent_id, rew in rets.items(): print('Average, std return: {}, {} for agent {}'.format( np.mean(rew), np.std(rew), agent_id)) else: print('Average, std return: {}, {}'.format(np.mean(rets), np.std(rets))) print('Average, std speed: {}, {}'.format(np.mean(mean_speed), np.std(mean_speed))) print('Average, std outflow: {}, {}'.format(np.mean(final_outflows), np.std(final_outflows))) # terminate the environment env.unwrapped.terminate() # if prompted, convert the emission file into a csv file if args.emission_to_csv: dir_path = os.path.dirname(os.path.realpath(__file__)) emission_filename = '{0}-emission.xml'.format(scenario.name) emission_path = \ '{0}/test_time_rollout/{1}'.format(dir_path, emission_filename) emission_to_csv(emission_path) # if we wanted to save the render, here we create the movie if args.save_render: dirs = os.listdir(os.path.expanduser('~') + '/flow_rendering') dirs.sort(key=lambda date: datetime.strptime(date, "%Y-%m-%d-%H%M%S")) recent_dir = dirs[-1] # create the movie movie_dir = os.path.expanduser('~') + '/flow_rendering/' + recent_dir save_dir = os.path.expanduser('~') + '/flow_movies' if not os.path.exists(save_dir): os.mkdir(save_dir) os_cmd = "cd " + movie_dir + " && ffmpeg -i frame_%06d.png" os_cmd += " -pix_fmt yuv420p " + dirs[-1] + ".mp4" os_cmd += "&& cp " + dirs[-1] + ".mp4 " + save_dir + "/" os.system(os_cmd)
def __init__(self, path_to_model, observation_space): self._prep = ModelCatalog.get_preprocessor_for_space(observation_space) self._path_to_model = path_to_model
def main(args): # ==================================== # init env config # ==================================== if args.no_debug: ray.init(webui_host="127.0.0.1") else: ray.init(local_mode=True, webui_host="127.0.0.1") # use ray cluster for training # ray.init( # address="auto" if args.address is None else args.address, # redis_password="******", # ) # # print( # "--------------- Ray startup ------------\n{}".format( # ray.state.cluster_resources() # ) # ) agent_specs = {"AGENT-007": agent_spec} env_config = { "seed": 42, "scenarios": [scenario_paths], "headless": args.headless, "agent_specs": agent_specs, } # ==================================== # init tune config # ==================================== class MultiEnv(RLlibHiWayEnv): def __init__(self, env_config): env_config["scenarios"] = [ scenario_paths[(env_config.worker_index - 1) % len(scenario_paths)] ] super(MultiEnv, self).__init__(config=env_config) ModelCatalog.register_custom_model("my_fc", FullyConnectedNetwork) tune_config = { "env": MultiEnv, "env_config": env_config, "multiagent": { "policies": { "default_policy": ( None, OBSERVATION_SPACE, ACTION_SPACE, {}, ) }, "policy_mapping_fn": lambda agent_id: "default_policy", }, "model": { "custom_model": "my_fc", }, "framework": "torch", "callbacks": { "on_episode_start": on_episode_start, "on_episode_step": on_episode_step, "on_episode_end": on_episode_end, }, "lr": 1e-4, "log_level": "WARN", "num_workers": args.num_workers, "horizon": args.horizon, "train_batch_size": 10240 * 3, # "observation_filter": "MeanStdFilter", # "batch_mode": "complete_episodes", # "grad_clip": 0.5, # "model":{ # "use_lstm": True, # }, } tune_config.update({ "lambda": 0.95, "clip_param": 0.2, "num_sgd_iter": 10, "sgd_minibatch_size": 1024, "gamma": 0.995, # "l2_coeff": 5e-4, }) # ==================================== # init log and checkpoint dir_info # ==================================== experiment_name = EXPERIMENT_NAME.format( scenario="multi_scenarios", algorithm="PPO", n_agent=1, ) log_dir = Path(args.log_dir).expanduser().absolute() / RUN_NAME log_dir.mkdir(parents=True, exist_ok=True) print(f"Checkpointing at {log_dir}") if args.restore: restore_path = Path(args.restore).expanduser() print(f"Loading model from {restore_path}") else: restore_path = None # run experiments analysis = tune.run( PPOTrainer, # "PPO", name=experiment_name, stop={"time_total_s": 24 * 60 * 60}, checkpoint_freq=2, checkpoint_at_end=True, local_dir=str(log_dir), resume=args.resume, restore=restore_path, max_failures=1000, export_formats=["model", "checkpoint"], config=tune_config, ) print(analysis.dataframe().head())
def setup(env, hparams, num_cpus, num_gpus, num_agents, use_gpus_for_workers=False, use_gpu_for_driver=False, num_workers_per_device=1): if env == 'harvest': def env_creator(_): return HarvestEnv(num_agents=num_agents) single_env = HarvestEnv() else: def env_creator(_): return CleanupEnv(num_agents=num_agents) single_env = CleanupEnv() env_name = env + "_env" register_env(env_name, env_creator) obs_space = single_env.observation_space act_space = single_env.action_space # Each policy can have a different configuration (including custom model) def gen_policy(): return (PPOPolicyGraph, obs_space, act_space, {}) # Setup PPO with an ensemble of `num_policies` different policy graphs policy_graphs = {} for i in range(num_agents): policy_graphs['agent-' + str(i)] = gen_policy() def policy_mapping_fn(agent_id): return agent_id # register the custom model model_name = "conv_to_fc_net" ModelCatalog.register_custom_model(model_name, ConvToFCNet) algorithm = 'A3C' agent_cls = get_agent_class(algorithm) config = agent_cls._default_config.copy() # information for replay config['env_config']['func_create'] = tune.function(env_creator) config['env_config']['env_name'] = env_name config['env_config']['run'] = algorithm # Calculate device configurations gpus_for_driver = int(use_gpu_for_driver) cpus_for_driver = 1 - gpus_for_driver if use_gpus_for_workers: spare_gpus = (num_gpus - gpus_for_driver) num_workers = int(spare_gpus * num_workers_per_device) num_gpus_per_worker = spare_gpus / num_workers num_cpus_per_worker = 0 else: spare_cpus = (num_cpus - cpus_for_driver) num_workers = int(spare_cpus * num_workers_per_device) num_gpus_per_worker = 0 num_cpus_per_worker = spare_cpus / num_workers # hyperparams config.update({ "train_batch_size": 128, "horizon": 1000, "lr_schedule": [[0, hparams['lr_init']], [20000000, hparams['lr_final']]], "num_workers": num_workers, "num_gpus": gpus_for_driver, # The number of GPUs for the driver "num_cpus_for_driver": cpus_for_driver, "num_gpus_per_worker": num_gpus_per_worker, # Can be a fraction "num_cpus_per_worker": num_cpus_per_worker, # Can be a fraction "entropy_coeff": hparams['entropy_coeff'], "multiagent": { "policy_graphs": policy_graphs, "policy_mapping_fn": tune.function(policy_mapping_fn), }, "model": { "custom_model": "conv_to_fc_net", "use_lstm": True, "lstm_cell_size": 128 } }) return algorithm, env_name, config
last_layer = slim.fully_connected( input_dict["obs"], 64, activation_fn=tf.nn.relu, scope="fc1") last_layer = slim.fully_connected( last_layer, 64, activation_fn=tf.nn.relu, scope="fc2") output = slim.fully_connected( last_layer, num_outputs, activation_fn=None, scope="fc_out") return output, last_layer if __name__ == "__main__": args = parser.parse_args() ray.init() # Simple environment with `num_agents` independent cartpole entities register_env("multi_cartpole", lambda _: MultiCartpole(args.num_agents)) ModelCatalog.register_custom_model("model1", CustomModel1) ModelCatalog.register_custom_model("model2", CustomModel2) single_env = gym.make("CartPole-v0") obs_space = single_env.observation_space act_space = single_env.action_space # Each policy can have a different configuration (including custom model) def gen_policy(i): config = { "model": { "custom_model": ["model1", "model2"][i % 2], }, "gamma": random.choice([0.95, 0.99]), } return (PPOPolicyGraph, obs_space, act_space, config)
last_layer = tf.layers.batch_normalization( last_layer, training=input_dict["is_training"]) output = slim.fully_connected( last_layer, num_outputs, weights_initializer=normc_initializer(0.01), activation_fn=None, scope="fc_out") return output, last_layer if __name__ == "__main__": args = parser.parse_args() ray.init() ModelCatalog.register_custom_model("bn_model", BatchNormModel) run_experiments({ "batch_norm_demo": { "run": args.run, "env": "Pendulum-v0" if args.run == "DDPG" else "CartPole-v0", "stop": { "training_iteration": args.num_iters }, "config": { "model": { "custom_model": "bn_model", }, "num_workers": 0, }, }, })
# Batch dot product => shape of logits is [BATCH, MAX_ACTIONS]. action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2) # Mask out invalid actions (use tf.float32.min for stability) inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min) return action_logits + inf_mask, state def value_function(self): return self.action_embed_model.value_function() if __name__ == "__main__": args = parser.parse_args() ray.init() ModelCatalog.register_custom_model("pa_model", ParametricActionsModel) register_env("pa_cartpole", lambda _: ParametricActionCartpole(10)) if args.run == "DQN": cfg = { # TODO(ekl) we need to set these to prevent the masked values # from being further processed in DistributionalQModel, which # would mess up the masking. It is possible to support these if we # defined a a custom DistributionalQModel that is aware of masking. "hiddens": [], "dueling": False, } else: cfg = {} tune.run( args.run, stop={
import importlib config = importlib.import_module(args.config_file, package=None) print(config.num_seeds) # Did not do ConfigParser or JSON because I want to give config programmatically # config = configparser.ConfigParser() # config.read(args.config_file) # print(config.sections(), [i for i in config['ConfigSpace']]) # import json # print([json.loads(config['ConfigSpace'][i]) for i in config['ConfigSpace']]) from ray.rllib.models.preprocessors import OneHotPreprocessor from ray.rllib.models import ModelCatalog ModelCatalog.register_custom_preprocessor("ohe", OneHotPreprocessor) ray.init(local_mode=True) #, object_id_seed=0) # num_seeds = 10 # state_space_sizes = [8]#, 10, 12, 14] # [2**i for i in range(1,6)] # action_space_sizes = [8]#2, 4, 8, 16] # [2**i for i in range(1,6)] # delays = [0] + [2**i for i in range(4)] # sequence_lengths = [1, 2, 3, 4]#i for i in range(1,4)] # reward_densities = [0.25] # np.linspace(0.0, 1.0, num=5) # # make_reward_dense = [True, False] # terminal_state_densities = [0.25] # np.linspace(0.1, 1.0, num=5) # algorithms = ["DQN"] # seeds = [i for i in range(num_seeds)] # # Others, keep the rest fixed for these: learning_starts, target_network_update_freq, double_dqn, fcnet_hiddens, fcnet_activation, use_lstm, lstm_seq_len, sample_batch_size/train_batch_size, learning rate # # More others: adam_epsilon, exploration_final_eps/exploration_fraction, buffer_size
def register_mm_ray_policy(name: str, policy_model: Type[tf.keras.Model], networks: Dict[str, tf.keras.Model]): """ Constructs a Ray policy with multiple models as part of the collections. This allows for distributed training with multiple parameter sets (for example, when using auxillary losses) Arguments: policy_model {Type[tf.keras.Model]} -- The policy model which is called to make predictions networks {Dict[str, tf.keras.Model]} -- A dictionary of additional networks Returns: ray.rllib.models.Model -- A model which can be used with Ray """ class MMRayPolicy(ray.rllib.models.Model): @override(ray.rllib.models.Model) def _build_layers_v2(self, input_dict, num_outputs, options): # Setup the policy model if tf.get_collection('_rk_policy_model'): self.model = tf.get_collection('_rk_policy_model')[0] else: self.model = policy_model(num_outputs, **options) tf.add_to_collection('_rk_policy_model', self.model) # Add any other models to the collection if networks: self.networks = {} for key in networks.keys(): if tf.get_collection('_rk_networks_{}'.format(key)): self.networks[key] = tf.get_collection( '_rk_networks_{}'.format(key)) else: self.networks[key] = [networks[key](**options), None, None] tf.add_to_collection('_rk_networks_{}'.format(key), self.networks[key]) if self.model.recurrent: self.state_init = [ np.zeros([state_size]) for state_size in self.model.state_size ] if not self.state_in: self.state_in = [ tf.placeholder(tf.float32, [None, state_size]) for state_size in self.model.state_size ] output = self.model(input_dict, seqlens=self.seq_lens, initial_state=self.state_in) self.state_out = list(output['state_out']) else: output = self.model(input_dict) self.policy_output = output # Update the input dict with the model outputs input_dict['model_outputs'] = output # Compute the outputs for each of the networks for key, net in self.networks.items(): if net[0].recurrent: net[1] = [ np.zeros([state_size]) for state_size in net[0].state_size ] if not net[2]: net[2] = [ tf.placeholder(tf.float32, [None, state_size]) for state_size in net[0].state_size ] self.network_outputs[key] = net[0](input_dict, seqlens=self.seq_lens, initial_state=net[2]) else: self.network_outputs[key] = net[0](input_dict) return output['logits'], output['latent'] @override(ray.rllib.models.Model) def custom_loss(self, policy_loss, loss_inputs): # Update the loss_inputs with all of the model outputs if self.networks: loss_inputs['network_outputs'] = { k: self.network_outputs[k] for k in self.networks.keys() } loss_inputs['network_outputs'][ 'policy_model'] = self.policy_output total_loss = policy_loss if hasattr(self.model, 'custom_loss'): total_loss = self.model.custom_loss(policy_loss, loss_inputs) if self.networks: for _, net in self.networks.items(): if hasattr(net[0], 'custom_loss'): total_loss = net[0].custom_loss( total_loss, loss_inputs, ) return total_loss MMRayPolicy.__name__ = name MMRayPolicy.__doc__ = "Wraped Multi-Network RAY policy" ModelCatalog.register_custom_model(name, MMRayPolicy) return MMRayPolicy
def testCustomModel(self): ray.init() ModelCatalog.register_custom_model("foo", CustomModel) p1 = ModelCatalog.get_model( get_registry(), 1, 5, {"custom_model": "foo"}) self.assertEqual(str(type(p1)), str(CustomModel))
return np.array(transform_module(observation)) elif flag == 2: return np.flip(observation, 1) elif flag == 3: h1 = np.random.randint(10, 20) w1 = np.random.randint(10, 20) observation[h1:h1 + h1, w1:w1 + w1, :] = 0 return observation elif flag == 4: h1 = np.random.randint(10, 20) w1 = np.random.randint(10, 20) rand_color = np.random.randint(0, 255, size=(3, )) / 255. observation[h1:h1 + h1, w1:w1 + w1, :] = np.tile( rand_color.reshape(1, 1, -1), observation[h1:h1 + h1, w1:w1 + w1, :].shape[:2] + (1, )) return observation elif flag == 5: observation = observation[:, :, 0] * 0.2989 + observation[:, :, 1] * 0.587 + observation[:, :, 2] * 0.114 observation = np.expand_dims(observation, axis=2) return observation ModelCatalog.register_custom_preprocessor("my_prep", MyPreprocessorClass)
if not args.config.get("env"): parser.error("the following arguments are required: --env") args.env = args.config.get("env") ray.init() cls = get_agent_class(args.run) agent = cls(env=args.env, config=args.config) agent.restore(args.checkpoint) num_steps = int(args.steps) if args.run == "DQN": env = gym.make(args.env) env = wrap_dqn(get_registry(), env, args.config.get("model", {})) else: env = ModelCatalog.get_preprocessor_as_wrapper(get_registry(), gym.make(args.env)) if args.out is not None: rollouts = [] steps = 0 while steps < (num_steps or steps + 1): if args.out is not None: rollout = [] state = env.reset() done = False reward_total = 0.0 while not done and steps < (num_steps or steps + 1): action = agent.compute_action(state) next_state, reward, done, _ = env.step(action) reward_total += reward if not args.no_render: env.render()
size, weights_initializer=normc_initializer(1.0), activation_fn=activation, scope=label) i += 1 label = "fc_out" output = slim.fully_connected( last_layer, num_outputs, weights_initializer=normc_initializer(0.01), activation_fn=None, scope=label) return output, last_layer ModelCatalog.register_custom_model("my_model", MyModelClass) #ray.init(num_gpus=2) ray.init() print("hello!") def my_train_fn(config, reporter): agent = a3c.A3CAgent(config=config) policy_graph = agent.local_evaluator.policy_map["default"].sess.graph writer = tf.summary.FileWriter(agent._result_logger.logdir, policy_graph) writer.close() for _ in range(10): result = agent.train() reporter(**result) agent.stop()