def check_learned(self): """ check the learned agent """ ray.init(local_mode=True) if self.algorithm == 'PPO': agent = ppo.PPOTrainer(config=self.ray_config, env=self.env.__class__) elif self.algorithm == 'A3C': agent = a3c.A3CTrainer(config=self.ray_config, env=self.env.__class__) elif self.algorithm == 'PG': agent = pg.PGTrainer(config=self.ray_config, env=self.env.__class__) agent.restore(self.checkpoint_path) # run until episode ends episode_reward = 0 done = False obs = self.env.reset() while True: self.env.render() action = agent.compute_action(obs) obs, reward, done, info = self.env.step(action) # print(f"obs:\n{obs}") print(f"reward:\n{reward}") print(f"info:\n{info}") episode_reward += reward
def test_add_delete_policy(self): env = gym.make("CartPole-v0") config = pg.DEFAULT_CONFIG.copy() config.update({ "env": MultiAgentCartPole, "env_config": { "config": { "num_agents": 4, }, }, "multiagent": { # Start with a single policy. "policies": { "p0": (None, env.observation_space, env.action_space, {}), }, "policy_mapping_fn": lambda aid, episode, **kwargs: "p0", "policy_map_capacity": 2, }, }) # TODO: (sven) this will work for tf, once we have the DynamicTFPolicy # refactor PR merged. for _ in framework_iterator(config, frameworks=("tf2", "torch")): trainer = pg.PGTrainer(config=config) r = trainer.train() self.assertTrue("p0" in r["policy_reward_min"]) for i in range(1, 4): def new_mapping_fn(agent_id, episode, **kwargs): return f"p{choice([i, i - 1])}" # Add a new policy. new_pol = trainer.add_policy( f"p{i}", trainer._policy_class, observation_space=env.observation_space, action_space=env.action_space, config={}, # Test changing the mapping fn. policy_mapping_fn=new_mapping_fn, # Change the list of policies to train. policies_to_train=[f"p{i}", f"p{i-1}"], ) pol_map = trainer.workers.local_worker().policy_map self.assertTrue(new_pol is not trainer.get_policy("p0")) for j in range(i): self.assertTrue(f"p{j}" in pol_map) self.assertTrue(len(pol_map) == i + 1) r = trainer.train() self.assertTrue("p1" in r["policy_reward_min"]) # Delete all added policies again from trainer. for i in range(3, 0, -1): trainer.remove_policy( f"p{i}", policy_mapping_fn=lambda aid, eps, **kwargs: f"p{i - 1}", policies_to_train=[f"p{i - 1}"]) trainer.stop()
def test_pg_fake_multi_gpu_learning(self): """Test whether PGTrainer can learn CartPole w/ faked multi-GPU.""" config = copy.deepcopy(pg.DEFAULT_CONFIG) # Fake GPU setup. config["num_gpus"] = 2 config["_fake_gpus"] = True config["framework"] = "tf" # Mimic tuned_example for PG CartPole. config["model"]["fcnet_hiddens"] = [64] config["model"]["fcnet_activation"] = "linear" trainer = pg.PGTrainer(config=config, env="CartPole-v0") num_iterations = 200 learnt = False for i in range(num_iterations): results = trainer.train() print("reward={}".format(results["episode_reward_mean"])) # Make this test quite short (75.0). if results["episode_reward_mean"] > 75.0: learnt = True break assert learnt, "PG multi-GPU (with fake-GPUs) did not learn CartPole!" trainer.stop()
def test_timesteps(self): """Test whether a PGTrainer can be built with both frameworks.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["model"]["fcnet_hiddens"] = [1] config["model"]["fcnet_activation"] = None obs = np.array(1) obs_batch = np.array([1]) for _ in framework_iterator(config): trainer = pg.PGTrainer(config=config, env=RandomEnv) policy = trainer.get_policy() for i in range(1, 21): trainer.compute_single_action(obs) self.assertEqual(policy.global_timestep, i) for i in range(1, 21): policy.compute_actions(obs_batch) self.assertEqual(policy.global_timestep, i + 20) # Artificially set ts to 100Bio, then keep computing actions and # train. crazy_timesteps = int(1e11) policy.global_timestep = crazy_timesteps # Run for 10 more ts. for i in range(1, 11): policy.compute_actions(obs_batch) self.assertEqual(policy.global_timestep, i + crazy_timesteps) trainer.train()
def test_pg_exec_impl(ray_start_regular): trainer = pg.PGTrainer(env="CartPole-v0", config={ "min_iter_time_s": 0, "use_exec_api": True }) assert isinstance(trainer.train(), dict)
def test_pg_compilation(self): """Test whether a PGTrainer can be built with both frameworks.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. # tf. trainer = pg.PGTrainer(config=config, env="CartPole-v0") num_iterations = 2 for i in range(num_iterations): trainer.train() # Torch. config["use_pytorch"] = True trainer = pg.PGTrainer(config=config, env="CartPole-v0") for i in range(num_iterations): trainer.train()
def test_add_delete_policy(self): env = gym.make("CartPole-v0") config = pg.DEFAULT_CONFIG.copy() config.update({ "env": MultiAgentCartPole, "env_config": { "config": { "num_agents": 4, }, }, "multiagent": { # Start with a single policy. "policies": { "p0": (None, env.observation_space, env.action_space, {}), }, "policy_mapping_fn": lambda aid, episode, **kwargs: "p0", }, }) # TODO: (sven): Fix TrainTFMultiGPU to be flexible wrt adding policies # on-the-fly. for _ in framework_iterator(config, frameworks=("tf2", "torch")): trainer = pg.PGTrainer(config=config) # Given evaluation_interval=2, r0, r2, r4 should not contain # evaluation metrics, while r1, r3 should. r0 = trainer.train() self.assertTrue("p0" in r0["policy_reward_min"]) for i in range(1, 4): # Add a new policy. new_pol = trainer.add_policy( f"p{i}", trainer._policy_class, observation_space=env.observation_space, action_space=env.action_space, config={}, # Test changing the mapping fn. policy_mapping_fn=lambda aid, eps, **kwargs: f"p{i}", # Change the list of policies to train. policies_to_train=[f"p{i}"], ) pol_map = trainer.workers.local_worker().policy_map self.assertTrue(new_pol is not trainer.get_policy("p0")) self.assertTrue("p0" in pol_map) self.assertTrue("p1" in pol_map) self.assertTrue(len(pol_map) == i + 1) r = trainer.train() self.assertTrue("p1" in r["policy_reward_min"]) # Delete all added policies again from trainer. for i in range(3, 0, -1): trainer.remove_policy( f"p{i}", policy_mapping_fn=lambda aid, eps, **kwargs: f"p{i - 1}", policies_to_train=[f"p{i - 1}"]) trainer.stop()
def test_pg_compilation(self): """Test whether a PGTrainer can be built with all frameworks.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 1 config["rollout_fragment_length"] = 500 # Test with filter to see whether they work w/o preprocessing. config["observation_filter"] = "MeanStdFilter" num_iterations = 1 image_space = Box(-1.0, 1.0, shape=(84, 84, 3)) simple_space = Box(-1.0, 1.0, shape=(3,)) tune.register_env( "random_dict_env", lambda _: RandomEnv( { "observation_space": Dict( { "a": simple_space, "b": Discrete(2), "c": image_space, } ), "action_space": Box(-1.0, 1.0, shape=(1,)), } ), ) tune.register_env( "random_tuple_env", lambda _: RandomEnv( { "observation_space": Tuple( [simple_space, Discrete(2), image_space] ), "action_space": Box(-1.0, 1.0, shape=(1,)), } ), ) for _ in framework_iterator(config, with_eager_tracing=True): # Test for different env types (discrete w/ and w/o image, + cont). for env in [ "random_dict_env", "random_tuple_env", "MsPacmanNoFrameskip-v4", "CartPole-v0", "FrozenLake-v1", ]: print(f"env={env}") trainer = pg.PGTrainer(config=config, env=env) for i in range(num_iterations): results = trainer.train() check_train_results(results) print(results) check_compute_single_action(trainer, include_prev_action_reward=True)
def test_pg_compilation(self): """Test whether a PGTrainer can be built with both frameworks.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. num_iterations = 2 for _ in framework_iterator(config): trainer = pg.PGTrainer(config=config, env="CartPole-v0") for i in range(num_iterations): trainer.train() check_compute_action(trainer, include_prev_action_reward=True)
def test_bad_envs(self): """Tests different "bad env" errors. """ config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Non existing/non-registered gym env string. env = "Alien-Attack-v42" for _ in framework_iterator(config): self.assertRaisesRegex( EnvError, f"The env string you provided \\('{env}'\\) is", lambda: pg.PGTrainer(config=config, env=env), ) # Malformed gym env string (must have v\d at end). env = "Alien-Attack-part-42" for _ in framework_iterator(config): self.assertRaisesRegex( EnvError, f"The env string you provided \\('{env}'\\) is", lambda: pg.PGTrainer(config=config, env=env), ) # Non-existing class in a full-class-path. env = "ray.rllib.examples.env.random_env.RandomEnvThatDoesntExist" for _ in framework_iterator(config): self.assertRaisesRegex( EnvError, f"The env string you provided \\('{env}'\\) is", lambda: pg.PGTrainer(config=config, env=env), ) # Non-existing module inside a full-class-path. env = "ray.rllib.examples.env.module_that_doesnt_exist.SomeEnv" for _ in framework_iterator(config): self.assertRaisesRegex( EnvError, f"The env string you provided \\('{env}'\\) is", lambda: pg.PGTrainer(config=config, env=env), )
def test_space_inference_from_remote_workers(self): # Expect to not do space inference if the learner has an env. env = gym.make("CartPole-v0") config = pg.DEFAULT_CONFIG.copy() config["env"] = "CartPole-v0" config["num_workers"] = 1 # No env on driver -> expect longer build time due to space # lookup from remote worker. t0 = time.time() trainer = pg.PGTrainer(config=config) w_lookup = time.time() - t0 print(f"No env on learner: {w_lookup}sec") trainer.stop() # Env on driver -> expect shorted build time due to no space # lookup required from remote worker. config["create_env_on_driver"] = True t0 = time.time() trainer = pg.PGTrainer(config=config) wo_lookup = time.time() - t0 print(f"Env on learner: {wo_lookup}sec") self.assertLess(wo_lookup, w_lookup) trainer.stop() # Spaces given -> expect shorter build time due to no space # lookup required from remote worker. config["create_env_on_driver"] = False config["observation_space"] = env.observation_space config["action_space"] = env.action_space t0 = time.time() trainer = pg.PGTrainer(config=config) wo_lookup = time.time() - t0 print(f"Spaces given manually in config: {wo_lookup}sec") self.assertLess(wo_lookup, w_lookup) trainer.stop()
def test_pg_compilation(self): """Test whether a PGTrainer can be built with both frameworks.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 1 config["rollout_fragment_length"] = 500 num_iterations = 1 for _ in framework_iterator(config): for env in ["FrozenLake-v0", "CartPole-v0"]: trainer = pg.PGTrainer(config=config, env=env) for i in range(num_iterations): print(trainer.train()) check_compute_single_action( trainer, include_prev_action_reward=True)
def test_pg_compilation(self): """Test whether a PGTrainer can be built with both frameworks.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 num_iterations = 2 for fw in framework_iterator(config): # For tf, build with fake-GPUs. config["_fake_gpus"] = fw == "tf" config["num_gpus"] = 2 if fw == "tf" else 0 trainer = pg.PGTrainer(config=config, env="CartPole-v0") for i in range(num_iterations): print(trainer.train()) check_compute_single_action(trainer, include_prev_action_reward=True)
def get_rl_agent(agent_name, config, env_to_agent): if agent_name == A2C: import ray.rllib.agents.a3c as a2c agent = a2c.A2CTrainer(config=config, env=env_to_agent) elif agent_name == A3C: import ray.rllib.agents.a3c as a3c agent = a3c.A3CTrainer(config=config, env=env_to_agent) elif agent_name == BC: import ray.rllib.agents.marwil as bc agent = bc.BCTrainer(config=config, env=env_to_agent) elif agent_name == DQN: import ray.rllib.agents.dqn as dqn agent = dqn.DQNTrainer(config=config, env=env_to_agent) elif agent_name == APEX_DQN: import ray.rllib.agents.dqn as dqn agent = dqn.ApexTrainer(config=config, env=env_to_agent) elif agent_name == IMPALA: import ray.rllib.agents.impala as impala agent = impala.ImpalaTrainer(config=config, env=env_to_agent) elif agent_name == MARWIL: import ray.rllib.agents.marwil as marwil agent = marwil.MARWILTrainer(config=config, env=env_to_agent) elif agent_name == PG: import ray.rllib.agents.pg as pg agent = pg.PGTrainer(config=config, env=env_to_agent) elif agent_name == PPO: import ray.rllib.agents.ppo as ppo agent = ppo.PPOTrainer(config=config, env=env_to_agent) elif agent_name == APPO: import ray.rllib.agents.ppo as ppo agent = ppo.APPOTrainer(config=config, env=env_to_agent) elif agent_name == SAC: import ray.rllib.agents.sac as sac agent = sac.SACTrainer(config=config, env=env_to_agent) elif agent_name == LIN_UCB: import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent) elif agent_name == LIN_TS: import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent) else: raise Exception("Not valid agent name") return agent
def get_rllib_agent(agent_name, env_name, env, env_to_agent): config = get_config(env_name, env, 1) if is_rllib_agent(agent_name) else {} if agent_name == RLLIB_A2C: import ray.rllib.agents.a3c as a2c agent = a2c.A2CTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_A3C: import ray.rllib.agents.a3c as a3c agent = a3c.A3CTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_BC: import ray.rllib.agents.marwil as bc agent = bc.BCTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_DQN: import ray.rllib.agents.dqn as dqn agent = dqn.DQNTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_APEX_DQN: import ray.rllib.agents.dqn as dqn agent = dqn.ApexTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_IMPALA: import ray.rllib.agents.impala as impala agent = impala.ImpalaTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_MARWIL: import ray.rllib.agents.marwil as marwil agent = marwil.MARWILTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_PG: import ray.rllib.agents.pg as pg agent = pg.PGTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_PPO: import ray.rllib.agents.ppo as ppo agent = ppo.PPOTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_APPO: import ray.rllib.agents.ppo as ppo agent = ppo.APPOTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_SAC: import ray.rllib.agents.sac as sac agent = sac.SACTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_LIN_UCB: import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_LIN_TS: import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent) return agent
def test_validate_config_idempotent(self): """ Asserts that validate_config run multiple times on COMMON_CONFIG will be idempotent """ # Given: standard_config = copy.deepcopy(COMMON_CONFIG) trainer = pg.PGTrainer(env="CartPole-v0", config=standard_config) # When (we validate config 2 times). # Try deprecated `Trainer._validate_config()` method (static). trainer._validate_config(standard_config, trainer) config_v1 = copy.deepcopy(standard_config) # Try new method: `Trainer.validate_config()` (non-static). trainer.validate_config(standard_config) config_v2 = copy.deepcopy(standard_config) # Make sure nothing changed. self.assertEqual(config_v1, config_v2) trainer.stop()
# array[offset + observation] = 1 # # Can optionally call trainer.restore(path) to load a checkpoint. # class MyPreprocessor(Preprocessor): # def _init_shape(self, obs_space, options): # return (4, 4, 1) # def transform(self, observation): # arr = np.zeros(16, ) # arr[observation] = 1 # return arr.reshape(4, 4, 1) #ModelCatalog.register_custom_preprocessor("my_prep", OneHotPreprocessor) #config["model"]["custom_preprocessor"] = "my_prep" #ModelCatalog.register_custom_model("my_model", Dense) #config["model"]["custom_model"] = "my_model" #trainer = ppo.PPOTrainer(config=config, env=env_test) #trainer = dqn.DQNTrainer(config=config, env=env_test) trainer = pg.PGTrainer(config=config, env=env_test) for i in range(200): # Perform one iteration of training the policy with PPO result = trainer.train() print(pretty_print(result)) if i % 100 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint)
def test_add_delete_policy(self): config = pg.DEFAULT_CONFIG.copy() config.update( { "env": MultiAgentCartPole, "env_config": { "config": { "num_agents": 4, }, }, "num_workers": 2, # Test on remote workers as well. "num_cpus_per_worker": 0.1, "model": { "fcnet_hiddens": [5], "fcnet_activation": "linear", }, "train_batch_size": 100, "rollout_fragment_length": 50, "multiagent": { # Start with a single policy. "policies": {"p0"}, "policy_mapping_fn": lambda aid, eps, worker, **kwargs: "p0", # And only two policies that can be stored in memory at a # time. "policy_map_capacity": 2, }, "evaluation_num_workers": 1, "evaluation_config": { "num_cpus_per_worker": 0.1, }, } ) for _ in framework_iterator(config): trainer = pg.PGTrainer(config=config) pol0 = trainer.get_policy("p0") r = trainer.train() self.assertTrue("p0" in r["info"][LEARNER_INFO]) for i in range(1, 3): def new_mapping_fn(agent_id, episode, worker, **kwargs): return f"p{choice([i, i - 1])}" # Add a new policy. pid = f"p{i}" new_pol = trainer.add_policy( pid, trainer.get_default_policy_class(config), # Test changing the mapping fn. policy_mapping_fn=new_mapping_fn, # Change the list of policies to train. policies_to_train=[f"p{i}", f"p{i-1}"], ) pol_map = trainer.workers.local_worker().policy_map self.assertTrue(new_pol is not pol0) for j in range(i + 1): self.assertTrue(f"p{j}" in pol_map) self.assertTrue(len(pol_map) == i + 1) trainer.train() checkpoint = trainer.save() # Test restoring from the checkpoint (which has more policies # than what's defined in the config dict). test = pg.PGTrainer(config=config) test.restore(checkpoint) # Make sure evaluation worker also gets the restored policy. def _has_policy(w): return w.get_policy("p0") is not None self.assertTrue( all(test.evaluation_workers.foreach_worker(_has_policy)) ) # Make sure trainer can continue training the restored policy. pol0 = test.get_policy("p0") test.train() # Test creating an action with the added (and restored) policy. a = test.compute_single_action( np.zeros_like(pol0.observation_space.sample()), policy_id=pid ) self.assertTrue(pol0.action_space.contains(a)) test.stop() # Delete all added policies again from trainer. for i in range(2, 0, -1): trainer.remove_policy( f"p{i}", # Note that the complete signature of a policy_mapping_fn # is: `agent_id, episode, worker, **kwargs`. policy_mapping_fn=lambda aid, eps, **kwargs: f"p{i - 1}", policies_to_train=[f"p{i - 1}"], ) trainer.stop()
def test_pg_loss_functions(self): """Tests the PG loss function math.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["eager"] = True config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" # Fake CartPole episode of n time steps. train_batch = { SampleBatch.CUR_OBS: np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]]), SampleBatch.ACTIONS: np.array([0, 1, 1]), SampleBatch.REWARDS: np.array([1.0, 1.0, 1.0]), SampleBatch.DONES: np.array([False, False, True]) } # tf. trainer = pg.PGTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() vars = policy.model.trainable_variables() # Post-process (calculate simple (non-GAE) advantages) and attach to # train_batch dict. # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] = # [2.9701, 1.99, 1.0] train_batch = pg.post_process_advantages(policy, train_batch) # Check Advantage values. check(train_batch[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0]) # Actual loss results. results = pg.pg_tf_loss(policy, policy.model, dist_class=Categorical, train_batch=train_batch) # Calculate expected results. expected_logits = fc( fc(train_batch[SampleBatch.CUR_OBS], vars[0].numpy(), vars[1].numpy()), vars[2].numpy(), vars[3].numpy()) expected_logp = Categorical(expected_logits, policy.model).logp( train_batch[SampleBatch.ACTIONS]) expected_loss = -np.mean( expected_logp * train_batch[Postprocessing.ADVANTAGES]) check(results.numpy(), expected_loss, decimals=4) # Torch. config["use_pytorch"] = True trainer = pg.PGTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() train_batch = policy._lazy_tensor_dict(train_batch) results = pg.pg_torch_loss(policy, policy.model, dist_class=TorchCategorical, train_batch=train_batch) expected_logits = policy.model.last_output() expected_logp = TorchCategorical(expected_logits, policy.model).logp( train_batch[SampleBatch.ACTIONS]) expected_loss = -np.mean( expected_logp.detach().numpy() * train_batch[Postprocessing.ADVANTAGES].numpy()) check(results.detach().numpy(), expected_loss, decimals=4)
def test_pg_loss_functions(self): """Tests the PG loss function math.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" # Fake CartPole episode of n time steps. train_batch = SampleBatch({ SampleBatch.OBS: np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]]), SampleBatch.ACTIONS: np.array([0, 1, 1]), SampleBatch.REWARDS: np.array([1.0, 1.0, 1.0]), SampleBatch.DONES: np.array([False, False, True]), SampleBatch.EPS_ID: np.array([1234, 1234, 1234]), SampleBatch.AGENT_INDEX: np.array([0, 0, 0]), }) for fw, sess in framework_iterator(config, session=True): dist_cls = (Categorical if fw != "torch" else TorchCategorical) trainer = pg.PGTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() vars = policy.model.trainable_variables() if sess: vars = policy.get_session().run(vars) # Post-process (calculate simple (non-GAE) advantages) and attach # to train_batch dict. # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] = # [2.9701, 1.99, 1.0] train_batch_ = pg.post_process_advantages(policy, train_batch.copy()) if fw == "torch": train_batch_ = policy._lazy_tensor_dict(train_batch_) # Check Advantage values. check(train_batch_[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0]) # Actual loss results. if sess: results = policy.get_session().run( policy._loss, feed_dict=policy._get_loss_inputs_dict(train_batch_, shuffle=False)) else: results = (pg.pg_tf_loss if fw in ["tf2", "tfe"] else pg.pg_torch_loss)(policy, policy.model, dist_class=dist_cls, train_batch=train_batch_) # Calculate expected results. if fw != "torch": expected_logits = fc(fc(train_batch_[SampleBatch.OBS], vars[0], vars[1], framework=fw), vars[2], vars[3], framework=fw) else: expected_logits = fc(fc(train_batch_[SampleBatch.OBS], vars[2], vars[3], framework=fw), vars[0], vars[1], framework=fw) expected_logp = dist_cls(expected_logits, policy.model).logp( train_batch_[SampleBatch.ACTIONS]) adv = train_batch_[Postprocessing.ADVANTAGES] if sess: expected_logp = sess.run(expected_logp) elif fw == "torch": expected_logp = expected_logp.detach().cpu().numpy() adv = adv.detach().cpu().numpy() else: expected_logp = expected_logp.numpy() expected_loss = -np.mean(expected_logp * adv) check(results, expected_loss, decimals=4)
(dqn.dqn_policy.DQNTFPolicy, env.observation_space, env.action_space, {}) } """ trainer = dqn.DQNTrainer( env="multi_air-v0", config={ "multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["dqn_policy"], }, "gamma": 0.99, "n_step": 3, #"num_gpus": 1, #"num_workers": 16 }) """ #trainer = dqn.DQNTrainer(env="multi_air-v0") #trainer = ppo.PPOTrainer(env="multi_air-v0") #trainer = a3c.A3CTrainer(env="multi_air-v0") trainer = pg.PGTrainer(env="multi_air-v0") for i in range(num_train_itr): x = trainer.train() if i % 100 == 0: print("****************************Iteration: ", i, "****************************") print(pretty_print(x)) trainer.save()
def render(checkpoint, home_path): """ Renders pybullet and mujoco environments. """ alg = re.match('.+?(?=_)', os.path.basename(os.path.normpath(home_path))).group(0) current_env = re.search("(?<=_).*?(?=_)", os.path.basename(os.path.normpath(home_path))).group(0) checkpoint_path = home_path + "checkpoint_" + str(checkpoint) + "/checkpoint-" + str(checkpoint) config = json.load(open(home_path + "params.json")) config_bin = pickle.load(open(home_path + "params.pkl", "rb")) ray.shutdown() import pybullet_envs ray.init() ModelCatalog.register_custom_model("RBF", RBFModel) ModelCatalog.register_custom_model("MLP_2_64", MLP) ModelCatalog.register_custom_model("linear", Linear) if alg == "PPO": trainer = ppo.PPOTrainer(config_bin) if alg == "SAC": trainer = sac.SACTrainer(config) if alg == "DDPG": trainer = ddpg.DDPGTrainer(config) if alg == "PG": trainer = pg.PGTrainer(config) if alg == "A3C": trainer = a3c.A3CTrainer(config) if alg == "TD3": trainer = td3.TD3Trainer(config) if alg == "ES": trainer = es.ESTrainer(config) if alg == "ARS": trainer = ars.ARSTrainer(config) # "normalize_actions": true, trainer.restore(checkpoint_path) if "Bullet" in current_env: env = gym.make(current_env, render=True) else: env = gym.make(current_env) #env.unwrapped.reset_model = det_reset_model env._max_episode_steps = 10000 obs = env.reset() action_hist = [] m_act_hist = [] state_hist = [] obs_hist = [] reward_hist = [] done = False step = 0 for t in range(10000): # for some algorithms you can get the sample mean out, need to change the value on the index to match your env for now # mean_actions = out_dict['behaviour_logits'][:17] # actions = trainer.compute_action(obs.flatten()) # sampled_actions, _ , out_dict = trainer.compute_action(obs.flatten(),full_fetch=True) sampled_actions = trainer.compute_action(obs.flatten()) # sampled_actions, _ , out_dict = trainer.compute_action(obs.flatten(),full_fetch=True) actions = sampled_actions obs, reward, done, _ = env.step(np.asarray(actions)) # env.camera_adjust() env.render(mode='human') time.sleep(0.01) # env.render() # env.render(mode='rgb_array', close = True) # p.computeViewMatrix(cameraEyePosition=[0,10,5], cameraTargetPosition=[0,0,0], cameraUpVector=[0,0,0]) # if step % 1000 == 0: # env.reset() # step += 1 action_hist.append(np.copy(actions)) obs_hist.append(np.copy(obs)) reward_hist.append(np.copy(reward)) if done: obs = env.reset() # print(sum(reward_hist)) # print((obs_hist)) #plt.plot(action_hist) #plt.figure() #plt.figure() #plt.plot(obs_hist) #plt.figure() # Reminder that the bahavior logits that come out are the mean and logstd (not log mean, despite the name logit) # trainer.compute_action(obs, full_fetch=True) trainer.compute_action(obs)
'env_config': { 'simu_len': args.simu_len, 'num_ex': args.num_ex } } for key, value in my_params.items(): my_config[key] = value # initialize the Ray backend ray.init(address=args.default_ray_address) # create the RLLib trainer object trainer = pg.PGTrainer(config=my_config) # get a reference to Azure ML Run object, to be used to log training metrics run = Run.get_context() # execute the RLLib training loop for i in range(args.num_iterations): start_time = time.time() result = trainer.train() end_time = time.time() print( 'Iteration: {0} - Mean Score: {1} - Min Score: {2} - Max Score: {3} - Elapsed time: {4} s.' .format(result['training_iteration'],