def test_ddpg_exploration_and_with_random_prerun(self): """Tests DDPG's Exploration (w/ random actions for n timesteps).""" core_config = ddpg.DEFAULT_CONFIG.copy() core_config["num_workers"] = 0 # Run locally. obs = np.array([0.0, 0.1, -0.1]) # Test against all frameworks. for _ in framework_iterator(core_config): config = core_config.copy() # Default OUNoise setup. trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0") # Setting explore=False should always return the same action. a_ = trainer.compute_action(obs, explore=False) self.assertEqual(trainer.get_policy().global_timestep, 1) for i in range(50): a = trainer.compute_action(obs, explore=False) self.assertEqual(trainer.get_policy().global_timestep, i + 2) check(a, a_) # explore=None (default: explore) should return different actions. actions = [] for i in range(50): actions.append(trainer.compute_action(obs)) self.assertEqual(trainer.get_policy().global_timestep, i + 52) check(np.std(actions), 0.0, false=True) trainer.stop() # Check randomness at beginning. config["exploration_config"] = { # Act randomly at beginning ... "random_timesteps": 50, # Then act very closely to deterministic actions thereafter. "ou_base_scale": 0.001, "initial_scale": 0.001, "final_scale": 0.001, } trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0") # ts=0 (get a deterministic action as per explore=False). deterministic_action = trainer.compute_action(obs, explore=False) self.assertEqual(trainer.get_policy().global_timestep, 1) # ts=1-49 (in random window). random_a = [] for i in range(1, 50): random_a.append(trainer.compute_action(obs, explore=True)) self.assertEqual(trainer.get_policy().global_timestep, i + 1) check(random_a[-1], deterministic_action, false=True) self.assertTrue(np.std(random_a) > 0.5) # ts > 50 (a=deterministic_action + scale * N[0,1]) for i in range(50): a = trainer.compute_action(obs, explore=True) self.assertEqual(trainer.get_policy().global_timestep, i + 51) check(a, deterministic_action, rtol=0.1) # ts >> 50 (BUT: explore=False -> expect deterministic action). for i in range(50): a = trainer.compute_action(obs, explore=False) self.assertEqual(trainer.get_policy().global_timestep, i + 101) check(a, deterministic_action) trainer.stop()
def __init__(self, agent_name, env, config, logger_creator): assert agent_name in [ "td3", "ddpg", "ppo", ], "Some policies are not currently supported (dqn,sac)" # dqn and sac not currently supported self.agent_name = agent_name if self.agent_name == "ppo": self.trainer = ppo.PPOTrainer( env=env, config=config, logger_creator=logger_creator, ) elif self.agent_name == "ddpg": self.trainer = ddpg.DDPGTrainer( env=env, config=config, logger_creator=logger_creator, ) elif self.agent_name == "td3": self.trainer = ddpg.TD3Trainer( env=env, config=config, logger_creator=logger_creator, )
def test_ddpg_compilation(self): """Test whether a DDPGTrainer can be built with both frameworks.""" config = ddpg.DEFAULT_CONFIG.copy() config["num_workers"] = 1 config["num_envs_per_worker"] = 2 config["learning_starts"] = 0 config["exploration_config"]["random_timesteps"] = 100 num_iterations = 1 # Test against all frameworks. for _ in framework_iterator(config): trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0") for i in range(num_iterations): results = trainer.train() print(results) check_compute_single_action(trainer) # Ensure apply_gradient_fn is being called and updating global_step if config["framework"] == "tf": a = trainer.get_policy().global_step.eval( trainer.get_policy().get_session()) else: a = trainer.get_policy().global_step check(a, 500) trainer.stop()
def test(self, algo, path, lr, fc_hid, fc_act): """Test trained agent for a single episode. Return the episode reward""" # instantiate env class unused_shared = [] unused_own = [] unsatisfied_shared = [] unsatisfied_own = [] episode_reward = 0 #self.config["num_workers"] = 0 self.config["lr"] = lr self.config['model']["fcnet_hiddens"] = fc_hid self.config['model']["fcnet_activation"] = fc_act if algo == "ppo": self.agent = ppo.PPOTrainer(config=self.config) if algo == "ddpg": self.agent = ddpg.DDPGTrainer(config=self.config) if algo == "a3c": self.agent = a3c.A3CTrainer(config=self.config) if algo == "impala": self.agent = impala.ImpalaTrainer(config=self.config) if algo == "appo": self.agent = ppo.APPOTrainer(config=self.config) if algo == "td3": self.agent = ddpg.TD3Trainer(config=self.config) self.agent.restore(path) env = caching_vM(config=self.config) obs = env.reset() done = False action = {} for agent_id, agent_obs in obs.items(): policy_id = self.config['multiagent']['policy_mapping_fn']( agent_id) action[agent_id] = self.agent.compute_action(agent_obs, policy_id=policy_id) obs, reward, done, info = env.step(action) done = done['__all__'] for x in range(len(info)): res = ast.literal_eval(info[x]) unused_shared.append(res[0]) unused_own.append(res[1]) unsatisfied_shared.append(res[2]) unsatisfied_own.append(res[3]) print("reward == ", reward) # sum up reward for all agents episode_reward += sum(reward.values()) return episode_reward, unused_shared, unused_own, unsatisfied_shared, unsatisfied_own
def test(self,algo, path, lr, fc_hid, fc_act): """Test trained agent for a single episode. Return the episode reward""" # instantiate env class unused_shared = [] unused_own = [] unsatisfied_shared = [] unsatisfied_own = [] episode_reward = 0 self.config_test["num_workers"] = 0 self.config_test["lr"] = lr self.config_test['model']["fcnet_hiddens"] = fc_hid self.config_test['model']["fcnet_activation"] = fc_act if algo == "ppo": self.agent = ppo.PPOTrainer(config=self.config_test) if algo == "ddpg": self.agent = ddpg.DDPGTrainer(config=self.config_test) if algo == "a3c": self.agent = a3c.A3CTrainer(config=self.config_test) if algo == "impala": self.agent = impala.ImpalaTrainer(config=self.config_test) if algo == "appo": self.agent = ppo.APPOTrainer(config=self.config_test) if algo == "td3": self.agent = ddpg.TD3Trainer(config=self.config_test) self.agent.restore(path) #env = self.agent.workers.local_worker().env #env = self.env_class(self.env_config) #env = ContentCaching(*self.config_train) #env = self.config_train["env"]#env_config) #env = self.env_class(3) #env = ContentCaching #env = self.env #self.env = ContentCaching #env = self.config_train["env"] obs = ContentCaching.reset() done = False while not done: action = self.agent.compute_action(obs) obs, reward, done, info = self.env.step(action) episode_reward += reward unused_shared.append(info["unused_shared"]) unused_own.append(info["unused_own"]) unsatisfied_shared.append(info["unsatisfied_shared"]) unsatisfied_own.append(info["unsatisfied_own"]) return episode_reward, unused_shared, unused_own, unsatisfied_shared, unsatisfied_own
def test_ddpg_compilation(self): """Test whether a DDPGTrainer can be built with both frameworks.""" config = ddpg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. # Test against all frameworks. for _ in framework_iterator(config, "tf"): trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0") num_iterations = 2 for i in range(num_iterations): results = trainer.train() print(results)
def test_ddpg_checkpoint_save_and_restore(self): """Test whether a DDPGTrainer can save and load checkpoints.""" config = ddpg.DEFAULT_CONFIG.copy() config["num_workers"] = 1 config["num_envs_per_worker"] = 2 config["learning_starts"] = 0 config["exploration_config"]["random_timesteps"] = 100 # Test against all frameworks. for _ in framework_iterator(config): trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0") trainer.train() with TemporaryDirectory() as temp_dir: checkpoint = trainer.save(temp_dir) trainer.restore(checkpoint) trainer.stop()
def test_ddpg_compilation(self): """Test whether a DDPGTrainer can be built with both frameworks.""" config = ddpg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. # Test against all frameworks. for fw in ["tf", "eager", "torch"]: if fw != "tf": continue config["eager"] = True if fw == "eager" else False config["use_pytorch"] = True if fw == "torch" else False trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0") num_iterations = 2 for i in range(num_iterations): results = trainer.train() print(results)
def test_ddpg_fake_multi_gpu_learning(self): """Test whether DDPGTrainer can run SimpleEnv w/ faked multi-GPU.""" config = ddpg.DEFAULT_CONFIG.copy() # Fake GPU setup. config["num_gpus"] = 2 config["_fake_gpus"] = True env = "ray.rllib.agents.sac.tests.test_sac.SimpleEnv" config["env_config"] = {"config": {"repeat_delay": 0}} for _ in framework_iterator(config, frameworks=("tf", "torch")): trainer = ddpg.DDPGTrainer(config=config, env=env) num_iterations = 2 for i in range(num_iterations): results = trainer.train() print(results) trainer.stop()
def test_ddpg_compilation(self): """Test whether a DDPGTrainer can be built with both frameworks.""" config = ddpg.DEFAULT_CONFIG.copy() config["num_workers"] = 1 config["num_envs_per_worker"] = 2 config["learning_starts"] = 0 config["exploration_config"]["random_timesteps"] = 100 num_iterations = 2 # Test against all frameworks. for _ in framework_iterator(config, ("tf", "torch")): trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0") for i in range(num_iterations): results = trainer.train() print(results) check_compute_single_action(trainer)
def test_ddpg_fake_multi_gpu_learning(self): """Test whether DDPGTrainer can learn CartPole w/ faked multi-GPU.""" config = ddpg.DEFAULT_CONFIG.copy() # Fake GPU setup. config["num_gpus"] = 2 config["_fake_gpus"] = True env = "ray.rllib.agents.sac.tests.test_sac.SimpleEnv" config["env_config"] = {"config": {"repeat_delay": 0}} for _ in framework_iterator(config, frameworks=("tf", "torch")): trainer = ddpg.DDPGTrainer(config=config, env=env) num_iterations = 50 learnt = False for i in range(num_iterations): results = trainer.train() print(f"R={results['episode_reward_mean']}") if results["episode_reward_mean"] > 70.0: learnt = True break assert learnt, \ f"DDPG multi-GPU (with fake-GPUs) did not learn {env}!" trainer.stop()
def _continuous_run(self): import ray from ray import tune from ray.rllib.agents import ppo,ddpg ray.init(num_cpus=4,num_gpus=1,local_mode=True) configs={ 'num_gpus':1, 'num_workers':4, # 'num_gpus_per_worker':1, 'framework':'torch', "simple_optimizer":True, } AGENT_CONFIG={ 'ddpg':ddpg.DDPGTrainer(config=configs,env="MountainCarContinuous-v0"), 'ppo':ppo.PPOTrainer(config=configs,env="MountainCarContinuous-v0"), } trainer=AGENT_CONFIG[self.configs['algorithm']] # tune.run(agent, config={"env": "MountainCarContinuous-v0","framework":"torch","num_gpus":0,}) for i in range(2000): # 2000epoch result=trainer.train()#1 epoch print(result) return
def test_ddpg_loss_function(self): """Tests DDPG loss function results across all frameworks.""" config = ddpg.DEFAULT_CONFIG.copy() # Run locally. config["num_workers"] = 0 config["learning_starts"] = 0 config["twin_q"] = True config["use_huber"] = True config["huber_threshold"] = 1.0 config["gamma"] = 0.99 # Make this small (seems to introduce errors). config["l2_reg"] = 1e-10 config["prioritized_replay"] = False # Use very simple nets. config["actor_hiddens"] = [10] config["critic_hiddens"] = [10] # Make sure, timing differences do not affect trainer.train(). config["min_iter_time_s"] = 0 config["timesteps_per_iteration"] = 100 map_ = { # Normal net. "default_policy/actor_hidden_0/kernel": "policy_model.action_0." "_model.0.weight", "default_policy/actor_hidden_0/bias": "policy_model.action_0." "_model.0.bias", "default_policy/actor_out/kernel": "policy_model.action_out." "_model.0.weight", "default_policy/actor_out/bias": "policy_model.action_out." "_model.0.bias", "default_policy/sequential/q_hidden_0/kernel": "q_model.q_hidden_0" "._model.0.weight", "default_policy/sequential/q_hidden_0/bias": "q_model.q_hidden_0." "_model.0.bias", "default_policy/sequential/q_out/kernel": "q_model.q_out._model." "0.weight", "default_policy/sequential/q_out/bias": "q_model.q_out._model." "0.bias", # -- twin. "default_policy/sequential_1/twin_q_hidden_0/kernel": "twin_" "q_model.twin_q_hidden_0._model.0.weight", "default_policy/sequential_1/twin_q_hidden_0/bias": "twin_" "q_model.twin_q_hidden_0._model.0.bias", "default_policy/sequential_1/twin_q_out/kernel": "twin_" "q_model.twin_q_out._model.0.weight", "default_policy/sequential_1/twin_q_out/bias": "twin_" "q_model.twin_q_out._model.0.bias", # Target net. "default_policy/actor_hidden_0_1/kernel": "policy_model.action_0." "_model.0.weight", "default_policy/actor_hidden_0_1/bias": "policy_model.action_0." "_model.0.bias", "default_policy/actor_out_1/kernel": "policy_model.action_out." "_model.0.weight", "default_policy/actor_out_1/bias": "policy_model.action_out._model" ".0.bias", "default_policy/sequential_2/q_hidden_0/kernel": "q_model." "q_hidden_0._model.0.weight", "default_policy/sequential_2/q_hidden_0/bias": "q_model." "q_hidden_0._model.0.bias", "default_policy/sequential_2/q_out/kernel": "q_model." "q_out._model.0.weight", "default_policy/sequential_2/q_out/bias": "q_model." "q_out._model.0.bias", # -- twin. "default_policy/sequential_3/twin_q_hidden_0/kernel": "twin_" "q_model.twin_q_hidden_0._model.0.weight", "default_policy/sequential_3/twin_q_hidden_0/bias": "twin_" "q_model.twin_q_hidden_0._model.0.bias", "default_policy/sequential_3/twin_q_out/kernel": "twin_" "q_model.twin_q_out._model.0.weight", "default_policy/sequential_3/twin_q_out/bias": "twin_" "q_model.twin_q_out._model.0.bias", } env = SimpleEnv batch_size = 100 if env is SimpleEnv: obs_size = (batch_size, 1) actions = np.random.random(size=(batch_size, 1)) elif env == "CartPole-v0": obs_size = (batch_size, 4) actions = np.random.randint(0, 2, size=(batch_size, )) else: obs_size = (batch_size, 3) actions = np.random.random(size=(batch_size, 1)) # Batch of size=n. input_ = self._get_batch_helper(obs_size, actions, batch_size) # Simply compare loss values AND grads of all frameworks with each # other. prev_fw_loss = weights_dict = None expect_c, expect_a, expect_t = None, None, None # History of tf-updated NN-weights over n training steps. tf_updated_weights = [] # History of input batches used. tf_inputs = [] for fw, sess in framework_iterator(config, frameworks=("tf", "torch"), session=True): # Generate Trainer and get its default Policy object. trainer = ddpg.DDPGTrainer(config=config, env=env) policy = trainer.get_policy() p_sess = None if sess: p_sess = policy.get_session() # Set all weights (of all nets) to fixed values. if weights_dict is None: assert fw == "tf" # Start with the tf vars-dict. weights_dict = policy.get_weights() else: assert fw == "torch" # Then transfer that to torch Model. model_dict = self._translate_weights_to_torch( weights_dict, map_) policy.model.load_state_dict(model_dict) policy.target_model.load_state_dict(model_dict) if fw == "torch": # Actually convert to torch tensors. input_ = policy._lazy_tensor_dict(input_) input_ = {k: input_[k] for k in input_.keys()} # Only run the expectation once, should be the same anyways # for all frameworks. if expect_c is None: expect_c, expect_a, expect_t = \ self._ddpg_loss_helper( input_, weights_dict, sorted(weights_dict.keys()), fw, gamma=config["gamma"], huber_threshold=config["huber_threshold"], l2_reg=config["l2_reg"], sess=sess) # Get actual outs and compare to expectation AND previous # framework. c=critic, a=actor, e=entropy, t=td-error. if fw == "tf": c, a, t, tf_c_grads, tf_a_grads = \ p_sess.run([ policy.critic_loss, policy.actor_loss, policy.td_error, policy._critic_optimizer.compute_gradients( policy.critic_loss, policy.model.q_variables()), policy._actor_optimizer.compute_gradients( policy.actor_loss, policy.model.policy_variables())], feed_dict=policy._get_loss_inputs_dict( input_, shuffle=False)) # Check pure loss values. check(c, expect_c) check(a, expect_a) check(t, expect_t) tf_c_grads = [g for g, v in tf_c_grads] tf_a_grads = [g for g, v in tf_a_grads] elif fw == "torch": loss_torch(policy, policy.model, None, input_) c, a, t = policy.critic_loss, policy.actor_loss, \ policy.td_error # Check pure loss values. check(c, expect_c) check(a, expect_a) check(t, expect_t) # Test actor gradients. policy._actor_optimizer.zero_grad() assert all(v.grad is None for v in policy.model.q_variables()) assert all(v.grad is None for v in policy.model.policy_variables()) a.backward() # `actor_loss` depends on Q-net vars # (but not twin-Q-net vars!). assert not any(v.grad is None for v in policy.model.q_variables()[:4]) assert all(v.grad is None for v in policy.model.q_variables()[4:]) assert not all( torch.mean(v.grad) == 0 for v in policy.model.policy_variables()) assert not all( torch.min(v.grad) == 0 for v in policy.model.policy_variables()) # Compare with tf ones. torch_a_grads = [ v.grad for v in policy.model.policy_variables() ] for tf_g, torch_g in zip(tf_a_grads, torch_a_grads): if tf_g.shape != torch_g.shape: check(tf_g, np.transpose(torch_g)) else: check(tf_g, torch_g) # Test critic gradients. policy._critic_optimizer.zero_grad() assert all(v.grad is None or torch.mean(v.grad) == 0.0 for v in policy.model.q_variables()) assert all(v.grad is None or torch.min(v.grad) == 0.0 for v in policy.model.q_variables()) c.backward() assert not all( torch.mean(v.grad) == 0 for v in policy.model.q_variables()) assert not all( torch.min(v.grad) == 0 for v in policy.model.q_variables()) # Compare with tf ones. torch_c_grads = [v.grad for v in policy.model.q_variables()] for tf_g, torch_g in zip(tf_c_grads, torch_c_grads): if tf_g.shape != torch_g.shape: check(tf_g, np.transpose(torch_g)) else: check(tf_g, torch_g) # Compare (unchanged(!) actor grads) with tf ones. torch_a_grads = [ v.grad for v in policy.model.policy_variables() ] for tf_g, torch_g in zip(tf_a_grads, torch_a_grads): if tf_g.shape != torch_g.shape: check(tf_g, np.transpose(torch_g)) else: check(tf_g, torch_g) # Store this framework's losses in prev_fw_loss to compare with # next framework's outputs. if prev_fw_loss is not None: check(c, prev_fw_loss[0]) check(a, prev_fw_loss[1]) check(t, prev_fw_loss[2]) prev_fw_loss = (c, a, t) # Update weights from our batch (n times). for update_iteration in range(10): print("train iteration {}".format(update_iteration)) if fw == "tf": in_ = self._get_batch_helper(obs_size, actions, batch_size) tf_inputs.append(in_) # Set a fake-batch to use # (instead of sampling from replay buffer). buf = LocalReplayBuffer.get_instance_for_testing() buf._fake_batch = in_ trainer.train() updated_weights = policy.get_weights() # Net must have changed. if tf_updated_weights: check(updated_weights[ "default_policy/actor_hidden_0/kernel"], tf_updated_weights[-1] ["default_policy/actor_hidden_0/kernel"], false=True) tf_updated_weights.append(updated_weights) # Compare with updated tf-weights. Must all be the same. else: tf_weights = tf_updated_weights[update_iteration] in_ = tf_inputs[update_iteration] # Set a fake-batch to use # (instead of sampling from replay buffer). buf = LocalReplayBuffer.get_instance_for_testing() buf._fake_batch = in_ trainer.train() # Compare updated model and target weights. for tf_key in tf_weights.keys(): tf_var = tf_weights[tf_key] # Model. if re.search( "actor_out_1|actor_hidden_0_1|sequential_" "[23]", tf_key): torch_var = policy.target_model.state_dict()[ map_[tf_key]] # Target model. else: torch_var = policy.model.state_dict()[map_[tf_key]] if tf_var.shape != torch_var.shape: check(tf_var, np.transpose(torch_var), rtol=0.07) else: check(tf_var, torch_var, rtol=0.07)
config["num_gpus"] = 0 if algorithm == 'A2C': RLAgent = a2c.A2CTrainer(env=env_name, config=config) elif algorithm == 'ADQN': RLAgent = adqn.ApexTrainer(env=env_name, config=config) elif algorithm == 'DQN': RLAgent = dqn.DQNTrainer(env=env_name, config=config) elif algorithm == 'IMPALA': RLAgent = impala.ImpalaTrainer(env=env_name, config=config) elif algorithm == 'PPO': RLAgent = ppo.PPOTrainer(env=env_name, config=config) elif algorithm == 'RDQN': RLAgent = dqn.DQNTrainer(env=env_name, config=config) elif algorithm == "DDPG": RLAgent = ddpg.DDPGTrainer(env=env_name, config=config) print(checkpoint_path, flush=True) #RLAgent.restore(checkpoint_path) num_runs = 50 totalRewards = np.empty((num_runs, )) policy = RLAgent.get_policy("policy_0") for j in range(num_runs): observations = env.reset() rewards, action_dict = {}, {} for agent_id in env.agents: rewards[agent_id] = 0 totalReward = 0
def render(checkpoint, home_path): """ Renders pybullet and mujoco environments. """ alg = re.match('.+?(?=_)', os.path.basename(os.path.normpath(home_path))).group(0) current_env = re.search("(?<=_).*?(?=_)", os.path.basename(os.path.normpath(home_path))).group(0) checkpoint_path = home_path + "checkpoint_" + str(checkpoint) + "/checkpoint-" + str(checkpoint) config = json.load(open(home_path + "params.json")) config_bin = pickle.load(open(home_path + "params.pkl", "rb")) ray.shutdown() import pybullet_envs ray.init() ModelCatalog.register_custom_model("RBF", RBFModel) ModelCatalog.register_custom_model("MLP_2_64", MLP) ModelCatalog.register_custom_model("linear", Linear) if alg == "PPO": trainer = ppo.PPOTrainer(config_bin) if alg == "SAC": trainer = sac.SACTrainer(config) if alg == "DDPG": trainer = ddpg.DDPGTrainer(config) if alg == "PG": trainer = pg.PGTrainer(config) if alg == "A3C": trainer = a3c.A3CTrainer(config) if alg == "TD3": trainer = td3.TD3Trainer(config) if alg == "ES": trainer = es.ESTrainer(config) if alg == "ARS": trainer = ars.ARSTrainer(config) # "normalize_actions": true, trainer.restore(checkpoint_path) if "Bullet" in current_env: env = gym.make(current_env, render=True) else: env = gym.make(current_env) #env.unwrapped.reset_model = det_reset_model env._max_episode_steps = 10000 obs = env.reset() action_hist = [] m_act_hist = [] state_hist = [] obs_hist = [] reward_hist = [] done = False step = 0 for t in range(10000): # for some algorithms you can get the sample mean out, need to change the value on the index to match your env for now # mean_actions = out_dict['behaviour_logits'][:17] # actions = trainer.compute_action(obs.flatten()) # sampled_actions, _ , out_dict = trainer.compute_action(obs.flatten(),full_fetch=True) sampled_actions = trainer.compute_action(obs.flatten()) # sampled_actions, _ , out_dict = trainer.compute_action(obs.flatten(),full_fetch=True) actions = sampled_actions obs, reward, done, _ = env.step(np.asarray(actions)) # env.camera_adjust() env.render(mode='human') time.sleep(0.01) # env.render() # env.render(mode='rgb_array', close = True) # p.computeViewMatrix(cameraEyePosition=[0,10,5], cameraTargetPosition=[0,0,0], cameraUpVector=[0,0,0]) # if step % 1000 == 0: # env.reset() # step += 1 action_hist.append(np.copy(actions)) obs_hist.append(np.copy(obs)) reward_hist.append(np.copy(reward)) if done: obs = env.reset() # print(sum(reward_hist)) # print((obs_hist)) #plt.plot(action_hist) #plt.figure() #plt.figure() #plt.plot(obs_hist) #plt.figure() # Reminder that the bahavior logits that come out are the mean and logstd (not log mean, despite the name logit) # trainer.compute_action(obs, full_fetch=True) trainer.compute_action(obs)
"pol0": (None, temp_env.observation_space[0], temp_env.action_space[0], { "agent_id": 0, }), "pol1": (None, temp_env.observation_space[1], temp_env.action_space[1], { "agent_id": 1, }), }, "policy_mapping_fn": lambda x: "pol0" if x == 0 else "pol1", # # Function mapping agent ids to policy ids # "observation_fn": central_critic_observer, # See rllib/evaluation/observation_function.py for more info } #### Restore agent ######################################### agent = ddpg.DDPGTrainer(config=config) # with open(ARGS.exp+'/checkpoint.txt', 'r+') as f: # checkpoint = f.read() checkpoint = "/home/mahendra/git/gym-pybullet-drones/experiments/learning/results/save-payloadcoop-2-cc-payload_one_sensor-xyz_yaw-03.25.2021_20.20.48/DDPG_2021-03-25_20-20-51/DDPG_this-aviary-v0_ebf05_00000_0_2021-03-25_20-20-52/checkpoint_40/checkpoint-40" agent.restore(checkpoint) #### Extract and print policies ############################ policy0 = agent.get_policy("pol0") # print("action model 0", policy0.model.action_model) # print("value model 0", policy0.model.value_model) policy1 = agent.get_policy("pol1") # print("action model 1", policy1.model.action_model) # print("value model 1", policy1.model.value_model) #### Create test environment ############################### test_env = PayloadCoop(num_drones=NUM_DRONES,