def test_impala_compilation(self): """Test whether an ImpalaTrainer can be built with both frameworks.""" config = impala.DEFAULT_CONFIG.copy() num_iterations = 1 for _ in framework_iterator(config, frameworks=("torch", "tf")): local_cfg = config.copy() for env in ["Pendulum-v0", "CartPole-v0"]: print("Env={}".format(env)) print("w/ LSTM") # Test w/o LSTM. trainer = impala.ImpalaTrainer(config=local_cfg, env=env) for i in range(num_iterations): print(trainer.train()) check_compute_action(trainer) trainer.stop() # Test w/ LSTM. print("w/o LSTM") local_cfg["model"]["use_lstm"] = True trainer = impala.ImpalaTrainer(config=local_cfg, env=env) for i in range(num_iterations): print(trainer.train()) check_compute_action(trainer) trainer.stop()
def test_impala_compilation(self): """Test whether an ImpalaTrainer can be built with both frameworks.""" config = impala.DEFAULT_CONFIG.copy() num_iterations = 1 for _ in framework_iterator(config): local_cfg = config.copy() for env in ["Pendulum-v0", "CartPole-v0"]: print("Env={}".format(env)) print("w/o LSTM") # Test w/o LSTM. local_cfg["model"]["use_lstm"] = False local_cfg["num_aggregation_workers"] = 0 trainer = impala.ImpalaTrainer(config=local_cfg, env=env) for i in range(num_iterations): print(trainer.train()) check_compute_single_action(trainer) trainer.stop() # Test w/ LSTM. print("w/ LSTM") local_cfg["model"]["use_lstm"] = True local_cfg["model"]["lstm_use_prev_action"] = True local_cfg["model"]["lstm_use_prev_reward"] = True local_cfg["num_aggregation_workers"] = 1 trainer = impala.ImpalaTrainer(config=local_cfg, env=env) for i in range(num_iterations): print(trainer.train()) check_compute_single_action(trainer, include_state=True, include_prev_action_reward=True) trainer.stop()
def test_impala_fake_multi_gpu_learning(self): """Test whether IMPALATrainer can learn CartPole w/ faked multi-GPU.""" config = copy.deepcopy(impala.DEFAULT_CONFIG) # Fake GPU setup. config["_fake_gpus"] = True config["num_gpus"] = 2 config["train_batch_size"] *= 2 # Test w/ LSTMs. config["model"]["use_lstm"] = True for _ in framework_iterator(config, frameworks=("tf", "torch")): trainer = impala.ImpalaTrainer(config=config, env="CartPole-v0") num_iterations = 200 learnt = False for i in range(num_iterations): results = trainer.train() print(results) if results["episode_reward_mean"] > 55.0: learnt = True break assert learnt, \ "IMPALA multi-GPU (with fake-GPUs) did not learn CartPole!" trainer.stop()
def test_impala_compilation(self): """Test whether an ImpalaTrainer can be built with both frameworks.""" config = impala.DEFAULT_CONFIG.copy() config["num_gpus"] = 0 config["model"]["lstm_use_prev_action"] = True config["model"]["lstm_use_prev_reward"] = True num_iterations = 1 env = "CartPole-v0" for _ in framework_iterator(config, with_eager_tracing=True): local_cfg = config.copy() for lstm in [False, True]: local_cfg["num_aggregation_workers"] = 0 if not lstm else 1 local_cfg["model"]["use_lstm"] = lstm print("lstm={} aggregation-workers={}".format( lstm, local_cfg["num_aggregation_workers"])) # Test with and w/o aggregation workers (this has nothing # to do with LSTMs, though). trainer = impala.ImpalaTrainer(config=local_cfg, env=env) for i in range(num_iterations): results = trainer.train() check_train_results(results) print(results) check_compute_single_action( trainer, include_state=lstm, include_prev_action_reward=lstm, ) trainer.stop()
def load(self, path): """ Load a trained RLlib agent from the specified path. Call this before testing a trained agent. :param path: Path pointing to the agent's saved checkpoint (only used for RLlib agents) """ self.agent = impala.ImpalaTrainer(config=self.config) self.agent.restore(path)
def test_impala_lr_schedule(self): config = impala.DEFAULT_CONFIG.copy() config["num_gpus"] = 0 # Test whether we correctly ignore the "lr" setting. # The first lr should be 0.0005. config["lr"] = 0.1 config["lr_schedule"] = [ [0, 0.0005], [10000, 0.000001], ] config["num_gpus"] = 0 # Do not use any (fake) GPUs. config["env"] = "CartPole-v0" def get_lr(result): return result["info"]["learner"][DEFAULT_POLICY_ID]["cur_lr"] for fw in framework_iterator(config, frameworks=("tf", "torch")): trainer = impala.ImpalaTrainer(config=config) policy = trainer.get_policy() try: if fw == "tf": check(policy.get_session().run(policy.cur_lr), 0.0005) else: check(policy.cur_lr, 0.0005) r1 = trainer.train() r2 = trainer.train() assert get_lr(r2) < get_lr(r1), (r1, r2) finally: trainer.stop()
def test(self, algo, path, lr, fc_hid, fc_act): """Test trained agent for a single episode. Return the episode reward""" # instantiate env class unused_shared = [] unused_own = [] unsatisfied_shared = [] unsatisfied_own = [] episode_reward = 0 #self.config["num_workers"] = 0 self.config["lr"] = lr self.config['model']["fcnet_hiddens"] = fc_hid self.config['model']["fcnet_activation"] = fc_act if algo == "ppo": self.agent = ppo.PPOTrainer(config=self.config) if algo == "ddpg": self.agent = ddpg.DDPGTrainer(config=self.config) if algo == "a3c": self.agent = a3c.A3CTrainer(config=self.config) if algo == "impala": self.agent = impala.ImpalaTrainer(config=self.config) if algo == "appo": self.agent = ppo.APPOTrainer(config=self.config) if algo == "td3": self.agent = ddpg.TD3Trainer(config=self.config) self.agent.restore(path) env = caching_vM(config=self.config) obs = env.reset() done = False action = {} for agent_id, agent_obs in obs.items(): policy_id = self.config['multiagent']['policy_mapping_fn']( agent_id) action[agent_id] = self.agent.compute_action(agent_obs, policy_id=policy_id) obs, reward, done, info = env.step(action) done = done['__all__'] for x in range(len(info)): res = ast.literal_eval(info[x]) unused_shared.append(res[0]) unused_own.append(res[1]) unsatisfied_shared.append(res[2]) unsatisfied_own.append(res[3]) print("reward == ", reward) # sum up reward for all agents episode_reward += sum(reward.values()) return episode_reward, unused_shared, unused_own, unsatisfied_shared, unsatisfied_own
def test(self,algo, path, lr, fc_hid, fc_act): """Test trained agent for a single episode. Return the episode reward""" # instantiate env class unused_shared = [] unused_own = [] unsatisfied_shared = [] unsatisfied_own = [] episode_reward = 0 self.config_test["num_workers"] = 0 self.config_test["lr"] = lr self.config_test['model']["fcnet_hiddens"] = fc_hid self.config_test['model']["fcnet_activation"] = fc_act if algo == "ppo": self.agent = ppo.PPOTrainer(config=self.config_test) if algo == "ddpg": self.agent = ddpg.DDPGTrainer(config=self.config_test) if algo == "a3c": self.agent = a3c.A3CTrainer(config=self.config_test) if algo == "impala": self.agent = impala.ImpalaTrainer(config=self.config_test) if algo == "appo": self.agent = ppo.APPOTrainer(config=self.config_test) if algo == "td3": self.agent = ddpg.TD3Trainer(config=self.config_test) self.agent.restore(path) #env = self.agent.workers.local_worker().env #env = self.env_class(self.env_config) #env = ContentCaching(*self.config_train) #env = self.config_train["env"]#env_config) #env = self.env_class(3) #env = ContentCaching #env = self.env #self.env = ContentCaching #env = self.config_train["env"] obs = ContentCaching.reset() done = False while not done: action = self.agent.compute_action(obs) obs, reward, done, info = self.env.step(action) episode_reward += reward unused_shared.append(info["unused_shared"]) unused_own.append(info["unused_own"]) unsatisfied_shared.append(info["unsatisfied_shared"]) unsatisfied_own.append(info["unsatisfied_own"]) return episode_reward, unused_shared, unused_own, unsatisfied_shared, unsatisfied_own
def test_no_gpus_error(self): """Tests errors related to no-GPU/too-few GPUs/etc. This test will only work ok on a CPU-only machine. """ config = impala.DEFAULT_CONFIG.copy() env = "CartPole-v0" for _ in framework_iterator(config): self.assertRaisesRegex( RuntimeError, # (?s): "dot matches all" (also newlines). "(?s)Found 0 GPUs on your machine.+To change the config", lambda: impala.ImpalaTrainer(config=config, env=env), )
def test_impala_lr_schedule(self): config = impala.DEFAULT_CONFIG.copy() config["lr_schedule"] = [ [0, 0.0005], [10000, 0.000001], ] local_cfg = config.copy() trainer = impala.ImpalaTrainer(config=local_cfg, env="CartPole-v0") def get_lr(result): return result["info"]["learner"][DEFAULT_POLICY_ID]["cur_lr"] try: r1 = trainer.train() r2 = trainer.train() assert get_lr(r2) < get_lr(r1), (r1, r2) finally: trainer.stop()
def get_rl_agent(agent_name, config, env_to_agent): if agent_name == A2C: import ray.rllib.agents.a3c as a2c agent = a2c.A2CTrainer(config=config, env=env_to_agent) elif agent_name == A3C: import ray.rllib.agents.a3c as a3c agent = a3c.A3CTrainer(config=config, env=env_to_agent) elif agent_name == BC: import ray.rllib.agents.marwil as bc agent = bc.BCTrainer(config=config, env=env_to_agent) elif agent_name == DQN: import ray.rllib.agents.dqn as dqn agent = dqn.DQNTrainer(config=config, env=env_to_agent) elif agent_name == APEX_DQN: import ray.rllib.agents.dqn as dqn agent = dqn.ApexTrainer(config=config, env=env_to_agent) elif agent_name == IMPALA: import ray.rllib.agents.impala as impala agent = impala.ImpalaTrainer(config=config, env=env_to_agent) elif agent_name == MARWIL: import ray.rllib.agents.marwil as marwil agent = marwil.MARWILTrainer(config=config, env=env_to_agent) elif agent_name == PG: import ray.rllib.agents.pg as pg agent = pg.PGTrainer(config=config, env=env_to_agent) elif agent_name == PPO: import ray.rllib.agents.ppo as ppo agent = ppo.PPOTrainer(config=config, env=env_to_agent) elif agent_name == APPO: import ray.rllib.agents.ppo as ppo agent = ppo.APPOTrainer(config=config, env=env_to_agent) elif agent_name == SAC: import ray.rllib.agents.sac as sac agent = sac.SACTrainer(config=config, env=env_to_agent) elif agent_name == LIN_UCB: import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent) elif agent_name == LIN_TS: import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent) else: raise Exception("Not valid agent name") return agent
def get_rllib_agent(agent_name, env_name, env, env_to_agent): config = get_config(env_name, env, 1) if is_rllib_agent(agent_name) else {} if agent_name == RLLIB_A2C: import ray.rllib.agents.a3c as a2c agent = a2c.A2CTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_A3C: import ray.rllib.agents.a3c as a3c agent = a3c.A3CTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_BC: import ray.rllib.agents.marwil as bc agent = bc.BCTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_DQN: import ray.rllib.agents.dqn as dqn agent = dqn.DQNTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_APEX_DQN: import ray.rllib.agents.dqn as dqn agent = dqn.ApexTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_IMPALA: import ray.rllib.agents.impala as impala agent = impala.ImpalaTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_MARWIL: import ray.rllib.agents.marwil as marwil agent = marwil.MARWILTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_PG: import ray.rllib.agents.pg as pg agent = pg.PGTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_PPO: import ray.rllib.agents.ppo as ppo agent = ppo.PPOTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_APPO: import ray.rllib.agents.ppo as ppo agent = ppo.APPOTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_SAC: import ray.rllib.agents.sac as sac agent = sac.SACTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_LIN_UCB: import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent) elif agent_name == RLLIB_LIN_TS: import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent) return agent
def test_impala_lr_schedule(self): config = impala.DEFAULT_CONFIG.copy() config["num_gpus"] = 0 # Test whether we correctly ignore the "lr" setting. # The first lr should be 0.05. config["lr"] = 0.1 config["lr_schedule"] = [ [0, 0.05], [10000, 0.000001], ] config["num_gpus"] = 0 # Do not use any (fake) GPUs. config["env"] = "CartPole-v0" def get_lr(result): return result["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY][ "cur_lr" ] for fw in framework_iterator(config): trainer = impala.ImpalaTrainer(config=config) policy = trainer.get_policy() try: if fw == "tf": check(policy.get_session().run(policy.cur_lr), 0.05) else: check(policy.cur_lr, 0.05) r1 = trainer.train() r2 = trainer.train() r3 = trainer.train() # Due to the asynch'ness of IMPALA, learner-stats metrics # could be delayed by one iteration. Do 3 train() calls here # and measure guaranteed decrease in lr between 1st and 3rd. lr1 = get_lr(r1) lr2 = get_lr(r2) lr3 = get_lr(r3) assert lr2 <= lr1, (lr1, lr2) assert lr3 <= lr2, (lr2, lr3) assert lr3 < lr1, (lr1, lr3) finally: trainer.stop()
wandb.init( project='pacman', notes='rllib', tags=['impala', 'rllib', 'dev run'], config=log_config ) if __name__ == "__main__": args = parser.parse_args() config = build_training_config(args) log_config = build_log_config(args, config) setup_wandb(args, log_config) # Start ray and load a training instance. ray.init() trainer = impala.ImpalaTrainer(config=config, env=args.env) # Find the new folder and make sure we can upload videos base_dir = '/home/ubuntu/ray_results/' expdir = max([base_dir + d for d in os.listdir(base_dir)], key=os.path.getmtime) print("Exp dir detected: {}".format(expdir)) # Begin training timesteps = 0 for i in range(args.iterations): start_time = time.time() result = trainer.train() print("Finished iter {}".format(i), result) elapsed_time = time.time() - start_time
elif agent == "IMPALA": trainer_config = impala.DEFAULT_CONFIG.copy() trainer_config['log_level'] = "WARN" trainer_config['clip_rewards'] = True trainer_config["num_gpus"] = 1 trainer_config['output'] = './checkpoints/' trainer_config['rollout_fragment_length'] = 50 trainer_config['train_batch_size'] = 500 trainer_config["remote_worker_envs"] = True trainer_config['num_workers'] = 8 trainer_config['num_envs_per_worker'] = 4 trainer_config['lr_schedule'] = [ [0, 0.0005], [20000000, 0.000000000001], ] trainer_config['framework'] = 'tf' if framework == "tf" else 'torch' agent = impala.ImpalaTrainer(config=trainer_config, env=game) if training: trainer = train(agent, checkpoint=checkpoint) else: test(agent, game, state, scenario, wrapper, checkpoint=checkpoint, render=True, record=record, episode_count=episode_count)
import os # os.environ["TUNE_RESULT_DIR"] = "/media/drake/BlackPassport/ray_results/" import ray import ray.rllib.agents.impala as impala from ray.tune.logger import pretty_print ray.init() config = impala.DEFAULT_CONFIG.copy() config["num_gpus"] = 0 config["num_workers"] = 5 config["num_envs_per_worker"] = 5 trainer = impala.ImpalaTrainer(config=config, env="LunarLander-v2") # Can optionally call trainer.restore(path) to load a checkpoint. for i in range(1000): # Perform one iteration of training the policy with PPO result = trainer.train() print(pretty_print(result))
if not "." in sys.path: sys.path.insert(0, ".") from custom_model import CustomModel # Register custom-model in ModelCatalog ModelCatalog.register_custom_model("CustomCNN", CustomModel) ray.init() config = impala.DEFAULT_CONFIG.copy() config["num_gpus"] = 0 config["num_workers"] = 1 config["model"]["custom_model"] = "CustomCNN" config["log_level"] = "INFO" config["framework"] = "tf2" trainer = impala.ImpalaTrainer(config=config, env="procgen:procgen-coinrun-v0") for step in range(1000): # Custom training loop result = trainer.train() print(pretty_print(result)) if step % 100 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint) # Restore agent from a checkpoint and start a new training run with a different config # config["lr"] = ray.tune.grid_search([0.01, 0.001])"] # ray.tune.run(trainer, config=config, restore=checkpoint) ray.shutdown()
self.base_model = tf.keras.models.Sequential([input, output]) self.register_variables(self.base_model.variables) def forward(self, input_dict, state, seq_lens): return self.base_model(input_dict["obs"]), [] ModelCatalog.register_custom_model("MLPModel", MLPModel) ModelCatalog.register_custom_model("MLPModelV2", MLPModelV2) if algorithm == 'A2C': RLAgent = a2c.A2CTrainer(env=env_name, config=config) elif algorithm == 'ADQN': RLAgent = adqn.ApexTrainer(env=env_name, config=config) elif algorithm == 'DQN': RLAgent = dqn.DQNTrainer(env=env_name, config=config) elif algorithm == 'IMPALA': RLAgent = impala.ImpalaTrainer(env=env_name, config=config) elif algorithm == 'PPO': RLAgent = ppo.PPOTrainer(env=env_name, config=config) elif algorithm == 'RDQN': RLAgent = dqn.DQNTrainer(env=env_name, config=config) RLAgent.restore(checkpoint_path) num_runs = 50 totalRewards = np.empty((num_runs,)) for j in range(num_runs): observations = env.reset() rewards, action_dict = {}, {} for agent_id in env.agent_ids: assert isinstance(agent_id, int), "Error: agent_ids are not ints." rewards[agent_id] = 0
config["num_envs_per_worker"] = num_envs_per_worker trainer = ppo.PPOTrainer(config=config, env=env) run_policy(trainer) config = a3c.DEFAULT_CONFIG.copy() config["num_gpus"] = 0 config["num_workers"] = num_workers config["num_envs_per_worker"] = num_envs_per_worker trainer = a3c.A3CTrainer(config=config, env=env) run_policy(trainer) config = impala.DEFAULT_CONFIG.copy() config["num_gpus"] = 0 config["num_workers"] = num_workers config["num_envs_per_worker"] = num_envs_per_worker trainer = impala.ImpalaTrainer(config=config, env=env) run_policy(trainer) config = asp.DEFAULT_CONFIG.copy() config["num_gpus"] = 0 config["num_workers"] = num_workers config["num_envs_per_worker"] = num_envs_per_worker config["lr_schedule"] = [ [0, 0.0007], [20000000, 0.000000000001], ] config["significance_threshold"] = 0.1 trainer = asp.ASPTrainer(config=config, env=env) run_policy(trainer) config = easgd.DEFAULT_CONFIG.copy()