Esempio n. 1
0
    def test_impala_compilation(self):
        """Test whether an ImpalaTrainer can be built with both frameworks."""
        config = impala.DEFAULT_CONFIG.copy()
        num_iterations = 1

        for _ in framework_iterator(config, frameworks=("torch", "tf")):
            local_cfg = config.copy()
            for env in ["Pendulum-v0", "CartPole-v0"]:
                print("Env={}".format(env))
                print("w/ LSTM")
                # Test w/o LSTM.
                trainer = impala.ImpalaTrainer(config=local_cfg, env=env)
                for i in range(num_iterations):
                    print(trainer.train())
                check_compute_action(trainer)
                trainer.stop()

                # Test w/ LSTM.
                print("w/o LSTM")
                local_cfg["model"]["use_lstm"] = True
                trainer = impala.ImpalaTrainer(config=local_cfg, env=env)
                for i in range(num_iterations):
                    print(trainer.train())
                check_compute_action(trainer)
                trainer.stop()
Esempio n. 2
0
    def test_impala_compilation(self):
        """Test whether an ImpalaTrainer can be built with both frameworks."""
        config = impala.DEFAULT_CONFIG.copy()
        num_iterations = 1

        for _ in framework_iterator(config):
            local_cfg = config.copy()
            for env in ["Pendulum-v0", "CartPole-v0"]:
                print("Env={}".format(env))
                print("w/o LSTM")
                # Test w/o LSTM.
                local_cfg["model"]["use_lstm"] = False
                local_cfg["num_aggregation_workers"] = 0
                trainer = impala.ImpalaTrainer(config=local_cfg, env=env)
                for i in range(num_iterations):
                    print(trainer.train())
                check_compute_single_action(trainer)
                trainer.stop()

                # Test w/ LSTM.
                print("w/ LSTM")
                local_cfg["model"]["use_lstm"] = True
                local_cfg["model"]["lstm_use_prev_action"] = True
                local_cfg["model"]["lstm_use_prev_reward"] = True
                local_cfg["num_aggregation_workers"] = 1
                trainer = impala.ImpalaTrainer(config=local_cfg, env=env)
                for i in range(num_iterations):
                    print(trainer.train())
                check_compute_single_action(trainer,
                                            include_state=True,
                                            include_prev_action_reward=True)
                trainer.stop()
Esempio n. 3
0
    def test_impala_fake_multi_gpu_learning(self):
        """Test whether IMPALATrainer can learn CartPole w/ faked multi-GPU."""
        config = copy.deepcopy(impala.DEFAULT_CONFIG)
        # Fake GPU setup.
        config["_fake_gpus"] = True
        config["num_gpus"] = 2

        config["train_batch_size"] *= 2

        # Test w/ LSTMs.
        config["model"]["use_lstm"] = True

        for _ in framework_iterator(config, frameworks=("tf", "torch")):
            trainer = impala.ImpalaTrainer(config=config, env="CartPole-v0")
            num_iterations = 200
            learnt = False
            for i in range(num_iterations):
                results = trainer.train()
                print(results)
                if results["episode_reward_mean"] > 55.0:
                    learnt = True
                    break
            assert learnt, \
                "IMPALA multi-GPU (with fake-GPUs) did not learn CartPole!"
            trainer.stop()
Esempio n. 4
0
    def test_impala_compilation(self):
        """Test whether an ImpalaTrainer can be built with both frameworks."""
        config = impala.DEFAULT_CONFIG.copy()
        config["num_gpus"] = 0
        config["model"]["lstm_use_prev_action"] = True
        config["model"]["lstm_use_prev_reward"] = True
        num_iterations = 1
        env = "CartPole-v0"

        for _ in framework_iterator(config, with_eager_tracing=True):
            local_cfg = config.copy()
            for lstm in [False, True]:
                local_cfg["num_aggregation_workers"] = 0 if not lstm else 1
                local_cfg["model"]["use_lstm"] = lstm
                print("lstm={} aggregation-workers={}".format(
                    lstm, local_cfg["num_aggregation_workers"]))
                # Test with and w/o aggregation workers (this has nothing
                # to do with LSTMs, though).
                trainer = impala.ImpalaTrainer(config=local_cfg, env=env)
                for i in range(num_iterations):
                    results = trainer.train()
                    check_train_results(results)
                    print(results)

                check_compute_single_action(
                    trainer,
                    include_state=lstm,
                    include_prev_action_reward=lstm,
                )
                trainer.stop()
Esempio n. 5
0
 def load(self, path):
     """
     Load a trained RLlib agent from the specified path. Call this before testing a trained agent.
     :param path: Path pointing to the agent's saved checkpoint (only used for RLlib agents)
     """
     self.agent = impala.ImpalaTrainer(config=self.config)
     self.agent.restore(path)
Esempio n. 6
0
    def test_impala_lr_schedule(self):
        config = impala.DEFAULT_CONFIG.copy()
        config["num_gpus"] = 0
        # Test whether we correctly ignore the "lr" setting.
        # The first lr should be 0.0005.
        config["lr"] = 0.1
        config["lr_schedule"] = [
            [0, 0.0005],
            [10000, 0.000001],
        ]
        config["num_gpus"] = 0  # Do not use any (fake) GPUs.
        config["env"] = "CartPole-v0"

        def get_lr(result):
            return result["info"]["learner"][DEFAULT_POLICY_ID]["cur_lr"]

        for fw in framework_iterator(config, frameworks=("tf", "torch")):
            trainer = impala.ImpalaTrainer(config=config)
            policy = trainer.get_policy()

            try:
                if fw == "tf":
                    check(policy.get_session().run(policy.cur_lr), 0.0005)
                else:
                    check(policy.cur_lr, 0.0005)
                r1 = trainer.train()
                r2 = trainer.train()
                assert get_lr(r2) < get_lr(r1), (r1, r2)
            finally:
                trainer.stop()
Esempio n. 7
0
    def test(self, algo, path, lr, fc_hid, fc_act):
        """Test trained agent for a single episode. Return the episode reward"""
        # instantiate env class
        unused_shared = []
        unused_own = []
        unsatisfied_shared = []
        unsatisfied_own = []

        episode_reward = 0

        #self.config["num_workers"] = 0
        self.config["lr"] = lr
        self.config['model']["fcnet_hiddens"] = fc_hid
        self.config['model']["fcnet_activation"] = fc_act

        if algo == "ppo":
            self.agent = ppo.PPOTrainer(config=self.config)
        if algo == "ddpg":
            self.agent = ddpg.DDPGTrainer(config=self.config)
        if algo == "a3c":
            self.agent = a3c.A3CTrainer(config=self.config)
        if algo == "impala":
            self.agent = impala.ImpalaTrainer(config=self.config)
        if algo == "appo":
            self.agent = ppo.APPOTrainer(config=self.config)
        if algo == "td3":
            self.agent = ddpg.TD3Trainer(config=self.config)

        self.agent.restore(path)

        env = caching_vM(config=self.config)

        obs = env.reset()
        done = False

        action = {}
        for agent_id, agent_obs in obs.items():
            policy_id = self.config['multiagent']['policy_mapping_fn'](
                agent_id)
            action[agent_id] = self.agent.compute_action(agent_obs,
                                                         policy_id=policy_id)
        obs, reward, done, info = env.step(action)
        done = done['__all__']

        for x in range(len(info)):
            res = ast.literal_eval(info[x])
            unused_shared.append(res[0])
            unused_own.append(res[1])
            unsatisfied_shared.append(res[2])
            unsatisfied_own.append(res[3])

        print("reward == ", reward)
        # sum up reward for all agents
        episode_reward += sum(reward.values())

        return episode_reward, unused_shared, unused_own, unsatisfied_shared, unsatisfied_own
Esempio n. 8
0
    def test(self,algo, path, lr, fc_hid, fc_act):

        """Test trained agent for a single episode. Return the episode reward"""
        # instantiate env class
        unused_shared = []
        unused_own = []
        unsatisfied_shared = []
        unsatisfied_own = []

        episode_reward = 0
        self.config_test["num_workers"] = 0
        self.config_test["lr"] = lr
        self.config_test['model']["fcnet_hiddens"] = fc_hid
        self.config_test['model']["fcnet_activation"] = fc_act

        if algo == "ppo":
            self.agent = ppo.PPOTrainer(config=self.config_test)
        if algo == "ddpg":
            self.agent = ddpg.DDPGTrainer(config=self.config_test)
        if algo == "a3c":
            self.agent = a3c.A3CTrainer(config=self.config_test)
        if algo == "impala":
            self.agent = impala.ImpalaTrainer(config=self.config_test)
        if algo == "appo":
            self.agent = ppo.APPOTrainer(config=self.config_test)
        if algo == "td3":
            self.agent = ddpg.TD3Trainer(config=self.config_test)

        self.agent.restore(path)

        #env = self.agent.workers.local_worker().env
        #env = self.env_class(self.env_config)
        #env = ContentCaching(*self.config_train)
        #env = self.config_train["env"]#env_config)
        #env = self.env_class(3)
        #env = ContentCaching
        #env = self.env
        #self.env = ContentCaching
        #env = self.config_train["env"]
        
     
        obs = ContentCaching.reset()
        done = False

        while not done:
            action = self.agent.compute_action(obs)
            obs, reward, done, info = self.env.step(action)
            episode_reward += reward

            unused_shared.append(info["unused_shared"])
            unused_own.append(info["unused_own"])
            unsatisfied_shared.append(info["unsatisfied_shared"])
            unsatisfied_own.append(info["unsatisfied_own"])
        
        return episode_reward, unused_shared, unused_own, unsatisfied_shared, unsatisfied_own
Esempio n. 9
0
    def test_no_gpus_error(self):
        """Tests errors related to no-GPU/too-few GPUs/etc.

        This test will only work ok on a CPU-only machine.
        """

        config = impala.DEFAULT_CONFIG.copy()
        env = "CartPole-v0"

        for _ in framework_iterator(config):
            self.assertRaisesRegex(
                RuntimeError,
                # (?s): "dot matches all" (also newlines).
                "(?s)Found 0 GPUs on your machine.+To change the config",
                lambda: impala.ImpalaTrainer(config=config, env=env),
            )
Esempio n. 10
0
    def test_impala_lr_schedule(self):
        config = impala.DEFAULT_CONFIG.copy()
        config["lr_schedule"] = [
            [0, 0.0005],
            [10000, 0.000001],
        ]
        local_cfg = config.copy()
        trainer = impala.ImpalaTrainer(config=local_cfg, env="CartPole-v0")

        def get_lr(result):
            return result["info"]["learner"][DEFAULT_POLICY_ID]["cur_lr"]

        try:
            r1 = trainer.train()
            r2 = trainer.train()
            assert get_lr(r2) < get_lr(r1), (r1, r2)
        finally:
            trainer.stop()
Esempio n. 11
0
def get_rl_agent(agent_name, config, env_to_agent):
    if agent_name == A2C:
        import ray.rllib.agents.a3c as a2c
        agent = a2c.A2CTrainer(config=config, env=env_to_agent)
    elif agent_name == A3C:
        import ray.rllib.agents.a3c as a3c
        agent = a3c.A3CTrainer(config=config, env=env_to_agent)
    elif agent_name == BC:
        import ray.rllib.agents.marwil as bc
        agent = bc.BCTrainer(config=config, env=env_to_agent)
    elif agent_name == DQN:
        import ray.rllib.agents.dqn as dqn
        agent = dqn.DQNTrainer(config=config, env=env_to_agent)
    elif agent_name == APEX_DQN:
        import ray.rllib.agents.dqn as dqn
        agent = dqn.ApexTrainer(config=config, env=env_to_agent)
    elif agent_name == IMPALA:
        import ray.rllib.agents.impala as impala
        agent = impala.ImpalaTrainer(config=config, env=env_to_agent)
    elif agent_name == MARWIL:
        import ray.rllib.agents.marwil as marwil
        agent = marwil.MARWILTrainer(config=config, env=env_to_agent)
    elif agent_name == PG:
        import ray.rllib.agents.pg as pg
        agent = pg.PGTrainer(config=config, env=env_to_agent)
    elif agent_name == PPO:
        import ray.rllib.agents.ppo as ppo
        agent = ppo.PPOTrainer(config=config, env=env_to_agent)
    elif agent_name == APPO:
        import ray.rllib.agents.ppo as ppo
        agent = ppo.APPOTrainer(config=config, env=env_to_agent)
    elif agent_name == SAC:
        import ray.rllib.agents.sac as sac
        agent = sac.SACTrainer(config=config, env=env_to_agent)
    elif agent_name == LIN_UCB:
        import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb
        agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent)
    elif agent_name == LIN_TS:
        import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts
        agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent)
    else:
        raise Exception("Not valid agent name")
    return agent
Esempio n. 12
0
def get_rllib_agent(agent_name, env_name, env, env_to_agent):
    config = get_config(env_name, env, 1) if is_rllib_agent(agent_name) else {}
    if agent_name == RLLIB_A2C:
        import ray.rllib.agents.a3c as a2c
        agent = a2c.A2CTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_A3C:
        import ray.rllib.agents.a3c as a3c
        agent = a3c.A3CTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_BC:
        import ray.rllib.agents.marwil as bc
        agent = bc.BCTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_DQN:
        import ray.rllib.agents.dqn as dqn
        agent = dqn.DQNTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_APEX_DQN:
        import ray.rllib.agents.dqn as dqn
        agent = dqn.ApexTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_IMPALA:
        import ray.rllib.agents.impala as impala
        agent = impala.ImpalaTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_MARWIL:
        import ray.rllib.agents.marwil as marwil
        agent = marwil.MARWILTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_PG:
        import ray.rllib.agents.pg as pg
        agent = pg.PGTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_PPO:
        import ray.rllib.agents.ppo as ppo
        agent = ppo.PPOTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_APPO:
        import ray.rllib.agents.ppo as ppo
        agent = ppo.APPOTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_SAC:
        import ray.rllib.agents.sac as sac
        agent = sac.SACTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_LIN_UCB:
        import ray.rllib.contrib.bandits.agents.lin_ucb as lin_ucb
        agent = lin_ucb.LinUCBTrainer(config=config, env=env_to_agent)
    elif agent_name == RLLIB_LIN_TS:
        import ray.rllib.contrib.bandits.agents.lin_ts as lin_ts
        agent = lin_ts.LinTSTrainer(config=config, env=env_to_agent)
    return agent
Esempio n. 13
0
    def test_impala_lr_schedule(self):
        config = impala.DEFAULT_CONFIG.copy()
        config["num_gpus"] = 0
        # Test whether we correctly ignore the "lr" setting.
        # The first lr should be 0.05.
        config["lr"] = 0.1
        config["lr_schedule"] = [
            [0, 0.05],
            [10000, 0.000001],
        ]
        config["num_gpus"] = 0  # Do not use any (fake) GPUs.
        config["env"] = "CartPole-v0"

        def get_lr(result):
            return result["info"][LEARNER_INFO][DEFAULT_POLICY_ID][LEARNER_STATS_KEY][
                "cur_lr"
            ]

        for fw in framework_iterator(config):
            trainer = impala.ImpalaTrainer(config=config)
            policy = trainer.get_policy()

            try:
                if fw == "tf":
                    check(policy.get_session().run(policy.cur_lr), 0.05)
                else:
                    check(policy.cur_lr, 0.05)
                r1 = trainer.train()
                r2 = trainer.train()
                r3 = trainer.train()
                # Due to the asynch'ness of IMPALA, learner-stats metrics
                # could be delayed by one iteration. Do 3 train() calls here
                # and measure guaranteed decrease in lr between 1st and 3rd.
                lr1 = get_lr(r1)
                lr2 = get_lr(r2)
                lr3 = get_lr(r3)
                assert lr2 <= lr1, (lr1, lr2)
                assert lr3 <= lr2, (lr2, lr3)
                assert lr3 < lr1, (lr1, lr3)
            finally:
                trainer.stop()
Esempio n. 14
0
    wandb.init(
        project='pacman', notes='rllib',
        tags=['impala', 'rllib', 'dev run'],
        config=log_config
    )
    
    
if __name__ == "__main__":
    args = parser.parse_args()
    config = build_training_config(args)
    log_config = build_log_config(args, config)
    setup_wandb(args, log_config)

    # Start ray and load a training instance.
    ray.init()
    trainer = impala.ImpalaTrainer(config=config, env=args.env)

    # Find the new folder and make sure we can upload videos
    base_dir = '/home/ubuntu/ray_results/'
    expdir = max([base_dir + d for d in os.listdir(base_dir)], key=os.path.getmtime)
    print("Exp dir detected: {}".format(expdir))
    
    # Begin training
    timesteps = 0 
    for i in range(args.iterations):
        start_time = time.time()

        result = trainer.train()
        print("Finished iter {}".format(i), result)

        elapsed_time = time.time() - start_time
Esempio n. 15
0
    elif agent == "IMPALA":
        trainer_config = impala.DEFAULT_CONFIG.copy()
        trainer_config['log_level'] = "WARN"
        trainer_config['clip_rewards'] = True
        trainer_config["num_gpus"] = 1
        trainer_config['output'] = './checkpoints/'
        trainer_config['rollout_fragment_length'] = 50
        trainer_config['train_batch_size'] = 500
        trainer_config["remote_worker_envs"] = True
        trainer_config['num_workers'] = 8
        trainer_config['num_envs_per_worker'] = 4
        trainer_config['lr_schedule'] = [
            [0, 0.0005],
            [20000000, 0.000000000001],
        ]
        trainer_config['framework'] = 'tf' if framework == "tf" else 'torch'

        agent = impala.ImpalaTrainer(config=trainer_config, env=game)

    if training:
        trainer = train(agent, checkpoint=checkpoint)
    else:
        test(agent,
             game,
             state,
             scenario,
             wrapper,
             checkpoint=checkpoint,
             render=True,
             record=record,
             episode_count=episode_count)
Esempio n. 16
0
import os

# os.environ["TUNE_RESULT_DIR"] = "/media/drake/BlackPassport/ray_results/"

import ray
import ray.rllib.agents.impala as impala
from ray.tune.logger import pretty_print

ray.init()

config = impala.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["num_workers"] = 5
config["num_envs_per_worker"] = 5
trainer = impala.ImpalaTrainer(config=config, env="LunarLander-v2")

# Can optionally call trainer.restore(path) to load a checkpoint.

for i in range(1000):
    # Perform one iteration of training the policy with PPO
    result = trainer.train()
    print(pretty_print(result))
Esempio n. 17
0
if not "." in sys.path:
    sys.path.insert(0, ".")
from custom_model import CustomModel

# Register custom-model in ModelCatalog
ModelCatalog.register_custom_model("CustomCNN", CustomModel)

ray.init()
config = impala.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["num_workers"] = 1
config["model"]["custom_model"] = "CustomCNN"
config["log_level"] = "INFO"
config["framework"] = "tf2"
trainer = impala.ImpalaTrainer(config=config, env="procgen:procgen-coinrun-v0")

for step in range(1000):
    # Custom training loop
    result = trainer.train()
    print(pretty_print(result))

    if step % 100 == 0:
        checkpoint = trainer.save()
        print("checkpoint saved at", checkpoint)

# Restore agent from a checkpoint and start a new training run with a different config
# config["lr"] =  ray.tune.grid_search([0.01, 0.001])"]
# ray.tune.run(trainer, config=config, restore=checkpoint)

ray.shutdown()
        self.base_model = tf.keras.models.Sequential([input, output])
        self.register_variables(self.base_model.variables)
    def forward(self, input_dict, state, seq_lens):
        return self.base_model(input_dict["obs"]), []

ModelCatalog.register_custom_model("MLPModel", MLPModel)
ModelCatalog.register_custom_model("MLPModelV2", MLPModelV2)

if algorithm == 'A2C':
    RLAgent = a2c.A2CTrainer(env=env_name, config=config)
elif algorithm == 'ADQN':
    RLAgent = adqn.ApexTrainer(env=env_name, config=config)
elif algorithm == 'DQN':
    RLAgent = dqn.DQNTrainer(env=env_name, config=config)
elif algorithm == 'IMPALA':
    RLAgent = impala.ImpalaTrainer(env=env_name, config=config)
elif algorithm == 'PPO':
    RLAgent = ppo.PPOTrainer(env=env_name, config=config)
elif algorithm == 'RDQN':
    RLAgent = dqn.DQNTrainer(env=env_name, config=config)
RLAgent.restore(checkpoint_path)

num_runs = 50
totalRewards = np.empty((num_runs,))

for j in range(num_runs):
    observations = env.reset()
    rewards, action_dict = {}, {}
    for agent_id in env.agent_ids:
        assert isinstance(agent_id, int), "Error: agent_ids are not ints."
        rewards[agent_id] = 0
Esempio n. 19
0
        config["num_envs_per_worker"] = num_envs_per_worker
        trainer = ppo.PPOTrainer(config=config, env=env)
        run_policy(trainer)

        config = a3c.DEFAULT_CONFIG.copy()
        config["num_gpus"] = 0
        config["num_workers"] = num_workers
        config["num_envs_per_worker"] = num_envs_per_worker
        trainer = a3c.A3CTrainer(config=config, env=env)
        run_policy(trainer)

        config = impala.DEFAULT_CONFIG.copy()
        config["num_gpus"] = 0
        config["num_workers"] = num_workers
        config["num_envs_per_worker"] = num_envs_per_worker
        trainer = impala.ImpalaTrainer(config=config, env=env)
        run_policy(trainer)

        config = asp.DEFAULT_CONFIG.copy()
        config["num_gpus"] = 0
        config["num_workers"] = num_workers
        config["num_envs_per_worker"] = num_envs_per_worker
        config["lr_schedule"] = [
            [0, 0.0007],
            [20000000, 0.000000000001],
        ]
        config["significance_threshold"] = 0.1
        trainer = asp.ASPTrainer(config=config, env=env)
        run_policy(trainer)

        config = easgd.DEFAULT_CONFIG.copy()