Example #1
0
    def test_ppo_sample_waste(self):
        # Check we at least collect the initial wave of samples
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "rollout_fragment_length": 200,
                             "train_batch_size": 128,
                             "num_workers": 3,
                         })
        result = ppo.train()
        self.assertEqual(result["info"]["num_steps_sampled"], 600)
        ppo.stop()

        # Check we collect at least the specified amount of samples
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "rollout_fragment_length": 200,
                             "train_batch_size": 900,
                             "num_workers": 3,
                         })
        result = ppo.train()
        self.assertEqual(result["info"]["num_steps_sampled"], 1200)
        ppo.stop()

        # Check in vectorized mode
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "rollout_fragment_length": 200,
                             "num_envs_per_worker": 2,
                             "train_batch_size": 900,
                             "num_workers": 3,
                         })
        result = ppo.train()
        self.assertEqual(result["info"]["num_steps_sampled"], 1200)
        ppo.stop()
Example #2
0
    def testPPOSampleWaste(self):
        ray.init(num_cpus=4)

        # Check we at least collect the initial wave of samples
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "sample_batch_size": 200,
                             "train_batch_size": 128,
                             "num_workers": 3,
                         })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 600)
        ppo.stop()

        # Check we collect at least the specified amount of samples
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "sample_batch_size": 200,
                             "train_batch_size": 900,
                             "num_workers": 3,
                         })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 1000)
        ppo.stop()

        # Check in vectorized mode
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "sample_batch_size": 200,
                             "num_envs_per_worker": 2,
                             "train_batch_size": 900,
                             "num_workers": 3,
                         })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 1200)
        ppo.stop()

        # Check legacy mode
        ppo = PPOTrainer(env="CartPole-v0",
                         config={
                             "sample_batch_size": 200,
                             "train_batch_size": 128,
                             "num_workers": 3,
                             "straggler_mitigation": True,
                         })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 200)
        ppo.stop()
Example #3
0
    def test_minibatch_sequencing(self):
        ModelCatalog.register_custom_model("rnn", RNNSpyModel)
        register_env("counter", lambda _: DebugCounterEnv())
        ppo = PPOTrainer(
            env="counter",
            config={
                "shuffle_sequences": False,  # for deterministic testing
                "num_workers": 0,
                "rollout_fragment_length": 20,
                "train_batch_size": 20,
                "sgd_minibatch_size": 10,
                "vf_share_layers": True,
                "simple_optimizer": False,
                "num_sgd_iter": 1,
                "model": {
                    "custom_model": "rnn",
                    "max_seq_len": 4,
                },
                "framework": "tf",
            })
        ppo.train()
        ppo.train()

        # first epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch0 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0"))
        batch1 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1"))
        if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]:
            batch0, batch1 = batch1, batch0  # sort minibatches
        self.assertEqual(batch0["seq_lens"].tolist(), [4, 4])
        self.assertEqual(batch1["seq_lens"].tolist(), [4, 3])
        self.assertEqual(batch0["sequences"].tolist(), [
            [[0], [1], [2], [3]],
            [[4], [5], [6], [7]],
        ])
        self.assertEqual(batch1["sequences"].tolist(), [
            [[8], [9], [10], [11]],
            [[12], [13], [14], [0]],
        ])

        # second epoch: 20 observations get split into 2 minibatches of 8
        # four observations are discarded
        batch2 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2"))
        batch3 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3"))
        if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]:
            batch2, batch3 = batch3, batch2
        self.assertEqual(batch2["seq_lens"].tolist(), [4, 4])
        self.assertEqual(batch3["seq_lens"].tolist(), [2, 4])
        self.assertEqual(batch2["sequences"].tolist(), [
            [[5], [6], [7], [8]],
            [[9], [10], [11], [12]],
        ])
        self.assertEqual(batch3["sequences"].tolist(), [
            [[13], [14], [0], [0]],
            [[0], [1], [2], [3]],
        ])
Example #4
0
def run(n_agents=3, episode_length=40000, config=None):
    ray.init()
    tf.compat.v1.enable_v2_behavior()
    # initialize trainer
    env = ASMEnv(n_agents=n_agents)
    register_env(
        "asm",
        lambda _: ASMEnv(n_agents=n_agents, episode_length=episode_length))
    policies = {
        "govt_policy":
        (PPOTFPolicy, env.observation_space, env.govt_action_space, {}),
    }
    for idx in range(n_agents):
        policies[f"citizen_policy_{idx}"] = (PPOTFPolicy,
                                             env.observation_space,
                                             env.citizen_action_space, {})
    if config is None:
        ppo_config = {
            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": policy_mapping_fn,
                "policies_to_train": list(policies.keys()),
            },
            "simple_optimizer": True,
            "observation_filter": "NoFilter",
            "framework": "tf",
        }
    else:
        ppo_config = config
    ppo_trainer = PPOTrainer(env="asm", config=ppo_config)
    print(ppo_trainer.train())
    print("DONE!")
    ray.shutdown()
Example #5
0
 def testBasic(self):
     ray.init(num_cpus=2)
     ppo = PPOTrainer(env="CartPole-v0",
                      config={"lr_schedule": [[0, 1e-5], [1000, 0.0]]})
     for _ in range(10):
         result = ppo.train()
     assert result["episode_reward_mean"] < 100, "should not have learned"
Example #6
0
    def train(config, checkpoint_dir=None):
        trainer = PPOTrainer(config=config, env='BomberMan-v0')
        init_w = trainer.get_policy('policy_01').get_weights()
        trainer.restore(
            'C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-25_08-56-43eo23nmho\\checkpoint_002360\\checkpoint-2360'
        )
        trainer.workers.foreach_worker(
            lambda w: w.get_policy('policy_01').set_weights(init_w))
        trainer.restore('.\\kill-policy-0\\checkpoint')
        trainer.import_model()
        iter = 0

        #def update_phase(ev):
        #    ev.foreach_env(lambda e: e.set_phase(phase))

        while True:
            iter += 1
            result = trainer.train()
            if iter % 200 == 0:
                if not os.path.exists(f'./model-{iter}'):
                    #trainer.get_policy('policy_01').export_model(f'./model-{iter}')
                    trainer.export_policy_model(f'./model-{iter}/main',
                                                'policy_01')
                    trainer.export_policy_model(f'./model-{iter}/collect',
                                                'policy_collect')
                    trainer.export_policy_model(f'./model-{iter}/destroy',
                                                'policy_destroy')
                    trainer.export_policy_model(f'./model-{iter}/kill',
                                                'policy_kill')

                else:
                    print("model already saved")
def test_rllib_ppo_smoke():
    ray.shutdown()
    seed = 123
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    ray.init(local_mode=True)  # Runs PPO training in the same process
    register_env(
        "mlir_rl_env-v0",
        lambda env_config: make_mlir_rl_wrapper_env(env=gym.make("mlir-v0")),
    )
    config = {
        "env": "mlir_rl_env-v0",
        "framework": "torch",
        "model": {
            "fcnet_hiddens": [2, 2],
            "fcnet_activation": "relu",
        },
        "num_workers": 0,  # local worker only
        "train_batch_size": 2,
        "sgd_minibatch_size": 1,
        "num_sgd_iter": 1,
        "rollout_fragment_length": 2,
    }
    trainer = PPOTrainer(config=config)
    trainer.train()
    ray.shutdown()
Example #8
0
def getTrainner(args):

    config = getConfig(args)

    if args.agent == "PPO":
        trainner = PPOTrainer(config=config, env="custom-explorer")

    return trainner
Example #9
0
async def train_agent():
    ray.init()
    trainer = PPOTrainer(config={
        "num_gpus": 0,
        "num_workers": 1
    },
                         env="CartPole-v0")
    result = trainer.train()
    return str(result)
Example #10
0
    def test_simple_optimizer_sequencing(self):
        ModelCatalog.register_custom_model("rnn", RNNSpyModel)
        register_env("counter", lambda _: DebugCounterEnv())
        ppo = PPOTrainer(
            env="counter",
            config={
                "num_workers": 0,
                "rollout_fragment_length": 10,
                "train_batch_size": 10,
                "sgd_minibatch_size": 10,
                "vf_share_layers": True,
                "simple_optimizer": True,
                "num_sgd_iter": 1,
                "model": {
                    "custom_model": "rnn",
                    "max_seq_len": 4,
                    "state_shape": [3, 3],
                },
                "framework": "tf",
            })
        ppo.train()
        ppo.train()

        batch0 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0"))
        self.assertEqual(
            batch0["sequences"].tolist(),
            [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]])
        self.assertEqual(batch0["seq_lens"].tolist(), [4, 4, 2])
        self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0])
        self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0])
        self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0)
        self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0)
        self.assertTrue(
            np.allclose(batch0["state_in"][0].tolist()[1:],
                        batch0["state_out"][0].tolist()[:-1]))
        self.assertTrue(
            np.allclose(batch0["state_in"][1].tolist()[1:],
                        batch0["state_out"][1].tolist()[:-1]))

        batch1 = pickle.loads(
            ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1"))
        self.assertEqual(batch1["sequences"].tolist(), [
            [[10], [11], [12], [13]],
            [[14], [0], [0], [0]],
            [[0], [1], [2], [3]],
            [[4], [0], [0], [0]],
        ])
        self.assertEqual(batch1["seq_lens"].tolist(), [4, 1, 4, 1])
        self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0])
        self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0])
        self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0)
        self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0)
Example #11
0
    def test_old_configs(self):
        """Tests creating various Trainers (Algorithms) using 1.10 config dicts."""
        from ray.rllib.tests.backward_compat.old_ppo import DEFAULT_CONFIG
        from ray.rllib.agents.ppo import PPOTrainer

        config = DEFAULT_CONFIG.copy()
        trainer = PPOTrainer(config=config, env="CartPole-v0")
        trainer.train()
        trainer.stop()
Example #12
0
def Hunter_trainer(config, reporter):
    multi_hunter_trainer = PPOTrainer(MultiHunterEnv, config)
    for _ in range(100):
        environment.simulate()
        result = multi_hunter_trainer.train()
        result["phase"] = 1
        reporter(**result)
        phase1_time = result["timesteps_total"]
    state = multi_hunter_trainer.save()
    multi_hunter_trainer.stop()
Example #13
0
    def _build_model(self):

        trainer_config = DEFAULT_CONFIG.copy()
        trainer_config['num_workers'] = 0
        trainer_config["train_batch_size"] = 640
        trainer_config["sgd_minibatch_size"] = 64
        trainer_config["num_sgd_iter"] = 10
        trainer = PPOTrainer(trainer_config, self.env_class)

        return trainer
Example #14
0
    def load_agent(self,
                   rllib_dir=None,
                   rand_seed=None,
                   fixed_action=1,
                   explore=False):
        """
        Load a trained RLlib agent from the specified rllib_path. Call this before testing a trained agent.

        :param rllib_dir: Path pointing to the agent's training dir (only used for RLlib agents)
        :param rand_seed: RNG seed used by the random agent (ignored by other agents)
        :param fixed_action: Fixed action performed by the fixed agent (ignored by the others)
        :param explore: Whether to keep exploration enabled. Set to False when testing an RLlib agent.
        True for continuing training.
        """
        checkpoint_path = None
        if self.agent_name == 'ppo':
            # turn off exploration for testing the loaded agent
            self.config['explore'] = explore
            self.agent = PPOTrainer(config=self.config, env=self.env_class)
            self.agent_path = self.get_best_checkpoint_path(rllib_dir)
            # self.agent_path = self.get_last_checkpoint_path(rllib_dir)
            self.log.info('Loading PPO agent', checkpoint=self.agent_path)
            try:
                self.agent.restore(self.agent_path)
            except (AssertionError, ValueError) as e:
                self.log.error(
                    f"Error loading agent. Mismatch of neural network size and number of UEs or env size"
                    f" when using a pretrained central DeepCoMP agent? Error: '{str(e)}'"
                )
                sys.exit()
        if self.agent_name == '3gpp':
            self.agent = Heuristic3GPP()
        if self.agent_name == 'fullcomp':
            self.agent = FullCoMP()
        if self.agent_name == 'dynamic':
            self.agent = DynamicSelection(epsilon=0.8)
        if self.agent_name == 'brute-force':
            self.agent = BruteForceAgent(self.num_workers)
        if self.agent_name == 'random':
            # instantiate the environment to get the action space
            env = self.env_class(self.env_config)
            self.agent = RandomAgent(env.action_space, seed=rand_seed)
        if self.agent_name == 'fixed':
            self.agent = FixedAgent(action=fixed_action, noop_interval=100)

        self.log.info('Agent loaded',
                      agent=type(self.agent).__name__,
                      rllib_dir=rllib_dir,
                      checkpoint=checkpoint_path)

        # set a suitable filename for saving testing videos and results later
        self.set_result_filename()

        # read the number of training steps
        self.agent_train_steps = self.get_training_steps()
Example #15
0
def my_train_fn(config, reporter):
    # Train for n iterations with high LR
    agent1 = PPOTrainer(env="CartPole-v0", config=config)
    for _ in range(10):
        result = agent1.train()
        result["phase"] = 1
        reporter(**result)
        phase1_time = result["timesteps_total"]
    state = agent1.save()
    agent1.stop()

    # Train for n iterations with low LR
    config["lr"] = 0.0001
    agent2 = PPOTrainer(env="CartPole-v0", config=config)
    agent2.restore(state)
    for _ in range(10):
        result = agent2.train()
        result["phase"] = 2
        result["timesteps_total"] += phase1_time  # keep time moving forward
        reporter(**result)
    agent2.stop()
def train(config, checkpoint_dir=None):
    trainer = PPOTrainer(config=config)

    if checkpoint_dir:
        trainer.load_checkpoint(checkpoint_dir)

    chk_freq = 10

    if useModelFromLowLevelTrain:
        config_low["num_workers"] = 0
        config_low["num_envs_per_worker"] = 1
        config_low["num_gpus"] = 1
        agentLow = PPOTrainer(config_low)
        agentLow.restore(
            "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}".
            format(experiment_name, experiment_id, checkpoint_num,
                   checkpoint_num))
        lowWeight = agentLow.get_policy().get_weights()
        highWeight = trainer.get_policy("low_level_policy").get_weights()
        lowState = agentLow.get_policy().get_state()
        importedOptState = OrderedDict([
            (k.replace("default_policy", "low_level_policy"), v)
            for k, v in lowState["_optimizer_variables"].items()
        ])
        importedPolicy = {
            hw: lowWeight[lw]
            for hw, lw in zip(highWeight.keys(), lowWeight.keys())
        }
        importedPolicy["_optimizer_variables"] = importedOptState
        trainer.get_policy("low_level_policy").set_state(importedPolicy)
        chk_freq = 1  # Hanya perlu 1 kali saja di awal untuk save model hasil import

    while True:
        result = trainer.train()
        tune.report(**result)
        if (trainer._iteration % chk_freq == 0):
            with tune.checkpoint_dir(
                    step=trainer._iteration) as checkpoint_dir:
                trainer.save(checkpoint_dir)
Example #17
0
def train(num_iters):
    trainer = PPOTrainer(
        env='SUMOEnv-v0',
        config={
            'model': {
                "conv_filters": [
                    [32, [4, 4], 8],
                    [64, [2, 2], 4],
                ],
            },
            'multiagent': {
                'policy_graphs': {
                    'cluster_648538736_648538737':
                    (PPOPolicyGraph, Box(low=0., high=1., shape=(32, 32, 1)),
                     Discrete(n=5), {}),
                    '49228579': (PPOPolicyGraph,
                                 Box(low=0., high=1.,
                                     shape=(32, 32, 1)), Discrete(n=4), {}),
                    'cluster_2511020106_49297289':
                    (PPOPolicyGraph, Box(low=0., high=1., shape=(32, 32, 1)),
                     Discrete(n=4), {}),
                    'cluster_298135838_49135231': (PPOPolicyGraph,
                                                   Box(low=0.,
                                                       high=1.,
                                                       shape=(32, 32, 1)),
                                                   Discrete(n=3), {}),
                    'cluster_290051904_49145925': (PPOPolicyGraph,
                                                   Box(low=0.,
                                                       high=1.,
                                                       shape=(32, 32, 1)),
                                                   Discrete(n=5), {}),
                    'cluster_290051912_298136030_648538909':
                    (PPOPolicyGraph, Box(low=0., high=1., shape=(32, 32, 1)),
                     Discrete(n=3), {}),
                    'cluster_2511020102_2511020103_290051922_298135886':
                    (PPOPolicyGraph, Box(low=0., high=1., shape=(32, 32, 1)),
                     Discrete(n=4), {}),
                },
                'policy_mapping_fn': function(lambda agent_id: agent_id),
            },
            'callbacks': {
                'on_episode_end': function(on_episode_end),
            },
            # 'num_workers': 4,
            # 'num_gpus_per_worker': 0.25,  # All workers on a single GPU
            # 'timesteps_per_iteration': 16000,
        })

    for i in range(num_iters):
        print(f'== Iteration {i}==')
        print(pretty_print(trainer.train()))
Example #18
0
def load_agent():

    # Initialize training environment

    ray.init()

    def environment_creater(params=None):
        agent = SimpleAvoidAgent(noise=0.05)
        return TronRaySinglePlayerEnvironment(board_size=13,
                                              num_players=4,
                                              agent=agent)

    env = environment_creater()
    tune.register_env("tron_single_player", environment_creater)
    ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard)

    # Configure Deep Q Learning with reasonable values
    config = DEFAULT_CONFIG.copy()
    config['num_workers'] = 4
    ## config['num_gpus'] = 1
    #config["timesteps_per_iteration"] = 1024
    #config['target_network_update_freq'] = 256
    #config['buffer_size'] = 100_000
    #config['schedule_max_timesteps'] = 200_000
    #config['exploration_fraction'] = 0.02
    #config['compress_observations'] = False
    #config['n_step'] = 2
    #config['seed'] = SEED

    #Configure for PPO
    #config["sample_batch_size"]= 100
    #config["train_batch_size"]=200
    #config["sgd_minibatch_size"]=60
    #Configure A3C with reasonable values

    # We will use a simple convolution network with 3 layers as our feature extractor
    config['model']['vf_share_layers'] = True
    config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)]
    config['model']['fcnet_hiddens'] = [256]
    config['model']['custom_preprocessor'] = 'tron_prep'

    # Begin training or evaluation
    #trainer = DDPGTrainer(config, "tron_single_player")
    #trainer = A3CTrainer(config, "tron_single_player")
    #trainer = DQNTrainer(config, "tron_single_player")
    trainer = PPOTrainer(config, "tron_single_player")

    trainer.restore("./ppo_checkpoint_201/checkpoint-201")

    return trainer  #.get_policy("trainer")
Example #19
0
def train_ppo(config, reporter):
    agent = PPOTrainer(config)
    # agent.restore("/path/checkpoint_41/checkpoint-41")  # continue training
    i = 0
    while True:
        result = agent.train()
        if reporter is None:
            continue
        else:
            reporter(**result)
        if i % 10 == 0:  # save every 10th training iteration
            checkpoint_path = agent.save()
            print(checkpoint_path)
        i += 1
Example #20
0
    def test_ppo_sample_waste(self):
        # Check we at least collect the initial wave of samples
        ppo = PPOTrainer(
            env="CartPole-v0",
            config={
                "sample_batch_size": 200,
                "train_batch_size": 128,
                "num_workers": 3,
            })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 600)
        ppo.stop()

        # Check we collect at least the specified amount of samples
        ppo = PPOTrainer(
            env="CartPole-v0",
            config={
                "sample_batch_size": 200,
                "train_batch_size": 900,
                "num_workers": 3,
            })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 1000)
        ppo.stop()

        # Check in vectorized mode
        ppo = PPOTrainer(
            env="CartPole-v0",
            config={
                "sample_batch_size": 200,
                "num_envs_per_worker": 2,
                "train_batch_size": 900,
                "num_workers": 3,
            })
        ppo.train()
        self.assertEqual(ppo.optimizer.num_steps_sampled, 1200)
        ppo.stop()
Example #21
0
    def train(config, checkpoint_dir=None):
        trainer = PPOTrainer(config=config, env='BomberMan-v0')
        #trainer.restore('C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-16_09-20-44984tj3ip\\checkpoint_002770\\checkpoint-2770')
        iter = 0

        #def update_phase(ev):
        #    ev.foreach_env(lambda e: e.set_phase(phase))

        while True:
            iter += 1
            result = trainer.train()
            if iter % 200 == 1:
                if not os.path.exists(f'./model-{iter}'):
                    trainer.get_policy('policy_01').export_model(f'./model-{iter}')
                else:
                    print("model already saved")
Example #22
0
    def train(config, checkpoint_dir=None):
        trainer = PPOTrainer(config=config, env='BomberMan-v0')
        trainer.restore(
            'C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-26_20-15-082mjvde9i\\checkpoint_008980\\checkpoint-8980'
        )
        iter = 0

        while True:
            iter += 1
            result = trainer.train()
            if iter % 200 == 0:
                if not os.path.exists(f'./model-{iter}'):
                    trainer.get_policy('policy_01').export_model(
                        f'./model-{iter}')
                else:
                    print("model already saved")
Example #23
0
def main():
    ray.init()

    #  Hyperparameters of PPO are not well tuned. Most of them refer to https://github.com/xtma/pytorch_car_caring/blob/master/train.py
    trainer = PPOTrainer(env=MyEnv,
                         config={
                             "use_pytorch": True,
                             "model": {
                                 "custom_model": "mymodel",
                                 "custom_options": {
                                     'encoder_path': args.encoder_path,
                                     'train_encoder': args.train_encoder
                                 },
                                 "custom_action_dist": "mydist",
                             },
                             "env_config": {
                                 'game': 'CarRacing'
                             },
                             "num_workers": args.num_workers,
                             "num_envs_per_worker": args.num_envs_per_worker,
                             "num_gpus": args.num_gpus,
                             "use_gae": args.use_gae,
                             "batch_mode": args.batch_mode,
                             "vf_loss_coeff": args.vf_loss_coeff,
                             "vf_clip_param": args.vf_clip_param,
                             "lr": args.lr,
                             "kl_coeff": args.kl_coeff,
                             "num_sgd_iter": args.num_sgd_iter,
                             "grad_clip": args.grad_clip,
                             "clip_param": args.clip_param,
                             "rollout_fragment_length":
                             args.rollout_fragment_length,
                             "train_batch_size": args.train_batch_size,
                             "sgd_minibatch_size": args.sgd_minibatch_size
                         })

    for i in range(args.train_epochs):
        trainer.train()
        print("%d Train Done" % (i), "Save Freq: %d" % (args.model_save_freq))
        if (i + 1) % args.model_save_freq == 0:
            print("%d Episodes Done" % (i))
            weights = trainer.get_policy().get_weights()
            torch.save(weights, args.model_save_path + "%d-mode.pt" % (i + 1))
    trainer.save(args.trainer_save_path)
    print("Done All!")
    trainer.stop()
Example #24
0
    def train(config, checkpoint_dir=None):
        trainer = PPOTrainer(config=config, env='BomberMan-v0')
        #trainer.restore('C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-22_10-57-05mz9533ge\\checkpoint_000140\\checkpoint-140')
        iter = 0

        #def update_phase(ev):
        #    ev.foreach_env(lambda e: e.set_phase(phase))

        while True:
            iter += 1
            result = trainer.train()
            if iter % 250 == 1:
                if not os.path.exists(f'./model-{iter}-ckpt'):
                    #trainer.export_policy_model(f'./model-{iter}/kill', 'policy_kill')
                    trainer.export_model('h5',f'./model-{iter}')
                else:
                    trainer.import_model(f'./model-{iter}')
                    print("model already saved")
Example #25
0
def train_model(args):
    # We are using custom model and environment, which need to be registered in ray/rllib
    # Names can be anything.
    register_env("DuckieTown-MultiMap", lambda _: DiscreteWrapper(MultiMapEnv()))

    # Define trainer. Apart from env, config/framework and config/model, which are common among trainers.
    trainer = PPOTrainer(
        env="DuckieTown-MultiMap",
        config={
            "framework": "torch",
            "model": {
                "custom_model": "image-ppo",
            },
            "sgd_minibatch_size": 64,
            "output": None,
            "compress_observations": True,
            "num_workers": 0,
        }
    )

    # Start training from a checkpoint, if available.
    if args.model_path:
        trainer.restore(args.model_path)

    plot = plotter.Plotter('ppo_agent')
    for i in range(args.epochs):  # Number of episodes (basically epochs)
        print(f'----------------------- Starting epoch {i} ----------------------- ')
        # train() trains only a single episode
        result = trainer.train()
        print(result)
        plot.add_results(result)

        # Save model so far.
        checkpoint_path = trainer.save()
        print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}')

        # Cleanup CUDA memory to reduce memory usage.
        torch.cuda.empty_cache()
        # Debug log to monitor memory.
        print(torch.cuda.memory_summary(device=None, abbreviated=False))

    plot.plot('PPO DuckieTown-MultiMap')
Example #26
0
def build_bot():
    ray.init(local_mode=True)
    trainer = PPOTrainer(env=ExternalAtari, config=dict(**CONFIG_PPO))
    model_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                             'ckpts')
    last_iter = 0
    for name in os.listdir(model_dir):
        print(name)
        it = int(name.split('_')[1])
        if it > last_iter:
            last_iter = it
    print(
        os.listdir(
            os.path.join(os.path.abspath(os.path.dirname(__file__)),
                         'ckpts/checkpoint_{}'.format(last_iter))))
    trainer.restore(
        os.path.join(
            os.path.abspath(os.path.dirname(__file__)),
            'ckpts/checkpoint_{}/checkpoint-{}'.format(last_iter, last_iter)))
    return trainer
Example #27
0
def get_trainer(checkpoint_path=None, extra_config=None, num_workers=10):
    config = dict(
        num_gpus=0,
        num_workers=num_workers,
        num_cpus_per_worker=1,
        horizon=1000,
        lr=0.0,
        batch_mode="complete_episodes",
        callbacks=DrivingCallbacks,
        # explore=False,  # Add this line to only use mean for action.

        # Setup the correct environment
        env=GeneralizationRacing,
        env_config=dict(environment_num=10000))
    if extra_config:
        config.update(extra_config)
    trainer = PPOTrainer(config=config)
    if checkpoint_path is not None:
        trainer.restore(os.path.expanduser(checkpoint_path))
    return trainer
Example #28
0
def evaluate_model(args):
    if args.model_path == '':
        print('Cannot evaluate model, no --model_path set')
        exit(1)

    def get_env():
        # Simulator env uses a single map, so better for evaluation/testing.
        # DiscreteWrapper just converts wheel velocities to high level discrete actions.
        return DiscreteWrapper(simulator.Simulator(
            map_name=args.map,
            max_steps=2000,
        ))

    # Rather than reuse the env, another one is created later because I can't
    # figure out how to provide register_env with an object, th
    # register_env('DuckieTown-Simulator', lambda _: get_env())
    trainer = PPOTrainer(
        env="DuckieTown-Simulator",
        config={
            "framework": "torch",
            "model": {
                "custom_model": "image-ppo",
            },
        },
    )
    trainer.restore(args.model_path)

    sim_env = get_env()

    # Standard OpenAI Gym reset/action/step/render loop.
    # This matches how the `enjoy_reinforcement.py` script works, see: https://git.io/J3js2
    done = False
    observation = sim_env.reset()
    episode_reward = 0
    while not done:
        action = trainer.compute_action(observation)
        observation, reward, done, _ = sim_env.step(action)
        episode_reward += reward
        sim_env.render()

    print(f'Episode complete, total reward: {episode_reward}')
Example #29
0
    def build_model(self):

        trainer_config = DEFAULT_CONFIG.copy()

        trainer_config["num_workers"] = 0
        # trainer_config["train_batch_size"] = 640
        # trainer_config["sgd_minibatch_size"] = 160
        # trainer_config["num_sgd_iter"] = 100

        trainer_config["exploration_config"] = {
            "type": "Random",
        }
        # EpsilonGreedy(Exploration):
        # trainer_config["exploration_config"] = {
        #         "type": "Curiosity",
        #         "eta": 0.2,
        #         "lr": 0.001,
        #         "feature_dim": 128,
        #         "feature_net_config": {
        #             "fcnet_hiddens": [],
        #             "fcnet_activation": "relu",
        #         },
        #         "sub_exploration": {
        #             "type": "StochasticSampling",
        #         }
        #     }

        # trainer_config["log_level"] = "DEBUG"
        """
        if env_config is not None:
            for x in env_config.keys():
                trainer_config[x] = env_config[x]
        """

        # trainer_config["env_config"] = copy.deepcopy(env_config)  #  {"rules": "qiyang_role"}

        trainer_config.update(self.agent_config)

        self.trainer = PPOTrainer(trainer_config, self.agent_config["env"])
        # self.config["trainer"] = self.trainer
        return self.trainer
Example #30
0
def ray_server(run='PPO', address=ADDRESS, port=PORT):
    print(ray.init(log_to_driver=False))

    connector_config = {
        "input": (lambda ioctx: PolicyServerInput(ioctx, address, port)),
        "num_workers": 0,
        "input_evaluation": [],
        "create_env_on_driver": False,
        "num_gpus": FLAGS.num_gpus,
    }

    if run == "DQN":
        trainer = DQNTrainer(env=ExternalAtari,
                             config=dict(connector_config, **CONFIG_DQN))
    elif run == "PPO":
        trainer = PPOTrainer(env=ExternalAtari,
                             config=dict(connector_config, **CONFIG_PPO))
    else:
        raise ValueError("--run must be DQN or PPO")

    i = 0
    while i < FLAGS.iter:
        i += 1
        print(pretty_print(trainer.train()))
    ray.shutdown()

    checkpoint = trainer.save("{}/ckpts".format(FLAGS.train_url.rstrip('/')))
    print("checkpoint saved at", checkpoint)
    mox.file.copy(
        os.path.join(os.path.abspath(os.path.dirname(__file__)),
                     "config.json"),
        os.path.join(FLAGS.train_url, "config.json"))
    mox.file.copy(
        os.path.join(os.path.abspath(os.path.dirname(__file__)),
                     "customize_service.py"),
        os.path.join(FLAGS.train_url, "customize_service.py"))
    mox.file.copy(os.path.join(FLAGS.data_url, "rl_config.py"),
                  os.path.join(FLAGS.train_url, "rl_config.py"))

    del trainer