def test_old_configs(self): """Tests creating various Trainers (Algorithms) using 1.10 config dicts.""" from ray.rllib.tests.backward_compat.old_ppo import DEFAULT_CONFIG from ray.rllib.agents.ppo import PPOTrainer config = DEFAULT_CONFIG.copy() trainer = PPOTrainer(config=config, env="CartPole-v0") trainer.train() trainer.stop()
def Hunter_trainer(config, reporter): multi_hunter_trainer = PPOTrainer(MultiHunterEnv, config) for _ in range(100): environment.simulate() result = multi_hunter_trainer.train() result["phase"] = 1 reporter(**result) phase1_time = result["timesteps_total"] state = multi_hunter_trainer.save() multi_hunter_trainer.stop()
def main(): ray.init() # Hyperparameters of PPO are not well tuned. Most of them refer to https://github.com/xtma/pytorch_car_caring/blob/master/train.py trainer = PPOTrainer(env=MyEnv, config={ "use_pytorch": True, "model": { "custom_model": "mymodel", "custom_options": { 'encoder_path': args.encoder_path, 'train_encoder': args.train_encoder }, "custom_action_dist": "mydist", }, "env_config": { 'game': 'CarRacing' }, "num_workers": args.num_workers, "num_envs_per_worker": args.num_envs_per_worker, "num_gpus": args.num_gpus, "use_gae": args.use_gae, "batch_mode": args.batch_mode, "vf_loss_coeff": args.vf_loss_coeff, "vf_clip_param": args.vf_clip_param, "lr": args.lr, "kl_coeff": args.kl_coeff, "num_sgd_iter": args.num_sgd_iter, "grad_clip": args.grad_clip, "clip_param": args.clip_param, "rollout_fragment_length": args.rollout_fragment_length, "train_batch_size": args.train_batch_size, "sgd_minibatch_size": args.sgd_minibatch_size }) for i in range(args.train_epochs): trainer.train() print("%d Train Done" % (i), "Save Freq: %d" % (args.model_save_freq)) if (i + 1) % args.model_save_freq == 0: print("%d Episodes Done" % (i)) weights = trainer.get_policy().get_weights() torch.save(weights, args.model_save_path + "%d-mode.pt" % (i + 1)) trainer.save(args.trainer_save_path) print("Done All!") trainer.stop()
def my_train_fn(config, reporter): # Train for n iterations with high LR agent1 = PPOTrainer(env="CartPole-v0", config=config) for _ in range(10): result = agent1.train() result["phase"] = 1 reporter(**result) phase1_time = result["timesteps_total"] state = agent1.save() agent1.stop() # Train for n iterations with low LR config["lr"] = 0.0001 agent2 = PPOTrainer(env="CartPole-v0", config=config) agent2.restore(state) for _ in range(10): result = agent2.train() result["phase"] = 2 result["timesteps_total"] += phase1_time # keep time moving forward reporter(**result) agent2.stop()
def test_ppo_sample_waste(self): # Check we at least collect the initial wave of samples ppo = PPOTrainer(env="CartPole-v0", config={ "rollout_fragment_length": 200, "train_batch_size": 128, "num_workers": 3, }) result = ppo.train() self.assertEqual(result["info"]["num_steps_sampled"], 600) ppo.stop() # Check we collect at least the specified amount of samples ppo = PPOTrainer(env="CartPole-v0", config={ "rollout_fragment_length": 200, "train_batch_size": 900, "num_workers": 3, }) result = ppo.train() self.assertEqual(result["info"]["num_steps_sampled"], 1200) ppo.stop() # Check in vectorized mode ppo = PPOTrainer(env="CartPole-v0", config={ "rollout_fragment_length": 200, "num_envs_per_worker": 2, "train_batch_size": 900, "num_workers": 3, }) result = ppo.train() self.assertEqual(result["info"]["num_steps_sampled"], 1200) ppo.stop()
def testPPOSampleWaste(self): ray.init(num_cpus=4) # Check we at least collect the initial wave of samples ppo = PPOTrainer(env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 128, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 600) ppo.stop() # Check we collect at least the specified amount of samples ppo = PPOTrainer(env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 900, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 1000) ppo.stop() # Check in vectorized mode ppo = PPOTrainer(env="CartPole-v0", config={ "sample_batch_size": 200, "num_envs_per_worker": 2, "train_batch_size": 900, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 1200) ppo.stop() # Check legacy mode ppo = PPOTrainer(env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 128, "num_workers": 3, "straggler_mitigation": True, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 200) ppo.stop()
def test_ppo_sample_waste(self): # Check we at least collect the initial wave of samples ppo = PPOTrainer( env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 128, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 600) ppo.stop() # Check we collect at least the specified amount of samples ppo = PPOTrainer( env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 900, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 1000) ppo.stop() # Check in vectorized mode ppo = PPOTrainer( env="CartPole-v0", config={ "sample_batch_size": 200, "num_envs_per_worker": 2, "train_batch_size": 900, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 1200) ppo.stop()
agent_cfg[ "shuffle_sequences"] = True # Whether to shuffle sequences in the batch when training agent_cfg[ "grad_clip"] = None # Clamp the norm of the gradient during optimization (None to disable) # ====================== Run the optimization ====================== agent_cfg["lr"] = 1.0e-4 agent_cfg["lr_schedule"] = None train_agent = Trainer(agent_cfg, "env", logger_creator) checkpoint_path = train(train_agent, max_timesteps=100000) # ===================== Enjoy the trained agent ====================== test_agent = Trainer(agent_cfg, "env", logger_creator) test_agent.restore(checkpoint_path) test(test_agent, explore=False) # =================== Terminate Ray backend ==================== train_agent.stop() test_agent.stop() ray.shutdown() # =================== Terminate the Ray backend ==================== train_agent.stop() test_agent.stop() ray.shutdown()
def test_local(self): cf = DEFAULT_CONFIG.copy() for _ in framework_iterator(cf): agent = PPOTrainer(cf, "CartPole-v0") print(agent.train()) agent.stop()
# Create a new dummy Trainer to "fix" our checkpoint. new_trainer = PPOTrainer(config=config) # Get untrained weights for all policies. untrained_weights = new_trainer.get_weights() # Restore all policies from checkpoint. new_trainer.restore(best_checkpoint) # Set back all weights (except for 1st agent) to original # untrained weights. new_trainer.set_weights( {pid: w for pid, w in untrained_weights.items() if pid != "policy_0"}) # Create the checkpoint from which tune can pick up the # experiment. new_checkpoint = new_trainer.save() new_trainer.stop() print(".. checkpoint to restore from (all policies reset, " f"except policy_0): {new_checkpoint}") print("Starting new tune.run") # Start our actual experiment. stop = { "episode_reward_mean": args.stop_reward, "timesteps_total": args.stop_timesteps, "training_iteration": args.stop_iters, } # Make sure, the non-1st policies are not updated anymore. config["multiagent"]["policies_to_train"] = [ pid for pid in policy_ids if pid != "policy_0"
# else random.choice(["fb_1", "fb_2"]) # }, # }, # ) trainer = PPOTrainer( env="rcrs_env", config={ "env": "rcrs_env", "num_workers": 1, "multiagent": { "policies": { "fb_1": (None, obs_space, act_space, {}), "fb_2": (None, obs_space, act_space, {}), }, "policy_mapping_fn": lambda agent_id: "fb_1" if agent_id.startswith("fb_1_") else random.choice(["fb_1", "fb_2"]) }, }, ) for i in range(2): result = trainer.train() print(pretty_print(result)) if i % 1 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint) statess = trainer.save() trainer.stop()