def test_ppo_sample_waste(self): # Check we at least collect the initial wave of samples ppo = PPOTrainer(env="CartPole-v0", config={ "rollout_fragment_length": 200, "train_batch_size": 128, "num_workers": 3, }) result = ppo.train() self.assertEqual(result["info"]["num_steps_sampled"], 600) ppo.stop() # Check we collect at least the specified amount of samples ppo = PPOTrainer(env="CartPole-v0", config={ "rollout_fragment_length": 200, "train_batch_size": 900, "num_workers": 3, }) result = ppo.train() self.assertEqual(result["info"]["num_steps_sampled"], 1200) ppo.stop() # Check in vectorized mode ppo = PPOTrainer(env="CartPole-v0", config={ "rollout_fragment_length": 200, "num_envs_per_worker": 2, "train_batch_size": 900, "num_workers": 3, }) result = ppo.train() self.assertEqual(result["info"]["num_steps_sampled"], 1200) ppo.stop()
def testPPOSampleWaste(self): ray.init(num_cpus=4) # Check we at least collect the initial wave of samples ppo = PPOTrainer(env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 128, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 600) ppo.stop() # Check we collect at least the specified amount of samples ppo = PPOTrainer(env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 900, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 1000) ppo.stop() # Check in vectorized mode ppo = PPOTrainer(env="CartPole-v0", config={ "sample_batch_size": 200, "num_envs_per_worker": 2, "train_batch_size": 900, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 1200) ppo.stop() # Check legacy mode ppo = PPOTrainer(env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 128, "num_workers": 3, "straggler_mitigation": True, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 200) ppo.stop()
def test_minibatch_sequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPOTrainer( env="counter", config={ "shuffle_sequences": False, # for deterministic testing "num_workers": 0, "rollout_fragment_length": 20, "train_batch_size": 20, "sgd_minibatch_size": 10, "vf_share_layers": True, "simple_optimizer": False, "num_sgd_iter": 1, "model": { "custom_model": "rnn", "max_seq_len": 4, }, "framework": "tf", }) ppo.train() ppo.train() # first epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) if batch0["sequences"][0][0][0] > batch1["sequences"][0][0][0]: batch0, batch1 = batch1, batch0 # sort minibatches self.assertEqual(batch0["seq_lens"].tolist(), [4, 4]) self.assertEqual(batch1["seq_lens"].tolist(), [4, 3]) self.assertEqual(batch0["sequences"].tolist(), [ [[0], [1], [2], [3]], [[4], [5], [6], [7]], ]) self.assertEqual(batch1["sequences"].tolist(), [ [[8], [9], [10], [11]], [[12], [13], [14], [0]], ]) # second epoch: 20 observations get split into 2 minibatches of 8 # four observations are discarded batch2 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_2")) batch3 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_3")) if batch2["sequences"][0][0][0] > batch3["sequences"][0][0][0]: batch2, batch3 = batch3, batch2 self.assertEqual(batch2["seq_lens"].tolist(), [4, 4]) self.assertEqual(batch3["seq_lens"].tolist(), [2, 4]) self.assertEqual(batch2["sequences"].tolist(), [ [[5], [6], [7], [8]], [[9], [10], [11], [12]], ]) self.assertEqual(batch3["sequences"].tolist(), [ [[13], [14], [0], [0]], [[0], [1], [2], [3]], ])
def run(n_agents=3, episode_length=40000, config=None): ray.init() tf.compat.v1.enable_v2_behavior() # initialize trainer env = ASMEnv(n_agents=n_agents) register_env( "asm", lambda _: ASMEnv(n_agents=n_agents, episode_length=episode_length)) policies = { "govt_policy": (PPOTFPolicy, env.observation_space, env.govt_action_space, {}), } for idx in range(n_agents): policies[f"citizen_policy_{idx}"] = (PPOTFPolicy, env.observation_space, env.citizen_action_space, {}) if config is None: ppo_config = { "multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": list(policies.keys()), }, "simple_optimizer": True, "observation_filter": "NoFilter", "framework": "tf", } else: ppo_config = config ppo_trainer = PPOTrainer(env="asm", config=ppo_config) print(ppo_trainer.train()) print("DONE!") ray.shutdown()
def testBasic(self): ray.init(num_cpus=2) ppo = PPOTrainer(env="CartPole-v0", config={"lr_schedule": [[0, 1e-5], [1000, 0.0]]}) for _ in range(10): result = ppo.train() assert result["episode_reward_mean"] < 100, "should not have learned"
def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config, env='BomberMan-v0') init_w = trainer.get_policy('policy_01').get_weights() trainer.restore( 'C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-25_08-56-43eo23nmho\\checkpoint_002360\\checkpoint-2360' ) trainer.workers.foreach_worker( lambda w: w.get_policy('policy_01').set_weights(init_w)) trainer.restore('.\\kill-policy-0\\checkpoint') trainer.import_model() iter = 0 #def update_phase(ev): # ev.foreach_env(lambda e: e.set_phase(phase)) while True: iter += 1 result = trainer.train() if iter % 200 == 0: if not os.path.exists(f'./model-{iter}'): #trainer.get_policy('policy_01').export_model(f'./model-{iter}') trainer.export_policy_model(f'./model-{iter}/main', 'policy_01') trainer.export_policy_model(f'./model-{iter}/collect', 'policy_collect') trainer.export_policy_model(f'./model-{iter}/destroy', 'policy_destroy') trainer.export_policy_model(f'./model-{iter}/kill', 'policy_kill') else: print("model already saved")
def test_rllib_ppo_smoke(): ray.shutdown() seed = 123 np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) ray.init(local_mode=True) # Runs PPO training in the same process register_env( "mlir_rl_env-v0", lambda env_config: make_mlir_rl_wrapper_env(env=gym.make("mlir-v0")), ) config = { "env": "mlir_rl_env-v0", "framework": "torch", "model": { "fcnet_hiddens": [2, 2], "fcnet_activation": "relu", }, "num_workers": 0, # local worker only "train_batch_size": 2, "sgd_minibatch_size": 1, "num_sgd_iter": 1, "rollout_fragment_length": 2, } trainer = PPOTrainer(config=config) trainer.train() ray.shutdown()
def getTrainner(args): config = getConfig(args) if args.agent == "PPO": trainner = PPOTrainer(config=config, env="custom-explorer") return trainner
async def train_agent(): ray.init() trainer = PPOTrainer(config={ "num_gpus": 0, "num_workers": 1 }, env="CartPole-v0") result = trainer.train() return str(result)
def test_simple_optimizer_sequencing(self): ModelCatalog.register_custom_model("rnn", RNNSpyModel) register_env("counter", lambda _: DebugCounterEnv()) ppo = PPOTrainer( env="counter", config={ "num_workers": 0, "rollout_fragment_length": 10, "train_batch_size": 10, "sgd_minibatch_size": 10, "vf_share_layers": True, "simple_optimizer": True, "num_sgd_iter": 1, "model": { "custom_model": "rnn", "max_seq_len": 4, "state_shape": [3, 3], }, "framework": "tf", }) ppo.train() ppo.train() batch0 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_0")) self.assertEqual( batch0["sequences"].tolist(), [[[0], [1], [2], [3]], [[4], [5], [6], [7]], [[8], [9], [0], [0]]]) self.assertEqual(batch0["seq_lens"].tolist(), [4, 4, 2]) self.assertEqual(batch0["state_in"][0][0].tolist(), [0, 0, 0]) self.assertEqual(batch0["state_in"][1][0].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch0["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch0["state_in"][1][1])), 0) self.assertTrue( np.allclose(batch0["state_in"][0].tolist()[1:], batch0["state_out"][0].tolist()[:-1])) self.assertTrue( np.allclose(batch0["state_in"][1].tolist()[1:], batch0["state_out"][1].tolist()[:-1])) batch1 = pickle.loads( ray.experimental.internal_kv._internal_kv_get("rnn_spy_in_1")) self.assertEqual(batch1["sequences"].tolist(), [ [[10], [11], [12], [13]], [[14], [0], [0], [0]], [[0], [1], [2], [3]], [[4], [0], [0], [0]], ]) self.assertEqual(batch1["seq_lens"].tolist(), [4, 1, 4, 1]) self.assertEqual(batch1["state_in"][0][2].tolist(), [0, 0, 0]) self.assertEqual(batch1["state_in"][1][2].tolist(), [0, 0, 0]) self.assertGreater(abs(np.sum(batch1["state_in"][0][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][0])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][1])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][0][3])), 0) self.assertGreater(abs(np.sum(batch1["state_in"][1][3])), 0)
def test_old_configs(self): """Tests creating various Trainers (Algorithms) using 1.10 config dicts.""" from ray.rllib.tests.backward_compat.old_ppo import DEFAULT_CONFIG from ray.rllib.agents.ppo import PPOTrainer config = DEFAULT_CONFIG.copy() trainer = PPOTrainer(config=config, env="CartPole-v0") trainer.train() trainer.stop()
def Hunter_trainer(config, reporter): multi_hunter_trainer = PPOTrainer(MultiHunterEnv, config) for _ in range(100): environment.simulate() result = multi_hunter_trainer.train() result["phase"] = 1 reporter(**result) phase1_time = result["timesteps_total"] state = multi_hunter_trainer.save() multi_hunter_trainer.stop()
def _build_model(self): trainer_config = DEFAULT_CONFIG.copy() trainer_config['num_workers'] = 0 trainer_config["train_batch_size"] = 640 trainer_config["sgd_minibatch_size"] = 64 trainer_config["num_sgd_iter"] = 10 trainer = PPOTrainer(trainer_config, self.env_class) return trainer
def load_agent(self, rllib_dir=None, rand_seed=None, fixed_action=1, explore=False): """ Load a trained RLlib agent from the specified rllib_path. Call this before testing a trained agent. :param rllib_dir: Path pointing to the agent's training dir (only used for RLlib agents) :param rand_seed: RNG seed used by the random agent (ignored by other agents) :param fixed_action: Fixed action performed by the fixed agent (ignored by the others) :param explore: Whether to keep exploration enabled. Set to False when testing an RLlib agent. True for continuing training. """ checkpoint_path = None if self.agent_name == 'ppo': # turn off exploration for testing the loaded agent self.config['explore'] = explore self.agent = PPOTrainer(config=self.config, env=self.env_class) self.agent_path = self.get_best_checkpoint_path(rllib_dir) # self.agent_path = self.get_last_checkpoint_path(rllib_dir) self.log.info('Loading PPO agent', checkpoint=self.agent_path) try: self.agent.restore(self.agent_path) except (AssertionError, ValueError) as e: self.log.error( f"Error loading agent. Mismatch of neural network size and number of UEs or env size" f" when using a pretrained central DeepCoMP agent? Error: '{str(e)}'" ) sys.exit() if self.agent_name == '3gpp': self.agent = Heuristic3GPP() if self.agent_name == 'fullcomp': self.agent = FullCoMP() if self.agent_name == 'dynamic': self.agent = DynamicSelection(epsilon=0.8) if self.agent_name == 'brute-force': self.agent = BruteForceAgent(self.num_workers) if self.agent_name == 'random': # instantiate the environment to get the action space env = self.env_class(self.env_config) self.agent = RandomAgent(env.action_space, seed=rand_seed) if self.agent_name == 'fixed': self.agent = FixedAgent(action=fixed_action, noop_interval=100) self.log.info('Agent loaded', agent=type(self.agent).__name__, rllib_dir=rllib_dir, checkpoint=checkpoint_path) # set a suitable filename for saving testing videos and results later self.set_result_filename() # read the number of training steps self.agent_train_steps = self.get_training_steps()
def my_train_fn(config, reporter): # Train for n iterations with high LR agent1 = PPOTrainer(env="CartPole-v0", config=config) for _ in range(10): result = agent1.train() result["phase"] = 1 reporter(**result) phase1_time = result["timesteps_total"] state = agent1.save() agent1.stop() # Train for n iterations with low LR config["lr"] = 0.0001 agent2 = PPOTrainer(env="CartPole-v0", config=config) agent2.restore(state) for _ in range(10): result = agent2.train() result["phase"] = 2 result["timesteps_total"] += phase1_time # keep time moving forward reporter(**result) agent2.stop()
def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config) if checkpoint_dir: trainer.load_checkpoint(checkpoint_dir) chk_freq = 10 if useModelFromLowLevelTrain: config_low["num_workers"] = 0 config_low["num_envs_per_worker"] = 1 config_low["num_gpus"] = 1 agentLow = PPOTrainer(config_low) agentLow.restore( "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}". format(experiment_name, experiment_id, checkpoint_num, checkpoint_num)) lowWeight = agentLow.get_policy().get_weights() highWeight = trainer.get_policy("low_level_policy").get_weights() lowState = agentLow.get_policy().get_state() importedOptState = OrderedDict([ (k.replace("default_policy", "low_level_policy"), v) for k, v in lowState["_optimizer_variables"].items() ]) importedPolicy = { hw: lowWeight[lw] for hw, lw in zip(highWeight.keys(), lowWeight.keys()) } importedPolicy["_optimizer_variables"] = importedOptState trainer.get_policy("low_level_policy").set_state(importedPolicy) chk_freq = 1 # Hanya perlu 1 kali saja di awal untuk save model hasil import while True: result = trainer.train() tune.report(**result) if (trainer._iteration % chk_freq == 0): with tune.checkpoint_dir( step=trainer._iteration) as checkpoint_dir: trainer.save(checkpoint_dir)
def train(num_iters): trainer = PPOTrainer( env='SUMOEnv-v0', config={ 'model': { "conv_filters": [ [32, [4, 4], 8], [64, [2, 2], 4], ], }, 'multiagent': { 'policy_graphs': { 'cluster_648538736_648538737': (PPOPolicyGraph, Box(low=0., high=1., shape=(32, 32, 1)), Discrete(n=5), {}), '49228579': (PPOPolicyGraph, Box(low=0., high=1., shape=(32, 32, 1)), Discrete(n=4), {}), 'cluster_2511020106_49297289': (PPOPolicyGraph, Box(low=0., high=1., shape=(32, 32, 1)), Discrete(n=4), {}), 'cluster_298135838_49135231': (PPOPolicyGraph, Box(low=0., high=1., shape=(32, 32, 1)), Discrete(n=3), {}), 'cluster_290051904_49145925': (PPOPolicyGraph, Box(low=0., high=1., shape=(32, 32, 1)), Discrete(n=5), {}), 'cluster_290051912_298136030_648538909': (PPOPolicyGraph, Box(low=0., high=1., shape=(32, 32, 1)), Discrete(n=3), {}), 'cluster_2511020102_2511020103_290051922_298135886': (PPOPolicyGraph, Box(low=0., high=1., shape=(32, 32, 1)), Discrete(n=4), {}), }, 'policy_mapping_fn': function(lambda agent_id: agent_id), }, 'callbacks': { 'on_episode_end': function(on_episode_end), }, # 'num_workers': 4, # 'num_gpus_per_worker': 0.25, # All workers on a single GPU # 'timesteps_per_iteration': 16000, }) for i in range(num_iters): print(f'== Iteration {i}==') print(pretty_print(trainer.train()))
def load_agent(): # Initialize training environment ray.init() def environment_creater(params=None): agent = SimpleAvoidAgent(noise=0.05) return TronRaySinglePlayerEnvironment(board_size=13, num_players=4, agent=agent) env = environment_creater() tune.register_env("tron_single_player", environment_creater) ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard) # Configure Deep Q Learning with reasonable values config = DEFAULT_CONFIG.copy() config['num_workers'] = 4 ## config['num_gpus'] = 1 #config["timesteps_per_iteration"] = 1024 #config['target_network_update_freq'] = 256 #config['buffer_size'] = 100_000 #config['schedule_max_timesteps'] = 200_000 #config['exploration_fraction'] = 0.02 #config['compress_observations'] = False #config['n_step'] = 2 #config['seed'] = SEED #Configure for PPO #config["sample_batch_size"]= 100 #config["train_batch_size"]=200 #config["sgd_minibatch_size"]=60 #Configure A3C with reasonable values # We will use a simple convolution network with 3 layers as our feature extractor config['model']['vf_share_layers'] = True config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)] config['model']['fcnet_hiddens'] = [256] config['model']['custom_preprocessor'] = 'tron_prep' # Begin training or evaluation #trainer = DDPGTrainer(config, "tron_single_player") #trainer = A3CTrainer(config, "tron_single_player") #trainer = DQNTrainer(config, "tron_single_player") trainer = PPOTrainer(config, "tron_single_player") trainer.restore("./ppo_checkpoint_201/checkpoint-201") return trainer #.get_policy("trainer")
def train_ppo(config, reporter): agent = PPOTrainer(config) # agent.restore("/path/checkpoint_41/checkpoint-41") # continue training i = 0 while True: result = agent.train() if reporter is None: continue else: reporter(**result) if i % 10 == 0: # save every 10th training iteration checkpoint_path = agent.save() print(checkpoint_path) i += 1
def test_ppo_sample_waste(self): # Check we at least collect the initial wave of samples ppo = PPOTrainer( env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 128, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 600) ppo.stop() # Check we collect at least the specified amount of samples ppo = PPOTrainer( env="CartPole-v0", config={ "sample_batch_size": 200, "train_batch_size": 900, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 1000) ppo.stop() # Check in vectorized mode ppo = PPOTrainer( env="CartPole-v0", config={ "sample_batch_size": 200, "num_envs_per_worker": 2, "train_batch_size": 900, "num_workers": 3, }) ppo.train() self.assertEqual(ppo.optimizer.num_steps_sampled, 1200) ppo.stop()
def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config, env='BomberMan-v0') #trainer.restore('C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-16_09-20-44984tj3ip\\checkpoint_002770\\checkpoint-2770') iter = 0 #def update_phase(ev): # ev.foreach_env(lambda e: e.set_phase(phase)) while True: iter += 1 result = trainer.train() if iter % 200 == 1: if not os.path.exists(f'./model-{iter}'): trainer.get_policy('policy_01').export_model(f'./model-{iter}') else: print("model already saved")
def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config, env='BomberMan-v0') trainer.restore( 'C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-26_20-15-082mjvde9i\\checkpoint_008980\\checkpoint-8980' ) iter = 0 while True: iter += 1 result = trainer.train() if iter % 200 == 0: if not os.path.exists(f'./model-{iter}'): trainer.get_policy('policy_01').export_model( f'./model-{iter}') else: print("model already saved")
def main(): ray.init() # Hyperparameters of PPO are not well tuned. Most of them refer to https://github.com/xtma/pytorch_car_caring/blob/master/train.py trainer = PPOTrainer(env=MyEnv, config={ "use_pytorch": True, "model": { "custom_model": "mymodel", "custom_options": { 'encoder_path': args.encoder_path, 'train_encoder': args.train_encoder }, "custom_action_dist": "mydist", }, "env_config": { 'game': 'CarRacing' }, "num_workers": args.num_workers, "num_envs_per_worker": args.num_envs_per_worker, "num_gpus": args.num_gpus, "use_gae": args.use_gae, "batch_mode": args.batch_mode, "vf_loss_coeff": args.vf_loss_coeff, "vf_clip_param": args.vf_clip_param, "lr": args.lr, "kl_coeff": args.kl_coeff, "num_sgd_iter": args.num_sgd_iter, "grad_clip": args.grad_clip, "clip_param": args.clip_param, "rollout_fragment_length": args.rollout_fragment_length, "train_batch_size": args.train_batch_size, "sgd_minibatch_size": args.sgd_minibatch_size }) for i in range(args.train_epochs): trainer.train() print("%d Train Done" % (i), "Save Freq: %d" % (args.model_save_freq)) if (i + 1) % args.model_save_freq == 0: print("%d Episodes Done" % (i)) weights = trainer.get_policy().get_weights() torch.save(weights, args.model_save_path + "%d-mode.pt" % (i + 1)) trainer.save(args.trainer_save_path) print("Done All!") trainer.stop()
def train(config, checkpoint_dir=None): trainer = PPOTrainer(config=config, env='BomberMan-v0') #trainer.restore('C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-22_10-57-05mz9533ge\\checkpoint_000140\\checkpoint-140') iter = 0 #def update_phase(ev): # ev.foreach_env(lambda e: e.set_phase(phase)) while True: iter += 1 result = trainer.train() if iter % 250 == 1: if not os.path.exists(f'./model-{iter}-ckpt'): #trainer.export_policy_model(f'./model-{iter}/kill', 'policy_kill') trainer.export_model('h5',f'./model-{iter}') else: trainer.import_model(f'./model-{iter}') print("model already saved")
def train_model(args): # We are using custom model and environment, which need to be registered in ray/rllib # Names can be anything. register_env("DuckieTown-MultiMap", lambda _: DiscreteWrapper(MultiMapEnv())) # Define trainer. Apart from env, config/framework and config/model, which are common among trainers. trainer = PPOTrainer( env="DuckieTown-MultiMap", config={ "framework": "torch", "model": { "custom_model": "image-ppo", }, "sgd_minibatch_size": 64, "output": None, "compress_observations": True, "num_workers": 0, } ) # Start training from a checkpoint, if available. if args.model_path: trainer.restore(args.model_path) plot = plotter.Plotter('ppo_agent') for i in range(args.epochs): # Number of episodes (basically epochs) print(f'----------------------- Starting epoch {i} ----------------------- ') # train() trains only a single episode result = trainer.train() print(result) plot.add_results(result) # Save model so far. checkpoint_path = trainer.save() print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}') # Cleanup CUDA memory to reduce memory usage. torch.cuda.empty_cache() # Debug log to monitor memory. print(torch.cuda.memory_summary(device=None, abbreviated=False)) plot.plot('PPO DuckieTown-MultiMap')
def build_bot(): ray.init(local_mode=True) trainer = PPOTrainer(env=ExternalAtari, config=dict(**CONFIG_PPO)) model_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'ckpts') last_iter = 0 for name in os.listdir(model_dir): print(name) it = int(name.split('_')[1]) if it > last_iter: last_iter = it print( os.listdir( os.path.join(os.path.abspath(os.path.dirname(__file__)), 'ckpts/checkpoint_{}'.format(last_iter)))) trainer.restore( os.path.join( os.path.abspath(os.path.dirname(__file__)), 'ckpts/checkpoint_{}/checkpoint-{}'.format(last_iter, last_iter))) return trainer
def get_trainer(checkpoint_path=None, extra_config=None, num_workers=10): config = dict( num_gpus=0, num_workers=num_workers, num_cpus_per_worker=1, horizon=1000, lr=0.0, batch_mode="complete_episodes", callbacks=DrivingCallbacks, # explore=False, # Add this line to only use mean for action. # Setup the correct environment env=GeneralizationRacing, env_config=dict(environment_num=10000)) if extra_config: config.update(extra_config) trainer = PPOTrainer(config=config) if checkpoint_path is not None: trainer.restore(os.path.expanduser(checkpoint_path)) return trainer
def evaluate_model(args): if args.model_path == '': print('Cannot evaluate model, no --model_path set') exit(1) def get_env(): # Simulator env uses a single map, so better for evaluation/testing. # DiscreteWrapper just converts wheel velocities to high level discrete actions. return DiscreteWrapper(simulator.Simulator( map_name=args.map, max_steps=2000, )) # Rather than reuse the env, another one is created later because I can't # figure out how to provide register_env with an object, th # register_env('DuckieTown-Simulator', lambda _: get_env()) trainer = PPOTrainer( env="DuckieTown-Simulator", config={ "framework": "torch", "model": { "custom_model": "image-ppo", }, }, ) trainer.restore(args.model_path) sim_env = get_env() # Standard OpenAI Gym reset/action/step/render loop. # This matches how the `enjoy_reinforcement.py` script works, see: https://git.io/J3js2 done = False observation = sim_env.reset() episode_reward = 0 while not done: action = trainer.compute_action(observation) observation, reward, done, _ = sim_env.step(action) episode_reward += reward sim_env.render() print(f'Episode complete, total reward: {episode_reward}')
def build_model(self): trainer_config = DEFAULT_CONFIG.copy() trainer_config["num_workers"] = 0 # trainer_config["train_batch_size"] = 640 # trainer_config["sgd_minibatch_size"] = 160 # trainer_config["num_sgd_iter"] = 100 trainer_config["exploration_config"] = { "type": "Random", } # EpsilonGreedy(Exploration): # trainer_config["exploration_config"] = { # "type": "Curiosity", # "eta": 0.2, # "lr": 0.001, # "feature_dim": 128, # "feature_net_config": { # "fcnet_hiddens": [], # "fcnet_activation": "relu", # }, # "sub_exploration": { # "type": "StochasticSampling", # } # } # trainer_config["log_level"] = "DEBUG" """ if env_config is not None: for x in env_config.keys(): trainer_config[x] = env_config[x] """ # trainer_config["env_config"] = copy.deepcopy(env_config) # {"rules": "qiyang_role"} trainer_config.update(self.agent_config) self.trainer = PPOTrainer(trainer_config, self.agent_config["env"]) # self.config["trainer"] = self.trainer return self.trainer
def ray_server(run='PPO', address=ADDRESS, port=PORT): print(ray.init(log_to_driver=False)) connector_config = { "input": (lambda ioctx: PolicyServerInput(ioctx, address, port)), "num_workers": 0, "input_evaluation": [], "create_env_on_driver": False, "num_gpus": FLAGS.num_gpus, } if run == "DQN": trainer = DQNTrainer(env=ExternalAtari, config=dict(connector_config, **CONFIG_DQN)) elif run == "PPO": trainer = PPOTrainer(env=ExternalAtari, config=dict(connector_config, **CONFIG_PPO)) else: raise ValueError("--run must be DQN or PPO") i = 0 while i < FLAGS.iter: i += 1 print(pretty_print(trainer.train())) ray.shutdown() checkpoint = trainer.save("{}/ckpts".format(FLAGS.train_url.rstrip('/'))) print("checkpoint saved at", checkpoint) mox.file.copy( os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.json"), os.path.join(FLAGS.train_url, "config.json")) mox.file.copy( os.path.join(os.path.abspath(os.path.dirname(__file__)), "customize_service.py"), os.path.join(FLAGS.train_url, "customize_service.py")) mox.file.copy(os.path.join(FLAGS.data_url, "rl_config.py"), os.path.join(FLAGS.train_url, "rl_config.py")) del trainer