def my_train_fn(config, reporter): iterations = config.pop("train-iterations", 10) # Train for n iterations with high LR agent1 = PPO(env="CartPole-v0", config=config) for _ in range(iterations): result = agent1.train() result["phase"] = 1 reporter(**result) phase1_time = result["timesteps_total"] state = agent1.save() agent1.stop() # Train for n iterations with low LR config["lr"] = 0.0001 agent2 = PPO(env="CartPole-v0", config=config) agent2.restore(state) for _ in range(iterations): result = agent2.train() result["phase"] = 2 result["timesteps_total"] += phase1_time # keep time moving forward reporter(**result) agent2.stop()
config=config, stop=stop, verbose=2, checkpoint_at_end=True) if args.as_test: check_learning_achieved(results, args.stop_reward) checkpoints = results.get_trial_checkpoints_paths( trial=results.get_best_trial("episode_reward_mean", mode="max"), metric="episode_reward_mean", ) checkpoint_path = checkpoints[0][0] trainer = PPO(config) trainer.restore(checkpoint_path) # Inference loop. env = StatelessCartPole() # Run manual inference loop for n episodes. for _ in range(10): episode_reward = 0.0 reward = 0.0 action = 0 done = False obs = env.reset() while not done: # Create a dummy action using the same observation n times, # as well as dummy prev-n-actions and prev-n-rewards. action, state, logits = trainer.compute_single_action(
verbose=1, checkpoint_freq=1, checkpoint_at_end=True, ) print("Pre-training done.") best_checkpoint = results.get_best_checkpoint(results.trials[0], mode="max") print(f".. best checkpoint was: {best_checkpoint}") # Create a new dummy Trainer to "fix" our checkpoint. new_trainer = PPO(config=config) # Get untrained weights for all policies. untrained_weights = new_trainer.get_weights() # Restore all policies from checkpoint. new_trainer.restore(best_checkpoint) # Set back all weights (except for 1st agent) to original # untrained weights. new_trainer.set_weights( {pid: w for pid, w in untrained_weights.items() if pid != "policy_0"}) # Create the checkpoint from which tune can pick up the # experiment. new_checkpoint = new_trainer.save() new_trainer.stop() print(".. checkpoint to restore from (all policies reset, " f"except policy_0): {new_checkpoint}") print("Starting new tune.run") # Start our actual experiment.
} results = tune.run( args.run, config=config, stop=stop, verbose=2, checkpoint_at_end=True ) if args.as_test: check_learning_achieved(results, args.stop_reward) checkpoints = results.get_trial_checkpoints_paths( trial=results.get_best_trial("episode_reward_mean", mode="max"), metric="episode_reward_mean", ) checkpoint_path = checkpoints[0][0] algo = PPO(config) algo.restore(checkpoint_path) # Inference loop. env = StatelessCartPole() # Run manual inference loop for n episodes. for _ in range(10): episode_reward = 0.0 reward = 0.0 action = 0 done = False obs = env.reset() while not done: # Create a dummy action using the same observation n times, # as well as dummy prev-n-actions and prev-n-rewards. action, state, logits = algo.compute_single_action(
results = tune.run( "PPO", config=config, stop=stop, checkpoint_at_end=True, checkpoint_freq=10, verbose=3, ) # Restore trained trainer (set to non-explore behavior) and play against # human on command line. if args.num_episodes_human_play > 0: num_episodes = 0 trainer = PPO(config=dict(config, **{"explore": False})) if args.from_checkpoint: trainer.restore(args.from_checkpoint) else: checkpoint = results.get_last_checkpoint() if not checkpoint: raise ValueError("No last checkpoint found in results!") trainer.restore(checkpoint) # Play from the command line against the trained agent # in an actual (non-RLlib-wrapped) open-spiel env. human_player = 1 env = Environment(args.env) while num_episodes < args.num_episodes_human_play: print("You play as {}".format("o" if human_player else "x")) time_step = env.reset() while not time_step.last():
verbose=1, checkpoint_freq=1, checkpoint_at_end=True, ) print("Pre-training done.") best_checkpoint = results.get_best_checkpoint(results.trials[0], mode="max") print(f".. best checkpoint was: {best_checkpoint}") # Create a new dummy Algorithm to "fix" our checkpoint. new_algo = PPO(config=config) # Get untrained weights for all policies. untrained_weights = new_algo.get_weights() # Restore all policies from checkpoint. new_algo.restore(best_checkpoint) # Set back all weights (except for 1st agent) to original # untrained weights. new_algo.set_weights( {pid: w for pid, w in untrained_weights.items() if pid != "policy_0"}) # Create the checkpoint from which tune can pick up the # experiment. new_checkpoint = new_algo.save() new_algo.stop() print(".. checkpoint to restore from (all policies reset, " f"except policy_0): {new_checkpoint}") print("Starting new tune.run") # Start our actual experiment.