def train_model(args): # We are using custom model and environment, which need to be registered in ray/rllib # Names can be anything. register_env("DuckieTown-MultiMap", lambda _: DiscreteWrapper(MultiMapEnv())) # Define trainer. Apart from env, config/framework and config/model, which are common among trainers. # Here is a list of default config keys/values: # https://docs.ray.io/en/master/rllib-training.html#common-parameters # For DQN specifically there are also additionally these keys: # https://docs.ray.io/en/master/rllib-algorithms.html#dqn trainer = DQNTrainer( env="DuckieTown-MultiMap", config={ "framework": "torch", "model": { "custom_model": "image-dqn", }, "learning_starts": 500, # Doing this allows us to record images from the DuckieTown Gym! Might be useful for report. # "record_env": True, "train_batch_size": 16, # Use a very small buffer to reduce memory usage, default: 50_000. "buffer_size": 1000, # Dueling off "dueling": False, # No hidden layers "hiddens": [], # Don't save experiences. # "output": None, # "compress_observations": True, "num_workers": 0, "num_gpus": 0.5, "rollout_fragment_length": 50, }) # Start training from a checkpoint, if available. if args.model_path: trainer.restore(args.model_path) plot = plotter.Plotter('dqn_agent') for i in range(args.epochs): # Number of episodes (basically epochs) print( f'----------------------- Starting epoch {i} ----------------------- ' ) # train() trains only a single episode result = trainer.train() print(result) plot.add_results(result) # Save model so far. checkpoint_path = trainer.save() print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}') # Cleanup CUDA memory to reduce memory usage. torch.cuda.empty_cache() # Debug log to monitor memory. print(torch.cuda.memory_summary(device=None, abbreviated=False)) plot.plot('DQN DuckieTown-MultiMap')
def load_agent(): # Initialize training environment ray.init() def environment_creater(params=None): agent = SimpleAvoidAgent(noise=0.05) return TronRaySinglePlayerEnvironment(board_size=13, num_players=4, agent=agent) env = environment_creater() tune.register_env("tron_single_player", environment_creater) ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard) # Configure Deep Q Learning with reasonable values config = DEFAULT_CONFIG.copy() config['num_workers'] = 4 ## config['num_gpus'] = 1 #config["timesteps_per_iteration"] = 1024 #config['target_network_update_freq'] = 256 #config['buffer_size'] = 100_000 #config['schedule_max_timesteps'] = 200_000 #config['exploration_fraction'] = 0.02 #config['compress_observations'] = False #config['n_step'] = 2 #config['seed'] = SEED f #Configure for PPO #config["sample_batch_size"]= 100 #config["train_batch_size"]=200 #config["sgd_minibatch_size"]=60 #Configure A3C with reasonable values # We will use a simple convolution network with 3 layers as our feature extractor config['model']['vf_share_layers'] = True config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)] config['model']['fcnet_hiddens'] = [256] config['model']['custom_preprocessor'] = 'tron_prep' # Begin training or evaluation #trainer = DDPGTrainer(config, "tron_single_player") #trainer = A3CTrainer(config, "tron_single_player") trainer = DQNTrainer(config, "tron_single_player") #trainer = PPOTrainer(config, "tron_single_player") trainer.restore("./dqn_checkpoint_3800/checkpoint-3800") return trainer #.get_policy("trainer")
def evaluate_model(args): if args.model_path == '': print('Cannot evaluate model, no --model_path set') exit(1) def get_env(): # Simulator env uses a single map, so better for evaluation/testing. # DiscreteWrapper just converts wheel velocities to high level discrete actions. return DiscreteWrapper( simulator.Simulator( map_name=args.map, max_steps=2000, )) # Rather than reuse the env, another one is created later because I can't # figure out how to provide register_env with an object, th register_env('DuckieTown-Simulator', lambda _: get_env()) trainer = DQNTrainer( env="DuckieTown-Simulator", config={ "framework": "torch", "model": { "custom_model": "image-dqn", }, # Dueling off "dueling": False, # No hidden layers "hiddens": [], }, ) trainer.restore(args.model_path) sim_env = get_env() # Standard OpenAI Gym reset/action/step/render loop. # This matches how the `enjoy_reinforcement.py` script works, see: https://git.io/J3js2 done = False observation = sim_env.reset() episode_reward = 0 while not done: action = trainer.compute_action(observation) observation, reward, done, _ = sim_env.step(action) episode_reward += reward sim_env.render() print(f'Episode complete, total reward: {episode_reward}')
def train_model(args, config): # Define trainer. Apart from env, config/framework and config/model, which are common among trainers. # Here is a list of default config keys/values: # https://docs.ray.io/en/master/rllib-training.html#common-parameters # For DQN specifically there are also additionally these keys: # https://docs.ray.io/en/master/rllib-algorithms.html#dqn trainer = DQNTrainer( env="DuckieTown-MultiMap", config=config, ) # Start training from a checkpoint, if available. if args.model_path: trainer.restore(args.model_path) best_mean_reward = -np.inf epoch_of_best_mean_reward = 0 path_of_best_mean_reward = None for i in trange(args.epochs, desc="Epochs", leave=False): # Number of episodes (basically epochs) # print(f'----------------------- Starting epoch {i} ----------------------- ') # train() trains only a single episode result = trainer.train() # print(result) # Save model so far. checkpoint_path = trainer.save() # print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}') if result["episode_reward_mean"] > best_mean_reward: best_mean_reward = result["episode_reward_mean"] epoch_of_best_mean_reward = i path_of_best_mean_reward = checkpoint_path # Cleanup CUDA memory to reduce memory usage. torch.cuda.empty_cache() # Debug log to monitor memory. # print(torch.cuda.memory_summary(device=None, abbreviated=False)) return best_mean_reward, epoch_of_best_mean_reward, path_of_best_mean_reward
"log_level": "INFO", "framework": args.framework, })) elif args.run == "PPO": # Example of using PPO (does NOT support off-policy actions). trainer = PPOTrainer(env=env, config=dict( connector_config, **{ "sample_batch_size": 1000, "train_batch_size": 4000, "framework": args.framework, })) else: raise ValueError("--run must be DQN or PPO") checkpoint_path = CHECKPOINT_FILE.format(args.run) # Attempt to restore from checkpoint if possible. if os.path.exists(checkpoint_path): checkpoint_path = open(checkpoint_path).read() print("Restoring from checkpoint path", checkpoint_path) trainer.restore(checkpoint_path) # Serving and training loop while True: print(pretty_print(trainer.train())) checkpoint = trainer.save() print("Last checkpoint", checkpoint) with open(checkpoint_path, "w") as f: f.write(checkpoint)
if __name__ == "__main__": ray.init() register_env("ECglass-v2", lambda _: ECglassServing()) # We use DQN since it supports off-policy actions, but you can choose and # configure any agent. dqn = DQNTrainer( env="ECglass-v2", config={ # Use a single process to avoid needing to set up a load balancer "num_workers": 0, # Configure the agent to run short iterations for debugging "exploration_fraction": 0.01, "learning_starts": 100, "timesteps_per_iteration": 200, }) # Attempt to restore from checkpoint if possible. if os.path.exists(CHECKPOINT_FILE): checkpoint_path = open(CHECKPOINT_FILE).read() print("Restoring from checkpoint path", checkpoint_path) dqn.restore(checkpoint_path) # Serving and training loop while True: print(pretty_print(dqn.train())) checkpoint_path = dqn.save() print("Last checkpoint", checkpoint_path) with open(CHECKPOINT_FILE, "w") as f: f.write(checkpoint_path)
# env = VectorEnv.wrap(existing_envs=[warehouse_env_creator(env_config) for _ in range(NUM_ENVS)], # num_envs=NUM_ENVS) # config = {"env": "warehouse_env", # "framework": "torch", # "num_gpus": 0.1, # "num_gpus_per_worker": 0.1, # 'num_envs_per_worker': 6, # "evaluation_interval": 5, } with open(params_path, "rb") as f: config = cloudpickle.load(f) config["explore"] = False config['num_envs_per_worker'] = 1 print("Trained on map: \n", config["env_config"]["maps"]) config["env_config"]["maps"] = MAP_WITH_EXCEPTION trainer = DQNTrainer(config=config) trainer.restore(path.format(checkpoint, checkpoint)) policy = trainer.get_policy() trainer._evaluate() samples = (trainer.evaluation_workers.local_worker().sample() for _ in range(NUM_EPISODES)) rows = map(lambda x: np.concatenate([ x["unroll_id"][:, None], np.arange(0, x.count)[:,None], x["obs"], x["actions"][:, None], x["q_values"], x["rewards"][:, None], x["dones"][:, None], x["new_obs"], process_info(x["infos"])], -1),
if __name__ == "__main__": args = parser.parse_args() ray.init(num_gpus=1) env_config = {"board_shape": [8, 8], "length": 3} config = { "env": SnakeEnv, "env_config": env_config, "num_gpus": 1, "lr": 1e-4, "hiddens": [32, 64, 512] } agent = DQNTrainer(config=config) snake_env = SnakeEnv(config=env_config) if args.test: assert args.restore is not None agent.restore(args.restore) while True: score = simulate_one_game(render=True) print("Score: {}".format(score)) else: if args.restore is not None: agent.restore(args.restore) i = agent.iteration else: i = 0 while True: train_one_step() if i % 10 == 0: save_ckpt() # avg_score = 0 # for _ in range(100):
register_env("leduc_holdem", lambda config: PettingZooEnv(env_creator())) env = (env_creator()) # obs_space = env.observation_space # print(obs_space) # act_space = test_env.action_space with open(params_path, "rb") as f: config = pickle.load(f) # num_workers not needed since we are not training del config['num_workers'] del config['num_gpus'] ray.init(num_cpus=8, num_gpus=0) DQNAgent = DQNTrainer(env="leduc_holdem", config=config) DQNAgent.restore(checkpoint_path) reward_sums = {a: 0 for a in env.possible_agents} i = 0 env.reset() for agent in env.agent_iter(): observation, reward, done, info = env.last() obs = observation['observation'] reward_sums[agent] += reward if done: action = None else: print(DQNAgent.get_policy(agent)) policy = DQNAgent.get_policy(agent) batch_obs = {
config['framework'] = "torch" config['gamma'] = args.gamma # NN vision config['model']['dim'] = 21 config['model']['conv_filters'] = [[8, [3, 3], 2], [16, [2, 2], 2], [512, [6, 6], 1]] # DQN config config['v_min'] = -400 config['v_max'] = 400 config['noisy'] = False trainner = DQNTrainer(config=config, env="mars_explorer:explorer-v01") if PATH != "": print(f"Loading model {PATH}") trainner.restore(PATH) else: print(f"Starting without any a priori knowledge") N_start = 0 N_finish = args.steps results = [] episode_data = [] episode_json = [] writer = SummaryWriter(comment="SAC-GEP") for batch in range(N_start, N_finish): initial_time = time.time() result = trainner.train()
# For eval afterward config_copy = config.copy() config_copy['explore'] = False trainer = DDPGTrainer(config=config_copy, env='Bertrand') analysis = tune.run( trainer_choice, # num_samples = 4, config=config, local_dir='./log', stop={'training_iteration': sessions}, mode='max', metric='episode_reward_mean', checkpoint_at_end=True) trainer.restore(checkpoint_path=analysis.best_checkpoint) # analysis = tune.run( # trainer_choice, # # num_samples = 4, # config = config_copy, # local_dir = './log', # stop = {'training_iteration': sessions}, # mode = 'max', # metric = 'episode_reward_mean', # restore = analysis.best_checkpoint, # checkpoint_at_end = True # ) else: # Dual algorithm training
"timesteps_per_iteration": 200, "env_config": { "observation_size": args.observation_size, "action_size": args.action_size, }, }) elif args.run == "PG": trainer = PGTrainer( env="srv", config={ "num_workers": 0, "env_config": { "observation_size": args.observation_size, "action_size": args.action_size, }, }) # Attempt to restore from checkpoint if possible. if os.path.exists(args.checkpoint_file): checkpoint_file = open(args.checkpoint_file).read() print("Restoring from checkpoint path", checkpoint_file) trainer.restore(checkpoint_file) # Serving and training loop while True: print(pretty_print(trainer.train())) checkpoint_file = trainer.save() print("Last checkpoint", checkpoint_file) with open(args.checkpoint_file, "w") as f: f.write(checkpoint_file)