def train(env_name): ModelCatalog.register_custom_model("masked_actions_model", MaskedActionsCNN) model_config = { "custom_model": "masked_actions_model", "conv_filters": [[16, [2, 2], 1], [32, [2, 2], 1]], "conv_activation": "elu", "fcnet_hiddens": [128], "fcnet_activation": "elu", } tune_config = { "num_workers": 24, "num_gpus": 1, "batch_mode": "complete_episodes", "model": model_config, "env": env_name, "lr": 0.001, "multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping, }, "framework": "tf" } trainer = DQNTrainer(env=env_name, config=tune_config) for i in range(1000): print("== Iteration {}==".format(i)) results = trainer.train() pretty_print(results) checkpoint = trainer.save() print("\nCheckpoint saved at {}\n".format(checkpoint))
def train_model(args): # We are using custom model and environment, which need to be registered in ray/rllib # Names can be anything. register_env("DuckieTown-MultiMap", lambda _: DiscreteWrapper(MultiMapEnv())) # Define trainer. Apart from env, config/framework and config/model, which are common among trainers. # Here is a list of default config keys/values: # https://docs.ray.io/en/master/rllib-training.html#common-parameters # For DQN specifically there are also additionally these keys: # https://docs.ray.io/en/master/rllib-algorithms.html#dqn trainer = DQNTrainer( env="DuckieTown-MultiMap", config={ "framework": "torch", "model": { "custom_model": "image-dqn", }, "learning_starts": 500, # Doing this allows us to record images from the DuckieTown Gym! Might be useful for report. # "record_env": True, "train_batch_size": 16, # Use a very small buffer to reduce memory usage, default: 50_000. "buffer_size": 1000, # Dueling off "dueling": False, # No hidden layers "hiddens": [], # Don't save experiences. # "output": None, # "compress_observations": True, "num_workers": 0, "num_gpus": 0.5, "rollout_fragment_length": 50, }) # Start training from a checkpoint, if available. if args.model_path: trainer.restore(args.model_path) plot = plotter.Plotter('dqn_agent') for i in range(args.epochs): # Number of episodes (basically epochs) print( f'----------------------- Starting epoch {i} ----------------------- ' ) # train() trains only a single episode result = trainer.train() print(result) plot.add_results(result) # Save model so far. checkpoint_path = trainer.save() print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}') # Cleanup CUDA memory to reduce memory usage. torch.cuda.empty_cache() # Debug log to monitor memory. print(torch.cuda.memory_summary(device=None, abbreviated=False)) plot.plot('DQN DuckieTown-MultiMap')
def train(num_iters, checkpoint_freq): obs_space = spaces.Dict({ 'obs': spaces.Box(low=-0.5, high=1.5, shape=(32, 32, 3), dtype=np.float32), 'action_mask': spaces.Box(low=0, high=1, shape=(5, ), dtype=np.int32) }) act_space = spaces.Discrete(n=5) trainer = DQNTrainer( env='SUMOEnv-v0', config={ 'model': { 'custom_model': 'adaptive-trafficlight', 'custom_options': {}, }, 'multiagent': { 'policy_graphs': { 'default_policy_graph': ( DQNPolicyGraph, obs_space, act_space, {}, ), }, 'policy_mapping_fn': function(lambda _: 'default_policy_graph'), }, 'hiddens': [], # Don't postprocess the action scores 'callbacks': { 'on_episode_end': function(on_episode_end), }, # 'num_workers': 4, # 'num_gpus_per_worker': 0.25, # All workers on a single GPU 'timesteps_per_iteration': 20000, }) for i in range(num_iters): print(f'== Iteration {i}==') print(pretty_print(trainer.train())) if i % checkpoint_freq == 0: checkpoint = trainer.save() print(f'\nCheckpoint saved at {checkpoint}\n')
def train_model(args, config): # Define trainer. Apart from env, config/framework and config/model, which are common among trainers. # Here is a list of default config keys/values: # https://docs.ray.io/en/master/rllib-training.html#common-parameters # For DQN specifically there are also additionally these keys: # https://docs.ray.io/en/master/rllib-algorithms.html#dqn trainer = DQNTrainer( env="DuckieTown-MultiMap", config=config, ) # Start training from a checkpoint, if available. if args.model_path: trainer.restore(args.model_path) best_mean_reward = -np.inf epoch_of_best_mean_reward = 0 path_of_best_mean_reward = None for i in trange(args.epochs, desc="Epochs", leave=False): # Number of episodes (basically epochs) # print(f'----------------------- Starting epoch {i} ----------------------- ') # train() trains only a single episode result = trainer.train() # print(result) # Save model so far. checkpoint_path = trainer.save() # print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}') if result["episode_reward_mean"] > best_mean_reward: best_mean_reward = result["episode_reward_mean"] epoch_of_best_mean_reward = i path_of_best_mean_reward = checkpoint_path # Cleanup CUDA memory to reduce memory usage. torch.cuda.empty_cache() # Debug log to monitor memory. # print(torch.cuda.memory_summary(device=None, abbreviated=False)) return best_mean_reward, epoch_of_best_mean_reward, path_of_best_mean_reward
def ray_server(run='PPO', address=ADDRESS, port=PORT): print(ray.init(log_to_driver=False)) connector_config = { "input": (lambda ioctx: PolicyServerInput(ioctx, address, port)), "num_workers": 0, "input_evaluation": [], "create_env_on_driver": False, "num_gpus": FLAGS.num_gpus, } if run == "DQN": trainer = DQNTrainer(env=ExternalAtari, config=dict(connector_config, **CONFIG_DQN)) elif run == "PPO": trainer = PPOTrainer(env=ExternalAtari, config=dict(connector_config, **CONFIG_PPO)) else: raise ValueError("--run must be DQN or PPO") i = 0 while i < FLAGS.iter: i += 1 print(pretty_print(trainer.train())) ray.shutdown() checkpoint = trainer.save("{}/ckpts".format(FLAGS.train_url.rstrip('/'))) print("checkpoint saved at", checkpoint) mox.file.copy( os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.json"), os.path.join(FLAGS.train_url, "config.json")) mox.file.copy( os.path.join(os.path.abspath(os.path.dirname(__file__)), "customize_service.py"), os.path.join(FLAGS.train_url, "customize_service.py")) mox.file.copy(os.path.join(FLAGS.data_url, "rl_config.py"), os.path.join(FLAGS.train_url, "rl_config.py")) del trainer
def main(): ray.init() logging.getLogger().setLevel(logging.INFO) date = datetime.now().strftime('%Y%m%d_%H%M%S') parser = argparse.ArgumentParser() # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4') parser.add_argument('--config', type=str, default='config/global_config.json', help='config file') parser.add_argument('--algo', type=str, default='DQN', choices=['DQN', 'DDQN', 'DuelDQN'], help='choose an algorithm') parser.add_argument('--inference', action="store_true", help='inference or training') parser.add_argument('--ckpt', type=str, help='inference or training') parser.add_argument('--epoch', type=int, default=100, help='number of training epochs') parser.add_argument('--num_step', type=int, default=10 ** 3, help='number of timesteps for one episode, and for inference') parser.add_argument('--save_freq', type=int, default=100, help='model saving frequency') parser.add_argument('--batch_size', type=int, default=128, help='model saving frequency') parser.add_argument('--state_time_span', type=int, default=5, help='state interval to receive long term state') parser.add_argument('--time_span', type=int, default=30, help='time interval to collect data') args = parser.parse_args() config_env = env_config(args) # ray.tune.register_env('gym_cityflow', lambda env_config:CityflowGymEnv(config_env)) config_agent = agent_config(config_env) trainer = DQNTrainer( env=CityflowGymEnv, config=config_agent) for i in range(1000): # Perform one iteration of training the policy with DQN result = trainer.train() print(pretty_print(result)) if (i+1) % 100 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint)
"log_level": "INFO", "framework": args.framework, })) elif args.run == "PPO": # Example of using PPO (does NOT support off-policy actions). trainer = PPOTrainer(env=env, config=dict( connector_config, **{ "sample_batch_size": 1000, "train_batch_size": 4000, "framework": args.framework, })) else: raise ValueError("--run must be DQN or PPO") checkpoint_path = CHECKPOINT_FILE.format(args.run) # Attempt to restore from checkpoint if possible. if os.path.exists(checkpoint_path): checkpoint_path = open(checkpoint_path).read() print("Restoring from checkpoint path", checkpoint_path) trainer.restore(checkpoint_path) # Serving and training loop while True: print(pretty_print(trainer.train())) checkpoint = trainer.save() print("Last checkpoint", checkpoint) with open(checkpoint_path, "w") as f: f.write(checkpoint)
def main(): ray.init() logging.getLogger().setLevel(logging.INFO) date = datetime.now().strftime('%Y%m%d_%H%M%S') parser = argparse.ArgumentParser() # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4') parser.add_argument('--config', type=str, default='config/global_config.json', help='config file') parser.add_argument('--algo', type=str, default='DQN', choices=['DQN', 'DDQN', 'DuelDQN'], help='choose an algorithm') parser.add_argument('--inference', action="store_true", help='inference or training') parser.add_argument('--ckpt', type=str, help='inference or training') parser.add_argument('--epoch', type=int, default=10, help='number of training epochs') parser.add_argument( '--num_step', type=int, default=10**3, help='number of timesteps for one episode, and for inference') parser.add_argument('--save_freq', type=int, default=100, help='model saving frequency') parser.add_argument('--batch_size', type=int, default=128, help='model saving frequency') parser.add_argument('--state_time_span', type=int, default=5, help='state interval to receive long term state') parser.add_argument('--time_span', type=int, default=30, help='time interval to collect data') args = parser.parse_args() ### dw ### #parser.add_argument("--num-agents", type=int, default=6) model_dir = "model/{}_{}".format(args.algo, date) result_dir = "result/{}_{}".format(args.algo, date) config_env = env_config(args) num_agents = len(config_env["intersection_id"]) ''' obs_space = Tuple([ CityFlowEnvRay.observation_space for _ in range(num_agents) ]) act_space = Tuple([ CityFlowEnvRay.action_space for _ in range(num_agents) ]) ''' ### dw ### obs_space = CityFlowEnvRay.observation_space act_space = CityFlowEnvRay.action_space ray.tune.register_env('gym_cityflow', lambda env_config: CityFlowEnvRay(env_config)) #config_agent = agent_config(config_env) # # build cityflow environment ''' trainer = DQNTrainer( env=CityFlowEnvRay, config=config_agent) ''' policies = { #"dqn_policy":(None, obs_space, act_space, config_env) #"policy_{}".format(i): (None, obs_space, act_space, config_env) "policy_{}".format(i): (DQNTFPolicy, obs_space, act_space, {}) for i in range(num_agents) } policy_ids = list(policies.keys()) config_agent = agent_config(config_env, policies, policy_ids) trainer = DQNTrainer(env='gym_cityflow', config=config_agent) for i in range(1000): # Perform one iteration of training the policy with DQN result = trainer.train() print(pretty_print(result)) if i % 30 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint)
if __name__ == "__main__": ray.init() register_env("ECglass-v2", lambda _: ECglassServing()) # We use DQN since it supports off-policy actions, but you can choose and # configure any agent. dqn = DQNTrainer( env="ECglass-v2", config={ # Use a single process to avoid needing to set up a load balancer "num_workers": 0, # Configure the agent to run short iterations for debugging "exploration_fraction": 0.01, "learning_starts": 100, "timesteps_per_iteration": 200, }) # Attempt to restore from checkpoint if possible. if os.path.exists(CHECKPOINT_FILE): checkpoint_path = open(CHECKPOINT_FILE).read() print("Restoring from checkpoint path", checkpoint_path) dqn.restore(checkpoint_path) # Serving and training loop while True: print(pretty_print(dqn.train())) checkpoint_path = dqn.save() print("Last checkpoint", checkpoint_path) with open(CHECKPOINT_FILE, "w") as f: f.write(checkpoint_path)
connector_config, **{ "sample_batch_size": 1000, "train_batch_size": 4000, "framework": args.framework, })) else: raise ValueError("--run must be DQN or PPO") #this file contains checkpoint file path checkpoint_path_file = CHECKPOINT_FILE # Attempt to restore from checkpoint if possible. if os.path.exists(checkpoint_path_file): checkpoint_path = open(checkpoint_path_file).read() if os.path.exists(checkpoint_path): print("Restoring from checkpoint path", checkpoint_path) trainer.restore(checkpoint_path) else: print("checkpoint file does not exist") else: print("file containing checkpoint file path does not exist") # Serving and training loop while True: print(pretty_print(trainer.train())) checkpoint = trainer.save(parser.modDir) print("Last checkpoint", checkpoint) with open(checkpoint_path, "w") as f: f.write(checkpoint)
"timesteps_per_iteration": 200, "env_config": { "observation_size": args.observation_size, "action_size": args.action_size, }, }) elif args.run == "PG": trainer = PGTrainer( env="srv", config={ "num_workers": 0, "env_config": { "observation_size": args.observation_size, "action_size": args.action_size, }, }) # Attempt to restore from checkpoint if possible. if os.path.exists(args.checkpoint_file): checkpoint_file = open(args.checkpoint_file).read() print("Restoring from checkpoint path", checkpoint_file) trainer.restore(checkpoint_file) # Serving and training loop while True: print(pretty_print(trainer.train())) checkpoint_file = trainer.save() print("Last checkpoint", checkpoint_file) with open(args.checkpoint_file, "w") as f: f.write(checkpoint_file)
"buffer_size": 50000, "sample_batch_size": 4, "train_batch_size": 320, "schedule_max_timesteps": 2000000, "exploration_final_eps": 0.01, "exploration_fraction": 0.1, "model": { "dim": 64 } }) def env_creator(env_config): return PodWorldEnv(max_steps=100, reward_factor=1.0) register_env("podworld_env", env_creator) agent = DQNTrainer(config=config, env="podworld_env") agent_save_path = None for i in range(50): stats = agent.train() # print(pretty_print(stats)) if i % 10 == 0 and i > 0: path = agent.save() if agent_save_path is None: agent_save_path = path print('Saved agent at', agent_save_path) logger.write((i, stats['episode_reward_min'])) print('episode_reward_mean', stats['episode_reward_min'])