def train(env_name): ModelCatalog.register_custom_model("masked_actions_model", MaskedActionsCNN) model_config = { "custom_model": "masked_actions_model", "conv_filters": [[16, [2, 2], 1], [32, [2, 2], 1]], "conv_activation": "elu", "fcnet_hiddens": [128], "fcnet_activation": "elu", } tune_config = { "num_workers": 24, "num_gpus": 1, "batch_mode": "complete_episodes", "model": model_config, "env": env_name, "lr": 0.001, "multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping, }, "framework": "tf" } trainer = DQNTrainer(env=env_name, config=tune_config) for i in range(1000): print("== Iteration {}==".format(i)) results = trainer.train() pretty_print(results) checkpoint = trainer.save() print("\nCheckpoint saved at {}\n".format(checkpoint))
def test_model(self) -> Tuple[List[float], list]: ray.init(logging_level=logging.INFO, ignore_reinit_error=True) agent = DQNTrainer(self.config, env=custom_env_name) weights = torch.load( self.params.model_dir / "trained_model.pt", map_location=lambda storage, loc: storage, ) agent.set_weights({"default_policy": weights}) rewards = [] longest_screens = [] for i in range(self.params.num_testing_episodes): screens = [] try: logger.info("Iteration: {}", i) state = self.env.reset() done = False cumulative_reward = 0 while not done: action = agent.compute_action(state) state, reward, done, _ = self.env.step(action) screen = self.env.render(mode="rgb_array") screens.append(screen) cumulative_reward += reward time.sleep(0.01) logger.info("Iteration: {}, Reward: {}", i, cumulative_reward) rewards.append(cumulative_reward) except KeyboardInterrupt: logger.info("Testing was interrupted") break if len(screens) > len(longest_screens): longest_screens = screens self.env.close() ray.shutdown() return rewards, longest_screens
def dqn_train(config, reporter): # Instantiate a trainer cfg = { # Max num timesteps for annealing schedules. Exploration is annealed from # 1.0 to exploration_fraction over this number of timesteps scaled by # exploration_fraction "schedule_max_timesteps": 1000000, # Minimum env steps to optimize for per train call. This value does # not affect learning, only the length of iterations. "timesteps_per_iteration": 1000, # Fraction of entire training period over which the exploration rate is # annealed "exploration_fraction": 0.1, # Final value of random action probability "exploration_final_eps": 0.02, "n_step": 3, "buffer_size": 500000, # "sample_batch_size" : 32, # "train_batch_size" : 128, # "learning_starts" : 5000, # "target_network_update_freq": 5000, # "num_workers" : NUM_WORKERS, # "per_worker_exploration" : True, # "worker_side_prioritization": True, # "min_iter_time_s" : 1, } trainer = DQNTrainer(config={**config, **cfg}) while True: result = trainer.train() # Executes one training step print(pretty_print(result)) reporter(**result) # notifies TrialRunner
def test_reproducing_trajectory(self): class PickLargest(gym.Env): def __init__(self): self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(4, )) self.action_space = gym.spaces.Discrete(4) def reset(self, **kwargs): self.obs = np.random.randn(4) return self.obs def step(self, action): reward = self.obs[action] return self.obs, reward, True, {} def env_creator(env_config): return PickLargest() for fw in framework_iterator(frameworks=("tf", "torch")): trajs = list() for trial in range(3): ray.init() register_env("PickLargest", env_creator) config = { "seed": 666 if trial in [0, 1] else 999, "min_time_s_per_reporting": 0, "timesteps_per_iteration": 100, "framework": fw, } agent = DQNTrainer(config=config, env="PickLargest") trajectory = list() for _ in range(8): r = agent.train() trajectory.append(r["episode_reward_max"]) trajectory.append(r["episode_reward_min"]) trajs.append(trajectory) ray.shutdown() # trial0 and trial1 use same seed and thus # expect identical trajectories. all_same = True for v0, v1 in zip(trajs[0], trajs[1]): if v0 != v1: all_same = False self.assertTrue(all_same) # trial1 and trial2 use different seeds and thus # most rewards tend to be different. diff_cnt = 0 for v1, v2 in zip(trajs[1], trajs[2]): if v1 != v2: diff_cnt += 1 self.assertTrue(diff_cnt > 8)
def testTrainCartpoleOffPolicy(self): register_env( "test3", lambda _: PartOffPolicyServing(gym.make("CartPole-v0"), off_pol_frac=0.2)) dqn = DQNTrainer(env="test3", config={"exploration_fraction": 0.001}) for i in range(100): result = dqn.train() print("Iteration {}, reward {}, timesteps {}".format( i, result["episode_reward_mean"], result["timesteps_total"])) if result["episode_reward_mean"] >= 100: return raise Exception("failed to improve reward")
def load_agent(): # Initialize training environment ray.init() def environment_creater(params=None): agent = SimpleAvoidAgent(noise=0.05) return TronRaySinglePlayerEnvironment(board_size=13, num_players=4, agent=agent) env = environment_creater() tune.register_env("tron_single_player", environment_creater) ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard) # Configure Deep Q Learning with reasonable values config = DEFAULT_CONFIG.copy() config['num_workers'] = 4 ## config['num_gpus'] = 1 #config["timesteps_per_iteration"] = 1024 #config['target_network_update_freq'] = 256 #config['buffer_size'] = 100_000 #config['schedule_max_timesteps'] = 200_000 #config['exploration_fraction'] = 0.02 #config['compress_observations'] = False #config['n_step'] = 2 #config['seed'] = SEED f #Configure for PPO #config["sample_batch_size"]= 100 #config["train_batch_size"]=200 #config["sgd_minibatch_size"]=60 #Configure A3C with reasonable values # We will use a simple convolution network with 3 layers as our feature extractor config['model']['vf_share_layers'] = True config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)] config['model']['fcnet_hiddens'] = [256] config['model']['custom_preprocessor'] = 'tron_prep' # Begin training or evaluation #trainer = DDPGTrainer(config, "tron_single_player") #trainer = A3CTrainer(config, "tron_single_player") trainer = DQNTrainer(config, "tron_single_player") #trainer = PPOTrainer(config, "tron_single_player") trainer.restore("./dqn_checkpoint_3800/checkpoint-3800") return trainer #.get_policy("trainer")
def testEvaluationOption(self): ray.init() agent = DQNTrainer(env="CartPole-v0", config={"evaluation_interval": 2}) r0 = agent.train() r1 = agent.train() r2 = agent.train() r3 = agent.train() r4 = agent.train() self.assertTrue("evaluation" in r0) self.assertTrue("episode_reward_mean" in r0["evaluation"]) self.assertEqual(r0["evaluation"], r1["evaluation"]) self.assertNotEqual(r1["evaluation"], r2["evaluation"]) self.assertEqual(r2["evaluation"], r3["evaluation"]) self.assertNotEqual(r3["evaluation"], r4["evaluation"])
def save_best_response_checkpoint(trainer: DQNTrainer, player: int, save_dir: str, timesteps_training_br: int, episodes_training_br: int, active_policy_num: int = None): policy_name = active_policy_num if active_policy_num is not None else "unclaimed" date_time = datetime_str() checkpoint_name = f"policy_{policy_name}_{date_time}.h5" checkpoint_path = os.path.join(save_dir, checkpoint_name) br_weights = trainer.get_weights([f"best_response"])["best_response"] br_weights = {k.replace(".", "_dot_"): v for k, v in br_weights.items() } # periods cause HDF5 NaturalNaming warnings ensure_dir(file_path=checkpoint_path) num_save_attempts = 5 for attempt in range(num_save_attempts): try: deepdish.io.save(path=checkpoint_path, data={ "weights": br_weights, "player": player, "policy_num": active_policy_num, "date_time_str": date_time, "seconds_since_epoch": time.time(), "timesteps_training_br": timesteps_training_br, "episodes_training_br": episodes_training_br }) break except HDF5ExtError: if attempt + 1 == num_save_attempts: raise time.sleep(1.0) return checkpoint_path
def dqn_trainer_wt_amtft_policies_in_ipd( logger_creator, ): train_n_replicates = 1 debug = True hparams = get_hyperparameters( debug, train_n_replicates, filter_utilitarian=False, env="IteratedPrisonersDilemma", ) _, _, rllib_config = get_rllib_config( hparams, welfare_fn=postprocessing.WELFARE_UTILITARIAN ) rllib_config["env"] = IteratedPrisonersDilemma rllib_config["seed"] = int(time.time()) policies = rllib_config["multiagent"]["policies"] for policy_id, policy_tuple in policies.items(): policy_list = list(policy_tuple) policy_list[0] = amTFT.AmTFTRolloutsTorchPolicy policies[policy_id] = policy_list dqn_trainer = DQNTrainer(rllib_config, logger_creator=logger_creator) return dqn_trainer
def evaluate_model(args): if args.model_path == '': print('Cannot evaluate model, no --model_path set') exit(1) def get_env(): # Simulator env uses a single map, so better for evaluation/testing. # DiscreteWrapper just converts wheel velocities to high level discrete actions. return DiscreteWrapper( simulator.Simulator( map_name=args.map, max_steps=2000, )) # Rather than reuse the env, another one is created later because I can't # figure out how to provide register_env with an object, th register_env('DuckieTown-Simulator', lambda _: get_env()) trainer = DQNTrainer( env="DuckieTown-Simulator", config={ "framework": "torch", "model": { "custom_model": "image-dqn", }, # Dueling off "dueling": False, # No hidden layers "hiddens": [], }, ) trainer.restore(args.model_path) sim_env = get_env() # Standard OpenAI Gym reset/action/step/render loop. # This matches how the `enjoy_reinforcement.py` script works, see: https://git.io/J3js2 done = False observation = sim_env.reset() episode_reward = 0 while not done: action = trainer.compute_action(observation) observation, reward, done, _ = sim_env.step(action) episode_reward += reward sim_env.render() print(f'Episode complete, total reward: {episode_reward}')
def train(config, checkpoint_dir=None): trainer = DQNTrainer(config=config, env='BomberMan-v0') # trainer.restore('C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-16_09-20-44984tj3ip\\checkpoint_002770\\checkpoint-2770') iter = 0 # def update_phase(ev): # ev.foreach_env(lambda e: e.set_phase(phase)) while True: iter += 1 result = trainer.train() if iter % 250 == 0: if not os.path.exists(f'./model-{iter}'): trainer.get_policy('policy_01').export_model( f'./model-{iter}') else: print("model already saved")
def train(num_iters, checkpoint_freq): obs_space = spaces.Dict({ 'obs': spaces.Box(low=-0.5, high=1.5, shape=(32, 32, 3), dtype=np.float32), 'action_mask': spaces.Box(low=0, high=1, shape=(5, ), dtype=np.int32) }) act_space = spaces.Discrete(n=5) trainer = DQNTrainer( env='SUMOEnv-v0', config={ 'model': { 'custom_model': 'adaptive-trafficlight', 'custom_options': {}, }, 'multiagent': { 'policy_graphs': { 'default_policy_graph': ( DQNPolicyGraph, obs_space, act_space, {}, ), }, 'policy_mapping_fn': function(lambda _: 'default_policy_graph'), }, 'hiddens': [], # Don't postprocess the action scores 'callbacks': { 'on_episode_end': function(on_episode_end), }, # 'num_workers': 4, # 'num_gpus_per_worker': 0.25, # All workers on a single GPU 'timesteps_per_iteration': 20000, }) for i in range(num_iters): print(f'== Iteration {i}==') print(pretty_print(trainer.train())) if i % checkpoint_freq == 0: checkpoint = trainer.save() print(f'\nCheckpoint saved at {checkpoint}\n')
def test_policy_save_restore(self): config = DEFAULT_CONFIG.copy() for _ in framework_iterator(config): trainer = DQNTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() state1 = policy.get_state() trainer.train() state2 = policy.get_state() check(state1["_exploration_state"]["last_timestep"], state2["_exploration_state"]["last_timestep"], false=True) check(state1["global_timestep"], state2["global_timestep"], false=True) # Reset policy to its original state and compare. policy.set_state(state1) state3 = policy.get_state() # Make sure everything is the same. check(state1, state3)
def train_model(args): # We are using custom model and environment, which need to be registered in ray/rllib # Names can be anything. register_env("DuckieTown-MultiMap", lambda _: DiscreteWrapper(MultiMapEnv())) # Define trainer. Apart from env, config/framework and config/model, which are common among trainers. # Here is a list of default config keys/values: # https://docs.ray.io/en/master/rllib-training.html#common-parameters # For DQN specifically there are also additionally these keys: # https://docs.ray.io/en/master/rllib-algorithms.html#dqn trainer = DQNTrainer( env="DuckieTown-MultiMap", config={ "framework": "torch", "model": { "custom_model": "image-dqn", }, "learning_starts": 500, # Doing this allows us to record images from the DuckieTown Gym! Might be useful for report. # "record_env": True, "train_batch_size": 16, # Use a very small buffer to reduce memory usage, default: 50_000. "buffer_size": 1000, # Dueling off "dueling": False, # No hidden layers "hiddens": [], # Don't save experiences. # "output": None, # "compress_observations": True, "num_workers": 0, "num_gpus": 0.5, "rollout_fragment_length": 50, }) # Start training from a checkpoint, if available. if args.model_path: trainer.restore(args.model_path) plot = plotter.Plotter('dqn_agent') for i in range(args.epochs): # Number of episodes (basically epochs) print( f'----------------------- Starting epoch {i} ----------------------- ' ) # train() trains only a single episode result = trainer.train() print(result) plot.add_results(result) # Save model so far. checkpoint_path = trainer.save() print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}') # Cleanup CUDA memory to reduce memory usage. torch.cuda.empty_cache() # Debug log to monitor memory. print(torch.cuda.memory_summary(device=None, abbreviated=False)) plot.plot('DQN DuckieTown-MultiMap')
def ray_server(run='PPO', address=ADDRESS, port=PORT): print(ray.init(log_to_driver=False)) connector_config = { "input": (lambda ioctx: PolicyServerInput(ioctx, address, port)), "num_workers": 0, "input_evaluation": [], "create_env_on_driver": False, "num_gpus": FLAGS.num_gpus, } if run == "DQN": trainer = DQNTrainer(env=ExternalAtari, config=dict(connector_config, **CONFIG_DQN)) elif run == "PPO": trainer = PPOTrainer(env=ExternalAtari, config=dict(connector_config, **CONFIG_PPO)) else: raise ValueError("--run must be DQN or PPO") i = 0 while i < FLAGS.iter: i += 1 print(pretty_print(trainer.train())) ray.shutdown() checkpoint = trainer.save("{}/ckpts".format(FLAGS.train_url.rstrip('/'))) print("checkpoint saved at", checkpoint) mox.file.copy( os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.json"), os.path.join(FLAGS.train_url, "config.json")) mox.file.copy( os.path.join(os.path.abspath(os.path.dirname(__file__)), "customize_service.py"), os.path.join(FLAGS.train_url, "customize_service.py")) mox.file.copy(os.path.join(FLAGS.data_url, "rl_config.py"), os.path.join(FLAGS.train_url, "rl_config.py")) del trainer
def test_train_cartpole_off_policy(self): register_env( "test3", lambda _: PartOffPolicyServing(gym.make("CartPole-v0"), off_pol_frac=0.2)) config = { "num_workers": 0, "exploration_config": { "epsilon_timesteps": 100 }, } for _ in framework_iterator(config, frameworks=("tf", "torch")): dqn = DQNTrainer(env="test3", config=config) reached = False for i in range(50): result = dqn.train() print("Iteration {}, reward {}, timesteps {}".format( i, result["episode_reward_mean"], result["timesteps_total"])) if result["episode_reward_mean"] >= 80: reached = True break if not reached: raise Exception("failed to improve reward")
def main(): ray.init() logging.getLogger().setLevel(logging.INFO) date = datetime.now().strftime('%Y%m%d_%H%M%S') parser = argparse.ArgumentParser() # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4') parser.add_argument('--config', type=str, default='config/global_config.json', help='config file') parser.add_argument('--algo', type=str, default='DQN', choices=['DQN', 'DDQN', 'DuelDQN'], help='choose an algorithm') parser.add_argument('--inference', action="store_true", help='inference or training') parser.add_argument('--ckpt', type=str, help='inference or training') parser.add_argument('--epoch', type=int, default=100, help='number of training epochs') parser.add_argument('--num_step', type=int, default=10 ** 3, help='number of timesteps for one episode, and for inference') parser.add_argument('--save_freq', type=int, default=100, help='model saving frequency') parser.add_argument('--batch_size', type=int, default=128, help='model saving frequency') parser.add_argument('--state_time_span', type=int, default=5, help='state interval to receive long term state') parser.add_argument('--time_span', type=int, default=30, help='time interval to collect data') args = parser.parse_args() config_env = env_config(args) # ray.tune.register_env('gym_cityflow', lambda env_config:CityflowGymEnv(config_env)) config_agent = agent_config(config_env) trainer = DQNTrainer( env=CityflowGymEnv, config=config_agent) for i in range(1000): # Perform one iteration of training the policy with DQN result = trainer.train() print(pretty_print(result)) if (i+1) % 100 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint)
class DQNrl(object): def __init__(self, env, env_config, config): self.config = config self.config['env_config'] = env_config self.env = env(env_config) self.agent = DQNTrainer(config=self.config, env=env) def fit(self, checkpoint=None): if checkpoint is None: checkpoint = os.path.join(os.getcwd(), 'data/checkpoint_rl.pkl') for idx in trange(5): result = self.agent.train() LOGGER.warning('result: ', result) if (idx + 1) % 5 == 0: LOGGER.warning('Save checkpoint at: {}'.format(idx + 1)) state = self.agent.save_to_object() with open(checkpoint, 'wb') as fp: pickle.dump(state, fp, protocol=pickle.HIGHEST_PROTOCOL) return result def predict(self, checkpoint=None): if checkpoint is not None: with open(checkpoint, 'rb') as fp: state = pickle.load(fp) self.agent.restore_from_object(state) done = False episode_reward = 0 obs = self.env.reset() actions = [] while not done: action = self.agent.compute_action(obs) actions.append(action) obs, reward, done, info = self.env.step(action) episode_reward += reward results = {'action': actions, 'reward': episode_reward} return results
def train_model(args, config): # Define trainer. Apart from env, config/framework and config/model, which are common among trainers. # Here is a list of default config keys/values: # https://docs.ray.io/en/master/rllib-training.html#common-parameters # For DQN specifically there are also additionally these keys: # https://docs.ray.io/en/master/rllib-algorithms.html#dqn trainer = DQNTrainer( env="DuckieTown-MultiMap", config=config, ) # Start training from a checkpoint, if available. if args.model_path: trainer.restore(args.model_path) best_mean_reward = -np.inf epoch_of_best_mean_reward = 0 path_of_best_mean_reward = None for i in trange(args.epochs, desc="Epochs", leave=False): # Number of episodes (basically epochs) # print(f'----------------------- Starting epoch {i} ----------------------- ') # train() trains only a single episode result = trainer.train() # print(result) # Save model so far. checkpoint_path = trainer.save() # print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}') if result["episode_reward_mean"] > best_mean_reward: best_mean_reward = result["episode_reward_mean"] epoch_of_best_mean_reward = i path_of_best_mean_reward = checkpoint_path # Cleanup CUDA memory to reduce memory usage. torch.cuda.empty_cache() # Debug log to monitor memory. # print(torch.cuda.memory_summary(device=None, abbreviated=False)) return best_mean_reward, epoch_of_best_mean_reward, path_of_best_mean_reward
"policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["ppo_policy"], }, "explore": False, # disable filters, otherwise we would need to synchronize those # as well to the DQN agent "observation_filter": "NoFilter", "framework": "torch" if args.torch else "tf", }) dqn_trainer = DQNTrainer( env="multi_agent_cartpole", config={ "multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["dqn_policy"], }, "gamma": 0.95, "n_step": 3, "framework": "torch" if args.torch or args.mixed_torch_tf else "tf" }) # You should see both the printed X and Y approach 200 as this trains: # info: # policy_reward_mean: # dqn_policy: X # ppo_policy: Y for i in range(args.stop_iters): print("== Iteration", i, "==") # improve the DQN policy
# Use a single worker process to run the server. "num_workers": 0, # Disable OPE, since the rollouts are coming from online clients. "input_evaluation": [], } if args.run == "DQN": # Example of using DQN (supports off-policy actions). trainer = DQNTrainer(env=env, config=dict( connector_config, **{ "exploration_config": { "type": "EpsilonGreedy", "initial_epsilon": 1.0, "final_epsilon": 0.02, "epsilon_timesteps": 1000, }, "learning_starts": 100, "timesteps_per_iteration": 200, "log_level": "INFO", "framework": args.framework, })) elif args.run == "PPO": # Example of using PPO (does NOT support off-policy actions). trainer = PPOTrainer(env=env, config=dict( connector_config, **{ "sample_batch_size": 1000, "train_batch_size": 4000, "framework": args.framework, }))
def train(config, reporter): trainer = DQNTrainer(config=config, env=Coach) for _ in range(11): print(_) trainer.train()
_____________________________________________________________________ value_out (Dense) (None, 1) 257 fc_value_2[0][0] ===================================================================== Total params: 134,915 Trainable params: 134,915 Non-trainable params: 0 _____________________________________________________________________ """ # __query_action_dist_end__ # __get_q_values_dqn_start__ # Get a reference to the model through the policy import numpy as np from ray.rllib.agents.dqn import DQNTrainer trainer = DQNTrainer(env="CartPole-v0", config={"framework": "tf2"}) model = trainer.get_policy().model # <ray.rllib.models.catalog.FullyConnectedNetwork_as_DistributionalQModel ...> # List of all model variables model.variables() # Run a forward pass to get base model output. Note that complex observations # must be preprocessed. An example of preprocessing is examples/saving_experiences.py model_out = model({"obs": np.array([[0.1, 0.2, 0.3, 0.4]])}) # (<tf.Tensor: id=832, shape=(1, 256), dtype=float32, numpy=...) # Access the base Keras models (all default models have a base) model.base_model.summary() """ Model: "model"
# as well to the DQN agent "observation_filter": "MeanStdFilter", # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "framework": args.framework, }) dqn_trainer = DQNTrainer( env="multi_agent_cartpole", config={ "multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["dqn_policy"], }, "model": { "vf_share_layers": True, }, "gamma": 0.95, "n_step": 3, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "framework": args.framework, }) # You should see both the printed X and Y approach 200 as this trains: # info: # policy_reward_mean: # dqn_policy: X # ppo_policy: Y for i in range(args.stop_iters):
def main(): ray.init() logging.getLogger().setLevel(logging.INFO) date = datetime.now().strftime('%Y%m%d_%H%M%S') parser = argparse.ArgumentParser() # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4') parser.add_argument('--config', type=str, default='config/global_config.json', help='config file') parser.add_argument('--algo', type=str, default='DQN', choices=['DQN', 'DDQN', 'DuelDQN'], help='choose an algorithm') parser.add_argument('--inference', action="store_true", help='inference or training') parser.add_argument('--ckpt', type=str, help='inference or training') parser.add_argument('--epoch', type=int, default=10, help='number of training epochs') parser.add_argument( '--num_step', type=int, default=10**3, help='number of timesteps for one episode, and for inference') parser.add_argument('--save_freq', type=int, default=100, help='model saving frequency') parser.add_argument('--batch_size', type=int, default=128, help='model saving frequency') parser.add_argument('--state_time_span', type=int, default=5, help='state interval to receive long term state') parser.add_argument('--time_span', type=int, default=30, help='time interval to collect data') args = parser.parse_args() ### dw ### #parser.add_argument("--num-agents", type=int, default=6) model_dir = "model/{}_{}".format(args.algo, date) result_dir = "result/{}_{}".format(args.algo, date) config_env = env_config(args) num_agents = len(config_env["intersection_id"]) ''' obs_space = Tuple([ CityFlowEnvRay.observation_space for _ in range(num_agents) ]) act_space = Tuple([ CityFlowEnvRay.action_space for _ in range(num_agents) ]) ''' ### dw ### obs_space = CityFlowEnvRay.observation_space act_space = CityFlowEnvRay.action_space ray.tune.register_env('gym_cityflow', lambda env_config: CityFlowEnvRay(env_config)) #config_agent = agent_config(config_env) # # build cityflow environment ''' trainer = DQNTrainer( env=CityFlowEnvRay, config=config_agent) ''' policies = { #"dqn_policy":(None, obs_space, act_space, config_env) #"policy_{}".format(i): (None, obs_space, act_space, config_env) "policy_{}".format(i): (DQNTFPolicy, obs_space, act_space, {}) for i in range(num_agents) } policy_ids = list(policies.keys()) config_agent = agent_config(config_env, policies, policy_ids) trainer = DQNTrainer(env='gym_cityflow', config=config_agent) for i in range(1000): # Perform one iteration of training the policy with DQN result = trainer.train() print(pretty_print(result)) if i % 30 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint)
SERVER_PORT)) server = PolicyServer(self, SERVER_ADDRESS, SERVER_PORT) server.serve_forever() if __name__ == "__main__": ray.init() register_env("ECglass-v2", lambda _: ECglassServing()) # We use DQN since it supports off-policy actions, but you can choose and # configure any agent. dqn = DQNTrainer( env="ECglass-v2", config={ # Use a single process to avoid needing to set up a load balancer "num_workers": 0, # Configure the agent to run short iterations for debugging "exploration_fraction": 0.01, "learning_starts": 100, "timesteps_per_iteration": 200, }) # Attempt to restore from checkpoint if possible. if os.path.exists(CHECKPOINT_FILE): checkpoint_path = open(CHECKPOINT_FILE).read() print("Restoring from checkpoint path", checkpoint_path) dqn.restore(checkpoint_path) # Serving and training loop while True: print(pretty_print(dqn.train())) checkpoint_path = dqn.save()
def run_dqn(self, config): # RAY tmp temp_dir_full_path_obj = Path(self.ray_temp_dir).resolve() temp_dir_full_path_obj.mkdir(parents=True, exist_ok=True) temp_dir_full_path = str(temp_dir_full_path_obj) # Result paths result_dir_path_root = Path(self.run_result_dir).resolve() # Separate MDDE output and Ray output result_dir_path_ray_obj = result_dir_path_root.joinpath("ray") result_dir_path_ray_obj.mkdir(parents=True, exist_ok=True) result_dir_path_ray = str(result_dir_path_ray_obj) result_dir_path_mdde_obj = result_dir_path_root.joinpath("mdde") result_dir_path_mdde_obj.mkdir(parents=True, exist_ok=True) result_dir_path_mdde = str(result_dir_path_mdde_obj) # Config config_file_full_path = str(Path(self.mdde_registry_config).resolve()) # MDDE tmp temp_env_dir = self.env_temp_dir os.makedirs(os.path.abspath(temp_env_dir), exist_ok=True) ray.init( num_gpus=0, num_cpus=4, #temp_dir=temp_dir_full_path ) mdde_config = ConfigEnvironment(tmp_dir=temp_env_dir, result_dir=result_dir_path_mdde) def make_env(host: str, port: int, reg_config: str, env_config: ConfigEnvironment, write_stats: bool, initial_benchmark: bool = False, do_nothing: bool = True) -> Environment: """ Configure MDDE environment to run default. :param host: MDDE registry host or IP. :param port: MDDE registry control port. :param reg_config: Path to MDDE registry config. :param env_config: Environment configuration object. :param write_stats: True to write additional analytics info. :param initial_benchmark: Execute benchmark immediately upon execution. :param do_nothing: Enable or disable the agents' "do_nothing" action. :return: MDDE Environment. """ # Ray is peculiar in the way it handles environments, passing a pre-configured environment might cause # unexpected behavior. Customize the code of this extension if more complex environment are needed # Create Registry client tcp_client = RegistryClientTCP(host, port) read_client: PRegistryReadClient = tcp_client write_client: PRegistryWriteClient = tcp_client ctrl_client: PRegistryControlClient = tcp_client # Registry configuration config_container = ConfigRegistry() config_container.read(reg_config) # Create agents agents = list() idx = 0 for node in config_container.get_nodes(): agents.append( SingleNodeDefaultAgent(agent_name=node.id, agent_id=idx, data_node_id=node.id, write_stats=write_stats, allow_do_nothing=do_nothing)) idx += 1 # Create scenario scenario = DefaultScenario( num_fragments=20, num_steps_before_bench=config.bench_psteps, agents=agents, benchmark_clients=config.bench_clients, write_stats=write_stats) # Number of YCSB threads # Create environment environment = Environment(config=env_config, scenario=scenario, registry_ctrl=ctrl_client, registry_write=write_client, registry_read=read_client, write_stats=write_stats) # Re-generate data environment.initialize_registry(with_benchmark=initial_benchmark) return environment def obs_shaper_2d_box(obs): """Reshapes the environment into a form suitable for 2D box. Example 1. Note: Guaranteed to work only with the Default agent - Default scenario combination.""" # Resulted shape (Example for default scenario and default single-node agent: 2 agents, 5 fragments): # a_1: [0-4(allocation) 5-9(popularity) 10-14(ownership binary flag)] # a_2: [0-4(allocation) 5-9(popularity) 10-14(ownership binary flag)] # Hint: 2D array where rows are agents, and attributes in columns are as shown above. return obs.reshape((obs.shape[0], obs.shape[1] * obs.shape[2]), order='F') def obs_shaper_flat_box(obs): """Reshapes the environment into a form suitable for 2D 'flat' box. Example 2. Note: Guaranteed to work only with the Default agent - Default scenario combination.""" # Resulted shape (Example for default scenario and default single-node agent: 2 agents, 5 fragments): # [0-4(a_1: allocation) 5-9(a_1: popularity) 10-14(a_1: ownership binary flag) # 15-19(a_2: allocation) 20-24(a_2: popularity) 25-29(a_2: ownership binary flag)] return obs.reshape((obs.shape[0], obs.shape[1] * obs.shape[2]), order='F') \ .reshape((obs.shape[0] * obs.shape[1] * obs.shape[2]), order='C') sample_selected_shaper = obs_shaper_flat_box """Observation shaper selected. Set None if you want to use the default one in the wrapper.""" # Create and initialize environment before passing it to Ray # This makes it impossible to run multiple instances of the environment, however it's intentional due to the # the nature of the environment that's represented as a distributed infrastructure of services, it can't be # easily created and destroyed as a simple local game-like environment env_instance = MddeMultiAgentEnv( env=make_env(host=self.mdde_registry_host, port=self.mdde_registry_port, reg_config=config_file_full_path, env_config=mdde_config, write_stats=False, initial_benchmark=False, do_nothing=config.do_nothing), observation_shaper=sample_selected_shaper) def env_creator(kvargs): env = make_env(**kvargs) return MddeMultiAgentEnv(env=env, observation_shaper=sample_selected_shaper) register_env("mdde", env_creator) # generate policies based on the created environment instance def gen_policy(i): return (None, env_instance.observation_space_dict[i], env_instance.action_space_dict[i], { "agent_id": i, "obs_space_dict": env_instance.observation_space_dict[i], "act_space_dict": env_instance.action_space_dict[i], }) policies = { "policy_%d" % i: gen_policy(i) for i in env_instance.action_space_dict.keys() } policy_ids = list(policies.keys()) def policy_mapping_fn(agent_id): return policy_ids[agent_id] exp_name = "DQN_MDDE_DEBUG" exp_config = { # === Log === "log_level": "ERROR", # === Environment === "env_config": { "host": self.mdde_registry_host, "port": self.mdde_registry_port, "reg_config": config_file_full_path, "env_config": mdde_config, "write_stats": True, "do_nothing": config.do_nothing }, "num_envs_per_worker": 1, "horizon": config.ep_len, # === Policy Config === # --- Model --- "n_step": 1, #"gamma": config.gamma, # --- Replay buffer --- "buffer_size": config.buffer_size, # --- Optimization --- "lr": config.lr, "learning_starts": config.learning_starts, "train_batch_size": self.TRAIN_BATCH_SIZE, "batch_mode": "truncate_episodes", # --- Parallelism --- "num_workers": 0, "num_gpus": 0, "num_gpus_per_worker": 0, # === Multi-agent setting === "multiagent": { "policies": policies, "policy_mapping_fn": ray.tune.function(policy_mapping_fn) }, } if config.debug: # Run DQN within the same process (useful for debugging) dqn_trainer = DQNTrainer(env="mdde", config=exp_config) for step in range(0, config.num_episodes * config.ep_len): dqn_trainer.train() else: trainer = DQNTrainer run_experiments( { exp_name: { "run": trainer, "env": "mdde", "stop": { "episodes_total": config.num_episodes, }, "checkpoint_freq": 0, "local_dir": result_dir_path_ray, "restore": False, "config": exp_config }, }, verbose=0, reuse_actors=False ) # reuse_actors=True - messes up the results
# Use the connector server to generate experiences. "input": ( lambda ioctx: PolicyServerInput(ioctx, SERVER_ADDRESS, SERVER_PORT) ), # Use a single worker process to run the server. "num_workers": 0, # Disable OPE, since the rollouts are coming from online clients. "input_evaluation": [], } if args.run == "DQN": # Example of using DQN (supports off-policy actions). trainer = DQNTrainer( env=env, config=dict( connector_config, **{ "learning_starts": 100, "timesteps_per_iteration": 200, "framework": args.framework, })) elif args.run == "PPO": # Example of using PPO (does NOT support off-policy actions). trainer = PPOTrainer( env=env, config=dict( connector_config, **{ "rollout_fragment_length": 1000, "train_batch_size": 4000, "framework": args.framework, })) else: raise ValueError("--run must be DQN or PPO")
policy.q_loss.stats.update({"q_loss": policy.q_loss.loss}) loss = policy.q_model.extra_loss(policy.q_loss.loss, train_batch, policy.q_loss.stats) return loss def _compute_q_values(policy, model, obs, obs_space, action_space): model({ "obs": obs, "is_training": policy._get_is_training_placeholder(), }, [], None) q_out = model.get_q_out() return q_out["value"], q_out["logits"], q_out["dist"] LegalActionDQNPolicy = DQNTFPolicy.with_updates( name="LegalActionDQNPolicy", action_sampler_fn=build_q_networks, loss_fn=build_q_losses) LegalActionDQNTrainer = DQNTrainer.with_updates( name="LegalActionDQN", default_policy=LegalActionDQNPolicy) LegalActionApexTrainer = LegalActionDQNTrainer.with_updates( name="LegalActionAPEX", default_config=APEX_DEFAULT_CONFIG, **APEX_TRAINER_PROPERTIES)
"explore": False, # disable filters, otherwise we would need to synchronize those # as well to the DQN agent "observation_filter": "NoFilter", # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "framework": "torch" if args.torch else "tf", }) dqn_trainer = DQNTrainer( env="multi_agent_cartpole", config={ "multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["dqn_policy"], }, "gamma": 0.95, "n_step": 3, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "framework": "torch" if args.torch or args.mixed_torch_tf else "tf" }) # You should see both the printed X and Y approach 200 as this trains: # info: # policy_reward_mean: # dqn_policy: X # ppo_policy: Y for i in range(args.stop_iters): print("== Iteration", i, "==")