def testEvaluationOption(self): ray.init() agent = DQNTrainer(env="CartPole-v0", config={"evaluation_interval": 2}) r0 = agent.train() r1 = agent.train() r2 = agent.train() r3 = agent.train() r4 = agent.train() self.assertTrue("evaluation" in r0) self.assertTrue("episode_reward_mean" in r0["evaluation"]) self.assertEqual(r0["evaluation"], r1["evaluation"]) self.assertNotEqual(r1["evaluation"], r2["evaluation"]) self.assertEqual(r2["evaluation"], r3["evaluation"]) self.assertNotEqual(r3["evaluation"], r4["evaluation"])
def train(env_name): ModelCatalog.register_custom_model("masked_actions_model", MaskedActionsCNN) model_config = { "custom_model": "masked_actions_model", "conv_filters": [[16, [2, 2], 1], [32, [2, 2], 1]], "conv_activation": "elu", "fcnet_hiddens": [128], "fcnet_activation": "elu", } tune_config = { "num_workers": 24, "num_gpus": 1, "batch_mode": "complete_episodes", "model": model_config, "env": env_name, "lr": 0.001, "multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping, }, "framework": "tf" } trainer = DQNTrainer(env=env_name, config=tune_config) for i in range(1000): print("== Iteration {}==".format(i)) results = trainer.train() pretty_print(results) checkpoint = trainer.save() print("\nCheckpoint saved at {}\n".format(checkpoint))
def dqn_train(config, reporter): # Instantiate a trainer cfg = { # Max num timesteps for annealing schedules. Exploration is annealed from # 1.0 to exploration_fraction over this number of timesteps scaled by # exploration_fraction "schedule_max_timesteps": 1000000, # Minimum env steps to optimize for per train call. This value does # not affect learning, only the length of iterations. "timesteps_per_iteration": 1000, # Fraction of entire training period over which the exploration rate is # annealed "exploration_fraction": 0.1, # Final value of random action probability "exploration_final_eps": 0.02, "n_step": 3, "buffer_size": 500000, # "sample_batch_size" : 32, # "train_batch_size" : 128, # "learning_starts" : 5000, # "target_network_update_freq": 5000, # "num_workers" : NUM_WORKERS, # "per_worker_exploration" : True, # "worker_side_prioritization": True, # "min_iter_time_s" : 1, } trainer = DQNTrainer(config={**config, **cfg}) while True: result = trainer.train() # Executes one training step print(pretty_print(result)) reporter(**result) # notifies TrialRunner
def train_model(args): # We are using custom model and environment, which need to be registered in ray/rllib # Names can be anything. register_env("DuckieTown-MultiMap", lambda _: DiscreteWrapper(MultiMapEnv())) # Define trainer. Apart from env, config/framework and config/model, which are common among trainers. # Here is a list of default config keys/values: # https://docs.ray.io/en/master/rllib-training.html#common-parameters # For DQN specifically there are also additionally these keys: # https://docs.ray.io/en/master/rllib-algorithms.html#dqn trainer = DQNTrainer( env="DuckieTown-MultiMap", config={ "framework": "torch", "model": { "custom_model": "image-dqn", }, "learning_starts": 500, # Doing this allows us to record images from the DuckieTown Gym! Might be useful for report. # "record_env": True, "train_batch_size": 16, # Use a very small buffer to reduce memory usage, default: 50_000. "buffer_size": 1000, # Dueling off "dueling": False, # No hidden layers "hiddens": [], # Don't save experiences. # "output": None, # "compress_observations": True, "num_workers": 0, "num_gpus": 0.5, "rollout_fragment_length": 50, }) # Start training from a checkpoint, if available. if args.model_path: trainer.restore(args.model_path) plot = plotter.Plotter('dqn_agent') for i in range(args.epochs): # Number of episodes (basically epochs) print( f'----------------------- Starting epoch {i} ----------------------- ' ) # train() trains only a single episode result = trainer.train() print(result) plot.add_results(result) # Save model so far. checkpoint_path = trainer.save() print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}') # Cleanup CUDA memory to reduce memory usage. torch.cuda.empty_cache() # Debug log to monitor memory. print(torch.cuda.memory_summary(device=None, abbreviated=False)) plot.plot('DQN DuckieTown-MultiMap')
def test_reproducing_trajectory(self): class PickLargest(gym.Env): def __init__(self): self.observation_space = gym.spaces.Box(low=float("-inf"), high=float("inf"), shape=(4, )) self.action_space = gym.spaces.Discrete(4) def reset(self, **kwargs): self.obs = np.random.randn(4) return self.obs def step(self, action): reward = self.obs[action] return self.obs, reward, True, {} def env_creator(env_config): return PickLargest() for fw in framework_iterator(frameworks=("tf", "torch")): trajs = list() for trial in range(3): ray.init() register_env("PickLargest", env_creator) config = { "seed": 666 if trial in [0, 1] else 999, "min_time_s_per_reporting": 0, "timesteps_per_iteration": 100, "framework": fw, } agent = DQNTrainer(config=config, env="PickLargest") trajectory = list() for _ in range(8): r = agent.train() trajectory.append(r["episode_reward_max"]) trajectory.append(r["episode_reward_min"]) trajs.append(trajectory) ray.shutdown() # trial0 and trial1 use same seed and thus # expect identical trajectories. all_same = True for v0, v1 in zip(trajs[0], trajs[1]): if v0 != v1: all_same = False self.assertTrue(all_same) # trial1 and trial2 use different seeds and thus # most rewards tend to be different. diff_cnt = 0 for v1, v2 in zip(trajs[1], trajs[2]): if v1 != v2: diff_cnt += 1 self.assertTrue(diff_cnt > 8)
def test_policy_save_restore(self): config = DEFAULT_CONFIG.copy() for _ in framework_iterator(config): trainer = DQNTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() state1 = policy.get_state() trainer.train() state2 = policy.get_state() check(state1["_exploration_state"]["last_timestep"], state2["_exploration_state"]["last_timestep"], false=True) check(state1["global_timestep"], state2["global_timestep"], false=True) # Reset policy to its original state and compare. policy.set_state(state1) state3 = policy.get_state() # Make sure everything is the same. check(state1, state3)
def testTrainCartpoleOffPolicy(self): register_env( "test3", lambda _: PartOffPolicyServing(gym.make("CartPole-v0"), off_pol_frac=0.2)) dqn = DQNTrainer(env="test3", config={"exploration_fraction": 0.001}) for i in range(100): result = dqn.train() print("Iteration {}, reward {}, timesteps {}".format( i, result["episode_reward_mean"], result["timesteps_total"])) if result["episode_reward_mean"] >= 100: return raise Exception("failed to improve reward")
def train(config, checkpoint_dir=None): trainer = DQNTrainer(config=config, env='BomberMan-v0') # trainer.restore('C:\\Users\\Florian\\ray_results\\PPO_BomberMan-v0_2021-03-16_09-20-44984tj3ip\\checkpoint_002770\\checkpoint-2770') iter = 0 # def update_phase(ev): # ev.foreach_env(lambda e: e.set_phase(phase)) while True: iter += 1 result = trainer.train() if iter % 250 == 0: if not os.path.exists(f'./model-{iter}'): trainer.get_policy('policy_01').export_model( f'./model-{iter}') else: print("model already saved")
def train(num_iters, checkpoint_freq): obs_space = spaces.Dict({ 'obs': spaces.Box(low=-0.5, high=1.5, shape=(32, 32, 3), dtype=np.float32), 'action_mask': spaces.Box(low=0, high=1, shape=(5, ), dtype=np.int32) }) act_space = spaces.Discrete(n=5) trainer = DQNTrainer( env='SUMOEnv-v0', config={ 'model': { 'custom_model': 'adaptive-trafficlight', 'custom_options': {}, }, 'multiagent': { 'policy_graphs': { 'default_policy_graph': ( DQNPolicyGraph, obs_space, act_space, {}, ), }, 'policy_mapping_fn': function(lambda _: 'default_policy_graph'), }, 'hiddens': [], # Don't postprocess the action scores 'callbacks': { 'on_episode_end': function(on_episode_end), }, # 'num_workers': 4, # 'num_gpus_per_worker': 0.25, # All workers on a single GPU 'timesteps_per_iteration': 20000, }) for i in range(num_iters): print(f'== Iteration {i}==') print(pretty_print(trainer.train())) if i % checkpoint_freq == 0: checkpoint = trainer.save() print(f'\nCheckpoint saved at {checkpoint}\n')
def train_model(args, config): # Define trainer. Apart from env, config/framework and config/model, which are common among trainers. # Here is a list of default config keys/values: # https://docs.ray.io/en/master/rllib-training.html#common-parameters # For DQN specifically there are also additionally these keys: # https://docs.ray.io/en/master/rllib-algorithms.html#dqn trainer = DQNTrainer( env="DuckieTown-MultiMap", config=config, ) # Start training from a checkpoint, if available. if args.model_path: trainer.restore(args.model_path) best_mean_reward = -np.inf epoch_of_best_mean_reward = 0 path_of_best_mean_reward = None for i in trange(args.epochs, desc="Epochs", leave=False): # Number of episodes (basically epochs) # print(f'----------------------- Starting epoch {i} ----------------------- ') # train() trains only a single episode result = trainer.train() # print(result) # Save model so far. checkpoint_path = trainer.save() # print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}') if result["episode_reward_mean"] > best_mean_reward: best_mean_reward = result["episode_reward_mean"] epoch_of_best_mean_reward = i path_of_best_mean_reward = checkpoint_path # Cleanup CUDA memory to reduce memory usage. torch.cuda.empty_cache() # Debug log to monitor memory. # print(torch.cuda.memory_summary(device=None, abbreviated=False)) return best_mean_reward, epoch_of_best_mean_reward, path_of_best_mean_reward
def ray_server(run='PPO', address=ADDRESS, port=PORT): print(ray.init(log_to_driver=False)) connector_config = { "input": (lambda ioctx: PolicyServerInput(ioctx, address, port)), "num_workers": 0, "input_evaluation": [], "create_env_on_driver": False, "num_gpus": FLAGS.num_gpus, } if run == "DQN": trainer = DQNTrainer(env=ExternalAtari, config=dict(connector_config, **CONFIG_DQN)) elif run == "PPO": trainer = PPOTrainer(env=ExternalAtari, config=dict(connector_config, **CONFIG_PPO)) else: raise ValueError("--run must be DQN or PPO") i = 0 while i < FLAGS.iter: i += 1 print(pretty_print(trainer.train())) ray.shutdown() checkpoint = trainer.save("{}/ckpts".format(FLAGS.train_url.rstrip('/'))) print("checkpoint saved at", checkpoint) mox.file.copy( os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.json"), os.path.join(FLAGS.train_url, "config.json")) mox.file.copy( os.path.join(os.path.abspath(os.path.dirname(__file__)), "customize_service.py"), os.path.join(FLAGS.train_url, "customize_service.py")) mox.file.copy(os.path.join(FLAGS.data_url, "rl_config.py"), os.path.join(FLAGS.train_url, "rl_config.py")) del trainer
def test_train_cartpole_off_policy(self): register_env( "test3", lambda _: PartOffPolicyServing(gym.make("CartPole-v0"), off_pol_frac=0.2)) config = { "num_workers": 0, "exploration_config": { "epsilon_timesteps": 100 }, } for _ in framework_iterator(config, frameworks=("tf", "torch")): dqn = DQNTrainer(env="test3", config=config) reached = False for i in range(50): result = dqn.train() print("Iteration {}, reward {}, timesteps {}".format( i, result["episode_reward_mean"], result["timesteps_total"])) if result["episode_reward_mean"] >= 80: reached = True break if not reached: raise Exception("failed to improve reward")
def main(): ray.init() logging.getLogger().setLevel(logging.INFO) date = datetime.now().strftime('%Y%m%d_%H%M%S') parser = argparse.ArgumentParser() # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4') parser.add_argument('--config', type=str, default='config/global_config.json', help='config file') parser.add_argument('--algo', type=str, default='DQN', choices=['DQN', 'DDQN', 'DuelDQN'], help='choose an algorithm') parser.add_argument('--inference', action="store_true", help='inference or training') parser.add_argument('--ckpt', type=str, help='inference or training') parser.add_argument('--epoch', type=int, default=100, help='number of training epochs') parser.add_argument('--num_step', type=int, default=10 ** 3, help='number of timesteps for one episode, and for inference') parser.add_argument('--save_freq', type=int, default=100, help='model saving frequency') parser.add_argument('--batch_size', type=int, default=128, help='model saving frequency') parser.add_argument('--state_time_span', type=int, default=5, help='state interval to receive long term state') parser.add_argument('--time_span', type=int, default=30, help='time interval to collect data') args = parser.parse_args() config_env = env_config(args) # ray.tune.register_env('gym_cityflow', lambda env_config:CityflowGymEnv(config_env)) config_agent = agent_config(config_env) trainer = DQNTrainer( env=CityflowGymEnv, config=config_agent) for i in range(1000): # Perform one iteration of training the policy with DQN result = trainer.train() print(pretty_print(result)) if (i+1) % 100 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint)
class DQNrl(object): def __init__(self, env, env_config, config): self.config = config self.config['env_config'] = env_config self.env = env(env_config) self.agent = DQNTrainer(config=self.config, env=env) def fit(self, checkpoint=None): if checkpoint is None: checkpoint = os.path.join(os.getcwd(), 'data/checkpoint_rl.pkl') for idx in trange(5): result = self.agent.train() LOGGER.warning('result: ', result) if (idx + 1) % 5 == 0: LOGGER.warning('Save checkpoint at: {}'.format(idx + 1)) state = self.agent.save_to_object() with open(checkpoint, 'wb') as fp: pickle.dump(state, fp, protocol=pickle.HIGHEST_PROTOCOL) return result def predict(self, checkpoint=None): if checkpoint is not None: with open(checkpoint, 'rb') as fp: state = pickle.load(fp) self.agent.restore_from_object(state) done = False episode_reward = 0 obs = self.env.reset() actions = [] while not done: action = self.agent.compute_action(obs) actions.append(action) obs, reward, done, info = self.env.step(action) episode_reward += reward results = {'action': actions, 'reward': episode_reward} return results
def train(config, reporter): trainer = DQNTrainer(config=config, env=Coach) for _ in range(11): print(_) trainer.train()
"buffer_size": 50000, "sample_batch_size": 4, "train_batch_size": 320, "schedule_max_timesteps": 2000000, "exploration_final_eps": 0.01, "exploration_fraction": 0.1, "model": { "dim": 64 } }) def env_creator(env_config): return PodWorldEnv(max_steps=100, reward_factor=1.0) register_env("podworld_env", env_creator) agent = DQNTrainer(config=config, env="podworld_env") agent_save_path = None for i in range(50): stats = agent.train() # print(pretty_print(stats)) if i % 10 == 0 and i > 0: path = agent.save() if agent_save_path is None: agent_save_path = path print('Saved agent at', agent_save_path) logger.write((i, stats['episode_reward_min'])) print('episode_reward_mean', stats['episode_reward_min'])
def main(): ray.init() logging.getLogger().setLevel(logging.INFO) date = datetime.now().strftime('%Y%m%d_%H%M%S') parser = argparse.ArgumentParser() # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4') parser.add_argument('--config', type=str, default='config/global_config.json', help='config file') parser.add_argument('--algo', type=str, default='DQN', choices=['DQN', 'DDQN', 'DuelDQN'], help='choose an algorithm') parser.add_argument('--inference', action="store_true", help='inference or training') parser.add_argument('--ckpt', type=str, help='inference or training') parser.add_argument('--epoch', type=int, default=10, help='number of training epochs') parser.add_argument( '--num_step', type=int, default=10**3, help='number of timesteps for one episode, and for inference') parser.add_argument('--save_freq', type=int, default=100, help='model saving frequency') parser.add_argument('--batch_size', type=int, default=128, help='model saving frequency') parser.add_argument('--state_time_span', type=int, default=5, help='state interval to receive long term state') parser.add_argument('--time_span', type=int, default=30, help='time interval to collect data') args = parser.parse_args() ### dw ### #parser.add_argument("--num-agents", type=int, default=6) model_dir = "model/{}_{}".format(args.algo, date) result_dir = "result/{}_{}".format(args.algo, date) config_env = env_config(args) num_agents = len(config_env["intersection_id"]) ''' obs_space = Tuple([ CityFlowEnvRay.observation_space for _ in range(num_agents) ]) act_space = Tuple([ CityFlowEnvRay.action_space for _ in range(num_agents) ]) ''' ### dw ### obs_space = CityFlowEnvRay.observation_space act_space = CityFlowEnvRay.action_space ray.tune.register_env('gym_cityflow', lambda env_config: CityFlowEnvRay(env_config)) #config_agent = agent_config(config_env) # # build cityflow environment ''' trainer = DQNTrainer( env=CityFlowEnvRay, config=config_agent) ''' policies = { #"dqn_policy":(None, obs_space, act_space, config_env) #"policy_{}".format(i): (None, obs_space, act_space, config_env) "policy_{}".format(i): (DQNTFPolicy, obs_space, act_space, {}) for i in range(num_agents) } policy_ids = list(policies.keys()) config_agent = agent_config(config_env, policies, policy_ids) trainer = DQNTrainer(env='gym_cityflow', config=config_agent) for i in range(1000): # Perform one iteration of training the policy with DQN result = trainer.train() print(pretty_print(result)) if i % 30 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint)
if __name__ == "__main__": ray.init() register_env("ECglass-v2", lambda _: ECglassServing()) # We use DQN since it supports off-policy actions, but you can choose and # configure any agent. dqn = DQNTrainer( env="ECglass-v2", config={ # Use a single process to avoid needing to set up a load balancer "num_workers": 0, # Configure the agent to run short iterations for debugging "exploration_fraction": 0.01, "learning_starts": 100, "timesteps_per_iteration": 200, }) # Attempt to restore from checkpoint if possible. if os.path.exists(CHECKPOINT_FILE): checkpoint_path = open(CHECKPOINT_FILE).read() print("Restoring from checkpoint path", checkpoint_path) dqn.restore(checkpoint_path) # Serving and training loop while True: print(pretty_print(dqn.train())) checkpoint_path = dqn.save() print("Last checkpoint", checkpoint_path) with open(CHECKPOINT_FILE, "w") as f: f.write(checkpoint_path)
dqn_config = { "v_min": -1.0, "v_max": 5.0, "hiddens": [128], "exploration_config": { "epsilon_timesteps": 4000, }, 'lr': 5e-5, "num_atoms": 2, "learning_starts": 100, "timesteps_per_iteration": 1200 } if __name__ == "__main__": ray.init() register_env("LAIMKTEngine", lambda _: LAIMKTEngine(MKTWorld(env_config), episodes=10000)) dqn = DQNTrainer(env="LAIMKTEngine", config=dqn_config) i = 1 while True: result = dqn.train() print( "Iteration {}, Episodes {}, Mean Reward {}, Mean Length {}".format( i, result['episodes_this_iter'], result['episode_reward_mean'], result['episode_len_mean'])) i += 1 ray.shutdown()
# Manual training loop (no Ray tune). if args.no_tune: if args.run == "DQN": trainer = DQNTrainer(config=config) else: trainer = PPOTrainer(config=config) if checkpoint_path: print("Restoring from checkpoint path", checkpoint_path) trainer.restore(checkpoint_path) # Serving and training loop. ts = 0 for _ in range(args.stop_iters): results = trainer.train() print(pretty_print(results)) checkpoint = trainer.save() print("Last checkpoint", checkpoint) with open(checkpoint_path, "w") as f: f.write(checkpoint) if results["episode_reward_mean"] >= args.stop_reward or \ ts >= args.stop_timesteps: break ts += results["timesteps_total"] # Run with Tune for auto env and trainer creation and TensorBoard. else: stop = { "training_iteration": args.stop_iters, "timesteps_total": args.stop_timesteps,
from ray.rllib.agents.dqn import DQNTrainer, DQNTorchPolicy from ray.rllib.agents.dqn.dqn_torch_model import DQNTorchModel config = { 'gamma': 0.9, 'lr': 1e-2, 'num_workers': 4, 'train_batch_size': 1000, 'model': { 'fcnet_hiddens': [128, 128] } } trainer = DQNTrainer(env="LunarLander-v3", config=config).with_updates(execution_plan=execution_plan) results = trainer.train( ) # once enough data is collected the model is updated and the results are returned from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy from ray.rllib.agents.dqn.simple_q_tf_policy import SimpleQTFPolicy from ray.rllib.agents.trainer import with_common_config from ray.rllib.agents.trainer_template import build_trainer from ray.rllib.evaluation.worker_set import WorkerSet from ray.rllib.execution.concurrency_ops import Concurrently from ray.rllib.execution.metric_ops import StandardMetricsReporting from ray.rllib.execution.replay_buffer import LocalReplayBuffer from ray.rllib.execution.replay_ops import Replay, StoreToReplayBuffer from ray.rllib.execution.rollout_ops import ParallelRollouts from ray.rllib.execution.train_ops import TrainOneStep, UpdateTargetNetwork from ray.rllib.policy.policy import LEARNER_STATS_KEY, Policy from ray.rllib.utils.typing import TrainerConfigDict from ray.util.iter import LocalIterator
'grayscale': False, 'zero_mean': False, 'custom_preprocessor': None, 'custom_model': None, 'custom_action_dist': None, 'custom_options': {} } # episode_len_ls = [] # for i in range(11): # trainer = DQNTrainer(config=config, env=Coach) # episode_states, episode_actions = simulate_episode(trainer, np.array([0, 0 ,0])) # episode_len_ls.append(len(episode_actions)) # print(Counter(episode_actions)) # print('mean len:', np.mean(episode_len_ls)) #### training ##### trainer = DQNTrainer(config=config, env=Coach) for i in range(6666): print('train iteration', i) trainer.train() ################### episode_len_ls = [] for i in range(33): episode_states, episode_actions = simulate_episode(trainer, np.array([0, 0, 0])) episode_len_ls.append(len(episode_actions)) print(Counter(episode_actions)) print('mean len:', np.mean(episode_len_ls))
episode_len_ls = [] episode_actions_ls = [] for i in range(20): trainer = DQNTrainer(config=config, env=Coach) episode_states, episode_actions = simulate_episode(trainer, np.array([0])) episode_len_ls.append(len(episode_actions)) episode_actions_ls += episode_actions print(Counter(episode_actions_ls)) print('mean len:', np.mean(episode_len_ls)) print('median len:', np.median(episode_len_ls)) trainer = DQNTrainer(config=config, env=Coach) for train_iter in range(22): print('train_iter', train_iter) result = trainer.train() # print(pretty_print(result)) # import ipdb; ipdb.set_trace() episode_len_ls = [] episode_actions_ls = [] for i in range(66): episode_states, episode_actions = simulate_episode(trainer, np.array([0])) episode_len_ls.append(len(episode_actions)) episode_actions_ls += episode_actions print(Counter(episode_actions_ls)) print('mean len:', np.mean(episode_len_ls)) print('median len:', np.median(episode_len_ls))
config = { "env": LOREnv1, "gamma": 0.9, "num_workers": 0, "num_envs_per_worker": 4, "rollout_fragment_length": 10, "train_batch_size": 500, "multiagent": { "policies_to_train": ["learned"], "policies": { "LORHeuristic": (LORHeuristic, env.observation_space, env.action_space, {}), "learned": (None, env.observation_space, env.action_space, { "model": { "use_lstm": True }, }), }, "policy_mapping_fn": select_policy, }, } trainer_obj = DQNTrainer(config=config) env = trainer_obj.workers.local_worker().env for _ in range(100): results = trainer_obj.train() #print(results) #if _ % 100 == 0: print(env.player1_score, env.player2_score)
"log_level": "INFO", "framework": args.framework, })) elif args.run == "PPO": # Example of using PPO (does NOT support off-policy actions). trainer = PPOTrainer(env=env, config=dict( connector_config, **{ "sample_batch_size": 1000, "train_batch_size": 4000, "framework": args.framework, })) else: raise ValueError("--run must be DQN or PPO") checkpoint_path = CHECKPOINT_FILE.format(args.run) # Attempt to restore from checkpoint if possible. if os.path.exists(checkpoint_path): checkpoint_path = open(checkpoint_path).read() print("Restoring from checkpoint path", checkpoint_path) trainer.restore(checkpoint_path) # Serving and training loop while True: print(pretty_print(trainer.train())) checkpoint = trainer.save() print("Last checkpoint", checkpoint) with open(checkpoint_path, "w") as f: f.write(checkpoint)
def run_dqn(self, config): # RAY tmp temp_dir_full_path_obj = Path(self.ray_temp_dir).resolve() temp_dir_full_path_obj.mkdir(parents=True, exist_ok=True) temp_dir_full_path = str(temp_dir_full_path_obj) # Result paths result_dir_path_root = Path(self.run_result_dir).resolve() # Separate MDDE output and Ray output result_dir_path_ray_obj = result_dir_path_root.joinpath("ray") result_dir_path_ray_obj.mkdir(parents=True, exist_ok=True) result_dir_path_ray = str(result_dir_path_ray_obj) result_dir_path_mdde_obj = result_dir_path_root.joinpath("mdde") result_dir_path_mdde_obj.mkdir(parents=True, exist_ok=True) result_dir_path_mdde = str(result_dir_path_mdde_obj) # Config config_file_full_path = str(Path(self.mdde_registry_config).resolve()) # MDDE tmp temp_env_dir = self.env_temp_dir os.makedirs(os.path.abspath(temp_env_dir), exist_ok=True) ray.init( num_gpus=0, num_cpus=4, #temp_dir=temp_dir_full_path ) mdde_config = ConfigEnvironment(tmp_dir=temp_env_dir, result_dir=result_dir_path_mdde) def make_env(host: str, port: int, reg_config: str, env_config: ConfigEnvironment, write_stats: bool, initial_benchmark: bool = False, do_nothing: bool = True) -> Environment: """ Configure MDDE environment to run default. :param host: MDDE registry host or IP. :param port: MDDE registry control port. :param reg_config: Path to MDDE registry config. :param env_config: Environment configuration object. :param write_stats: True to write additional analytics info. :param initial_benchmark: Execute benchmark immediately upon execution. :param do_nothing: Enable or disable the agents' "do_nothing" action. :return: MDDE Environment. """ # Ray is peculiar in the way it handles environments, passing a pre-configured environment might cause # unexpected behavior. Customize the code of this extension if more complex environment are needed # Create Registry client tcp_client = RegistryClientTCP(host, port) read_client: PRegistryReadClient = tcp_client write_client: PRegistryWriteClient = tcp_client ctrl_client: PRegistryControlClient = tcp_client # Registry configuration config_container = ConfigRegistry() config_container.read(reg_config) # Create agents agents = list() idx = 0 for node in config_container.get_nodes(): agents.append( SingleNodeDefaultAgent(agent_name=node.id, agent_id=idx, data_node_id=node.id, write_stats=write_stats, allow_do_nothing=do_nothing)) idx += 1 # Create scenario scenario = DefaultScenario( num_fragments=20, num_steps_before_bench=config.bench_psteps, agents=agents, benchmark_clients=config.bench_clients, write_stats=write_stats) # Number of YCSB threads # Create environment environment = Environment(config=env_config, scenario=scenario, registry_ctrl=ctrl_client, registry_write=write_client, registry_read=read_client, write_stats=write_stats) # Re-generate data environment.initialize_registry(with_benchmark=initial_benchmark) return environment def obs_shaper_2d_box(obs): """Reshapes the environment into a form suitable for 2D box. Example 1. Note: Guaranteed to work only with the Default agent - Default scenario combination.""" # Resulted shape (Example for default scenario and default single-node agent: 2 agents, 5 fragments): # a_1: [0-4(allocation) 5-9(popularity) 10-14(ownership binary flag)] # a_2: [0-4(allocation) 5-9(popularity) 10-14(ownership binary flag)] # Hint: 2D array where rows are agents, and attributes in columns are as shown above. return obs.reshape((obs.shape[0], obs.shape[1] * obs.shape[2]), order='F') def obs_shaper_flat_box(obs): """Reshapes the environment into a form suitable for 2D 'flat' box. Example 2. Note: Guaranteed to work only with the Default agent - Default scenario combination.""" # Resulted shape (Example for default scenario and default single-node agent: 2 agents, 5 fragments): # [0-4(a_1: allocation) 5-9(a_1: popularity) 10-14(a_1: ownership binary flag) # 15-19(a_2: allocation) 20-24(a_2: popularity) 25-29(a_2: ownership binary flag)] return obs.reshape((obs.shape[0], obs.shape[1] * obs.shape[2]), order='F') \ .reshape((obs.shape[0] * obs.shape[1] * obs.shape[2]), order='C') sample_selected_shaper = obs_shaper_flat_box """Observation shaper selected. Set None if you want to use the default one in the wrapper.""" # Create and initialize environment before passing it to Ray # This makes it impossible to run multiple instances of the environment, however it's intentional due to the # the nature of the environment that's represented as a distributed infrastructure of services, it can't be # easily created and destroyed as a simple local game-like environment env_instance = MddeMultiAgentEnv( env=make_env(host=self.mdde_registry_host, port=self.mdde_registry_port, reg_config=config_file_full_path, env_config=mdde_config, write_stats=False, initial_benchmark=False, do_nothing=config.do_nothing), observation_shaper=sample_selected_shaper) def env_creator(kvargs): env = make_env(**kvargs) return MddeMultiAgentEnv(env=env, observation_shaper=sample_selected_shaper) register_env("mdde", env_creator) # generate policies based on the created environment instance def gen_policy(i): return (None, env_instance.observation_space_dict[i], env_instance.action_space_dict[i], { "agent_id": i, "obs_space_dict": env_instance.observation_space_dict[i], "act_space_dict": env_instance.action_space_dict[i], }) policies = { "policy_%d" % i: gen_policy(i) for i in env_instance.action_space_dict.keys() } policy_ids = list(policies.keys()) def policy_mapping_fn(agent_id): return policy_ids[agent_id] exp_name = "DQN_MDDE_DEBUG" exp_config = { # === Log === "log_level": "ERROR", # === Environment === "env_config": { "host": self.mdde_registry_host, "port": self.mdde_registry_port, "reg_config": config_file_full_path, "env_config": mdde_config, "write_stats": True, "do_nothing": config.do_nothing }, "num_envs_per_worker": 1, "horizon": config.ep_len, # === Policy Config === # --- Model --- "n_step": 1, #"gamma": config.gamma, # --- Replay buffer --- "buffer_size": config.buffer_size, # --- Optimization --- "lr": config.lr, "learning_starts": config.learning_starts, "train_batch_size": self.TRAIN_BATCH_SIZE, "batch_mode": "truncate_episodes", # --- Parallelism --- "num_workers": 0, "num_gpus": 0, "num_gpus_per_worker": 0, # === Multi-agent setting === "multiagent": { "policies": policies, "policy_mapping_fn": ray.tune.function(policy_mapping_fn) }, } if config.debug: # Run DQN within the same process (useful for debugging) dqn_trainer = DQNTrainer(env="mdde", config=exp_config) for step in range(0, config.num_episodes * config.ep_len): dqn_trainer.train() else: trainer = DQNTrainer run_experiments( { exp_name: { "run": trainer, "env": "mdde", "stop": { "episodes_total": config.num_episodes, }, "checkpoint_freq": 0, "local_dir": result_dir_path_ray, "restore": False, "config": exp_config }, }, verbose=0, reuse_actors=False ) # reuse_actors=True - messes up the results
"gamma": 0.95, "n_step": 3, "framework": "torch" if args.torch or args.mixed_torch_tf else "tf" }) # You should see both the printed X and Y approach 200 as this trains: # info: # policy_reward_mean: # dqn_policy: X # ppo_policy: Y for i in range(args.stop_iters): print("== Iteration", i, "==") # improve the DQN policy print("-- DQN --") result_dqn = dqn_trainer.train() print(pretty_print(result_dqn)) # improve the PPO policy print("-- PPO --") result_ppo = ppo_trainer.train() print(pretty_print(result_ppo)) # Test passed gracefully. if args.as_test and \ result_dqn["episode_reward_mean"] > args.stop_reward and \ result_ppo["episode_reward_mean"] > args.stop_reward: print("test passed (both agents above requested reward)") quit(0) # swap weights to synchronize
def dqn_train(config, reporter): # Modify default optimizer to return the batch after each step config["optimizer_class"] = "CustomSyncReplayOptimizer" setattr(optimizers, "CustomSyncReplayOptimizer", CustomSyncReplayOptimizer) # Instantiate a trainer cfg = { # "n_step" : 3, # "buffer_size" : 100000, # "sample_batch_size" : 32, # "train_batch_size" : 128, # "learning_starts" : 5000, # "target_network_update_freq": 5000, "timesteps_per_iteration": 1000, # "num_workers" : cpu_count(), # "per_worker_exploration" : True, # "worker_side_prioritization": True, # "min_iter_time_s" : 1, } trainer = DQNTrainer(config={**config, **cfg}, env="House") # Modify training loop to receive batches from the optimizer # and return custom info in the training result dict def _custom_train(self): start_timestep = self.global_timestep # Update worker explorations exp_vals = [self.exploration0.value(self.global_timestep)] self.local_evaluator.foreach_trainable_policy( lambda p, _: p.set_epsilon(exp_vals[0])) for i, e in enumerate(self.remote_evaluators): exp_val = self.explorations[i].value(self.global_timestep) e.foreach_trainable_policy.remote( lambda p, _: p.set_epsilon(exp_val)) exp_vals.append(exp_val) # Do optimization steps start = time.time() extra_metrics = defaultdict(lambda: defaultdict(list)) metrics = ['comfort_penalty', 'cost'] metrics += [f'{r}_temperature' for r in self.local_evaluator.env.rooms] while (self.global_timestep - start_timestep < self.config["timesteps_per_iteration"] ) or time.time() - start < self.config["min_iter_time_s"]: info_dict = self.optimizer.step() info_dict = info_dict.policy_batches['default_policy'].data for metric in metrics: for episode_id, info in zip(info_dict['eps_id'], info_dict['infos']): extra_metrics[metric][str(episode_id)].append(info[metric]) self.update_target_if_needed() if self.config["per_worker_exploration"]: # Only collect metrics from the third of workers with lowest eps result = self.collect_metrics( selected_evaluators=self. remote_evaluators[-len(self.remote_evaluators) // 3:]) else: result = self.collect_metrics() result.update(timesteps_this_iter=self.global_timestep - start_timestep, info=dict( { "min_exploration": min(exp_vals), "max_exploration": max(exp_vals), "num_target_updates": self.num_target_updates, }, **self.optimizer.stats())) result['extra_metrics'] = extra_metrics return result trainer._train = partial(_custom_train, trainer) while True: result = trainer.train() # Executes one training step # print(pretty_print(result)) reporter(**result) # notifies TrialRunner