def try_ray_ppo_planning(cls, trial_count): with mlflow.start_run(): ray.init() config = { "env": cls, "num_workers": 3, "num_gpus": 0, "horizon": 5, "train_batch_size": 12000, # val of 128 leads to ~1s per training iteration. } full_config = DEFAULT_CONFIG.copy() for k, v in config.items(): full_config[k] = v pprint(full_config) agent = PPOTrainerWithReset(full_config) strategy_fn = get_strategy_function(cls, agent) strategy_fn.info = "Ray PPO Planning strategy" trial_result = do_trials(cls, trial_count, strategy_fn, max_steps_per_episode=10000, always_print=True) checkpoint = agent.save() print(f"checkpoint saved at {checkpoint}") mlflow.log_metrics(trial_result)
def tune_run(n_agents=3, episode_length=4000, config=None): ray.init() tf.compat.v1.enable_v2_behaviøor() # initialize trainer env = ASMEnv(n_agents=n_agents) register_env( "asm", lambda _: ASMEnv(n_agents=n_agents, episode_length=episode_length)) policies = { "govt_policy": (PPOTFPolicy, env.observation_space, env.govt_action_space, {}), } for idx in range(n_agents): policies[f"citizen_policy_{idx}"] = (PPOTFPolicy, env.observation_space, env.citizen_action_space, {}) if config is None: ppo_config = DEFAULT_CONFIG.copy() else: ppo_config = config ppo_config["env"] = "asm" ppo_config["train_batch_size"] = 400 ppo_config["timesteps_per_iteration"] = episode_length ppo_config["multiagent"] = { "policies": policies, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": list(policies.keys()), } tune.run("PPO", stop={"training_iteration": 100}, config=ppo_config)
def _build_model(self): trainer_config = DEFAULT_CONFIG.copy() trainer_config['num_workers'] = 0 trainer_config["train_batch_size"] = 640 trainer_config["sgd_minibatch_size"] = 64 trainer_config["num_sgd_iter"] = 10 trainer = PPOTrainer(trainer_config, self.env_class) return trainer
def load_agent(): # Initialize training environment ray.init() def environment_creater(params=None): agent = SimpleAvoidAgent(noise=0.05) return TronRaySinglePlayerEnvironment(board_size=13, num_players=4, agent=agent) env = environment_creater() tune.register_env("tron_single_player", environment_creater) ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard) # Configure Deep Q Learning with reasonable values config = DEFAULT_CONFIG.copy() config['num_workers'] = 4 ## config['num_gpus'] = 1 #config["timesteps_per_iteration"] = 1024 #config['target_network_update_freq'] = 256 #config['buffer_size'] = 100_000 #config['schedule_max_timesteps'] = 200_000 #config['exploration_fraction'] = 0.02 #config['compress_observations'] = False #config['n_step'] = 2 #config['seed'] = SEED #Configure for PPO #config["sample_batch_size"]= 100 #config["train_batch_size"]=200 #config["sgd_minibatch_size"]=60 #Configure A3C with reasonable values # We will use a simple convolution network with 3 layers as our feature extractor config['model']['vf_share_layers'] = True config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)] config['model']['fcnet_hiddens'] = [256] config['model']['custom_preprocessor'] = 'tron_prep' # Begin training or evaluation #trainer = DDPGTrainer(config, "tron_single_player") #trainer = A3CTrainer(config, "tron_single_player") #trainer = DQNTrainer(config, "tron_single_player") trainer = PPOTrainer(config, "tron_single_player") trainer.restore("./ppo_checkpoint_201/checkpoint-201") return trainer #.get_policy("trainer")
def defaultConfig(args): if args.agent == "PPO": config = PPO_CONFIG.copy() config['num_workers'] = args.workers config['num_gpus'] = 1 config['framework'] = "torch" config['gamma'] = args.gamma # config['model']['dim'] = 21 # config['model']['conv_filters'] = [ [8, [3, 3], 2], # [16, [2, 2], 2], # [512, [6, 6], 1]] return config
def build_model(self): trainer_config = DEFAULT_CONFIG.copy() trainer_config["num_workers"] = 0 # trainer_config["train_batch_size"] = 640 # trainer_config["sgd_minibatch_size"] = 160 # trainer_config["num_sgd_iter"] = 100 trainer_config["exploration_config"] = { "type": "Random", } # EpsilonGreedy(Exploration): # trainer_config["exploration_config"] = { # "type": "Curiosity", # "eta": 0.2, # "lr": 0.001, # "feature_dim": 128, # "feature_net_config": { # "fcnet_hiddens": [], # "fcnet_activation": "relu", # }, # "sub_exploration": { # "type": "StochasticSampling", # } # } # trainer_config["log_level"] = "DEBUG" """ if env_config is not None: for x in env_config.keys(): trainer_config[x] = env_config[x] """ # trainer_config["env_config"] = copy.deepcopy(env_config) # {"rules": "qiyang_role"} trainer_config.update(self.agent_config) self.trainer = PPOTrainer(trainer_config, self.agent_config["env"]) # self.config["trainer"] = self.trainer return self.trainer
def main(): config = DEFAULT_CONFIG.copy() config['num_workers'] = 1 config['num_gpus'] = 0 config['num_cpus_per_worker'] = 0 config["entropy_coeff"] = 0 register_env("HiveEnv", lambda c: GymHive(c)) agent = PPOTrainer(config, env='HiveEnv') results = [] episode_data = [] episode_json = [] n_epochs = 1000 for epoch in range(n_epochs): # for _ in range(1000): # agent.render() # agent.step(agent.compute_action()) result = agent.train() results.append(result) episode = { 'epoch': epoch, 'episode_reward_min': result['episode_reward_min'], 'episode_reward_mean': result['episode_reward_mean'], 'episode_reward_max': result['episode_reward_max'], 'episode_len_mean': result['episode_len_mean'] } episode_data.append(episode) episode_json.append(json.dumps(episode)) print( f'{epoch:3d}: Min/Mean/Max reward: {result["episode_reward_min"]:8.4f}/{result["episode_reward_mean"]:8.4f}/{result["episode_reward_max"]:8.4f}' )
import gym import ray from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG from ray.tune.logger import pretty_print ray.init(num_gpus=1) config = DEFAULT_CONFIG.copy() config['num_gpus'] = 1 config['num_workers'] = 1 config['num_sgd_iter'] = 30 config['sgd_minibatch_size'] = 128 config['model']['fcnet_hiddens'] = [100, 100] config[ 'num_cpus_per_worker'] = 0 # This avoids running out of resources in the notebook environment when this cell is re-executed agent = PPOTrainer(config, 'CartPole-v0') for i in range(5): result = agent.train() print(pretty_print(result))
import gym import ray.utils from ray.tune.logger import pretty_print from envs import env_config from envs.ssa_tasker_simple_2 import SSA_Tasker_Env import datetime from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG as PPO_CONFIG import numpy as np import pickle ray.init() env_config['rso_count'] = 20 config = PPO_CONFIG.copy() config['num_gpus'] = 1 config['num_workers'] = 4 # !---- found that the network design from Jones's work had little effect in training # config['model']['fcnet_hiddens'] = [180, 95, 50] # 10 RSOs # config['model']['fcnet_hiddens'] = [360, 180, 100] # 20 RSOs # config['model']['fcnet_hiddens'] = [720, 380, 200] # 40 RSOs config['gamma'] = 0.99 # gamma (float) Discount factor config['rollout_fragment_length'] = 32 if env_config['rso_count'] == 40: #config['model']['fcnet_hiddens'] = [512, 512] # 40 RSOs config[ 'rollout_fragment_length'] = 128 # n_steps (int) The number of steps to run for each environment per update # (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel) config[ 'entropy_coeff'] = 0.01 # ent_coef (float) Entropy coefficient for the loss calculation config[
self.x = x_next reward = -1 * self.cost(x_next, action_vector) done = False if self.x[1] < 1: done = True return self.x, reward, done, {} def reset(self): self.f, self.x, self.cost = make_SIR_Treatement_model(self.S_0, self.I_0, alpha=self.alpha, beta=self.beta, f=0.5, B=1) return self.x trainer_config = DEFAULT_CONFIG.copy() trainer_config['num_workers'] = 1 trainer_config["train_batch_size"] = 400 trainer_config["sgd_minibatch_size"] = 64 trainer_config["num_sgd_iter"] = 10 trainer = PPOTrainer(trainer_config, SIR); for i in range(200): print("Training iteration {}...".format(i)) trainer.train()
rllib_cfg[ "log_level"] = "INFO" # Set the ray.rllib.* log level for the agent process and its workers [DEBUG, INFO, WARN, or ERROR] rllib_cfg[ "log_sys_usage"] = True # Monitor system resource metrics (requires `psutil` and `gputil`) rllib_cfg[ "metrics_smoothing_episodes"] = 100 # Smooth metrics over this many episodes rllib_cfg[ "collect_metrics_timeout"] = 180 # Wait for metric batches for this duration. If not in time, collect in the next train iteration. rllib_cfg[ "timesteps_per_iteration"] = 0 # Minimum env steps to optimize for per train call. It does not affect learning, only monitoring. # ================== Configure learning algorithm ================== # Copy the default learning algorithm configuration, including PPO-specific parameters, # then overwrite the common parameters that has been updated ONLY. agent_cfg = AGENT_DEFAULT_CONFIG.copy() for key, value in rllib_cfg.items(): if COMMON_CONFIG[key] != value: agent_cfg[key] = value # Estimators settings agent_cfg[ "use_gae"] = True # Use the Generalized Advantage Estimator (GAE) with a value function (https://arxiv.org/pdf/1506.02438.pdf) agent_cfg[ "use_critic"] = True # Use a critic as a value baseline (otherwise don't use any; required for using GAE). agent_cfg["lambda"] = 0.95 # The GAE(lambda) parameter. # Learning settings agent_cfg[ "kl_coeff"] = 0.0 # Initial coefficient for KL divergence. (0.0 for L^CLIP) agent_cfg["kl_target"] = 0.01 # Target value for KL divergence
def testLocal(self): ray.init(local_mode=True) cf = DEFAULT_CONFIG.copy() agent = PPOAgent(cf, "CartPole-v0") print(agent.train())
def test_local(self): cf = DEFAULT_CONFIG.copy() for _ in framework_iterator(cf): agent = PPOTrainer(cf, "CartPole-v0") print(agent.train()) agent.stop()
def test_local(self): cf = DEFAULT_CONFIG.copy() agent = PPOTrainer(cf, "CartPole-v0") print(agent.train())
def load_agent(): # Initialize training environment ray.init() def environment_creater(params=None): agent = SimpleAvoidAgent(noise=0.05) return TronRayEnvironment(board_size=13, num_players=4) env = environment_creater() tune.register_env("tron_multi_player", environment_creater) ModelCatalog.register_custom_preprocessor("tron_prep", TronExtractBoard) # Configure Deep Q Learning with reasonable values config = DEFAULT_CONFIG.copy() config['num_workers'] = 4 ## config['num_gpus'] = 1 #config["timesteps_per_iteration"] = 1024 #config['target_network_update_freq'] = 256 #config['buffer_size'] = 100_000 #config['schedule_max_timesteps'] = 200_000 #config['exploration_fraction'] = 0.02 #config['compress_observations'] = False #config['n_step'] = 2 #config['seed'] = SEED #Configure for PPO #config["sample_batch_size"]= 100 #config["train_batch_size"]=200 #config["sgd_minibatch_size"]=60 #Configure A3C with reasonable values # We will use a simple convolution network with 3 layers as our feature extractor config['model']['vf_share_layers'] = True config['model']['conv_filters'] = [(512, 5, 1), (256, 3, 2), (128, 3, 2)] config['model']['fcnet_hiddens'] = [256] config['model']['custom_preprocessor'] = 'tron_prep' # All of the models will use the same network as before agent_config = { "model": { "vf_share_layers": True, "conv_filters": [(512, 5, 1), (256, 3, 2), (128, 3, 2)], "fcnet_hiddens": [256], "custom_preprocessor": 'tron_prep' } } def policy_mapping_function(x): if x == '0': return "trainer" return "opponent" config['multiagent'] = { "policy_mapping_fn": policy_mapping_function, "policies": { "trainer": (None, env.observation_space, env.action_space, agent_config), "opponent": (None, env.observation_space, env.action_space, agent_config) }, "policies_to_train": ["trainer"] } # Begin training or evaluation #trainer = DDPGTrainer(config, "tron_single_player") #trainer = A3CTrainer(config, "tron_single_player") #trainer = MARWILTrainer(config, "tron_single_player") trainer = PPOTrainer(config, "tron_multi_player") trainer.restore("./sp_checkpoint_1802/checkpoint-1802") return trainer.get_policy("trainer")
def create_env_config(cli_args): """ Create environment and RLlib config based on passed CLI args. Return config. :param cli_args: Parsed CLI args :return: The complete config for an RLlib agent, including the env & env_config """ env_class = get_env_class(cli_args.agent) map, ue_list, bs_list = get_env(cli_args.env, cli_args.bs_dist, cli_args.static_ues, cli_args.slow_ues, cli_args.fast_ues, cli_args.sharing, cli_args.num_bs) # this is for DrEnv and step utility # env_config = { # 'episode_length': eps_length, 'seed': seed, # 'map': map, 'bs_list': bs_list, 'ue_list': ue_list, 'dr_cutoff': 'auto', 'sub_req_dr': True, # 'curr_dr_obs': False, 'ues_at_bs_obs': False, 'dist_obs': False, 'next_dist_obs': False # } # this is for the custom NormEnv and log utility env_config = { 'episode_length': cli_args.eps_length, 'seed': cli_args.seed, 'map': map, 'bs_list': bs_list, 'ue_list': ue_list, 'rand_episodes': cli_args.rand_train, 'new_ue_interval': cli_args.new_ue_interval, 'reward': cli_args.reward, 'max_ues': cli_args.max_ues, 'ue_arrival': get_ue_arrival(cli_args.ue_arrival), # if enabled log_metrics: log metrics even during training --> visible on tensorboard # if disabled: log just during testing --> probably slightly faster training with less memory 'log_metrics': True, # custom animation rendering 'dashboard': cli_args.dashboard, 'ue_details': cli_args.ue_details, } # convert ue_arrival sequence to str keys as required by RLlib: https://github.com/ray-project/ray/issues/16215 if env_config['ue_arrival'] is not None: env_config['ue_arrival'] = {str(k): v for k, v in env_config['ue_arrival'].items()} # create and return the config config = DEFAULT_CONFIG.copy() # discount factor (default 0.99) # config['gamma'] = 0.5 # 0 = no workers/actors at all --> low overhead for short debugging; 2+ workers to accelerate long training config['num_workers'] = cli_args.workers config['seed'] = cli_args.seed # write training stats to file under ~/ray_results (default: False) config['monitor'] = True config['train_batch_size'] = cli_args.batch_size # default: 4000; default in stable_baselines: 128 # auto normalize obserations by subtracting mean and dividing by std (default: "NoFilter") # config['observation_filter'] = "MeanStdFilter" # NN settings: https://docs.ray.io/en/latest/rllib-models.html#built-in-model-parameters # configure the size of the neural network's hidden layers; default: [256, 256] # config['model']['fcnet_hiddens'] = [512, 512, 512] # LSTM settings config['model']['use_lstm'] = cli_args.lstm # config['model']['lstm_use_prev_action_reward'] = True # config['log_level'] = 'INFO' # ray logging default: warning # reset the env whenever the horizon/eps_length is reached config['horizon'] = cli_args.eps_length config['env'] = env_class config['env_config'] = env_config # callback for monitoring custom metrics config['callbacks'] = CustomMetricCallbacks config['log_level'] = 'ERROR' # for multi-agent env: https://docs.ray.io/en/latest/rllib-env.html#multi-agent-and-hierarchical if MultiAgentEnv in env_class.__mro__: # instantiate env to access obs and action space and num diff UEs env = env_class(env_config) # use separate policies (and NNs) for each agent if cli_args.separate_agent_nns: num_diff_ues = env.get_num_diff_ues() # create policies also for all future UEs if num_diff_ues > env.num_ue: log.warning("Varying num. UEs. Creating policy for all (future) UEs.", curr_num_ue=env.num_ue, num_diff_ues=num_diff_ues, new_ue_interval=env.new_ue_interval, ue_arrival=env.ue_arrival) ue_ids = [str(i + 1) for i in range(num_diff_ues)] else: ue_ids = [ue.id for ue in ue_list] config['multiagent'] = { # attention: ue.id needs to be a string! just casting it to str() here doesn't work; # needs to be consistent with obs keys --> easier, just use string IDs 'policies': {ue_id: (None, env.observation_space, env.action_space, {}) for ue_id in ue_ids}, 'policy_mapping_fn': lambda agent_id: agent_id } # or: all UEs use the same policy and NN else: config['multiagent'] = { 'policies': {'ue': (None, env.observation_space, env.action_space, {})}, 'policy_mapping_fn': lambda agent_id: 'ue' } return config