def _validate_config(config: PartialTrainerConfigDict, trainer_obj_or_none: Optional["Trainer"] = None): # Call super (Trainer) validation method first. Trainer._validate_config(config, trainer_obj_or_none) # Then call user defined one, if any. if validate_config is not None: validate_config(config)
def __init__(self, config: TrainerConfigDict = None, env: Union[str, EnvType, None] = None, logger_creator: Callable[[], Logger] = None, remote_checkpoint_dir: Optional[str] = None, sync_function_tpl: Optional[str] = None): Trainer.__init__(self, config, env, logger_creator, remote_checkpoint_dir, sync_function_tpl)
def setup(self, config: PartialTrainerConfigDict): if allow_unknown_subkeys is not None: self._allow_unknown_subkeys += allow_unknown_subkeys self._allow_unknown_configs = allow_unknown_configs if override_all_subkeys_if_type_changes is not None: self._override_all_subkeys_if_type_changes += \ override_all_subkeys_if_type_changes Trainer.setup(self, config)
def default_resource_request(cls, config): cf = dict(cls._default_config, **config) Trainer._validate_config(cf) return Resources( cpu=cf["num_cpus_for_driver"], gpu=cf["num_gpus"], extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] + cf["num_aggregation_workers"], extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
def test_validate_config_idempotent(self): """ Asserts that validate_config run multiple times on COMMON_CONFIG will be idempotent """ # Given: standard_config = copy.deepcopy(COMMON_CONFIG) # When (we validate config 2 times), ... Trainer._validate_config(standard_config) config_v1 = copy.deepcopy(standard_config) Trainer._validate_config(standard_config) config_v2 = copy.deepcopy(standard_config) # ... then ... self.assertEqual(config_v1, config_v2)
def __getstate__(self): state = Trainer.__getstate__(self) state["trainer_state"] = self.state.copy() if self.train_exec_impl: state["train_exec_impl"] = ( self.train_exec_impl.shared_metrics.get().save()) return state
def __getstate__(self): state = Trainer.__getstate__(self) state.update({ "num_target_updates": self.num_target_updates, "last_target_update_ts": self.last_target_update_ts, }) return state
def __init__(self, obs_space, action_space, config): model = ModelCatalog.get_model_v2(obs_space, action_space, action_space.n, config["model"], "torch") _, env_creator = Trainer._get_env_id_and_creator(config["env"], config) if config["ranked_rewards"]["enable"]: # if r2 is enabled, tne env is wrapped to include a rewards buffer # used to normalize rewards env_cls = get_r2_env_wrapper(env_creator, config["ranked_rewards"]) # the wrapped env is used only in the mcts, not in the # rollout workers def _env_creator(): return env_cls(config["env_config"]) else: def _env_creator(): return env_creator(config["env_config"]) def mcts_creator(): return MCTS(model, config["mcts_config"]) super().__init__( obs_space, action_space, config, model, alpha_zero_loss, TorchCategorical, mcts_creator, _env_creator, )
def on_train_result(self, trainer: Trainer, result: dict, **kwargs): iteration = result["training_iteration"] logger.info(f"Iteration {iteration}") if iteration % 10 == 0: logger.info(f"Model checkpoint at iteration {iteration}") torch.save(trainer.get_weights()["default_policy"], self.model_path)
def get_default_config(cls) -> TrainerConfigDict: return Trainer.merge_trainer_configs( DEFAULT_CONFIG, { # Use UpperConfidenceBound exploration. "exploration_config": {"type": "UpperConfidenceBound"} }, )
def test_validate_config_idempotent(self): """ Asserts that validate_config run multiple times on COMMON_CONFIG will be idempotent """ # Given standard_config = copy.deepcopy(COMMON_CONFIG) standard_config["_use_trajectory_view_api"] = False # When (we validate config 2 times) Trainer._validate_config(standard_config) config_v1 = copy.deepcopy(standard_config) Trainer._validate_config(standard_config) config_v2 = copy.deepcopy(standard_config) # Then self.assertEqual(config_v1, config_v2)
def run_offline_inference(agent: Trainer, env: CrossAdaptiveEnv): # NOTE: something is wrong here. For some reason, all the action values are too close to the bound done = False obs = env.reset() while not done: action = agent.compute_action(obs) # TODO: standardize action # it might be difficult to standardize the action in live mode, but offline inference essentially work obs, _, done, _ = env.step(action)
def get_default_config(cls) -> TrainerConfigDict: config = Trainer.merge_trainer_configs( DEFAULT_CONFIG, { # Use ThompsonSampling exploration. "exploration_config": {"type": "ThompsonSampling"} }, ) return config
def execution_plan(workers, config, **kwargs): # `execution_plan` is provided, use it inside # `self.execution_plan()`. if execution_plan is not None: return execution_plan(workers, config, **kwargs) # If `execution_plan` is not provided (None), the Trainer will use # it's already existing default `execution_plan()` static method # instead. else: return Trainer.execution_plan(workers, config, **kwargs)
def execution_plan(trainer: Trainer, workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: """Execution plan of the Simple Q algorithm. Defines the distributed dataflow. Args: trainer (Trainer): The Trainer object creating the execution plan. workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: A local iterator over training metrics. """ local_replay_buffer = LocalReplayBuffer( num_shards=1, learning_starts=config["learning_starts"], buffer_size=config["buffer_size"], replay_batch_size=config["train_batch_size"], replay_mode=config["multiagent"]["replay_mode"], replay_sequence_length=config["replay_sequence_length"]) # Assign to Trainer, so we can store the LocalReplayBuffer's # data when we save checkpoints. trainer.local_replay_buffer = local_replay_buffer rollouts = ParallelRollouts(workers, mode="bulk_sync") # (1) Generate rollouts and store them in our local replay buffer. store_op = rollouts.for_each( StoreToReplayBuffer(local_buffer=local_replay_buffer)) if config["simple_optimizer"]: train_step_op = TrainOneStep(workers) else: train_step_op = MultiGPUTrainOneStep( workers=workers, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) # (2) Read and train on experiences from the replay buffer. replay_op = Replay(local_buffer=local_replay_buffer) \ .for_each(train_step_op) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) # Alternate deterministically between (1) and (2). train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config)
def default_resource_request(cls, config): cf = dict(cls._default_config, **config) Trainer._validate_config(cf) eval_config = cf["evaluation_config"] # Return PlacementGroupFactory containing all needed resources # (already properly defined as device bundles). return PlacementGroupFactory( bundles=[{ # Driver + Aggregation Workers: # Force to be on same node to maximize data bandwidth # between aggregation workers and the learner (driver). # Aggregation workers tree-aggregate experiences collected # from RolloutWorkers (n rollout workers map to m # aggregation workers, where m < n) and always use 1 CPU # each. "CPU": cf["num_cpus_for_driver"] + cf["num_aggregation_workers"], "GPU": cf["num_gpus"] }] + [ { # RolloutWorkers. "CPU": cf["num_cpus_per_worker"], "GPU": cf["num_gpus_per_worker"], } for _ in range(cf["num_workers"]) ] + ([ { # Evaluation (remote) workers. # Note: The local eval worker is located on the driver CPU. "CPU": eval_config.get("num_cpus_per_worker", cf["num_cpus_per_worker"]), "GPU": eval_config.get("num_gpus_per_worker", cf["num_gpus_per_worker"]), } for _ in range(cf["evaluation_num_workers"]) ] if cf["evaluation_interval"] else []), strategy=config.get("placement_strategy", "PACK"))
def default_resource_request(cls, config): cf = dict(cls._default_config, **config) Trainer._validate_config(cf) eval_config = cf["evaluation_config"] # Return PlacementGroupFactory containing all needed resources # (already properly defined as device bundles). return PlacementGroupFactory( bundles=[{ # Local worker + replay buffer actors. # Force replay buffers to be on same node to maximize # data bandwidth between buffers and the learner (driver). # Replay buffer actors each contain one shard of the total # replay buffer and use 1 CPU each. "CPU": cf["num_cpus_for_driver"] + cf["optimizer"]["num_replay_buffer_shards"], "GPU": cf["num_gpus"] }] + [ { # RolloutWorkers. "CPU": cf["num_cpus_per_worker"], "GPU": cf["num_gpus_per_worker"], } for _ in range(cf["num_workers"]) ] + ([ { # Evaluation workers. # Note: The local eval worker is located on the driver CPU. "CPU": eval_config.get("num_cpus_per_worker", cf["num_cpus_per_worker"]), "GPU": eval_config.get("num_gpus_per_worker", cf["num_gpus_per_worker"]), } for _ in range(cf["evaluation_num_workers"]) ] if cf["evaluation_interval"] else []), strategy=config.get("placement_strategy", "PACK"))
def add_policy(self, policy_id: PolicyID, policy_spec: PolicySpec): # Merge the policies config overrides with the main config. # Also, adjust `num_gpus` (to indicate an individual policy's # num_gpus, not the total number of GPUs). cfg = Trainer.merge_trainer_configs( self.config, dict(policy_spec.config, **{"num_gpus": self.num_gpus_per_policy}), ) # Need to create the replay actor first. Then add the first policy. if self.replay_actor is None: return self._add_replay_buffer_and_policy(policy_id, policy_spec, cfg) # Replay actor already exists -> Just add a new policy here. assert len(self.policy_actors) < self.max_num_policies actual_policy_class = get_tf_eager_cls_if_necessary( policy_spec.policy_class, cfg) colocated = create_colocated_actors( actor_specs=[( ray.remote( num_cpus=1, num_gpus=self.num_gpus_per_policy if not cfg["_fake_gpus"] else 0, )(actual_policy_class), # Policy c'tor args. (policy_spec.observation_space, policy_spec.action_space, cfg), # Policy c'tor kwargs={}. {}, # Count=1, 1, )], # Force co-locate on the already existing replay actor's node. node=ray.get(self.replay_actor.get_host.remote()), ) self.policy_actors[policy_id] = colocated[0][0] return self.policy_actors[policy_id]
def run_live_inference( agent: Trainer, env: CrossAdaptiveEnv, ): mediator = Mediator() episode_index = 0 while episode_index < 1500: source_features, target_features = mediator.get_features() if source_features is None or target_features is None: continue else: # trim off timestamp source_features = source_features[1:] target_features = target_features[1:] standardized_source = np.array([ env.standardizer.get_standardized_value( env.analyser.analysis_features[i], feature_value) for i, feature_value in enumerate(source_features) ]) standardized_target = np.array([ env.standardizer.get_standardized_value( env.analyser.analysis_features[i], feature_value) for i, feature_value in enumerate(target_features) ]) obs = np.concatenate((standardized_source, standardized_target)) print(np.round(obs, decimals=2)) action = agent.compute_action(obs) # action = env.action_space.sample() mapping = env.action_to_mapping(action) # print(mapping) mediator.send_effect_mapping(mapping) episode_index += 1 mediator.terminate() print("\n\n\tDONE\n\n")
DEFAULT_CONFIG = Trainer.merge_trainer_configs( SIMPLEQ_DEFAULT_CONFIG, { # === Model === # Number of atoms for representing the distribution of return. When # this is greater than 1, distributional Q-learning is used. # the discrete supports are bounded by v_min and v_max "num_atoms": 1, "v_min": -10.0, "v_max": 10.0, # Whether to use noisy network "noisy": False, # control the initial value of noisy nets "sigma0": 0.5, # Whether to use dueling dqn "dueling": True, # Dense-layer setup for each the advantage branch and the value branch # in a dueling architecture. "hiddens": [256], # Whether to use double dqn "double_q": True, # N-step Q learning "n_step": 1, # === Prioritized replay buffer === # If True prioritized replay buffer will be used. "prioritized_replay": True, # Alpha parameter for prioritized replay buffer. "prioritized_replay_alpha": 0.6, # Beta parameter for sampling from prioritized replay buffer. "prioritized_replay_beta": 0.4, # Final value of beta (by default, we use constant beta=0.4). "final_prioritized_replay_beta": 0.4, # Time steps over which the beta parameter is annealed. "prioritized_replay_beta_annealing_timesteps": 20000, # Epsilon to add to the TD errors when updating priorities. "prioritized_replay_eps": 1e-6, # Callback to run before learning on a multi-agent batch of # experiences. "before_learn_on_batch": None, # The intensity with which to update the model (vs collecting samples # from the env). If None, uses the "natural" value of: # `train_batch_size` / (`rollout_fragment_length` x `num_workers` x # `num_envs_per_worker`). # If provided, will make sure that the ratio between ts inserted into # and sampled from the buffer matches the given value. # Example: # training_intensity=1000.0 # train_batch_size=250 rollout_fragment_length=1 # num_workers=1 (or 0) num_envs_per_worker=1 # -> natural value = 250 / 1 = 250.0 # -> will make sure that replay+train op will be executed 4x as # often as rollout+insert op (4 * 250 = 1000). # See: rllib/agents/dqn/dqn.py::calculate_rr_weights for further # details. "training_intensity": None, # === Parallelism === # Whether to compute priorities on workers. "worker_side_prioritization": False, }, _allow_unknown_configs=True, )
R2D2_DEFAULT_CONFIG = Trainer.merge_trainer_configs( DQN_DEFAULT_CONFIG, # See keys in impala.py, which are also supported. { # Learning rate for adam optimizer. "lr": 1e-4, # Discount factor. "gamma": 0.997, # Train batch size (in number of single timesteps). "train_batch_size": 64 * 20, # Adam epsilon hyper parameter "adam_epsilon": 1e-3, # Run in parallel by default. "num_workers": 2, # Batch mode must be complete_episodes. "batch_mode": "complete_episodes", # === Replay buffer === "replay_buffer_config": { # For now we don't use the new ReplayBuffer API here "_enable_replay_buffer_api": False, "type": "MultiAgentReplayBuffer", "prioritized_replay": False, "prioritized_replay_alpha": 0.6, # Beta parameter for sampling from prioritized replay buffer. "prioritized_replay_beta": 0.4, # Epsilon to add to the TD errors when updating priorities. "prioritized_replay_eps": 1e-6, # Size of the replay buffer (in sequences, not timesteps). "capacity": 100000, # Set automatically: The number # of contiguous environment steps to # replay at once. Will be calculated via # model->max_seq_len + burn_in. # Do not set this to any valid value! "replay_sequence_length": -1, }, # If True, assume a zero-initialized state input (no matter where in # the episode the sequence is located). # If False, store the initial states along with each SampleBatch, use # it (as initial state when running through the network for training), # and update that initial state during training (from the internal # state outputs of the immediately preceding sequence). "zero_init_states": True, # If > 0, use the `burn_in` first steps of each replay-sampled sequence # (starting either from all 0.0-values if `zero_init_state=True` or # from the already stored values) to calculate an even more accurate # initial states for the actual sequence (starting after this burn-in # window). In the burn-in case, the actual length of the sequence # used for loss calculation is `n - burn_in` time steps # (n=LSTM’s/attention net’s max_seq_len). "burn_in": 0, # Whether to use the h-function from the paper [1] to scale target # values in the R2D2-loss function: # h(x) = sign(x)(|x| + 1 − 1) + εx "use_h_function": True, # The epsilon parameter from the R2D2 loss function (only used # if `use_h_function`=True. "h_function_epsilon": 1e-3, # Update the target network every `target_network_update_freq` steps. "target_network_update_freq": 2500, # Experimental flag. # If True, the execution plan API will not be used. Instead, # a Trainer's `training_iteration` method will be called as-is each # training iteration. "_disable_execution_plan_api": False, }, _allow_unknown_configs=True, )
DEFAULT_CONFIG = Trainer.merge_trainer_configs( PPO_DEFAULT_CONFIG, { # During the sampling phase, each rollout worker will collect a batch # `rollout_fragment_length * num_envs_per_worker` steps in size. "rollout_fragment_length": 100, # Vectorize the env (should enable by default since each worker has # a GPU). "num_envs_per_worker": 5, # During the SGD phase, workers iterate over minibatches of this size. # The effective minibatch size will be: # `sgd_minibatch_size * num_workers`. "sgd_minibatch_size": 50, # Number of SGD epochs per optimization round. "num_sgd_iter": 10, # Download weights between each training step. This adds a bit of # overhead but allows the user to access the weights from the trainer. "keep_local_weights_in_sync": True, # *** WARNING: configs below are DDPPO overrides over PPO; you # shouldn't need to adjust them. *** # DDPPO requires PyTorch distributed. "framework": "torch", # The communication backend for PyTorch distributed. "torch_distributed_backend": "gloo", # Learning is no longer done on the driver process, so # giving GPUs to the driver does not make sense! "num_gpus": 0, # Each rollout worker gets a GPU. "num_gpus_per_worker": 1, # Require evenly sized batches. Otherwise, # collective allreduce could fail. "truncate_episodes": True, # This is auto set based on sample batch size. "train_batch_size": -1, # Kl divergence penalty should be fixed to 0 in DDPPO because in order # for it to be used as a penalty, we would have to un-decentralize # DDPPO "kl_coeff": 0.0, "kl_target": 0.0 }, _allow_unknown_configs=True, )
def __setstate__(self, state): Trainer.__setstate__(self, state) self.train_exec_impl.shared_metrics.get().restore( state["train_exec_impl"])
def __init__(self, config=None, env=None, logger_creator=None): Trainer.__init__(self, config, env, logger_creator)
DEFAULT_CONFIG = Trainer.merge_trainer_configs( appo.DEFAULT_CONFIG, # See keys in appo.py, which are also supported. { # TODO: Unify the buffer API, then clean up our existing # implementations of different buffers. # This is num batches held at any time for each policy. "replay_buffer_capacity": 20, # e.g. ratio=0.2 -> 20% of samples in each train batch are # old (replayed) ones. "replay_buffer_replay_ratio": 0.5, # Timeout to use for `ray.wait()` when waiting for samplers to have placed # new data into the buffers. If no samples are ready within the timeout, # the buffers used for mixin-sampling will return only older samples. "sample_wait_timeout": 0.0, # Timeout to use for `ray.wait()` when waiting for the policy learner actors # to have performed an update and returned learning stats. If no learner # actors have produced any learning results in the meantime, their # learner-stats in the results will be empty for that iteration. "learn_wait_timeout": 0.0, # League-building parameters. # The LeagueBuilder class to be used for league building logic. "league_builder_config": { "type": AlphaStarLeagueBuilder, # The number of random policies to add to the league. This must be an # even number (including 0) as these will be evenly distributed # amongst league- and main- exploiters. "num_random_policies": 2, # The number of initially learning league-exploiters to create. "num_learning_league_exploiters": 4, # The number of initially learning main-exploiters to create. "num_learning_main_exploiters": 4, # Minimum win-rate (between 0.0 = 0% and 1.0 = 100%) of any policy to # be considered for snapshotting (cloning). The cloned copy may then # be frozen (no further learning) or keep learning (independent of # its ancestor policy). # Set this to lower values to speed up league growth. "win_rate_threshold_for_new_snapshot": 0.9, # If we took a new snapshot of any given policy, what's the probability # that this snapshot will continue to be trainable (rather than become # frozen/non-trainable)? By default, only keep those policies trainable # that have been trainable from the very beginning. "keep_new_snapshot_training_prob": 0.0, # Probabilities of different match-types: # LE: Learning league_exploiter vs any. # ME: Learning main exploiter vs any main. # M: Main self-play (p=1.0 - LE - ME). "prob_league_exploiter_match": 0.33, "prob_main_exploiter_match": 0.33, # Only for ME matches: Prob to play against learning # main (vs a snapshot main). "prob_main_exploiter_playing_against_learning_main": 0.5, }, # The maximum number of trainable policies for this Trainer. # Each trainable policy will exist as a independent remote actor, co-located # with a replay buffer. This is besides its existence inside # the RolloutWorkers for training and evaluation. # Set to None for automatically inferring this value from the number of # trainable policies found in the `multiagent` config. "max_num_policies_to_train": None, # By default, don't drop last timestep. # TODO: We should do the same for IMPALA and APPO at some point. "vtrace_drop_last_ts": False, # Reporting interval. "min_time_s_per_reporting": 2, # Use the `training_iteration` method instead of an execution plan. "_disable_execution_plan_api": True, }, _allow_unknown_configs=True, )
def training_step(self) -> ResultDict: # W/o microbatching: Identical to Trainer's default implementation. # Only difference to a default Trainer being the value function loss term # and its value computations alongside each action. if self.config["microbatch_size"] is None: return Trainer.training_step(self) # In microbatch mode, we want to compute gradients on experience # microbatches, average a number of these microbatches, and then # apply the averaged gradient in one SGD step. This conserves GPU # memory, allowing for extremely large experience batches to be # used. if self._by_agent_steps: train_batch = synchronous_parallel_sample( worker_set=self.workers, max_agent_steps=self.config["microbatch_size"]) else: train_batch = synchronous_parallel_sample( worker_set=self.workers, max_env_steps=self.config["microbatch_size"]) self._counters[NUM_ENV_STEPS_SAMPLED] += train_batch.env_steps() self._counters[NUM_AGENT_STEPS_SAMPLED] += train_batch.agent_steps() with self._timers[COMPUTE_GRADS_TIMER]: grad, info = self.workers.local_worker().compute_gradients( train_batch, single_agent=True) # New microbatch accumulation phase. if self._microbatches_grads is None: self._microbatches_grads = grad # Existing gradients: Accumulate new gradients on top of existing ones. else: for i, g in enumerate(grad): self._microbatches_grads[i] += g self._microbatches_counts += train_batch.count self._num_microbatches += 1 # If `train_batch_size` reached: Accumulate gradients and apply. num_microbatches = math.ceil(self.config["train_batch_size"] / self.config["microbatch_size"]) if self._num_microbatches >= num_microbatches: # Update counters. self._counters[STEPS_TRAINED_COUNTER] += self._microbatches_counts self._counters[ STEPS_TRAINED_THIS_ITER_COUNTER] = self._microbatches_counts # Apply gradients. apply_timer = self._timers[APPLY_GRADS_TIMER] with apply_timer: self.workers.local_worker().apply_gradients( self._microbatches_grads) apply_timer.push_units_processed(self._microbatches_counts) # Reset microbatch information. self._microbatches_grads = None self._microbatches_counts = self._num_microbatches = 0 # Also update global vars of the local worker. # Create current global vars. global_vars = { "timestep": self._counters[NUM_AGENT_STEPS_SAMPLED], } with self._timers[WORKER_UPDATE_TIMER]: self.workers.sync_weights( policies=self.workers.local_worker().get_policies_to_train( ), global_vars=global_vars, ) train_results = {DEFAULT_POLICY_ID: info} return train_results
def __setstate__(self, state): Trainer.__setstate__(self, state) self.state = state["trainer_state"].copy() if self.train_pipeline: self.train_pipeline.metrics.restore(state["train_pipeline"])
def __getstate__(self): state = Trainer.__getstate__(self) state["trainer_state"] = self.state.copy() if self.train_pipeline: state["train_pipeline"] = self.train_pipeline.metrics.save() return state
def __setstate__(self, state): Trainer.__setstate__(self, state) self.state = state["trainer_state"].copy()
def validate_config(self, config: PartialTrainerConfigDict): # Call super (Trainer) validation method first. Trainer.validate_config(self, config) # Then call user defined one, if any. if validate_config is not None: validate_config(config)