def execution_plan(workers, config): # For A3C, compute policy gradients remotely on the rollout workers. # rollouts = ParallelRollouts(workers, mode="bulk_sync") grads = AsyncGradients(workers) # Apply the gradients as they arrive. We set update_all to False so that # only the worker sending the gradient is updated with new weights. #train_op = grads.for_each(ApplyGradients(workers, update_all=False)) print("_____") print(workers) temp1 = workers temp2 = workers rem1 = workers.remote_workers()[0:6] rem2 = workers.remote_workers()[6:11] temp1.reset(rem1) temp2.reset(rem2) rollouts1 = ParallelRollouts(temp1, mode="bulk_sync") rollouts2 = ParallelRollouts(temp2, mode="bulk_sync") train_step_op1 = TrainTFMultiGPU( workers=temp1, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) train_step_op2 = TrainTFMultiGPU( workers=temp2, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) train_op1 = rollouts1.combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"][ "count_steps_by"])).for_each(train_step_op1) train_op2 = rollouts2.combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"][ "count_steps_by"])).for_each(train_step_op2) #train_op = grads.for_each(ApplyGradients(workers, update_all=False)) return StandardMetricsReporting(train_op1, temp1, config).union(StandardMetricsReporting(train_op2, temp2, config))
def default_execution_plan(workers: WorkerSet, config: TrainerConfigDict): # Collects experiences in parallel from multiple RolloutWorker actors. rollouts = ParallelRollouts(workers, mode="bulk_sync") # Combine experiences batches until we hit `train_batch_size` in size. # Then, train the policy on those experiences and update the workers. train_op = rollouts.combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )) if config.get("simple_optimizer") is True: train_op = train_op.for_each(TrainOneStep(workers)) else: train_op = train_op.for_each( TrainTFMultiGPU( workers=workers, sgd_minibatch_size=config.get("sgd_minibatch_size", config["train_batch_size"]), num_sgd_iter=config.get("num_sgd_iter", 1), num_gpus=config["num_gpus"], shuffle_sequences=config.get("shuffle_sequences", False), _fake_gpus=config["_fake_gpus"], framework=config["framework"])) # Add on the standard episode reward, etc. metrics reporting. This returns # a LocalIterator[metrics_dict] representing metrics for each train step. return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers, config): rollouts = ParallelRollouts(workers, mode="bulk_sync") # Collect large batches of relevant experiences & standardize. rollouts = rollouts.for_each( SelectExperiences(workers.trainable_policies())) rollouts = rollouts.combine( ConcatBatches(min_batch_size=config["train_batch_size"])) rollouts = rollouts.for_each(StandardizeFields(["advantages"])) if config["simple_optimizer"]: train_op = rollouts.for_each( TrainOneStep( workers, num_sgd_iter=config["num_sgd_iter"], sgd_minibatch_size=config["sgd_minibatch_size"])) else: train_op = rollouts.for_each( TrainTFMultiGPU( workers, sgd_minibatch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], rollout_fragment_length=config["rollout_fragment_length"], num_envs_per_worker=config["num_envs_per_worker"], train_batch_size=config["train_batch_size"], shuffle_sequences=config["shuffle_sequences"], _fake_gpus=config["_fake_gpus"])) # Update KL after each round of training. train_op = train_op.for_each(lambda t: t[1]).for_each(UpdateKL(workers)) return StandardMetricsReporting(train_op, workers, config) \ .for_each(lambda result: warn_about_bad_reward_scales(config, result))
def execution_plan(workers: WorkerSet, config: TrainerConfigDict) -> LocalIterator[dict]: """Execution plan of the PPO algorithm. Defines the distributed dataflow. Args: workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: The Policy class to use with PPOTrainer. If None, use `default_policy` provided in build_trainer(). """ rollouts = ParallelRollouts(workers, mode="bulk_sync") # Collect batches for the trainable policies. rollouts = rollouts.for_each( SelectExperiences(workers.trainable_policies())) # Concatenate the SampleBatches into one. rollouts = rollouts.combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )) # Standardize advantages. rollouts = rollouts.for_each(StandardizeFields(["advantages"])) # Perform one training step on the combined + standardized batch. if config["simple_optimizer"]: train_op = rollouts.for_each( TrainOneStep( workers, num_sgd_iter=config["num_sgd_iter"], sgd_minibatch_size=config["sgd_minibatch_size"])) else: train_op = rollouts.for_each( TrainTFMultiGPU( workers, sgd_minibatch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], rollout_fragment_length=config["rollout_fragment_length"], num_envs_per_worker=config["num_envs_per_worker"], train_batch_size=config["train_batch_size"], shuffle_sequences=config["shuffle_sequences"], _fake_gpus=config["_fake_gpus"], framework=config.get("framework"))) # Update KL after each round of training. train_op = train_op.for_each(lambda t: t[1]).for_each(UpdateKL(workers)) # Warn about bad reward scales and return training metrics. return StandardMetricsReporting(train_op, workers, config) \ .for_each(lambda result: warn_about_bad_reward_scales(config, result))
def execution_plan(workers: WorkerSet, config: TrainerConfigDict) -> Optional[Type[Policy]]: """Execution plan of the MARWIL/BC algorithm. Defines the distributed dataflow. Args: workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: A local iterator over training metrics. """ rollouts = ParallelRollouts(workers, mode="bulk_sync") if config["microbatch_size"]: num_microbatches = math.ceil(config["train_batch_size"] / config["microbatch_size"]) # In microbatch mode, we want to compute gradients on experience # microbatches, average a number of these microbatches, and then apply # the averaged gradient in one SGD step. This conserves GPU memory, # allowing for extremely large experience batches to be used. train_op = ( rollouts.combine( ConcatBatches(min_batch_size=config["microbatch_size"], count_steps_by=config["multiagent"] ["count_steps_by"])).for_each( ComputeGradients(workers)) # (grads, info) .batch(num_microbatches) # List[(grads, info)] .for_each(AverageGradients()) # (avg_grads, info) .for_each(ApplyGradients(workers))) else: # In normal mode, we execute one SGD step per each train batch. if config["simple_optimizer"]: train_step_op = TrainOneStep(workers) else: train_step_op = TrainTFMultiGPU( workers=workers, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) train_op = rollouts.combine( ConcatBatches(min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"] ["count_steps_by"])).for_each(train_step_op) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers: WorkerSet, config: TrainerConfigDict) -> LocalIterator[dict]: """Execution plan of the Simple Q algorithm. Defines the distributed dataflow. Args: workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: A local iterator over training metrics. """ local_replay_buffer = LocalReplayBuffer( num_shards=1, learning_starts=config["learning_starts"], buffer_size=config["buffer_size"], replay_batch_size=config["train_batch_size"], replay_mode=config["multiagent"]["replay_mode"], replay_sequence_length=config["replay_sequence_length"]) rollouts = ParallelRollouts(workers, mode="bulk_sync") # (1) Generate rollouts and store them in our local replay buffer. store_op = rollouts.for_each( StoreToReplayBuffer(local_buffer=local_replay_buffer)) if config["simple_optimizer"]: train_step_op = TrainOneStep(workers) else: train_step_op = TrainTFMultiGPU( workers=workers, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) # (2) Read and train on experiences from the replay buffer. replay_op = Replay(local_buffer=local_replay_buffer) \ .for_each(train_step_op) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) # Alternate deterministically between (1) and (2). train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers, config): rollouts = ParallelRollouts(workers, mode="bulk_sync") # Collect large batches of relevant experiences & standardize. rollouts = rollouts.for_each( SelectExperiences(workers.trainable_policies())) rollouts = rollouts.combine( ConcatBatches(min_batch_size=config["train_batch_size"])) rollouts = rollouts.for_each(StandardizeFields(["advantages"])) if config["simple_optimizer"]: train_op = rollouts.for_each( TrainOneStep(workers, num_sgd_iter=config["num_sgd_iter"], sgd_minibatch_size=config["sgd_minibatch_size"])) else: train_op = rollouts.for_each( TrainTFMultiGPU( workers, sgd_minibatch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], rollout_fragment_length=config["rollout_fragment_length"], num_envs_per_worker=config["num_envs_per_worker"], train_batch_size=config["train_batch_size"], shuffle_sequences=config["shuffle_sequences"], _fake_gpus=config["_fake_gpus"])) # Callback to update the KL based on optimization info. def update_kl(item): _, fetches = item def update(pi, pi_id): if pi_id in fetches: pi.update_kl(fetches[pi_id]["kl"]) else: logger.warning("No data for {}, not updating kl".format(pi_id)) workers.local_worker().foreach_trainable_policy(update) # Update KL after each round of training. train_op = train_op.for_each(update_kl) return StandardMetricsReporting(train_op, workers, config) \ .for_each(lambda result: _warn_about_bad_reward_scales(config, result))
def execution_plan(workers, config): rollouts = ParallelRollouts(workers, mode="bulk_sync") if config["microbatch_size"]: num_microbatches = math.ceil(config["train_batch_size"] / config["microbatch_size"]) # In microbatch mode, we want to compute gradients on experience # microbatches, average a number of these microbatches, and then apply # the averaged gradient in one SGD step. This conserves GPU memory, # allowing for extremely large experience batches to be used. train_op = ( rollouts.combine( ConcatBatches(min_batch_size=config["microbatch_size"], count_steps_by=config["multiagent"] ["count_steps_by"])).for_each( ComputeGradients(workers)) # (grads, info) .batch(num_microbatches) # List[(grads, info)] .for_each(AverageGradients()) # (avg_grads, info) .for_each(ApplyGradients(workers))) else: # In normal mode, we execute one SGD step per each train batch. if config["simple_optimizer"]: train_step_op = TrainOneStep(workers) else: train_step_op = TrainTFMultiGPU( workers=workers, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) train_op = rollouts.combine( ConcatBatches(min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"] ["count_steps_by"])).for_each(train_step_op) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers: WorkerSet, config: TrainerConfigDict) -> LocalIterator[dict]: """Execution plan of the DQN algorithm. Defines the distributed dataflow. Args: workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: A local iterator over training metrics. """ if config.get("prioritized_replay"): prio_args = { "prioritized_replay_alpha": config["prioritized_replay_alpha"], "prioritized_replay_beta": config["prioritized_replay_beta"], "prioritized_replay_eps": config["prioritized_replay_eps"], } else: prio_args = {} local_replay_buffer = LocalReplayBuffer( num_shards=1, learning_starts=config["learning_starts"], buffer_size=config["buffer_size"], replay_batch_size=config["train_batch_size"], replay_mode=config["multiagent"]["replay_mode"], replay_sequence_length=config.get("replay_sequence_length", 1), replay_burn_in=config.get("burn_in", 0), replay_zero_init_states=config.get("zero_init_states", True), **prio_args) rollouts = ParallelRollouts(workers, mode="bulk_sync") # We execute the following steps concurrently: # (1) Generate rollouts and store them in our local replay buffer. Calling # next() on store_op drives this. store_op = rollouts.for_each( StoreToReplayBuffer(local_buffer=local_replay_buffer)) def update_prio(item): samples, info_dict = item if config.get("prioritized_replay"): prio_dict = {} for policy_id, info in info_dict.items(): # TODO(sven): This is currently structured differently for # torch/tf. Clean up these results/info dicts across # policies (note: fixing this in torch_policy.py will # break e.g. DDPPO!). td_error = info.get("td_error", info[LEARNER_STATS_KEY].get("td_error")) samples.policy_batches[policy_id].set_get_interceptor(None) prio_dict[policy_id] = ( samples.policy_batches[policy_id].get("batch_indexes"), td_error) local_replay_buffer.update_priorities(prio_dict) return info_dict # (2) Read and train on experiences from the replay buffer. Every batch # returned from the LocalReplay() iterator is passed to TrainOneStep to # take a SGD step, and then we decide whether to update the target network. post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b) if config["simple_optimizer"]: train_step_op = TrainOneStep(workers) else: train_step_op = TrainTFMultiGPU( workers=workers, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) replay_op = Replay(local_buffer=local_replay_buffer) \ .for_each(lambda x: post_fn(x, workers, config)) \ .for_each(train_step_op) \ .for_each(update_prio) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) # Alternate deterministically between (1) and (2). Only return the output # of (2) since training metrics are not available until (2) runs. train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1], round_robin_weights=calculate_rr_weights(config)) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers, config): if config.get("prioritized_replay"): prio_args = { "prioritized_replay_alpha": config["prioritized_replay_alpha"], "prioritized_replay_beta": config["prioritized_replay_beta"], "prioritized_replay_eps": config["prioritized_replay_eps"], } else: prio_args = {} local_replay_buffer = LocalReplayBuffer( num_shards=1, learning_starts=config["learning_starts"], buffer_size=config["buffer_size"], replay_batch_size=config["train_batch_size"], replay_mode=config["multiagent"]["replay_mode"], replay_sequence_length=config.get("replay_sequence_length", 1), replay_burn_in=config.get("burn_in", 0), replay_zero_init_states=config.get("zero_init_states", True), **prio_args) global replay_buffer replay_buffer = local_replay_buffer def update_prio(item): samples, info_dict = item if config.get("prioritized_replay"): prio_dict = {} for policy_id, info in info_dict.items(): # TODO(sven): This is currently structured differently for # torch/tf. Clean up these results/info dicts across # policies (note: fixing this in torch_policy.py will # break e.g. DDPPO!). td_error = info.get("td_error", info[LEARNER_STATS_KEY].get("td_error")) samples.policy_batches[policy_id].set_get_interceptor(None) prio_dict[policy_id] = (samples.policy_batches[policy_id] .get("batch_indexes"), td_error) local_replay_buffer.update_priorities(prio_dict) return info_dict # (2) Read and train on experiences from the replay buffer. Every batch # returned from the LocalReplay() iterator is passed to TrainOneStep to # take a SGD step, and then we decide whether to update the target network. post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b) if config["simple_optimizer"]: train_step_op = TrainOneStep(workers) else: train_step_op = TrainTFMultiGPU( workers=workers, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) replay_op = Replay(local_buffer=local_replay_buffer) \ .for_each(lambda x: post_fn(x, workers, config)) \ .for_each(train_step_op) \ .for_each(update_prio) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) return StandardMetricsReporting( replay_op, workers, config, by_steps_trained=True)