Exemple #1
0
from ray.rllib.contrib.bandits.agents.policy import BanditPolicy

logger = logging.getLogger(__name__)

# yapf: disable
# __sphinx_doc_begin__
TS_CONFIG = with_common_config({
    # No remote workers by default.
    "num_workers": 0,
    "use_pytorch": True,

    # Do online learning one step at a time.
    "rollout_fragment_length": 1,
    "train_batch_size": 1,

    # Bandits cant afford to do one timestep per iteration as it is extremely
    # slow because of metrics collection overhead. This setting means that the
    # agent will be trained for 100 times in one iteration of Rllib
    "timesteps_per_iteration": 100,

    "exploration_config": {
        "type": "ray.rllib.contrib.bandits.exploration.ThompsonSampling"
    }
})
# __sphinx_doc_end__
# yapf: enable

LinTSTrainer = build_trainer(name="LinTS",
                             default_config=TS_CONFIG,
                             default_policy=BanditPolicy)
Exemple #2
0
        return [1, 1]
    # e.g., 32 / 4 -> native ratio of 8.0
    native_ratio = (config["train_batch_size"] /
                    config["rollout_fragment_length"])
    # Training intensity is specified in terms of
    # (steps_replayed / steps_sampled), so adjust for the native ratio.
    weights = [1, config["training_intensity"] / native_ratio]
    return weights


def get_policy_class(config: TrainerConfigDict) -> Type[Policy]:
    """Policy class picker function.

    Args:
        config (TrainerConfigDict): The trainer's configuration dict.

    Returns:
        Type[Policy]: The Policy class to use with SlateQTrainer.
    """
    if config["slateq_strategy"] == "RANDOM":
        return RandomPolicy
    else:
        return SlateQTorchPolicy


SlateQTrainer = build_trainer(name="SlateQ",
                              get_policy_class=get_policy_class,
                              default_config=DEFAULT_CONFIG,
                              validate_config=validate_config,
                              execution_plan=execution_plan)
Exemple #3
0
def ray_train(save_in_sub_folder=None,
              available_cluster_cpus=None,
              available_cluster_gpus=None,
              LOCAL_MODE=None,
              config=None,
              **mainkwargs):
    #config = gym.make(train_env_id).config

    subprocess.run(["chmod", "-R", "a+rwx", save_in_sub_folder + "/"])
    # Postprocess the perturbed config to ensure it's still valid

    s3pathname = 's3://datastore-s3/groups/Behavior/Pinaki'
    upload_dir_path = s3pathname + "/" + ray_folder + '/' + InceptcurrentDT
    if save_in_sub_folder is not None:
        local_dir_path = save_in_sub_folder
        # makedirpath(upload_dir_path)

    from ray.rllib.agents.impala.vtrace_policy import VTraceTFPolicy

    if is_predict_only() or LOCAL_MODE:
        delegated_cpus = 1
        delegated_gpus = 0
    else:
        delegated_cpus = available_cluster_cpus - 2
        delegated_gpus = available_cluster_gpus

    impala_config = impala.DEFAULT_CONFIG.copy()
    impala_config["num_gpus"] = 0
    ImpalaTrainer = build_trainer(
        name="IMPALA",
        default_config=impala_config,
        default_policy=VTraceTFPolicy,
        validate_config=impala.impala.validate_config,
        get_policy_class=impala.impala.choose_policy,
        make_workers=impala.impala.defer_make_workers,
        make_policy_optimizer=impala.impala.make_aggregators_and_optimizer,
        mixins=[impala.impala.OverrideDefaultResourceRequest])

    def make_async_optimizer(workers, config):
        return AsyncGradientsOptimizer(workers, grads_per_step=100)

    CustomTrainer = PPOTrainer.with_updates(
        make_policy_optimizer=make_async_optimizer)

    restore_folder = None
    algo = "PPO"  # RL Algorithm of choice
    LOAD_MODEL_FOLDER = config[
        "LOAD_MODEL_FOLDER"]  # Location of previous model (if needed) for training
    #RESTORE_COND = "NONE" # RESTORE: Use a previous model to start new training
    # RESTORE_AND_RESUME: Use a previous model to finish previous unfinished training
    # NONE: Start fresh

    # RESTORE_COND = config["RESTORE_COND"]
    RESTORE_COND = "NONE"
    if RESTORE_COND == "RESTORE_AND_RESUME":
        restore_folder, local_restore_path, _ = retrieve_ray_folder_info(
            LOAD_MODEL_FOLDER)
        local_dir = local_restore_path
        resume = True
    elif RESTORE_COND == "RESTORE":
        restore_folder, local_restore_path, _ = retrieve_ray_folder_info(
            LOAD_MODEL_FOLDER)
        local_dir = local_dir_path
        resume = False
    else:
        local_dir = local_dir_path
        resume = False

    checkpoint_freq = int(num_timesteps) // min(int(num_timesteps), 20)

    retrieved_agent_policy = settings.retrieved_agent_policy

    model = config["MODEL"]
    print("delegated_cpus ", delegated_cpus, " delegated_gpus ",
          delegated_gpus)

    ray_trials = ray.tune.run(
        PPOTrainer,
        name="pygame-ray",
        stop={"training_iteration": int(num_timesteps)},
        checkpoint_freq=checkpoint_freq,
        checkpoint_at_end=True,
        local_dir=local_dir,
        # upload_dir=upload_dir_path,
        verbose=True,
        queue_trials=False,
        resume=resume,
        # scheduler=pbt,
        # trial_executor=RayTrialExecutor(),
        # resources_per_trial={"cpu": delegated_cpus, "gpu": 0},
        restore=restore_folder,
        #**es.DEFAULT_CONFIG,
        **{
            "num_samples": 1,
            "config": {
                "num_gpus_per_worker": 0,
                #"num_cpus_per_worker": 1,
                "num_gpus": delegated_gpus,
                "gamma": 0.85,
                "num_workers": delegated_cpus,
                "num_envs_per_worker": 2,
                "env": train_env_id,
                "remote_worker_envs": False,
                "model": model,
                "ignore_worker_failures": True,
                #"env_config": {
                #                "retrieved_agent_policy": 1,
                #              },
                #"callbacks": {
                #  "on_episode_start": ray.tune.function(on_episode_start),
                #             },
                # These params are tuned from a fixed starting value.
                # "lambda": 0.95,
                # "clip_param": 0.2,
                # "lr": 1e-4,
                # These params start off randomly drawn from a set.
                # "num_sgd_iter": sample_from(lambda spec: random.choice([10, 20, 30])),
                # "sgd_minibatch_size": sample_from(lambda spec: random.choice([128, 512, 2048])),
                # "train_batch_size": sample_from(lambda spec: random.choice([10000, 20000, 40000])),
            },
        })
    copy_terminal_output_file(
        save_folder=local_dir_path,
        terminal_output_file_name=terminal_output_file_name)
    subprocess.run(["chmod", "-R", "a+rwx", ray_folder + "/"])
Exemple #4
0
        return A3CTFPolicy


def validate_config(config):
    if config["entropy_coeff"] < 0:
        raise DeprecationWarning("entropy_coeff must be >= 0")
    if config["sample_async"] and config["use_pytorch"]:
        config["sample_async"] = False
        logger.warning(
            "The sample_async option is not supported with use_pytorch: "
            "Multithreading can be lead to crashes if used with pytorch.")


def execution_plan(workers, config):
    # For A3C, compute policy gradients remotely on the rollout workers.
    grads = AsyncGradients(workers)

    # Apply the gradients as they arrive. We set update_all to False so that
    # only the worker sending the gradient is updated with new weights.
    train_op = grads.for_each(ApplyGradients(workers, update_all=False))

    return StandardMetricsReporting(train_op, workers, config)


A3CTrainer = build_trainer(name="A3C",
                           default_config=DEFAULT_CONFIG,
                           default_policy=A3CTFPolicy,
                           get_policy_class=get_policy_class,
                           validate_config=validate_config,
                           execution_plan=execution_plan)
Exemple #5
0
EXPERIMENT_NAME = "{scenario}-{algorithm}-{n_agent}"

scenario_root = (Path(__file__).parent / "../dataset_public").resolve()

scenario_paths = [
    scenario for scenario_dir in scenario_root.iterdir()
    for scenario in scenario_dir.iterdir() if scenario.is_dir()
]

print(f"training on {scenario_paths}")

from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.agents.ppo.ppo import DEFAULT_CONFIG, execution_plan, validate_config
PPOTrainer = build_trainer(name="PPO_TORCH",
                           default_config=DEFAULT_CONFIG,
                           default_policy=PPOTorchPolicy,
                           execution_plan=execution_plan,
                           validate_config=validate_config)


def parse_args():
    parser = argparse.ArgumentParser("train on multi scenarios")

    # env setting
    parser.add_argument("--scenario",
                        type=str,
                        default=None,
                        help="Scenario name")
    parser.add_argument("--headless",
                        default=False,
                        action="store_true",
Exemple #6
0
    # Do online learning one step at a time.
    "rollout_fragment_length": 1,
    "train_batch_size": 1,

    # Bandits cant afford to do one timestep per iteration as it is extremely
    # slow because of metrics collection overhead. This setting means that the
    # agent will be trained for 100 times in one iteration of Rllib
    "timesteps_per_iteration": 100,

    "exploration_config": {
        "type": "ray.rllib.contrib.bandits.exploration.UCB"
    }
})
# __sphinx_doc_end__
# yapf: enable


def get_stats(trainer):
    env_metrics = trainer.collect_metrics()
    stats = trainer.optimizer.stats()
    # Uncomment if regret at each time step is needed
    # stats.update({"all_regrets": trainer.get_policy().regrets})
    return dict(env_metrics, **stats)


LinUCBTrainer = build_trainer(
    name="LinUCB",
    default_config=UCB_CONFIG,
    default_policy=BanditPolicy,
    collect_metrics_fn=get_stats)
Exemple #7
0
            .for_each(BroadcastUpdateLearnerWeights(
                learner_thread, workers,
                broadcast_interval=config["broadcast_interval"]))

    # This sub-flow updates the steps trained counter based on learner output.
    dequeue_op = Dequeue(
            learner_thread.outqueue, check=learner_thread.is_alive) \
        .for_each(record_steps_trained)

    merged_op = Concurrently(
        [enqueue_op, dequeue_op], mode="async", output_indexes=[1])

    # Callback for APPO to use to update KL, target network periodically.
    # The input to the callback is the learner fetches dict.
    if config["after_train_step"]:
        merged_op = merged_op.for_each(lambda t: t[1]).for_each(
            config["after_train_step"](workers, config))

    return StandardMetricsReporting(merged_op, workers, config) \
        .for_each(learner_thread.add_learner_metrics)


ImpalaTrainer = build_trainer(
    name="IMPALA",
    default_config=DEFAULT_CONFIG,
    default_policy=VTraceTFPolicy,
    validate_config=validate_config,
    get_policy_class=get_policy_class,
    execution_plan=execution_plan,
    mixins=[OverrideDefaultResourceRequest])
Exemple #8
0
    # No remote workers by default.
    "num_workers": 0,
    # Learning rate.
    "lr": 0.0004,
    # Use the execution plan API instead of policy optimizers.
    "use_exec_api": True,
    "callbacks": MyCallbacks,
})

# Define the trainer.
# From the _setup() function in trainer.py, we can see how the env is setup.
# The main function is _train() in trainer_template.py.
# Here we can see how the execution_plan or other training is called.
PGTrainer = build_trainer(
    name="PolicyGradientTrainer",
    default_config=DEFAULT_CONFIG,
    default_policy=PolicyGradient,
    execution_plan=execution_plan,
)


class InfoNumberRounds():
    def __init__(self, min_, max_, step):
        self.min = min_
        self.max = max_
        self.step = step


def self_play_workflow(config):
    """
    Expects in config:
        checkpoint
Exemple #9
0
parser = argparse.ArgumentParser()
parser.add_argument("--iters", type=int, default=200)


def policy_gradient_loss(policy, model, dist_class, train_batch):
    logits, _ = model({SampleBatch.CUR_OBS: train_batch[SampleBatch.CUR_OBS]})
    action_dist = dist_class(logits, model)
    log_probs = action_dist.logp(train_batch[SampleBatch.ACTIONS])
    return -train_batch[SampleBatch.REWARDS].dot(log_probs)


# <class 'ray.rllib.policy.torch_policy_template.MyTorchPolicy'>
MyTorchPolicy = build_torch_policy(name="MyTorchPolicy",
                                   loss_fn=policy_gradient_loss)

# <class 'ray.rllib.agents.trainer_template.MyCustomTrainer'>
MyTrainer = build_trainer(
    name="MyCustomTrainer",
    default_policy=MyTorchPolicy,
)

if __name__ == "__main__":
    ray.init()
    args = parser.parse_args()
    tune.run(MyTrainer,
             stop={"training_iteration": args.iters},
             config={
                 "env": "CartPole-v0",
                 "num_workers": 2,
             })
Exemple #10
0
                         num_sgd_iter=config["num_sgd_iter"],
                         sgd_minibatch_size=config["sgd_minibatch_size"]))
    else:
        train_op = rollouts.for_each(
            TrainTFMultiGPU(workers=workers,
                            sgd_minibatch_size=config["sgd_minibatch_size"],
                            num_sgd_iter=config["num_sgd_iter"],
                            num_gpus=config["num_gpus"],
                            shuffle_sequences=config["shuffle_sequences"],
                            _fake_gpus=config["_fake_gpus"],
                            framework=config.get("framework")))

    # Update KL after each round of training.
    train_op = train_op.for_each(lambda t: t[1]).for_each(UpdateKL(workers))

    # Warn about bad reward scales and return training metrics.
    return StandardMetricsReporting(train_op, workers, config) \
        .for_each(lambda result: warn_about_bad_reward_scales(config, result))


# Build a child class of `Trainer`, which uses the framework specific Policy
# determined in `get_policy_class()` above.
PPOTrainer = build_trainer(
    name="PPO",
    default_config=DEFAULT_CONFIG,
    validate_config=validate_config,
    default_policy=PPOTFPolicy,
    get_policy_class=get_policy_class,
    execution_plan=execution_plan,
)
Exemple #11
0
        "use_gae": False,
        "vf_loss_coeff": 0.5,
        "entropy_coeff": 0.01,
        "truncate_episodes": True,
        "use_critic": True,
        "grad_clip": 40.0,
        "lr": 0.0001,
        "min_iter_time_s": 5,
        "sample_async": True,
        "lr_schedule": None,
    }
)


CA2CTFPolicy = build_tf_policy(
    name="CA2CTFPolicy",
    stats_fn=stats,
    grad_stats_fn=central_vf_stats,
    loss_fn=ac_loss_func,
    postprocess_fn=postprocess_trajectory,
    before_loss_init=setup_mixins,
    make_model=build_cac_model,
    mixins=[CentralizedValueMixin],
    get_default_config=lambda: DEFAULT_CONFIG,
)


CA2CTrainer = build_trainer(
    name="CA2C", default_policy=CA2CTFPolicy, default_config=DEFAULT_CONFIG
)
Exemple #12
0
    # policy configs, we have to explicitly set it in the multiagent config:
    policies = {
        "ppo_policy": (PPOTorchPolicy if args.torch or args.mixed_torch_tf else
                       PPOTFPolicy, obs_space, act_space, PPO_CONFIG),
        "dqn_policy": (DQNTorchPolicy if args.torch else DQNTFPolicy,
                       obs_space, act_space, DQN_CONFIG),
    }

    def policy_mapping_fn(agent_id):
        if agent_id % 2 == 0:
            return "ppo_policy"
        else:
            return "dqn_policy"

    MyTrainer = build_trainer(
        name="PPO_DQN_MultiAgent",
        default_policy=None,
        execution_plan=custom_training_workflow)

    config = {
        "rollout_fragment_length": 50,
        "num_workers": 0,
        "env": "multi_agent_cartpole",
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping_fn,
            "policies_to_train": ["dqn_policy", "ppo_policy"],
        },
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "framework": "torch" if args.torch else "tf",
        "_use_trajectory_view_api": True,

def execution_plan(workers: WorkerSet, config: TrainerConfigDict,
                   **kwargs) -> LocalIterator[dict]:
    rollouts = ParallelRollouts(workers, mode="async")

    # Collect batches for the trainable policies.
    rollouts = rollouts.for_each(
        SelectExperiences(local_worker=workers.local_worker()))

    # Return training metrics.
    return StandardMetricsReporting(rollouts, workers, config)


RandomParametricTrainer = build_trainer(name="RandomParametric",
                                        default_config=DEFAULT_CONFIG,
                                        default_policy=RandomParametriclPolicy,
                                        execution_plan=execution_plan)


def main():
    register_env("pa_cartpole", lambda _: ParametricActionsCartPole(10))
    trainer = RandomParametricTrainer(env="pa_cartpole")
    result = trainer.train()
    assert result["episode_reward_mean"] > 10, result
    print("Test: OK")


if __name__ == "__main__":
    ray.init()
    main()
Exemple #14
0
                         sgd_minibatch_size=config["sgd_minibatch_size"]))

    # Update KL after each round of training.
    train_op = train_op.for_each(lambda t: t[1]).for_each(UpdateKL(workers))

    return StandardMetricsReporting(train_op, workers, config) \
        .for_each(lambda result: warn_about_bad_reward_scales(config, result))


#######################################################################################################
#####################################   Trainer   #####################################################
#######################################################################################################

new_config = {
    # customs
    "embed_dim": 256,
    "encoder_type": "impala",
    "augmentation": True,
    "aug_num": 2,
    "max_shift": 4,
}
PPO_CONFIG = DEFAULT_CONFIG.copy()
PPO_CONFIG.update(new_config)

DrqPPOTrainer = build_trainer(name="DrqPPO",
                              default_config=PPO_CONFIG,
                              default_policy=DrqPPOTorchPolicy,
                              get_policy_class=get_policy_class,
                              execution_plan=execution_plan,
                              validate_config=validate_config)
Exemple #15
0
def get_policy_class(config):
    if config["framework"] == "torch":
        from ray.rllib.agents.dqn.dqn_torch_policy import DQNTorchPolicy
        return DQNTorchPolicy
    else:
        return DQNTFPolicy


def get_simple_policy_class(config):
    if config["framework"] == "torch":
        from ray.rllib.agents.dqn.simple_q_torch_policy import \
            SimpleQTorchPolicy
        return SimpleQTorchPolicy
    else:
        return SimpleQTFPolicy


GenericOffPolicyTrainer = build_trainer(name="GenericOffPolicyAlgorithm",
                                        default_policy=None,
                                        get_policy_class=get_policy_class,
                                        default_config=DEFAULT_CONFIG,
                                        validate_config=validate_config,
                                        execution_plan=execution_plan)

DQNTrainer = GenericOffPolicyTrainer.with_updates(
    name="DQN", default_policy=DQNTFPolicy, default_config=DEFAULT_CONFIG)

SimpleQTrainer = DQNTrainer.with_updates(
    default_policy=SimpleQTFPolicy, get_policy_class=get_simple_policy_class)
Exemple #16
0
from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.agents.pg.pg_policy import PGTFPolicy

# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
    # No remote workers by default
    "num_workers": 0,
    # Learning rate
    "lr": 0.0004,
    # Use PyTorch as backend
    "use_pytorch": False,
})
# __sphinx_doc_end__
# yapf: enable


def get_policy_class(config):
    if config["use_pytorch"]:
        from ray.rllib.agents.pg.torch_pg_policy import PGTorchPolicy
        return PGTorchPolicy
    else:
        return PGTFPolicy


PGTrainer = build_trainer(name="PGTrainer",
                          default_config=DEFAULT_CONFIG,
                          default_policy=PGTFPolicy,
                          get_policy_class=get_policy_class)
Exemple #17
0
        raise ValueError("Must have an actual Env created on the driver "
                         "(local) worker! Set `create_env_on_driver` to True.")


def validate_env(env: EnvType, env_context: EnvContext):
    """Validates the local_worker's env object (after creation).

    Args:
        env (EnvType): The env object to check (for worker=0 only).
        env_context (EnvContext): The env context used for the instantiation of
            the local worker's env (worker=0).

    Raises:
        ValueError: In case something is wrong with the config.
    """
    if not hasattr(env, "reward") or not callable(env.reward):
        raise ValueError("Env {} doest not have a `reward()` method, needed "
                         "for MB-MPO!".format(env))


# Build a child class of `Trainer`, which uses the default policy,
# MBMPOTorchPolicy. A TensorFlow version is not available yet.
MBMPOTrainer = build_trainer(
    name="MBMPO",
    default_config=DEFAULT_CONFIG,
    default_policy=MBMPOTorchPolicy,
    execution_plan=execution_plan,
    validate_config=validate_config,
    validate_env=validate_env,
)
Exemple #18
0
        return DQNTFPolicy


def get_simple_policy_class(config):
    if config["use_pytorch"]:
        from ray.rllib.agents.dqn.simple_q_torch_policy import \
            SimpleQTorchPolicy
        return SimpleQTorchPolicy
    else:
        return SimpleQTFPolicy


GenericOffPolicyTrainer = build_trainer(
    name="GenericOffPolicyAlgorithm",
    default_policy=None,
    get_policy_class=get_policy_class,
    default_config=DEFAULT_CONFIG,
    validate_config=validate_config,
    get_initial_state=get_initial_state,
    make_policy_optimizer=make_policy_optimizer,
    before_train_step=update_worker_exploration,
    after_optimizer_step=update_target_if_needed,
    after_train_result=after_train_result,
    execution_plan=execution_plan)

DQNTrainer = GenericOffPolicyTrainer.with_updates(
    name="DQN", default_policy=DQNTFPolicy, default_config=DEFAULT_CONFIG)

SimpleQTrainer = DQNTrainer.with_updates(
    default_policy=SimpleQTFPolicy, get_policy_class=get_simple_policy_class)
Exemple #19
0
            selected_workers=trainer.workers.remote_workers()
            [-len(trainer.workers.remote_workers()) // 3:])
    else:
        result = trainer.collect_metrics()
    return result


def disable_exploration(trainer):
    trainer.evaluation_workers.local_worker().foreach_policy(
        lambda p, _: p.set_epsilon(0))


GenericOffPolicyTrainer = build_trainer(
    name="GenericOffPolicyAlgorithm",
    default_policy=None,
    default_config=DEFAULT_CONFIG,
    validate_config=check_config_and_setup_param_noise,
    get_initial_state=get_initial_state,
    make_policy_optimizer=make_optimizer,
    before_init=setup_exploration,
    before_train_step=update_worker_explorations,
    after_optimizer_step=update_target_if_needed,
    after_train_result=add_trainer_metrics,
    collect_metrics_fn=collect_metrics,
    before_evaluate_fn=disable_exploration)

DQNTrainer = GenericOffPolicyTrainer.with_updates(
    name="NAF", default_policy=DQNTFPolicy, default_config=DEFAULT_CONFIG)

SimpleQTrainer = DQNTrainer.with_updates(default_policy=SimpleQPolicy)
Exemple #20
0
            sgd_minibatch_size=config["train_batch_size"],
            num_sgd_iter=1,
            num_gpus=config["num_gpus"],
            shuffle_sequences=True,
            _fake_gpus=config["_fake_gpus"],
            framework=config.get("framework"))

    # (2) Read and train on experiences from the replay buffer.
    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(train_step_op) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    # Alternate deterministically between (1) and (2).
    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)


# Build a child class of `Trainer`, which uses the framework specific Policy
# determined in `get_policy_class()` above.
SimpleQTrainer = build_trainer(
    name="SimpleQTrainer",
    default_policy=SimpleQTFPolicy,
    get_policy_class=get_policy_class,
    execution_plan=execution_plan,
    default_config=DEFAULT_CONFIG,
)
Exemple #21
0

class OverrideDefaultResourceRequest:
    @classmethod
    @override(Trainable)
    def default_resource_request(cls, config):
        cf = dict(cls._default_config, **config)
        Trainer._validate_config(cf)
        return Resources(
            cpu=cf["num_cpus_for_driver"],
            gpu=cf["num_gpus"],
            memory=cf["memory"],
            object_store_memory=cf["object_store_memory"],
            extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] +
            cf["num_aggregation_workers"],
            extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"],
            extra_memory=cf["memory_per_worker"] * cf["num_workers"],
            extra_object_store_memory=cf["object_store_memory_per_worker"] *
            cf["num_workers"])


ImpalaTrainer = build_trainer(
    name="IMPALA",
    default_config=DEFAULT_CONFIG,
    default_policy=VTraceTFPolicy,
    validate_config=validate_config,
    get_policy_class=choose_policy,
    make_workers=defer_make_workers,
    make_policy_optimizer=make_aggregators_and_optimizer,
    mixins=[OverrideDefaultResourceRequest])
Exemple #22
0
def get_policy_class(config):
    if config["use_pytorch"]:
        from ray.rllib.agents.pg.pg_torch_policy import PGTorchPolicy
        return PGTorchPolicy
    else:
        return PGTFPolicy


# Experimental pipeline-based impl; enable with "use_pipeline_impl": True.
def training_pipeline(workers, config):
    # Collects experiences in parallel from multiple RolloutWorker actors.
    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # Combine experiences batches until we hit `train_batch_size` in size.
    # Then, train the policy on those experiences and update the workers.
    train_op = rollouts \
        .combine(ConcatBatches(
            min_batch_size=config["train_batch_size"])) \
        .for_each(TrainOneStep(workers))

    # Add on the standard episode reward, etc. metrics reporting. This returns
    # a LocalIterator[metrics_dict] representing metrics for each train step.
    return StandardMetricsReporting(train_op, workers, config)


PGTrainer = build_trainer(name="PG",
                          default_config=DEFAULT_CONFIG,
                          default_policy=PGTFPolicy,
                          get_policy_class=get_policy_class,
                          training_pipeline=training_pipeline)
Exemple #23
0
    reservoir_buffers = MultiAgentReservoirBuffer(
        reservoir_size, config["multiagent"]["policies"])
    rollouts = ParallelRollouts(workers, mode="bulk_sync")

    # 2. define store operations
    store_op = rollouts.for_each(
        StoreToBuffers(replay_buffers, reservoir_buffers,
                       config['multiagent']['policies_to_train']))  # Sampling

    # 3. define replay/reservoir operations
    replay_op = SimpleLocalReplayMultiagent(replay_buffers, config["replay_train_batch_size"],
                                      config["replay_min_size_to_learn"],
                                      config["replay_train_every"]) \
        .for_each(TrainOneStep(workers))\
        .for_each(UpdateTargetNetwork(workers, config['dqn_policy']["target_network_update_freq"]))

    reservoir_op = LocalReservoirMultiagent(reservoir_buffers, config["reservoir_train_batch_size"],
                                            config["reservoir_min_size_to_learn"],
                                            config["reservoir_train_every"])\
        .for_each(TrainOneStep(workers))

    # 4. define main train loop
    train_op = Concurrently([replay_op, reservoir_op, store_op],
                            mode="round_robin")
    return LowMemoryMetricsReporting(train_op, workers, config)


NFSPTrainer = build_trainer(name='NFSPTrainer',
                            default_policy=NFSPPolicy,
                            default_config=NFSP_CONFIG,
                            execution_plan=execution_plan_nfsp)
            "prioritized_replay_beta":
            config["prioritized_replay_beta"],
            "prioritized_replay_beta_annealing_timesteps":
            config["prioritized_replay_beta_annealing_timesteps"],
            "final_prioritized_replay_beta":
            config["final_prioritized_replay_beta"],
            "prioritized_replay_eps":
            config["prioritized_replay_eps"],
        })
    return SyncReplayOptimizer(workers,
                               learning_starts=config["learning_starts"],
                               buffer_size=config["buffer_size"],
                               train_batch_size=config["train_batch_size"],
                               before_learn_on_batch=before_learn_on_batch,
                               **kwargs)


DataAugmentingDQNTrainer = build_trainer(
    name="data_augmenting_dqn_trainer",
    default_policy=DataAugmentingDQNTFPolicy,
    get_policy_class=get_policy_class,
    default_config=DEFAULT_CONFIG,
    validate_config=validate_config,
    get_initial_state=get_initial_state,
    make_policy_optimizer=make_data_augmenting_policy_optimizer,
    before_train_step=update_worker_exploration,
    after_optimizer_step=update_target_if_needed,
    after_train_result=after_train_result,
)
#   execution_plan=execution_plan)
Exemple #25
0
from benchmark.networks.communicate import NetworkedMixin, postprocess_trajectory


def networked_pg_loss(policy, model, dist_class, train_batch):
    # make gradients accessed
    for k in train_batch.keys():
        if "var" in k or "gamma" in k:
            _ = train_batch[k].shape

    return pg_tf_loss(policy, model, dist_class, train_batch)


def setupmixin(policy, obs_space, action_space, config):
    NetworkedMixin.__init__(policy)


NetworkedPG = build_tf_policy(
    name="NetworkedPG",
    get_default_config=lambda: PG_DEFAULT_CONFIG,
    postprocess_fn=postprocess_trajectory,
    loss_fn=networked_pg_loss,
    mixins=[NetworkedMixin],
    after_init=setupmixin,
)


NetworkedPGTrainer = build_trainer(
    name="NetworkedPGTrainer",
    default_policy=NetworkedPG,
)
# above.
MultiPPOTorchPolicy = build_policy_class(
    name="MultiPPOTorchPolicy",
    framework="torch",
    get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG,
    loss_fn=ppo_surrogate_loss,
    stats_fn=kl_and_loss_stats,
    extra_action_out_fn=vf_preds_fetches,
    postprocess_fn=compute_gae_for_sample_batch,
    extra_grad_process_fn=apply_grad_clipping,
    before_init=setup_config,
    before_loss_init=setup_mixins_override,
    mixins=[
        LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
        ValueNetworkMixin
    ],
)


def get_policy_class(config):
    return MultiPPOTorchPolicy


MultiPPOTrainer = build_trainer(
    name="MultiPPO",
    default_config=ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG,
    validate_config=ray.rllib.agents.ppo.ppo.validate_config,
    default_policy=MultiPPOTorchPolicy,
    get_policy_class=get_policy_class,
    execution_plan=ray.rllib.agents.ppo.ppo.execution_plan)
Exemple #27
0
                                     config["microbatch_size"])
        # In microbatch mode, we want to compute gradients on experience
        # microbatches, average a number of these microbatches, and then apply
        # the averaged gradient in one SGD step. This conserves GPU memory,
        # allowing for extremely large experience batches to be used.
        train_op = (
            rollouts.combine(
                ConcatBatches(
                    min_batch_size=config["microbatch_size"])).for_each(
                        ComputeGradients(workers))  # (grads, info)
            .batch(num_microbatches)  # List[(grads, info)]
            .for_each(AverageGradients())  # (avg_grads, info)
            .for_each(ApplyGradients(workers)))
    else:
        # In normal mode, we execute one SGD step per each train batch.
        train_op = rollouts \
            .combine(ConcatBatches(
                min_batch_size=config["train_batch_size"])) \
            .for_each(TrainOneStep(workers))

    return StandardMetricsReporting(train_op, workers, config)


A2CTrainer = build_trainer(name="A2C",
                           default_config=A2C_DEFAULT_CONFIG,
                           default_policy=A3CTFPolicy,
                           get_policy_class=get_policy_class,
                           make_policy_optimizer=choose_policy_optimizer,
                           validate_config=validate_config,
                           training_pipeline=training_pipeline)
Exemple #28
0
    if config["simple_optimizer"]:
        train_op = rollouts \
            .combine(ConcatBatches(
                min_batch_size=config["train_batch_size"])) \
            .for_each(TrainOneStep(
                workers, num_sgd_iter=config["num_sgd_iter"]))
    else:
        replay_buffer = SimpleReplayBuffer(config["buffer_size"])

        store_op = rollouts \
            .for_each(StoreToReplayBuffer(local_buffer=replay_buffer))

        replay_op = Replay(local_buffer=replay_buffer) \
            .filter(WaitUntilTimestepsElapsed(config["learning_starts"])) \
            .combine(
                ConcatBatches(min_batch_size=config["train_batch_size"])) \
            .for_each(TrainOneStep(
                workers, num_sgd_iter=config["num_sgd_iter"]))

        train_op = Concurrently([store_op, replay_op],
                                mode="round_robin",
                                output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)


AlphaZeroTrainer = build_trainer(name="AlphaZero",
                                 default_config=DEFAULT_CONFIG,
                                 default_policy=AlphaZeroPolicyWrapperClass,
                                 execution_plan=execution_plan)
Exemple #29
0
        return MARWILTorchPolicy


def execution_plan(workers, config):
    rollouts = ParallelRollouts(workers, mode="bulk_sync")
    replay_buffer = SimpleReplayBuffer(config["replay_buffer_size"])

    store_op = rollouts \
        .for_each(StoreToReplayBuffer(local_buffer=replay_buffer))

    replay_op = Replay(local_buffer=replay_buffer) \
        .combine(
            ConcatBatches(
                min_batch_size=config["train_batch_size"],
                count_steps_by=config["multiagent"]["count_steps_by"],
            )) \
        .for_each(TrainOneStep(workers))

    train_op = Concurrently(
        [store_op, replay_op], mode="round_robin", output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)


MARWILTrainer = build_trainer(
    name="MARWIL",
    default_config=DEFAULT_CONFIG,
    default_policy=MARWILTFPolicy,
    get_policy_class=get_policy_class,
    execution_plan=execution_plan)
    "observation_filter": "NoFilter",
    # Uses the sync samples optimizer instead of the multi-gpu one. This is
    # usually slower, but you might want to try it if you run into issues with
    # the default optimizer.
    "simple_optimizer": False,
    # Use PyTorch as framework?
    "use_pytorch": False
})
# __sphinx_doc_end__
# yapf: enable

from ray.rllib.agents.ppo.ppo import choose_policy_optimizer, update_kl,\
 warn_about_bad_reward_scales, validate_config


def get_policy_class(config):
    if config.get("use_pytorch") is True:
        from algorithms.master_agent.master_policy import PPOTorchPolicy
        return PPOTorchPolicy
    else:
        return PPOTFPolicy


MasterAgent = build_trainer(name="MasterAgent",
                            default_config=DEFAULT_CONFIG,
                            default_policy=PPOTFPolicy,
                            get_policy_class=get_policy_class,
                            make_policy_optimizer=choose_policy_optimizer,
                            validate_config=validate_config,
                            after_optimizer_step=update_kl,
                            after_train_result=warn_about_bad_reward_scales)