Example #1
0
 def __init__(self):
     super().__init__(
         with_common_config({
             # PPO specific keys:
             "use_critic": True,
             "use_gae": True,
             "lambda": 1.0,
             "kl_coeff": 0.2,
             "sgd_minibatch_size": 128,
             "shuffle_sequences": True,
             "num_sgd_iter": 30,
             "lr_schedule": None,
             "vf_loss_coeff": 1.0,
             "entropy_coeff": 0.0,
             "entropy_coeff_schedule": None,
             "clip_param": 0.3,
             "vf_clip_param": 10.0,
             "grad_clip": None,
             "kl_target": 0.01,
             "rollout_fragment_length": 200,
             # TrainerConfig overrides:
             "train_batch_size": 4000,
             "lr": 5e-5,
             "model": {
                 "vf_share_layers": False,
             },
             "_disable_execution_plan_api": True,
         }))
Example #2
0
 def get_default_config(cls) -> TrainerConfigDict:
     # Run this Trainer with new `training_iteration` API and set some PPO-specific
     # parameters.
     return with_common_config({
         "num_sgd_iter": 10,
         "sgd_minibatch_size": 128,
     })
Example #3
0
def rand_func():
    wandb.init(config={})
    config = with_common_config(wandb.config)
    config['evaluation_num_episodes'] = 10
    config['num_workers'] = mp.cpu_count() - 1
    config['num_envs_per_worker'] = 4
    config['metrics_smoothing_episodes'] = 2000
    config['observation_filter'] = 'NoFilter'
    config['env'] = default_config['env']
    config['env_config'] = {
        'env_config': {
            'instance_path': config['instance_path']
        }
    }
    config.pop('instance_path', None)

    config['callbacks'] = CustomCallbacks

    ray.init()

    stop = {
        "time_total_s": 600,
    }

    analysis = tune.run(RandomMaskedTrainer,
                        config=config,
                        stop=stop,
                        name="ppo-jss")
    result = analysis.results_df.to_dict('index')
    last_run_id = list(result.keys())[0]
    result = result[last_run_id]
    wandb.log({'time_step_min': result['custom_metrics.time_step_min']})
    if result['custom_metrics.time_step_max'] != float('inf'):
        wandb.log({'time_step_max': result['custom_metrics.time_step_max']})
        wandb.log({'time_step_mean': result['custom_metrics.time_step_mean']})
    wandb.log({'episode_reward_max': result['episode_reward_max']})
    wandb.log({'episode_reward_min': result['episode_reward_min']})
    wandb.log({'episode_reward_mean': result['episode_reward_mean']})
    wandb.log({'episodes_total': result['episodes_total']})
    wandb.log({'training_iteration': result['training_iteration']})

    ray.shutdown()
Example #4
0
class MinimalTrainer(Trainer):
    _name = "MinimalTrainer"
    _default_config = with_common_config(
        {"workers": False, "optimizer": False, "tracker": False}
    )
    _policy = DummyPolicy

    def _init(self, config, env_creator):
        def make_workers():
            return self._make_workers(
                env_creator, self._policy, config, num_workers=config["num_workers"]
            )

        if config["tracker"]:
            self.tracker = StatsTracker(make_workers())
        elif config["optimizer"]:
            self.optimizer = PolicyOptimizer(make_workers())
        elif config["workers"]:
            self.workers = make_workers()

    def _train(self):
        return self._log_metrics({})
Example #5
0
DEFAULT_CONFIG = with_common_config({
    # If true, use the Generalized Advantage Estimator (GAE)
    # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
    "use_gae": True,
    # GAE(lambda) parameter
    "lambda": 1.0,
    # Initial coefficient for KL divergence
    "kl_coeff": 0.0005,
    # Size of batches collected from each worker
    "rollout_fragment_length": 200,
    # Do create an actual env on the local worker (worker-idx=0).
    "create_env_on_driver": True,
    # Stepsize of SGD
    "lr": 1e-3,
    "model": {
        # Share layers for value function.
        "vf_share_layers": False,
    },
    # Coefficient of the value function loss
    "vf_loss_coeff": 0.5,
    # Coefficient of the entropy regularizer
    "entropy_coeff": 0.0,
    # PPO clip parameter
    "clip_param": 0.3,
    # Clip param for the value function. Note that this is sensitive to the
    # scale of the rewards. If your expected V is large, increase this.
    "vf_clip_param": 10.0,
    # If specified, clip the global norm of gradients by this amount
    "grad_clip": None,
    # Target value for KL divergence
    "kl_target": 0.01,
    # Whether to rollout "complete_episodes" or "truncate_episodes"
    "batch_mode": "complete_episodes",
    # Which observation filter to apply to the observation
    "observation_filter": "NoFilter",
    # Number of Inner adaptation steps for the MAML algorithm
    "inner_adaptation_steps": 1,
    # Number of MAML steps per meta-update iteration (PPO steps)
    "maml_optimizer_steps": 5,
    # Inner Adaptation Step size
    "inner_lr": 0.1,
    # Use Meta Env Template
    "use_meta_env": True,

    # Deprecated keys:
    # Share layers for value function. If you set this to True, it's important
    # to tune vf_loss_coeff.
    # Use config.model.vf_share_layers instead.
    "vf_share_layers": DEPRECATED_VALUE,
})
Example #6
0
logger = logging.getLogger(__name__)

Result = namedtuple("Result", [
    "noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths",
    "eval_returns", "eval_lengths"
])

# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
    "noise_stdev": 0.02,  # std deviation of parameter noise
    "num_rollouts": 32,  # number of perturbs to try
    "rollouts_used": 32,  # number of perturbs to keep in gradient estimate
    "num_workers": 2,
    "sgd_stepsize": 0.01,  # sgd step-size
    "observation_filter": "MeanStdFilter",
    "noise_size": 250000000,
    "eval_prob": 0.03,  # probability of evaluating the parameter rewards
    "report_length": 10,  # how many of the last rewards we average over
    "offset": 0,
})
# __sphinx_doc_end__
# yapf: enable


@ray.remote
def create_shared_noise(count):
    """Create a large array of noise to be shared by all workers."""
    seed = 123
    noise = np.random.RandomState(seed).randn(count).astype(np.float32)
    return noise
Example #7
0
DEFAULT_CONFIG = with_common_config({
    # If true, use the Generalized Advantage Estimator (GAE)
    # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
    "use_gae": True,
    # GAE(lambda) parameter
    "lambda": 1.0,
    # Initial coefficient for KL divergence
    "kl_coeff": 0.2,
    # Number of timesteps collected for each SGD round
    "timesteps_per_batch": 4000,
    # Number of SGD iterations in each outer loop
    "num_sgd_iter": 30,
    # Stepsize of SGD
    "sgd_stepsize": 5e-5,
    # Total SGD batch size across all devices for SGD
    "sgd_batchsize": 128,
    # Coefficient of the value function loss
    "vf_loss_coeff": 1.0,
    # Coefficient of the entropy regularizer
    "entropy_coeff": 0.0,
    # PPO clip parameter
    "clip_param": 0.3,
    # Target value for KL divergence
    "kl_target": 0.01,
    # Number of GPUs to use for SGD
    "num_gpus": 0,
    # Whether to allocate GPUs for workers (if > 0).
    "num_gpus_per_worker": 0,
    # Whether to allocate CPUs for workers (if > 0).
    "num_cpus_per_worker": 1,
    # Whether to rollout "complete_episodes" or "truncate_episodes"
    "batch_mode": "complete_episodes",
    # Which observation filter to apply to the observation
    "observation_filter": "MeanStdFilter",
})
Example #8
0
from ray.rllib.agents.ars import policies
from ray.rllib.agents.es import tabular_logger as tlogger
from ray.rllib.agents.ars import utils

Result = namedtuple("Result", [
    "noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths",
    "eval_returns", "eval_lengths"
])

DEFAULT_CONFIG = with_common_config({
    'noise_stdev': 0.02,  # std deviation of parameter noise
    'num_deltas': 4,  # number of perturbations to try
    'deltas_used': 4,  # number of perturbations to keep in gradient estimate
    'num_workers': 2,
    'stepsize': 0.01,  # sgd step-size
    'observation_filter': "MeanStdFilter",
    'noise_size': 250000000,
    'eval_prob': 0.03,  # probability of evaluating the parameter rewards
    'env_config': {},
    'offset': 0,
    'policy_type': "LinearPolicy",  # ["LinearPolicy", "MLPPolicy"]
    "fcnet_hiddens": [32, 32],  # fcnet structure of MLPPolicy
})


@ray.remote
def create_shared_noise(count):
    """Create a large array of noise to be shared by all workers."""
    seed = 123
    noise = np.random.RandomState(seed).randn(count).astype(np.float32)
    return noise
Example #9
0
DEFAULT_CONFIG = with_common_config({
    # If true, use the Generalized Advantage Estimator (GAE)
    # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
    "use_gae": True,
    # GAE(lambda) parameter.
    "lambda": 1.0,
    # Initial coefficient for KL divergence.
    "kl_coeff": 0.0005,
    # Size of batches collected from each worker.
    "rollout_fragment_length": 200,
    # Do create an actual env on the local worker (worker-idx=0).
    "create_env_on_driver": True,
    # Step size of SGD.
    "lr": 1e-3,
    # Coefficient of the value function loss.
    "vf_loss_coeff": 0.5,
    # Coefficient of the entropy regularizer.
    "entropy_coeff": 0.0,
    # PPO clip parameter.
    "clip_param": 0.5,
    # Clip param for the value function. Note that this is sensitive to the
    # scale of the rewards. If your expected V is large, increase this.
    "vf_clip_param": 10.0,
    # If specified, clip the global norm of gradients by this amount.
    "grad_clip": None,
    # Target value for KL divergence.
    "kl_target": 0.01,
    # Whether to rollout "complete_episodes" or "truncate_episodes".
    "batch_mode": "complete_episodes",
    # Which observation filter to apply to the observation.
    "observation_filter": "NoFilter",
    # Number of Inner adaptation steps for the MAML algorithm.
    "inner_adaptation_steps": 1,
    # Number of MAML steps per meta-update iteration (PPO steps).
    "maml_optimizer_steps": 8,
    # Inner adaptation step size.
    "inner_lr": 1e-3,
    # Horizon of the environment (200 in MB-MPO paper).
    "horizon": 200,
    # Dynamics ensemble hyperparameters.
    "dynamics_model": {
        "custom_model": DynamicsEnsembleCustomModel,
        # Number of Transition-Dynamics (TD) models in the ensemble.
        "ensemble_size": 5,
        # Hidden layers for each model in the TD-model ensemble.
        "fcnet_hiddens": [512, 512, 512],
        # Model learning rate.
        "lr": 1e-3,
        # Max number of training epochs per MBMPO iter.
        "train_epochs": 500,
        # Model batch size.
        "batch_size": 500,
        # Training/validation split.
        "valid_split_ratio": 0.2,
        # Normalize data (obs, action, and deltas).
        "normalize_data": True,
    },
    # Exploration for MB-MPO is based on StochasticSampling, but uses 8000
    # random timesteps up-front for worker=0.
    "exploration_config": {
        "type": MBMPOExploration,
        "random_timesteps": 8000,
    },
    # Workers sample from dynamics models, not from actual envs.
    "custom_vector_env": model_vector_env,
    # How many iterations through MAML per MBMPO iteration.
    "num_maml_steps": 10,

    # Deprecated keys:
    # Share layers for value function. If you set this to True, it's important
    # to tune vf_loss_coeff.
    # Use config.model.vf_share_layers instead.
    "vf_share_layers": DEPRECATED_VALUE,
})
Example #10
0
File: ppo.py Project: zdpau/ray-1
DEFAULT_CONFIG = with_common_config({
    # If true, use the Generalized Advantage Estimator (GAE)
    # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
    "use_gae": True,
    # GAE(lambda) parameter
    "lambda": 1.0,
    # Initial coefficient for KL divergence
    "kl_coeff": 0.2,
    # Size of batches collected from each worker
    "sample_batch_size": 200,
    # Number of timesteps collected for each SGD round
    "train_batch_size": 4000,
    # Total SGD batch size across all devices for SGD
    "sgd_minibatch_size": 128,
    # Number of SGD iterations in each outer loop
    "num_sgd_iter": 30,
    # Stepsize of SGD
    "lr": 5e-5,
    # Learning rate schedule
    "lr_schedule": None,
    # Share layers for value function
    "vf_share_layers": False,
    # Coefficient of the value function loss
    "vf_loss_coeff": 1.0,
    # Coefficient of the entropy regularizer
    "entropy_coeff": 0.0,
    # PPO clip parameter
    "clip_param": 0.3,
    # Clip param for the value function. Note that this is sensitive to the
    # scale of the rewards. If your expected V is large, increase this.
    "vf_clip_param": 10.0,
    # Target value for KL divergence
    "kl_target": 0.01,
    # Whether to rollout "complete_episodes" or "truncate_episodes"
    "batch_mode": "truncate_episodes",
    # Which observation filter to apply to the observation
    "observation_filter": "MeanStdFilter",
    # Uses the sync samples optimizer instead of the multi-gpu one. This does
    # not support minibatches.
    "simple_optimizer": False,
})
Example #11
0
import ray

import numpy as np

from ray.rllib import Policy
from ray.rllib.agents import with_common_config
from ray.rllib.agents.trainer import Trainer
from ray.rllib.execution.rollout_ops import synchronous_parallel_sample
from ray.rllib.examples.env.parametric_actions_cartpole import ParametricActionsCartPole
from ray.rllib.models.modelv2 import restore_original_dimensions
from ray.rllib.utils import override
from ray.rllib.utils.typing import ResultDict
from ray.tune.registry import register_env

DEFAULT_CONFIG = with_common_config({
    # Run with new `training_iteration` API.
    "_disable_execution_plan_api": True,
})


class RandomParametricPolicy(Policy, ABC):
    """
    Just pick a random legal action
    The outputted state of the environment needs to be a dictionary with an
    'action_mask' key containing the legal actions for the agent.
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.exploration = self._create_exploration()

    @override(Policy)
    def compute_actions(self,
Example #12
0
DEFAULT_CONFIG = with_common_config({
    # If true, use the Generalized Advantage Estimator (GAE)
    # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
    "use_gae": True,
    # The GAE(lambda) parameter.
    "lambda": 1.0,
    # Initial coefficient for KL divergence.
    "kl_coeff": 0.2,
    # Size of batches collected from each worker.
    "sample_batch_size": 200,
    # Number of timesteps collected for each SGD round. This defines the size
    # of each SGD epoch.
    "train_batch_size": 4000,
    # Total SGD batch size across all devices for SGD. This defines the
    # minibatch size within each epoch.
    "sgd_minibatch_size": 128,
    # Whether to shuffle sequences in the batch when training (recommended).
    "shuffle_sequences": True,
    # Number of SGD iterations in each outer loop (i.e., number of epochs to
    # execute per train batch).
    "num_sgd_iter": 30,
    # Stepsize of SGD.
    "lr": 5e-5,
    # Learning rate schedule.
    "lr_schedule": None,
    # Share layers for value function. If you set this to True, it's important
    # to tune vf_loss_coeff.
    "vf_share_layers": False,
    # Coefficient of the value function loss. IMPORTANT: you must tune this if
    # you set vf_share_layers: True.
    "vf_loss_coeff": 1.0,
    # Coefficient of the entropy regularizer.
    "entropy_coeff": 0.0,
    # Decay schedule for the entropy regularizer.
    "entropy_coeff_schedule": None,
    # PPO clip parameter.
    "clip_param": 0.3,
    # Clip param for the value function. Note that this is sensitive to the
    # scale of the rewards. If your expected V is large, increase this.
    "vf_clip_param": 10.0,
    # If specified, clip the global norm of gradients by this amount.
    "grad_clip": None,
    # Target value for KL divergence.
    "kl_target": 0.01,
    # Whether to rollout "complete_episodes" or "truncate_episodes".
    "batch_mode": "truncate_episodes",
    # Which observation filter to apply to the observation.
    "observation_filter": "NoFilter",
    # Uses the sync samples optimizer instead of the multi-gpu one. This is
    # usually slower, but you might want to try it if you run into issues with
    # the default optimizer.
    "simple_optimizer": False,
    # Use the experimental torch multi-node SGD optimizer.
    "distributed_data_parallel_optimizer": False,
    # Use PyTorch as framework?
    "use_pytorch": False
})
Example #13
0
File: ppo.py Project: lucfisc/ray
DEFAULT_CONFIG = with_common_config({
    # If true, use the Generalized Advantage Estimator (GAE)
    # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
    "use_gae": True,
    # GAE(lambda) parameter
    "lambda": 1.0,
    # Initial coefficient for KL divergence
    "kl_coeff": 0.2,
    # Number of timesteps collected for each SGD round
    "timesteps_per_batch": 4000,
    # Number of SGD iterations in each outer loop
    "num_sgd_iter": 30,
    # Stepsize of SGD
    "sgd_stepsize": 5e-5,
    # Learning rate schedule
    "lr_schedule": None,
    # Share layers for value function
    "vf_share_layers": False,
    # Total SGD batch size across all devices for SGD (multi-gpu only)
    "sgd_batchsize": 128,
    # Coefficient of the value function loss
    "vf_loss_coeff": 1.0,
    # Coefficient of the entropy regularizer
    "entropy_coeff": 0.0,
    # PPO clip parameter
    "clip_param": 0.3,
    # Target value for KL divergence
    "kl_target": 0.01,
    # Number of GPUs to use for SGD
    "num_gpus": 0,
    # Whether to allocate GPUs for workers (if > 0).
    "num_gpus_per_worker": 0,
    # Whether to allocate CPUs for workers (if > 0).
    "num_cpus_per_worker": 1,
    # Whether to rollout "complete_episodes" or "truncate_episodes"
    "batch_mode": "complete_episodes",
    # Which observation filter to apply to the observation
    "observation_filter": "MeanStdFilter",
    # Use the sync samples optimizer instead of the multi-gpu one
    "simple_optimizer": False,
    # Override model config
    "model": {
        # Whether to use LSTM model
        "use_lstm": False,
        # Max seq length for LSTM training.
        "max_seq_len": 20,
    },
})
Example #14
0
from ray.rllib.agents import with_common_config
from ray.rllib.utils.deprecation import DEPRECATED_VALUE

DEFAULT_CONFIG = with_common_config({
    "use_critic": True,
    "use_gae": True,
    "lambda": 1.0,
    "kl_coeff": 0.2,
    "rollout_fragment_length": 200,
    "train_batch_size": 4000,
    "sgd_minibatch_size": 128,
    "shuffle_sequences": True,
    "num_sgd_iter": 30,
    "lr": 5e-5,
    "lr_schedule": None,
    "vf_loss_coeff": 1.0,
    "model": {
        "vf_share_layers": False,
    },
    "entropy_coeff": 0.0,
    "entropy_coeff_schedule": None,
    "clip_param": 0.3,
    "vf_clip_param": 10.0,
    "grad_clip": None,
    "kl_target": 0.01,
    "batch_mode": "truncate_episodes",
    "observation_filter": "NoFilter",
    "vf_share_layers": DEPRECATED_VALUE,
})
Example #15
0
from ray.rllib import Policy
from ray.rllib.agents import with_common_config
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.evaluation.worker_set import WorkerSet
from ray.rllib.execution.metric_ops import StandardMetricsReporting
from ray.rllib.execution.rollout_ops import ParallelRollouts, SelectExperiences
from ray.rllib.examples.env.parametric_actions_cartpole import \
    ParametricActionsCartPole
from ray.rllib.models.modelv2 import restore_original_dimensions
from ray.rllib.utils import override
from ray.rllib.utils.typing import TrainerConfigDict
from ray.util.iter import LocalIterator
from ray.tune.registry import register_env

DEFAULT_CONFIG = with_common_config({})


class RandomParametriclPolicy(Policy, ABC):
    """
    Just pick a random legal action
    The outputted state of the environment needs to be a dictionary with an
    'action_mask' key containing the legal actions for the agent.
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.exploration = self._create_exploration()

    @override(Policy)
    def compute_actions(self,
                        obs_batch,
Example #16
0
from pathlib import Path

import numpy as np
import ray
import yaml
from ray.rllib.agents import with_common_config

from algorithms.trainer_ea import EATrainer
from utils.chromosome import VBNChromosome

DEFAULT_CONFIG = with_common_config(yaml.load(
    Path('configs/config_ga_default.yaml').read_text()
))


class GATrainer(EATrainer):
    _name = "GA"
    _default_config = DEFAULT_CONFIG

    def _init(self, config, env_creator):
        """ Trainer class for the Coevolutionary Genetic Algorithm.
        This class distributes the mutation and evaluation workload over a number
        of workers and updates and maintains the population."""

        super(GATrainer, self)._init(config, env_creator)

        self.elites = [VBNChromosome(number_actions=self.config['number_actions'],
                                     input_channels=self.config['input_channels'])
                       for _ in range(config['number_elites'])]
        samples = self.collect_samples()
        for chrom in self.elites:
Example #17
0
logger = logging.getLogger(__name__)

Result = namedtuple("Result", [
    "noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths",
    "eval_returns", "eval_lengths"
])

# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
    "l2_coeff": 0.005,
    "noise_stdev": 0.02,
    "episodes_per_batch": 1000,
    "train_batch_size": 10000,
    "eval_prob": 0.003,
    "return_proc_mode": "centered_rank",
    "num_workers": 10,
    "stepsize": 0.01,
    "observation_filter": "MeanStdFilter",
    "noise_size": 250000000,
    "report_length": 10,
})
# __sphinx_doc_end__
# yapf: enable


@ray.remote
def create_shared_noise(count):
    """Create a large array of noise to be shared by all workers."""
    seed = 123
    noise = np.random.RandomState(seed).randn(count).astype(np.float32)
Example #18
0
    "eval_lengths", "novelty", "policy_weights"
])

# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
    "l2_coeff": 0.005,
    "noise_stdev": 0.02,
    "noise_stdevGA": 0.02,
    "episodes_per_batch": 7,
    "train_batch_size": 1000,
    "eval_prob": 0.003,
    "return_proc_mode": "centered_rank",
    "num_workers": 7,#10
    "stepsize": 0.01,
    "observation_filter": "MeanStdFilter",
    "noise_size": 250000000,
    "report_length": 10,
    "pop_size": 1,
    "population_size":20,
    "bf_sz":2000,
    "k":10,
    "epsilon":0.5,
    "neinum":10,
    "eta_c":20
})
# __sphinx_doc_end__
# yapf: enable


@ray.remote
Example #19
0
from ray.rllib.utils.annotations import override
from ray.rllib.utils import FilterManager

logger = logging.getLogger(__name__)

# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
    # 'theta_decay':              0.001,
    # 'alpha':                    .2,
    # 'noise_stdev':              0.02,
    # 'candidates_per_iteration': 144,
    'timestep_limit': None,
    'num_evals_per_iteration': 1,
    # 'return_proc_mode':   'centered_rank',
    'num_workers': 4,
    'request_interleaving': 2,
    # 'stepsize': 0.01,
    # 'observation_filter':      'MeanStdFilter',
    'noise_size': 33554432,
    'random_seed': None,
    # 'report_length':      10,
    "action_noise_std": 0.0,
})


# __sphinx_doc_end__
# yapf: enable


class Common:
Example #20
0
File: ppo.py Project: rlan/ray
DEFAULT_CONFIG = with_common_config({
    # Should use a critic as a baseline (otherwise don't use value baseline;
    # required for using GAE).
    "use_critic": True,
    # If true, use the Generalized Advantage Estimator (GAE)
    # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
    "use_gae": True,
    # The GAE (lambda) parameter.
    "lambda": 1.0,
    # Initial coefficient for KL divergence.
    "kl_coeff": 0.2,
    # Size of batches collected from each worker.
    "rollout_fragment_length": 200,
    # Number of timesteps collected for each SGD round. This defines the size
    # of each SGD epoch.
    "train_batch_size": 4000,
    # Total SGD batch size across all devices for SGD. This defines the
    # minibatch size within each epoch.
    "sgd_minibatch_size": 128,
    # Whether to shuffle sequences in the batch when training (recommended).
    "shuffle_sequences": True,
    # Number of SGD iterations in each outer loop (i.e., number of epochs to
    # execute per train batch).
    "num_sgd_iter": 30,
    # Stepsize of SGD.
    "lr": 5e-5,
    # Learning rate schedule.
    "lr_schedule": None,
    # Coefficient of the value function loss. IMPORTANT: you must tune this if
    # you set vf_share_layers=True inside your model's config.
    "vf_loss_coeff": 1.0,
    "model": {
        # Share layers for value function. If you set this to True, it's
        # important to tune vf_loss_coeff.
        "vf_share_layers": False,
    },
    # Coefficient of the entropy regularizer.
    "entropy_coeff": 0.0,
    # Decay schedule for the entropy regularizer.
    "entropy_coeff_schedule": None,
    # PPO clip parameter.
    "clip_param": 0.3,
    # Clip param for the value function. Note that this is sensitive to the
    # scale of the rewards. If your expected V is large, increase this.
    "vf_clip_param": 10.0,
    # If specified, clip the global norm of gradients by this amount.
    "grad_clip": None,
    # Target value for KL divergence.
    "kl_target": 0.01,
    # Whether to rollout "complete_episodes" or "truncate_episodes".
    "batch_mode": "truncate_episodes",
    # Which observation filter to apply to the observation.
    "observation_filter": "NoFilter",

    # Deprecated keys:
    # Share layers for value function. If you set this to True, it's important
    # to tune vf_loss_coeff.
    # Use config.model.vf_share_layers instead.
    "vf_share_layers": DEPRECATED_VALUE,
})
Example #21
0
GENERAL_CONFIGS = with_common_config({
    # Buffers
    "replay_buffer_size": int(2e4),
    "reservoir_buffer_size": int(1e5),

    # Train specs
    "replay_train_batch_size": 128,
    "reservoir_train_batch_size": 128,
    "replay_train_every": 10,
    "reservoir_train_every": 66,
    "reservoir_train_every_sims": 300,
    "replay_min_size_to_learn": 10,
    "reservoir_min_size_to_learn": 1000,
    "reservoir_min_size_to_learn_sims": 1000,
    "replay_num_episodes": 10,
    "replay_min_episodes_to_learn": 100,

    # Models
    "model": {
        "lstm_cell_size": 128,
        "max_seq_len": 999999,
        "fcnet_activation": 'relu',
        "fcnet_hiddens": [128, 128, 128]
    },
    "sig_model": {
        "fcnet_hiddens": [64],
        "fcnet_activation": 'relu',
        "max_seq_len": 20
    },

    # Generals
    "framework": "torch",
    "use_exec_api": True,

    # keys in obs space to be used at inference time
    "test_obs_keys": ["obs"],

    # keys in obs space to be used at training time
    "train_obs_keys": ["obs"],

    # Train statistics flag
    "log_stats": False,
    "stats_fn": None,

    # Wandb integration configs
    "logger_config": {
        "wandb": {
            "project": None,
            "api_key_file": "/home/coordination/wandb_api",
            "log_config": True
        }
    },

    # Param to support cases in which training environment is different than execution environment (e.g. SIMS)
    "train_obs_space": None,

    # Param for hyperparameter tuning
    "model_struc": None,  # TODO (fede): remove once tuning is done

    # Debugging purposes, log probabilities for specific actions when training
    "relevant_obs": None
})
Example #22
0
from ray.rllib.agents import Trainer, with_common_config
from ray.rllib.optimizers import SyncSamplesOptimizer
from agents.ppo.ppo_torch_policy import PPOTorchPolicy

DEFAULT_CONFIG = with_common_config({
    'alpha': 0.1,
    'clip_ratio': 0.2,
    'gamma': 0.99,
    'lambda': 0.97,
    'lr_pi': 3e-4,
    'lr_vf': 1e-3,
    'max_episode_len': 1000,
    'model_hidden_sizes': (256, 128, 64),
    'policy': 'default',
    'num_workers': 4,
    'num_sgd_iter': 80,
    'num_skills': 10,
    'rollout_fragment_length': 200,
    'seed': 123,
    'sgd_minibatch_size': 128,
    'skill_input': None,
    'target_kl': 0.01,
    'train_batch_size': 4000,
    'use_diayn': True,
    'use_env_rewards': True,
    'use_gae': True,
})

policy_options = {
    'default': PPOTorchPolicy,
    'neuroblast': PPONeuroblastPolicy,
Example #23
0
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
numpy.random.seed(random_seed)
torch.manual_seed(random_seed)

CHECKPOINT_DIR = f"./rllib_temp_{gpu_id}"
CHECKPOINT_FILE = f"last_checkpoint_{gpu_id}.out"

config = with_common_config({
    'gamma': 0.99,
    'lr': 1e-5,
    'num_workers': 4,
    # 'framework': 'torch',
    'num_gpus': 1,
    'sgd_minibatch_size': 256,
    'num_sgd_iter': (2515 * 8) * 2**0 // 256,
    'train_batch_size': 2515 * 8,
    'entropy_coeff': 0.02,
    'vf_loss_coeff': 0.01,
    'model': {
        'fcnet_hiddens': [256, 256, 256]
    }
})


def train_it():
    ray.init()
    # Configure RLLib with The Roadwork Environment

    agent = ppo.PPOTrainer(env=StockEnvDOW30, config=config)
    # agent = ppo.PPOTrainer(env=StockEnvDOW30)
Example #24
0
DEFAULT_CONFIG = with_common_config({
    # PlaNET Model LR
    "td_model_lr": 6e-4,
    # Actor LR
    "actor_lr": 8e-5,
    # Critic LR
    "critic_lr": 8e-5,
    # Grad Clipping
    "grad_clip": 100.0,
    # Discount
    "discount": 0.99,
    # Lambda
    "lambda": 0.95,
    # Training iterations per data collection from real env
    "dreamer_train_iters": 100,
    # Horizon for Enviornment (1000 for Mujoco/DMC)
    "horizon": 1000,
    # Number of episodes to sample for Loss Calculation
    "batch_size": 50,
    # Length of each episode to sample for Loss Calculation
    "batch_length": 50,
    # Imagination Horizon for Training Actor and Critic
    "imagine_horizon": 15,
    # Free Nats
    "free_nats": 3.0,
    # KL Coeff for the Model Loss
    "kl_coeff": 1.0,
    # Distributed Dreamer not implemented yet
    "num_workers": 0,
    # Prefill Timesteps
    "prefill_timesteps": 5000,
    # This should be kept at 1 to preserve sample efficiency
    "num_envs_per_worker": 1,
    # Exploration Gaussian
    "explore_noise": 0.3,
    # Batch mode
    "batch_mode": "complete_episodes",
    # Custom Model
    "dreamer_model": {
        "custom_model": DreamerModel,
        # RSSM/PlaNET parameters
        "deter_size": 200,
        "stoch_size": 30,
        # CNN Decoder Encoder
        "depth_size": 32,
        # General Network Parameters
        "hidden_size": 400,
        # Action STD
        "action_init_std": 5.0,
    },

    "env_config": {
        # Repeats action send by policy for frame_skip times in env
        "frame_skip": 2,
    }
})
Example #25
0
import time

from ray.rllib.agents import Agent, with_common_config
from ray.rllib.evaluation.tf_policy_graph import LearningRateSchedule
from ray.rllib.optimizers import AsyncGradientsOptimizer
from ray.rllib.utils.annotations import override

DEFAULT_CONFIG = with_common_config(dict(

))


class A3CPolicyGraph:

class A3CAgent(Agent):

    _agent_name ="A3C-Per"
    _policy_graph = None # TODO


    @override(Agent)
    def _init(self, config, env_creator):

        policy_cls = self._policy_graph
        self.local_evaluator = self.make_local_evaluator(env_creator, policy_cls)
        self.remote_evaluators = self.make_remote_evaluators(env_creator, policy_cls, config["num_workers"])
        self.optimizer = self._make_optimizer()

    @override(Agent)
    def _train(self):
        prev_steps = self.optimizer.num_steps_sampled
DEFAULT_CONFIG = with_common_config({
    # Size of batches collected from each worker
    "rollout_fragment_length": 200,
    # Number of timesteps collected for each SGD round
    "train_batch_size": 4000,
    # Total SGD batch size across all devices for SGD
    "sgd_minibatch_size": 128,
    # Whether to shuffle sequences in the batch when training (recommended)
    "shuffle_sequences": True,
    # Number of SGD iterations in each outer loop
    "num_sgd_iter": 30,
    # IN case a buffer optimizer is used
    "learning_starts": 1000,
    # Size of the replay buffer in batches (not timesteps!).
    "buffer_size": 1000,
    # Stepsize of SGD
    "lr": 5e-5,
    # Learning rate schedule
    "lr_schedule": None,
    # Share layers for value function. If you set this to True, it"s important
    # to tune vf_loss_coeff.
    "vf_share_layers": False,
    # Whether to rollout "complete_episodes" or "truncate_episodes"
    "batch_mode": "complete_episodes",
    # Which observation filter to apply to the observation
    "observation_filter": "NoFilter",
    # Uses the sync samples optimizer instead of the multi-gpu one. This does
    # not support minibatches.
    "simple_optimizer": True,

    # === MCTS ===
    "mcts_config": {
        "puct_coefficient": 1.0,
        "num_simulations": 30,
        "temperature": 1.5,
        "dirichlet_epsilon": 0.25,
        "dirichlet_noise": 0.03,
        "argmax_tree_policy": False,
        "add_dirichlet_noise": True,
    },

    # === Ranked Rewards ===
    # implement the ranked reward (r2) algorithm
    # from: https://arxiv.org/pdf/1807.01672.pdf
    "ranked_rewards": {
        "enable": True,
        "percentile": 75,
        "buffer_max_length": 1000,
        # add rewards obtained from random policy to
        # "warm start" the buffer
        "initialize_buffer": True,
        "num_init_rewards": 100,
    },

    # === Evaluation ===
    # Extra configuration that disables exploration.
    "evaluation_config": {
        "mcts_config": {
            "argmax_tree_policy": True,
            "add_dirichlet_noise": False,
        },
    },

    # === Callbacks ===
    "callbacks": AlphaZeroDefaultCallbacks,

    "framework": "torch",  # Only PyTorch supported so far.
})
Example #27
0
    "eval_returns", "eval_lengths"
])

# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
    "action_noise_std": 0.0,
    "noise_stdev": 0.02,  # std deviation of parameter noise
    "num_rollouts": 32,  # number of perturbs to try
    "rollouts_used": 32,  # number of perturbs to keep in gradient estimate
    "num_workers": 2,
    "sgd_stepsize": 0.01,  # sgd step-size
    "observation_filter": "MeanStdFilter",
    "noise_size": 250000000,
    "eval_prob": 0.03,  # probability of evaluating the parameter rewards
    "report_length": 10,  # how many of the last rewards we average over
    "offset": 0,
    # ARS will use Trainer's evaluation WorkerSet (if evaluation_interval > 0).
    # Therefore, we must be careful not to use more than 1 env per eval worker
    # (would break ARSPolicy's compute_single_action method) and to not do
    # obs-filtering.
    "evaluation_config": {
        "num_envs_per_worker": 1,
        "observation_filter": "NoFilter"
    },
})
# __sphinx_doc_end__
# yapf: enable


@ray.remote
Example #28
0
File: es.py Project: zommiommy/ray
logger = logging.getLogger(__name__)

Result = namedtuple("Result", [
    "noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths",
    "eval_returns", "eval_lengths"
])

# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
    "action_noise_std": 0.01,
    "l2_coeff": 0.005,
    "noise_stdev": 0.02,
    "episodes_per_batch": 1000,
    "train_batch_size": 10000,
    "eval_prob": 0.003,
    "return_proc_mode": "centered_rank",
    "num_workers": 10,
    "stepsize": 0.01,
    "observation_filter": "MeanStdFilter",
    "noise_size": 250000000,
    "report_length": 10,
})
# __sphinx_doc_end__
# yapf: enable


@ray.remote
def create_shared_noise(count):
    """Create a large array of noise to be shared by all workers."""
    seed = 123
    noise = np.random.RandomState(seed).randn(count).astype(np.float32)
Example #29
0
logger = logging.getLogger(__name__)

Result = namedtuple("Result", [
    "noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths",
    "eval_returns", "eval_lengths"
])

# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
    "noise_stdev": 0.02,  # std deviation of parameter noise
    "num_rollouts": 32,  # number of perturbs to try
    "rollouts_used": 32,  # number of perturbs to keep in gradient estimate
    "num_workers": 2,
    "sgd_stepsize": 0.01,  # sgd step-size
    "observation_filter": "MeanStdFilter",
    "noise_size": 250000000,
    "eval_prob": 0.03,  # probability of evaluating the parameter rewards
    "report_length": 10,  # how many of the last rewards we average over
    "offset": 0,
})
# __sphinx_doc_end__
# yapf: enable


@ray.remote
def create_shared_noise(count):
    """Create a large array of noise to be shared by all workers."""
    seed = 123
    noise = np.random.RandomState(seed).randn(count).astype(np.float32)
    return noise
Example #30
0
DEFAULT_CONFIG = with_common_config({
    # Should use a critic as a baseline (otherwise don't use value baseline;
    # required for using GAE).
    "use_critic": True,
    # If true, use the Generalized Advantage Estimator (GAE)
    # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
    "use_gae": True,
    # The GAE (lambda) parameter.
    "lambda": 1.0,
    # Initial coefficient for KL divergence.
    "kl_coeff": 0.2,
    # Size of batches collected from each worker.
    "rollout_fragment_length": 200,
    # Number of timesteps collected for each SGD round. This defines the size
    # of each SGD epoch.
    "train_batch_size": 4000,
    # Total SGD batch size across all devices for SGD. This defines the
    # minibatch size within each epoch.
    "sgd_minibatch_size": 128,
    # Whether to shuffle sequences in the batch when training (recommended).
    "shuffle_sequences": True,
    # Number of SGD iterations in each outer loop (i.e., number of epochs to
    # execute per train batch).
    "ppo_epochs": 8,
    # Use dynamics model or not
    "use_dynamics": True,
    # Dynamics epochs
    "dynamics_epochs": 2,
    # Stepsize of SGD.
    "lr": 3e-4,
    # Learning rate schedule.
    "lr_schedule": None,
    # Skill dynamics learning rate.
    "dynamics_lr":3e-4,
    # dynamics orth reg
    "dynamics_orth_reg": True,
    # dynamics l2 reg
    "dynamics_l2_reg": False,
    # dynamics spectral norm
    "dynamics_spectral_norm": False,
    # dynamics apply reg to hidden or not
    "dynamics_reg_hiddens": False,
    # dads reward scale
    "dads_reward_scale": 1.0,
    # Coefficient of the value function loss. IMPORTANT: you must tune this if
    # you set vf_share_layers=True inside your model's config.
    "vf_loss_coeff": 1.0,
    "model": {
        # Share layers for value function. If you set this to True, it's
        # important to tune vf_loss_coeff.
        "vf_share_layers": True,
    },
    # Coefficient of the entropy regularizer.
    "entropy_coeff": 0.0,
    # Decay schedule for the entropy regularizer.
    "entropy_coeff_schedule": None,
    # PPO clip parameter.
    "clip_param": 0.3,
    # Clip param for the value function. Note that this is sensitive to the
    # scale of the rewards. If your expected V is large, increase this.
    "vf_clip_param": 10.0,
    # If specified, clip the global norm of gradients by this amount.
    "grad_clip": None,
    # Target value for KL divergence.
    "kl_target": 0.01,
    # Whether to rollout "complete_episodes" or "truncate_episodes".
    "batch_mode": "truncate_episodes",
    # Which observation filter to apply to the observation.
    "observation_filter": "NoFilter",
    # Uses the sync samples optimizer instead of the multi-gpu one. This is
    # usually slower, but you might want to try it if you run into issues with
    # the default optimizer.
    "simple_optimizer": False,
    # Whether to fake GPUs (using CPUs).
    # Set this to True for debugging on non-GPU machines (set `num_gpus` > 0).
    "_fake_gpus": False,

    # Deprecated keys:
    # Share layers for value function. If you set this to True, it's important
    # to tune vf_loss_coeff.
    # Use config.model.vf_share_layers instead.
    "vf_share_layers": DEPRECATED_VALUE,
})
import logging

from ray.rllib.agents import with_common_config
from ray.rllib.agents.trainer_template import build_trainer

from dqn.dqn_policy import DQNPolicy
from dqn.prey_policy import PreyPolicy

logger = logging.getLogger(__name__)

DEFAULT_CONFIG = with_common_config({
    # Agent parameters.
    "lr": 0.001,
    "gamma": 0.9,
    "eps_start": 1,
    "eps_end": 0.05,
    "eps_decay": 0.9995,
    "replay_memory_size": 10000,
    "target_update_frequency": 10,
    "dqn_model": {
        "custom_model": "DQNModel",
        "custom_model_config":
        {},  # Extra options to pass to your model (e.g. network of model).
    }
})

# Custom trainer.
DQNTrainer = build_trainer(name="DQNAlgorithm",
                           default_policy=DQNPolicy,
                           default_config=DEFAULT_CONFIG)
Example #32
0
    "eval_returns", "eval_lengths"
])

# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
    "action_noise_std": 0.01,
    "l2_coeff": 0.005,
    "noise_stdev": 0.02,
    "episodes_per_batch": 1000,
    "train_batch_size": 10000,
    "eval_prob": 0.003,
    "return_proc_mode": "centered_rank",
    "num_workers": 10,
    "stepsize": 0.01,
    "observation_filter": "MeanStdFilter",
    "noise_size": 250000000,
    "report_length": 10,
    # ARS will use Trainer's evaluation WorkerSet (if evaluation_interval > 0).
    # Therefore, we must be careful not to use more than 1 env per eval worker
    # (would break ESPolicy's compute_single_action method) and to not do
    # obs-filtering.
    "evaluation_config": {
        "num_envs_per_worker": 1,
        "observation_filter": "NoFilter"
    },
})
# __sphinx_doc_end__
# yapf: enable


@ray.remote
Example #33
0
DEFAULT_CONFIG = with_common_config({
    # If true, use the Generalized Advantage Estimator (GAE)
    # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
    "use_gae": True,
    # GAE(lambda) parameter
    "lambda": 1.0,
    # Initial coefficient for KL divergence
    "kl_coeff": 0.2,
    # Size of batches collected from each worker
    "sample_batch_size": 200,
    # Number of timesteps collected for each SGD round
    "train_batch_size": 4000,
    # Total SGD batch size across all devices for SGD
    "sgd_minibatch_size": 128,
    # Number of SGD iterations in each outer loop
    "num_sgd_iter": 30,
    # Stepsize of SGD
    "lr": 5e-5,
    # Learning rate schedule
    "lr_schedule": None,
    # Share layers for value function
    "vf_share_layers": False,
    # Coefficient of the value function loss
    "vf_loss_coeff": 1.0,
    # Coefficient of the entropy regularizer
    "entropy_coeff": 0.0,
    # PPO clip parameter
    "clip_param": 0.3,
    # Clip param for the value function. Note that this is sensitive to the
    # scale of the rewards. If your expected V is large, increase this.
    "vf_clip_param": 10.0,
    # If specified, clip the global norm of gradients by this amount
    "grad_clip": None,
    # Target value for KL divergence
    "kl_target": 0.01,
    # Whether to rollout "complete_episodes" or "truncate_episodes"
    "batch_mode": "truncate_episodes",
    # Which observation filter to apply to the observation
    "observation_filter": "NoFilter",
    # Uses the sync samples optimizer instead of the multi-gpu one. This does
    # not support minibatches.
    "simple_optimizer": False,
    # (Deprecated) Use the sampling behavior as of 0.6, which launches extra
    # sampling tasks for performance but can waste a large portion of samples.
    "straggler_mitigation": False,
})