Beispiel #1
0
 def __init__(self,
              observation_space,
              action_space,
              learning_rate=0.001,
              update_period=100,
              embedding_dim=10,
              net_fn=None,
              net_kwargs=None,
              device="cuda:best",
              rate_power=0.5,
              batch_size=10,
              memory_size=10000,
              with_action=False,
              **kwargs):
     assert isinstance(observation_space, spaces.Box)
     UncertaintyEstimator.__init__(self, observation_space, action_space)
     self.learning_rate = learning_rate
     self.loss_fn = F.mse_loss
     self.update_period = update_period
     self.embedding_dim = embedding_dim
     out_size = embedding_dim * action_space.n if with_action else embedding_dim
     self.net_fn = load(net_fn) if isinstance(net_fn, str) else \
         net_fn or partial(get_network, shape=observation_space.shape, embedding_dim=out_size)
     self.net_kwargs = net_kwargs or {}
     if "out_size" in self.net_kwargs:
         self.net_kwargs["out_size"] = out_size
     self.device = choose_device(device)
     self.rate_power = rate_power
     self.batch_size = batch_size
     self.memory = ReplayMemory(capacity=memory_size)
     self.with_action = with_action
     self.reset()
Beispiel #2
0
def read_env_config(config_path):
    """
    Read .yaml config file for an environment instance.

    The file contains the environment constructor and its params.

    Example:

    ``` env.yaml
        constructor: 'rlberry.envs.benchmarks.grid_exploration.nroom.NRoom'
        params:
            reward_free: false
            array_observation: true
            nrooms: 5
    ```

    Parameters
    ----------
    config_path : str
        yaml file name containing the env config

    Returns
    -------
    Tuple (constructor, kwargs) for the env
    """
    with open(config_path) as file:
        env_config = yaml.safe_load(file)
        return load(env_config["constructor"]), env_config["params"]
Beispiel #3
0
def read_agent_config(config_path):
    """
    Read .yaml config file for an Agent instance.

    The file contains the agent class and its parameters.

    TODO: recursive update of base_config.

    Example:

    ``` myagent.yaml
        agent_class: 'rlberry.agents.kernel_based.rs_ucbvi.RSUCBVIAgent'
        gamma: 1.0
        lp_metric: 2
        min_dist: 0.0
        max_repr: 800
        bonus_scale_factor: 1.0
    ```

    Parameters
    ----------
    config_path : str
        yaml file name containing the agent config

    Returns
    -------
    agent_class
    base_config : dict
        dictionary whose keys are ('agent_class', 'init_kwargs', 'eval_kwargs', 'fit_kwargs')
    """
    agent_config = process_agent_yaml(config_path)
    base_config_yaml = agent_config.pop("base_config", None)

    # TODO: recursive update
    if base_config_yaml is None:
        base_config = agent_config
    else:
        base_config = process_agent_yaml(base_config_yaml)
        for key in _AGENT_KEYS:
            try:
                base_config[key].update(agent_config[key])
            except KeyError:
                base_config[key] = agent_config[key]

    agent_class = load(base_config.pop("agent_class"))
    return agent_class, base_config
Beispiel #4
0
    def __init__(self,
                 env,
                 uncertainty_estimator_fn,
                 uncertainty_estimator_kwargs=None,
                 bonus_scale_factor=1.0,
                 bonus_max=np.inf):
        Wrapper.__init__(self, env)

        self.bonus_scale_factor = bonus_scale_factor
        self.bonus_max = bonus_max
        uncertainty_estimator_kwargs = uncertainty_estimator_kwargs or {}

        uncertainty_estimator_fn = load(uncertainty_estimator_fn) if isinstance(uncertainty_estimator_fn, str) else \
            uncertainty_estimator_fn
        self.uncertainty_estimator = uncertainty_estimator_fn(
            env.observation_space, env.action_space,
            **uncertainty_estimator_kwargs)
        self.previous_obs = None
Beispiel #5
0
class DQNAgent(AgentWithSimplePolicy):
    """DQN Agent based on PyTorch.

    Notes
    -----
    Uses Q(lambda) for computing targets by default. To recover
    the standard DQN, set :code:`lambda_ = 0.0` and :code:`chunk_size = 1`.

    Parameters
    ----------
    env: :class:`~rlberry.types.Env`
        Environment, can be a tuple (constructor, kwargs)
    gamma: float, default = 0.99
        Discount factor.
    batch_size: int, default=32
        Batch size.
    chunk_size: int, default=8
        Length of sub-trajectories sampled from the replay buffer.
    lambda_: float, default=0.5
        Q(lambda) parameter.
    target_update_parameter : int or float
        If int: interval (in number total number of online updates) between updates of the target network.
        If float: soft update coefficient
    device: str
        Torch device, see :func:`~rlberry.utils.torch.choose_device`
    learning_rate : float, default = 1e-3
        Optimizer learning rate.
    loss_function: {"l1", "l2", "smooth_l1"}, default: "l2"
        Loss function used to compute Bellman error.
    epsilon_init: float, default = 1.0
        Initial epsilon value for epsilon-greedy exploration.
    epsilon_final: float, default = 0.1
        Final epsilon value for epsilon-greedy exploration.
    epsilon_decay_interval : int
        After :code:`epsilon_decay` timesteps, epsilon approaches :code:`epsilon_final`.
    optimizer_type : {"ADAM", "RMS_PROP"}
        Optimization algorithm.
    q_net_constructor : Callable
        Function/constructor that returns a torch module for the Q-network:
        :code:`qnet = q_net_constructor(env, **kwargs)`.

        Module (Q-network) requirements:

        * Input shape = (batch_dim, chunk_size, obs_dims)

        * Ouput shape = (batch_dim, chunk_size, number_of_actions)

    q_net_kwargs : optional, dict
        Parameters for q_net_constructor.
    use_double_dqn : bool, default = False
        If True, use Double DQN.
    use_prioritized_replay : bool, default = False
        If True, use Prioritized Experience Replay.
    train_interval: int
        Update the model every :code:`train_interval` steps.
        If -1, train only at the end of the episodes.
    gradient_steps: int
        How many gradient steps to do at each update.
        If -1, take the number of timesteps since last update.
    max_replay_size : int
        Maximum number of transitions in the replay buffer.
    learning_starts : int
        How many steps of the model to collect transitions for before learning starts
    eval_interval : int, default = None
        Interval (in number of transitions) between agent evaluations in fit().
        If None, never evaluate.
    """

    name = "DQN"

    def __init__(
        self,
        env: types.Env,
        gamma: float = 0.99,
        batch_size: int = 32,
        chunk_size: int = 8,
        lambda_: float = 0.5,
        target_update_parameter: Union[int, float] = 0.005,
        device: str = "cuda:best",
        learning_rate: float = 1e-3,
        epsilon_init: float = 1.0,
        epsilon_final: float = 0.1,
        epsilon_decay_interval: int = 20_000,
        loss_function: str = "l2",
        optimizer_type: str = "ADAM",
        q_net_constructor: Optional[Callable[..., torch.nn.Module]] = None,
        q_net_kwargs: Optional[dict] = None,
        use_double_dqn: bool = False,
        use_prioritized_replay: bool = False,
        train_interval: int = 10,
        gradient_steps: int = -1,
        max_replay_size: int = 200_000,
        learning_starts: int = 5_000,
        eval_interval: Optional[int] = None,
        **kwargs,
    ):
        # For all parameters, define self.param = param
        _, _, _, values = inspect.getargvalues(inspect.currentframe())
        values.pop("self")
        for arg, val in values.items():
            setattr(self, arg, val)

        AgentWithSimplePolicy.__init__(self, env, **kwargs)
        env = self.env
        assert isinstance(env.observation_space, spaces.Box)
        assert isinstance(env.action_space, spaces.Discrete)

        # DQN parameters

        # Online and target Q networks, torch device
        self._device = choose_device(device)
        if isinstance(q_net_constructor, str):
            q_net_ctor = load(q_net_constructor)
        elif q_net_constructor is None:
            q_net_ctor = default_q_net_fn
        q_net_kwargs = q_net_kwargs or dict()
        self._qnet_online = q_net_ctor(env, **q_net_kwargs).to(self._device)
        self._qnet_target = q_net_ctor(env, **q_net_kwargs).to(self._device)

        # Optimizer and loss
        optimizer_kwargs = {
            "optimizer_type": optimizer_type,
            "lr": learning_rate
        }
        self._optimizer = optimizer_factory(self._qnet_online.parameters(),
                                            **optimizer_kwargs)
        self._loss_function = loss_function_factory(loss_function,
                                                    reduction="none")

        # Training params
        self._train_interval = train_interval
        self._gradient_steps = gradient_steps
        self._learning_starts = learning_starts
        self._learning_starts = learning_starts
        self._eval_interval = eval_interval

        # Setup replay buffer
        if hasattr(self.env, "_max_episode_steps"):
            max_episode_steps = self.env._max_episode_steps
        else:
            max_episode_steps = np.inf
        self._max_episode_steps = max_episode_steps

        self._replay_buffer = replay.ReplayBuffer(
            max_replay_size=max_replay_size,
            rng=self.rng,
            max_episode_steps=self._max_episode_steps,
            enable_prioritized=use_prioritized_replay,
        )
        self._replay_buffer.setup_entry("observations", np.float32)
        self._replay_buffer.setup_entry("next_observations", np.float32)
        self._replay_buffer.setup_entry("actions", np.int32)
        self._replay_buffer.setup_entry("rewards", np.float32)
        self._replay_buffer.setup_entry("dones", bool)

        # Counters
        self._total_timesteps = 0
        self._total_episodes = 0
        self._total_updates = 0
        self._timesteps_since_last_update = 0

        # epsilon scheduling
        self._epsilon_schedule = polynomial_schedule(
            self.epsilon_init,
            self.epsilon_final,
            power=1.0,
            transition_steps=self.epsilon_decay_interval,
            transition_begin=0,
        )
Beispiel #6
0
    def __init__(self,
                 env,
                 n_episodes=1000,
                 horizon=256,
                 gamma=0.99,
                 loss_function="l2",
                 batch_size=100,
                 device="cuda:best",
                 target_update=1,
                 learning_rate=0.001,
                 epsilon_init=1.0,
                 epsilon_final=0.1,
                 epsilon_decay=5000,
                 optimizer_type='ADAM',
                 qvalue_net_fn=None,
                 qvalue_net_kwargs=None,
                 double=True,
                 memory_capacity=10000,
                 use_bonus=False,
                 uncertainty_estimator_kwargs=None,
                 prioritized_replay=True,
                 update_frequency=1,
                 **kwargs):
        # Wrap arguments and initialize base class
        memory_kwargs = {
            'capacity': memory_capacity,
            'n_steps': 1,
            'gamma': gamma
        }
        exploration_kwargs = {
            'method': "EpsilonGreedy",
            'temperature': epsilon_init,
            'final_temperature': epsilon_final,
            'tau': epsilon_decay,
        }
        self.use_bonus = use_bonus
        if self.use_bonus:
            env = UncertaintyEstimatorWrapper(env,
                                              **uncertainty_estimator_kwargs)
        IncrementalAgent.__init__(self, env, **kwargs)
        self.horizon = horizon
        self.exploration_kwargs = exploration_kwargs or {}
        self.memory_kwargs = memory_kwargs or {}
        self.n_episodes = n_episodes
        self.batch_size = batch_size
        self.target_update = target_update
        self.double = double

        assert isinstance(env.action_space, spaces.Discrete), \
            "Only compatible with Discrete action spaces."

        self.prioritized_replay = prioritized_replay
        memory_class = PrioritizedReplayMemory if prioritized_replay else TransitionReplayMemory
        self.memory = memory_class(**self.memory_kwargs)
        self.exploration_policy = \
            exploration_factory(self.env.action_space,
                                **self.exploration_kwargs)
        self.training = True
        self.steps = 0
        self.episode = 0
        self.writer = None

        self.optimizer_kwargs = {
            'optimizer_type': optimizer_type,
            'lr': learning_rate
        }
        self.device = choose_device(device)
        self.loss_function = loss_function
        self.gamma = gamma

        qvalue_net_kwargs = qvalue_net_kwargs or {}
        qvalue_net_fn = load(qvalue_net_fn) if isinstance(qvalue_net_fn, str) else \
            qvalue_net_fn or default_qvalue_net_fn
        self.value_net = qvalue_net_fn(self.env, **qvalue_net_kwargs)
        self.target_net = qvalue_net_fn(self.env, **qvalue_net_kwargs)

        self.target_net.load_state_dict(self.value_net.state_dict())
        self.target_net.eval()
        logger.info("Number of trainable parameters: {}".format(
            trainable_parameters(self.value_net)))
        self.value_net.to(self.device)
        self.target_net.to(self.device)
        self.loss_function = loss_function_factory(self.loss_function)
        self.optimizer = optimizer_factory(self.value_net.parameters(),
                                           **self.optimizer_kwargs)
        self.update_frequency = update_frequency
        self.steps = 0