コード例 #1
0
    def __init__(self,
                 env,
                 n_samples=10,
                 gamma=0.95,
                 horizon=None,
                 epsilon=1e-6,
                 **kwargs):
        # initialize base class
        assert env.is_generative(), \
            "MBQVI requires a generative model."
        assert isinstance(env.observation_space, Discrete), \
            "MBQVI requires a finite state space."
        assert isinstance(env.action_space, Discrete), \
            "MBQVI requires a finite action space."
        Agent.__init__(self, env, **kwargs)

        #
        self.n_samples = n_samples
        self.gamma = gamma
        self.horizon = horizon
        self.epsilon = epsilon

        # empirical MDP, created in fit()
        self.R_hat = None
        self.P_hat = None

        # value functions
        self.V = None
        self.Q = None
コード例 #2
0
ファイル: incremental_agent.py プロジェクト: lulica/rlberry
    def __init__(self, env, **kwargs):
        """

        Parameters
        ----------
        env : Model
            Environment used to fit the agent.
        """
        Agent.__init__(self, env, **kwargs)
コード例 #3
0
ファイル: cem.py プロジェクト: lulica/rlberry
    def __init__(self,
                 env,
                 n_episodes=1000,
                 horizon=100,
                 gamma=0.99,
                 batch_size=16,
                 percentile=70,
                 learning_rate=0.01,
                 optimizer_type='ADAM',
                 policy_net_fn=None,
                 **kwargs):
        Agent.__init__(self, env, **kwargs)

        # check environment
        assert isinstance(self.env.observation_space, spaces.Box)
        assert isinstance(self.env.action_space, spaces.Discrete)

        # parameters
        self.gamma = gamma
        self.batch_size = batch_size
        self.n_episodes = n_episodes
        self.percentile = percentile
        self.learning_rate = learning_rate
        self.horizon = horizon

        # random number generator
        self.rng = seeding.get_rng()

        #
        self.policy_net_fn = policy_net_fn \
            or (lambda: default_policy_net_fn(self.env))

        self.optimizer_kwargs = {'optimizer_type': optimizer_type,
                                 'lr': learning_rate}

        # policy net
        self.policy_net = self.policy_net_fn().to(device)

        # loss function and optimizer
        self.loss_fn = nn.CrossEntropyLoss()
        self.optimizer = optimizer_factory(
                                    self.policy_net.parameters(),
                                    **self.optimizer_kwargs)

        # memory
        self.memory = CEMMemory(self.batch_size)

        # default writer
        self.writer = PeriodicWriter(self.name,
                                     log_every=5*logger.getEffectiveLevel())
コード例 #4
0
    def __init__(self,
                 env,
                 policy,
                 learning_rate=7e-4,
                 n_steps: int = 5,
                 gamma: float = 0.99,
                 gae_lambda: float = 1.0,
                 ent_coef: float = 0.0,
                 vf_coef: float = 0.5,
                 max_grad_norm: float = 0.5,
                 rms_prop_eps: float = 1e-5,
                 use_rms_prop: bool = True,
                 use_sde: bool = False,
                 sde_sample_freq: int = -1,
                 normalize_advantage: bool = False,
                 tensorboard_log=None,
                 create_eval_env=False,
                 policy_kwargs=None,
                 verbose: int = 0,
                 seed=None,
                 device="auto",
                 _init_setup_model: bool = True,
                 **kwargs):

        # Generate seed for A2CStableBaselines using rlberry seeding
        self.rng = seeding.get_rng()
        seed = self.rng.integers(2**32).item()

        # init stable baselines class
        self.wrapped = A2CStableBaselines(
            policy, env, learning_rate, n_steps, gamma, gae_lambda, ent_coef,
            vf_coef, max_grad_norm, rms_prop_eps, use_rms_prop, use_sde,
            sde_sample_freq, normalize_advantage, tensorboard_log,
            create_eval_env, policy_kwargs, verbose, seed, device,
            _init_setup_model)

        # init rlberry base class
        Agent.__init__(self, env, **kwargs)
コード例 #5
0
ファイル: rs_kernel_ucbvi.py プロジェクト: lulica/rlberry
    def __init__(self,
                 env,
                 n_episodes=1000,
                 gamma=0.95,
                 horizon=None,
                 lp_metric=2,
                 kernel_type="epanechnikov",
                 scaling=None,
                 bandwidth=0.05,
                 min_dist=0.1,
                 max_repr=1000,
                 bonus_scale_factor=1.0,
                 beta=0.01,
                 bonus_type="simplified_bernstein",
                 **kwargs):
        # init base class
        Agent.__init__(self, env, **kwargs)

        self.n_episodes = n_episodes
        self.gamma = gamma
        self.horizon = horizon
        self.lp_metric = lp_metric
        self.kernel_type = kernel_type
        self.bandwidth = bandwidth
        self.min_dist = min_dist
        self.bonus_scale_factor = bonus_scale_factor
        self.beta = beta
        self.bonus_type = bonus_type

        # check environment
        assert self.env.is_online()
        assert isinstance(self.env.observation_space, spaces.Box)
        assert isinstance(self.env.action_space, spaces.Discrete)

        # other checks
        assert gamma >= 0 and gamma <= 1.0
        if self.horizon is None:
            assert gamma < 1.0, \
                "If no horizon is given, gamma must be smaller than 1."
            self.horizon = int(np.ceil(1.0 / (1.0 - gamma)))

        # state dimension
        self.state_dim = self.env.observation_space.shape[0]

        # compute scaling, if it is None
        if scaling is None:
            # if high and low are bounded
            if (self.env.observation_space.high == np.inf).sum() == 0 \
                    and (self.env.observation_space.low == -np.inf).sum() == 0:
                scaling = self.env.observation_space.high \
                    - self.env.observation_space.low
                # if high or low are unbounded
            else:
                scaling = np.ones(self.state_dim)
        else:
            assert scaling.ndim == 1
            assert scaling.shape[0] == self.state_dim
        self.scaling = scaling

        # maximum value
        r_range = self.env.reward_range[1] - self.env.reward_range[0]
        if r_range == np.inf:
            logger.warning("{}: Reward range is infinity. ".format(self.name) +
                           "Clipping it to 1.")
            r_range = 1.0

        if self.gamma == 1.0:
            self.v_max = r_range * horizon
        else:
            self.v_max = r_range * (1.0 - np.power(self.gamma, self.horizon))\
                 / (1.0 - self.gamma)

        # number of representative states and number of actions
        if max_repr is None:
            max_repr = int(
                np.ceil((1.0 * np.sqrt(self.state_dim) /
                         self.min_dist)**self.state_dim))
        self.max_repr = max_repr

        # current number of representative states
        self.M = None
        self.A = self.env.action_space.n

        # declaring variables
        self.episode = None  # current episode
        self.representative_states = None  # coordinates of all repr states
        self.N_sa = None  # sum of weights at (s, a)
        self.B_sa = None  # bonus at (s, a)
        self.R_hat = None  # reward  estimate
        self.P_hat = None  # transitions estimate
        self.Q = None  # Q function
        self.V = None  # V function

        self.Q_policy = None  # Q function for recommended policy

        # initialize
        self.reset()
コード例 #6
0
    def __init__(self,
                 env,
                 horizon,
                 feature_map_fn,
                 feature_map_kwargs=None,
                 n_episodes=100,
                 gamma=0.99,
                 bonus_scale_factor=1.0,
                 reg_factor=0.1,
                 **kwargs):
        Agent.__init__(self, env, **kwargs)

        self.horizon = horizon
        self.n_episodes = n_episodes
        self.gamma = gamma
        self.bonus_scale_factor = bonus_scale_factor
        self.reg_factor = reg_factor
        feature_map_kwargs = feature_map_kwargs or {}
        self.feature_map = feature_map_fn(self.env, **feature_map_kwargs)

        #
        if self.bonus_scale_factor == 0.0:
            self.name = 'LSVI-Random-Expl'

        # maximum value
        r_range = self.env.reward_range[1] - self.env.reward_range[0]
        if r_range == np.inf:
            logger.warning("{}: Reward range is infinity. ".format(self.name) +
                           "Clipping it to 1.")
            r_range = 1.0

        if self.gamma == 1.0:
            self.v_max = r_range * horizon
        else:
            self.v_max = r_range * (1.0 - np.power(self.gamma, self.horizon))\
                 / (1.0 - self.gamma)

        #
        assert isinstance(self.env.action_space, Discrete), \
            "LSVI-UCB requires discrete actions."

        #
        assert len(self.feature_map.shape) == 1
        self.dim = self.feature_map.shape[0]

        # attributes initialized in reset()
        self.episode = None
        self.lambda_mat = None  # lambda matrix
        self.lambda_mat_inv = None  # inverse of lambda matrix
        self.w_vec = None  # vector representation of Q
        self.w_policy = None  # representation of Q for final policy
        self.reward_hist = None  # reward history
        self.state_hist = None  # state history
        self.action_hist = None  # action history
        self.nstate_hist = None  # next state history

        self.feat_hist = None  # feature history
        self.feat_ns_all_actions = None  # next state features for all actions
        #

        # aux variables (init in reset() too)
        self._rewards = None

        # default writer
        self.writer = PeriodicWriter(self.name, log_every=15)
        # 5*logger.getEffectiveLevel()

        #
        self.reset()
コード例 #7
0
ファイル: kovi2.py プロジェクト: antoine-moulin/rlberry
    def __init__(self,
                 env,
                 horizon,
                 pd_kernel_fn,
                 pd_kernel_kwargs=None,
                 n_episodes=100,
                 gamma=0.99,
                 bonus_scale_factor=1.0,
                 reg_factor=0.1,
                 **kwargs):
        Agent.__init__(self, env, **kwargs)

        self.use_jit = True
        self.horizon = horizon
        self.n_episodes = n_episodes
        self.gamma = gamma
        self.bonus_scale_factor = bonus_scale_factor
        self.reg_factor = reg_factor
        self.total_time_steps = 0

        pd_kernel_kwargs = pd_kernel_kwargs or {}
        self.pd_kernel = pd_kernel_fn

        #
        if self.bonus_scale_factor == 0.0:
            self.name = 'KOVI-Random-Expl'

        # maximum value
        r_range = self.env.reward_range[1] - self.env.reward_range[0]
        if r_range == np.inf:
            logger.warning("{}: Reward range is infinity. ".format(self.name) +
                           "Clipping it to 1.")
            r_range = 1.0

        if self.gamma == 1.0:
            self.v_max = r_range * horizon
        else:
            self.v_max = r_range * (1.0 - np.power(self.gamma, self.horizon))\
                 / (1.0 - self.gamma)

        #
        assert isinstance(self.env.action_space, Discrete), \
            "KOVI requires discrete actions."

        # attributes initialized in reset()
        self.episode = None
        self.gram_mat = None  # Gram matrix
        self.gram_mat_inv = None  # inverse of Gram matrix
        self.alphas = None  # vector representations of Q
        self.reward_hist = None  # reward history
        self.state_hist = None  # state history
        self.action_hist = None  # action history
        self.nstate_hist = None  # next state history
        self.rkhs_norm_hist = None  # norm history

        self.feat_hist = None  # feature history
        self.feat_ns_all_actions = None  # next state features for all actions

        self.new_gram_mat = None
        self.new_gram_mat_inv = None
        #

        # aux variables (init in reset() too)
        self._rewards = None

        # default writer
        self.writer = PeriodicWriter(self.name, log_every=15)
        # 5*logger.getEffectiveLevel()

        #
        self.reset()