Esempio n. 1
0
    def __init__(self,
                 env,
                 horizon,
                 exploration_kwargs=None,
                 memory_kwargs=None,
                 n_episodes=1000,
                 batch_size=100,
                 target_update=1,
                 double=True,
                 **kwargs):
        ABC.__init__(self)
        IncrementalAgent.__init__(self, env, **kwargs)
        self.horizon = horizon
        self.exploration_kwargs = exploration_kwargs or {}
        self.memory_kwargs = memory_kwargs or {}
        self.n_episodes = n_episodes
        self.batch_size = batch_size
        self.target_update = target_update
        self.double = double

        assert isinstance(env.action_space, spaces.Discrete), \
            "Only compatible with Discrete action spaces."

        self.memory = ReplayMemory(**self.memory_kwargs)
        self.exploration_policy = \
            exploration_factory(self.env.action_space,
                                **self.exploration_kwargs)
        self.training = True
        self.steps = 0
        self.writer = None
Esempio n. 2
0
    def __init__(self,
                 env,
                 n_episodes=1000,
                 gamma=1.0,
                 horizon=50,
                 bonus_scale_factor=1.0,
                 bonus_type="simplified_bernstein",
                 **kwargs):
        IncrementalAgent.__init__(self, env, **kwargs)

        assert isinstance(self.env.observation_space, spaces.Box)
        assert isinstance(self.env.action_space, spaces.Discrete)

        self.n_episodes = n_episodes
        self.gamma = gamma
        self.horizon = horizon
        self.bonus_scale_factor = bonus_scale_factor
        self.bonus_type = bonus_type

        # maximum value
        r_range = self.env.reward_range[1] - self.env.reward_range[0]
        if r_range == np.inf or r_range == 0.0:
            logger.warning(
                "{}: Reward range is  zero or infinity. ".format(self.name) +
                "Setting it to 1.")
            r_range = 1.0

        self.v_max = np.zeros(self.horizon)
        self.v_max[-1] = r_range
        for hh in reversed(range(self.horizon - 1)):
            self.v_max[hh] = r_range + self.gamma * self.v_max[hh + 1]

        self.reset()
Esempio n. 3
0
    def __init__(self, env,
                 n_episodes=4000,
                 batch_size=8,
                 horizon=256,
                 gamma=0.99,
                 entr_coef=0.01,
                 vf_coef=0.,
                 avec_coef=1.,
                 learning_rate=0.0003,
                 optimizer_type='ADAM',
                 eps_clip=0.2,
                 k_epochs=10,
                 policy_net_fn=None,
                 value_net_fn=None,
                 policy_net_kwargs=None,
                 value_net_kwargs=None,
                 use_bonus=False,
                 uncertainty_estimator_kwargs=None,
                 device="cuda:best",
                 **kwargs):
        self.use_bonus = use_bonus
        if self.use_bonus:
            env = UncertaintyEstimatorWrapper(env,
                                              **uncertainty_estimator_kwargs)
        IncrementalAgent.__init__(self, env, **kwargs)

        self.learning_rate = learning_rate
        self.gamma = gamma
        self.entr_coef = entr_coef
        self.vf_coef = vf_coef
        self.avec_coef = avec_coef
        self.eps_clip = eps_clip
        self.k_epochs = k_epochs
        self.horizon = horizon
        self.n_episodes = n_episodes
        self.batch_size = batch_size
        self.device = choose_device(device)

        self.policy_net_kwargs = policy_net_kwargs or {}
        self.value_net_kwargs = value_net_kwargs or {}

        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n

        #
        self.policy_net_fn = policy_net_fn or default_policy_net_fn
        self.value_net_fn = value_net_fn or default_value_net_fn

        self.optimizer_kwargs = {'optimizer_type': optimizer_type,
                                 'lr': learning_rate}

        # check environment
        assert isinstance(self.env.observation_space, spaces.Box)
        assert isinstance(self.env.action_space, spaces.Discrete)

        self.cat_policy = None  # categorical policy function

        # initialize
        self.reset()
Esempio n. 4
0
    def __init__(self, env, n_episodes, hyperparameter=0, **kwargs):
        IncrementalAgent.__init__(self, env, **kwargs)
        self.name = "DummyAgent"
        self.n_episodes = n_episodes
        self.fitted = False
        self.hyperparameter = hyperparameter

        self.fraction_fitted = 0.0
Esempio n. 5
0
    def __init__(self,
                 env,
                 n_episodes=4000,
                 batch_size=8,
                 horizon=256,
                 gamma=0.99,
                 entr_coef=0.01,
                 vf_coef=0.,
                 avec_coef=1.,
                 learning_rate=0.0003,
                 optimizer_type='ADAM',
                 eps_clip=0.2,
                 k_epochs=10,
                 policy_net_fn=None,
                 value_net_fn=None,
                 **kwargs):
        IncrementalAgent.__init__(self, env, **kwargs)

        self.learning_rate = learning_rate
        self.gamma = gamma
        self.entr_coef = entr_coef
        self.vf_coef = vf_coef
        self.avec_coef = avec_coef
        self.eps_clip = eps_clip
        self.k_epochs = k_epochs
        self.horizon = horizon
        self.n_episodes = n_episodes
        self.batch_size = batch_size

        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n

        #
        self.policy_net_fn = policy_net_fn \
            or (lambda: default_policy_net_fn(self.env))

        self.value_net_fn = value_net_fn \
            or (lambda: default_value_net_fn(self.env))

        self.optimizer_kwargs = {
            'optimizer_type': optimizer_type,
            'lr': learning_rate
        }

        # check environment
        assert isinstance(self.env.observation_space, spaces.Box)
        assert isinstance(self.env.action_space, spaces.Discrete)

        self.cat_policy = None  # categorical policy function

        # initialize
        self.reset()
Esempio n. 6
0
    def __init__(self,
                 env,
                 n_episodes=1000,
                 gamma=1.0,
                 horizon=100,
                 bonus_scale_factor=1.0,
                 bonus_type="simplified_bernstein",
                 reward_free=False,
                 stage_dependent=False,
                 real_time_dp=False,
                 **kwargs):
        # init base class
        IncrementalAgent.__init__(self, env, **kwargs)

        self.n_episodes = n_episodes
        self.gamma = gamma
        self.horizon = horizon
        self.bonus_scale_factor = bonus_scale_factor
        self.bonus_type = bonus_type
        self.reward_free = reward_free
        self.stage_dependent = stage_dependent
        self.real_time_dp = real_time_dp

        # check environment
        assert isinstance(self.env.observation_space, spaces.Discrete)
        assert isinstance(self.env.action_space, spaces.Discrete)

        # other checks
        assert gamma >= 0 and gamma <= 1.0
        if self.horizon is None:
            assert gamma < 1.0, \
                "If no horizon is given, gamma must be smaller than 1."
            self.horizon = int(np.ceil(1.0 / (1.0 - gamma)))

        # maximum value
        r_range = self.env.reward_range[1] - self.env.reward_range[0]
        if r_range == np.inf or r_range == 0.0:
            logger.warning(
                "{}: Reward range is  zero or infinity. ".format(self.name) +
                "Setting it to 1.")
            r_range = 1.0

        self.v_max = np.zeros(self.horizon)
        self.v_max[-1] = r_range
        for hh in reversed(range(self.horizon - 1)):
            self.v_max[hh] = r_range + self.gamma * self.v_max[hh + 1]

        # initialize
        self.reset()
Esempio n. 7
0
    def __init__(self,
                 env,
                 n_episodes=4000,
                 batch_size=8,
                 horizon=256,
                 gamma=0.99,
                 entr_coef=0.01,
                 learning_rate=0.0001,
                 normalize=True,
                 optimizer_type='ADAM',
                 policy_net_fn=None,
                 policy_net_kwargs=None,
                 use_bonus_if_available=False,
                 device="cuda:best",
                 **kwargs):
        IncrementalAgent.__init__(self, env, **kwargs)

        self.n_episodes = n_episodes
        self.batch_size = batch_size
        self.horizon = horizon
        self.gamma = gamma
        self.entr_coef = entr_coef
        self.learning_rate = learning_rate
        self.normalize = normalize
        self.use_bonus_if_available = use_bonus_if_available
        self.device = choose_device(device)

        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n

        self.policy_net_kwargs = policy_net_kwargs or {}

        #
        self.policy_net_fn = policy_net_fn or default_policy_net_fn

        self.optimizer_kwargs = {
            'optimizer_type': optimizer_type,
            'lr': learning_rate
        }

        # check environment
        assert isinstance(self.env.observation_space, spaces.Box)
        assert isinstance(self.env.action_space, spaces.Discrete)

        self.policy_net = None  # policy network

        # initialize
        self.reset()
Esempio n. 8
0
    def __init__(self,
                 env,
                 n_episodes=1000,
                 horizon=100,
                 gamma=0.99,
                 entr_coef=0.1,
                 batch_size=16,
                 percentile=70,
                 learning_rate=0.01,
                 optimizer_type='ADAM',
                 on_policy=False,
                 policy_net_fn=None,
                 policy_net_kwargs=None,
                 device="cuda:best",
                 **kwargs):
        IncrementalAgent.__init__(self, env, **kwargs)

        # check environment
        assert isinstance(self.env.observation_space, spaces.Box)
        assert isinstance(self.env.action_space, spaces.Discrete)

        # parameters
        self.gamma = gamma
        self.entr_coef = entr_coef
        self.batch_size = batch_size
        self.n_episodes = n_episodes
        self.percentile = percentile
        self.learning_rate = learning_rate
        self.horizon = horizon
        self.on_policy = on_policy
        self.policy_net_kwargs = policy_net_kwargs or {}
        self.policy_net_fn = policy_net_fn or default_policy_net_fn
        self.optimizer_kwargs = {
            'optimizer_type': optimizer_type,
            'lr': learning_rate
        }
        self.device = choose_device(device)
        self.reset()
Esempio n. 9
0
    def __init__(self,
                 env,
                 n_episodes=1000,
                 horizon=256,
                 gamma=0.99,
                 loss_function="l2",
                 batch_size=100,
                 device="cuda:best",
                 target_update=1,
                 learning_rate=0.001,
                 epsilon_init=1.0,
                 epsilon_final=0.1,
                 epsilon_decay=5000,
                 optimizer_type='ADAM',
                 qvalue_net_fn=None,
                 qvalue_net_kwargs=None,
                 double=True,
                 memory_capacity=10000,
                 use_bonus=False,
                 uncertainty_estimator_kwargs=None,
                 prioritized_replay=True,
                 update_frequency=1,
                 **kwargs):
        # Wrap arguments and initialize base class
        memory_kwargs = {
            'capacity': memory_capacity,
            'n_steps': 1,
            'gamma': gamma
        }
        exploration_kwargs = {
            'method': "EpsilonGreedy",
            'temperature': epsilon_init,
            'final_temperature': epsilon_final,
            'tau': epsilon_decay,
        }
        self.use_bonus = use_bonus
        if self.use_bonus:
            env = UncertaintyEstimatorWrapper(env,
                                              **uncertainty_estimator_kwargs)
        IncrementalAgent.__init__(self, env, **kwargs)
        self.horizon = horizon
        self.exploration_kwargs = exploration_kwargs or {}
        self.memory_kwargs = memory_kwargs or {}
        self.n_episodes = n_episodes
        self.batch_size = batch_size
        self.target_update = target_update
        self.double = double

        assert isinstance(env.action_space, spaces.Discrete), \
            "Only compatible with Discrete action spaces."

        self.prioritized_replay = prioritized_replay
        memory_class = PrioritizedReplayMemory if prioritized_replay else TransitionReplayMemory
        self.memory = memory_class(**self.memory_kwargs)
        self.exploration_policy = \
            exploration_factory(self.env.action_space,
                                **self.exploration_kwargs)
        self.training = True
        self.steps = 0
        self.episode = 0
        self.writer = None

        self.optimizer_kwargs = {
            'optimizer_type': optimizer_type,
            'lr': learning_rate
        }
        self.device = choose_device(device)
        self.loss_function = loss_function
        self.gamma = gamma

        qvalue_net_kwargs = qvalue_net_kwargs or {}
        qvalue_net_fn = load(qvalue_net_fn) if isinstance(qvalue_net_fn, str) else \
            qvalue_net_fn or default_qvalue_net_fn
        self.value_net = qvalue_net_fn(self.env, **qvalue_net_kwargs)
        self.target_net = qvalue_net_fn(self.env, **qvalue_net_kwargs)

        self.target_net.load_state_dict(self.value_net.state_dict())
        self.target_net.eval()
        logger.info("Number of trainable parameters: {}".format(
            trainable_parameters(self.value_net)))
        self.value_net.to(self.device)
        self.target_net.to(self.device)
        self.loss_function = loss_function_factory(self.loss_function)
        self.optimizer = optimizer_factory(self.value_net.parameters(),
                                           **self.optimizer_kwargs)
        self.update_frequency = update_frequency
        self.steps = 0
Esempio n. 10
0
    def __init__(self,
                 env,
                 n_episodes=4000,
                 batch_size=8,
                 horizon=256,
                 gamma=0.99,
                 entr_coef=0.01,
                 vf_coef=0.5,
                 learning_rate=0.01,
                 optimizer_type='ADAM',
                 k_epochs=5,
                 use_gae=True,
                 gae_lambda=0.95,
                 policy_net_fn=None,
                 value_net_fn=None,
                 policy_net_kwargs=None,
                 value_net_kwargs=None,
                 device="cuda:best",
                 use_bonus=False,
                 uncertainty_estimator_kwargs=None,
                 **kwargs):
        self.use_bonus = use_bonus
        if self.use_bonus:
            env = UncertaintyEstimatorWrapper(env,
                                              **uncertainty_estimator_kwargs)
        IncrementalAgent.__init__(self, env, **kwargs)

        self.n_episodes = n_episodes
        self.batch_size = batch_size
        self.horizon = horizon
        self.gamma = gamma
        self.entr_coef = entr_coef
        self.vf_coef = vf_coef
        self.learning_rate = learning_rate
        self.k_epochs = k_epochs
        self.use_gae = use_gae
        self.gae_lambda = gae_lambda
        self.damping = 0  # TODO: turn into argument
        self.max_kl = 0.1  # TODO: turn into argument
        self.use_entropy = False  # TODO: test, and eventually turn into argument
        self.normalize_advantage = True  # TODO: turn into argument
        self.normalize_reward = False  # TODO: turn into argument

        self.policy_net_kwargs = policy_net_kwargs or {}
        self.value_net_kwargs = value_net_kwargs or {}

        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n

        #
        self.policy_net_fn = policy_net_fn or default_policy_net_fn
        self.value_net_fn = value_net_fn or default_value_net_fn

        self.device = choose_device(device)

        self.optimizer_kwargs = {
            'optimizer_type': optimizer_type,
            'lr': learning_rate
        }

        # check environment
        assert isinstance(self.env.observation_space, spaces.Box)
        assert isinstance(self.env.action_space, spaces.Discrete)

        # TODO: check
        self.cat_policy = None  # categorical policy function
        self.policy_optimizer = None

        self.value_net = None
        self.value_optimizer = None

        self.cat_policy_old = None

        self.value_loss_fn = None

        self.memory = None

        self.episode = 0

        self._rewards = None
        self._cumul_rewards = None

        # initialize
        self.reset()
Esempio n. 11
0
    def __init__(self,
                 env,
                 n_episodes=1000,
                 gamma=0.99,
                 horizon=100,
                 lp_metric=2,
                 scaling=None,
                 min_dist=0.1,
                 max_repr=1000,
                 bonus_scale_factor=1.0,
                 bonus_type="simplified_bernstein",
                 reward_free=False,
                 **kwargs):
        # init base class
        IncrementalAgent.__init__(self, env, **kwargs)

        self.n_episodes = n_episodes
        self.gamma = gamma
        self.horizon = horizon
        self.lp_metric = lp_metric
        self.min_dist = min_dist
        self.bonus_scale_factor = bonus_scale_factor
        self.bonus_type = bonus_type
        self.reward_free = reward_free

        # check environment
        assert isinstance(self.env.observation_space, spaces.Box)
        assert isinstance(self.env.action_space, spaces.Discrete)

        # other checks
        assert gamma >= 0 and gamma <= 1.0
        if self.horizon is None:
            assert gamma < 1.0, \
                "If no horizon is given, gamma must be smaller than 1."
            self.horizon = int(np.ceil(1.0 / (1.0 - gamma)))

        # state dimension
        self.state_dim = self.env.observation_space.shape[0]

        # compute scaling, if it is None
        if scaling is None:
            # if high and low are bounded
            if (self.env.observation_space.high == np.inf).sum() == 0 \
                    and (self.env.observation_space.low == -np.inf).sum() == 0:
                scaling = self.env.observation_space.high \
                    - self.env.observation_space.low
                # if high or low are unbounded
            else:
                scaling = np.ones(self.state_dim)
        else:
            assert scaling.ndim == 1
            assert scaling.shape[0] == self.state_dim
        self.scaling = scaling

        # maximum value
        r_range = self.env.reward_range[1] - self.env.reward_range[0]
        if r_range == np.inf or r_range == 0.0:
            logger.warning(
                "{}: Reward range is  zero or infinity. ".format(self.name) +
                "Setting it to 1.")
            r_range = 1.0

        if self.gamma == 1.0:
            self.v_max = r_range * horizon
        else:
            self.v_max = r_range * (1.0 - np.power(self.gamma, self.horizon)) \
                                                        / (1.0 - self.gamma)

        # number of representative states and number of actions
        if max_repr is None:
            max_repr = int(
                np.ceil((1.0 * np.sqrt(self.state_dim) /
                         self.min_dist)**self.state_dim))
        self.max_repr = max_repr

        # current number of representative states
        self.M = None
        self.A = self.env.action_space.n

        # declaring variables
        self.episode = None  # current episode
        self.representative_states = None  # coordinates of all repr states
        self.N_sa = None  # visits to (s, a)
        self.N_sas = None  # visits to (s, a, s')
        self.S_sa = None  # sum of rewards at (s, a)
        self.B_sa = None  # bonus at (s, a)
        self.Q = None  # Q function
        self.V = None  # V function

        self.Q_policy = None  # Q function for recommended policy

        # initialize
        self.reset()