def __init__(self, observation_space, action_space, name="MBIE Agent", params={}, starting_policy=None): BaseAgent.__init__(self, observation_space, action_space, name, params=dict(MBIE_DEFAULTS, **params)) # Policy Setup if starting_policy: self.predict_policy = starting_policy else: self.predict_policy = DiscreteTabularPolicy(self.observation_space, self.action_space, default_value=1 / (1 - self.gamma)) self.backup_lim = int( np.log(1 / (self.params['epsilon_one'] * (1 - self.gamma))) / (1 - self.gamma)) self.policy_iterations = 0 # Model Setup self.model = DiscreteTabularModel( observation_space, action_space, default_reward=self.params['max_reward'], limit=self.params['known_threshold']) self.learn_policy = self.predict_policy
def __init__(self, observation_space, action_space, name="RMax Agent", params=None, starting_policy=None): BaseAgent.__init__(self, observation_space, action_space, name) # Hyper-parameters self.params = dict(RMAX_DEFAULTS) if params: for key, value in params: self.params[key] = value self.max_reward = self.params['max_reward'] self.epsilon_one = self.params['epsilon_one'] self.known_threshold = self.params['known_threshold'] self.gamma = self.params['gamma'] #self.max_reward = 1 / (1 - self.gamma) # Policy Setup self.starting_policy = starting_policy self.backup_lim = int( np.log(1 / (self.epsilon_one * (1 - self.gamma))) / (1 - self.gamma)) self.stepwise_backup_steps = 1 # self.backup_lim self.episodic_backup_steps = min(self.backup_lim, 5) # Model Setup self.model = KnownTabularModel(action_space.n, self.max_reward, self.known_threshold) self.reset()
def __init__(self, observation_space, action_space, name="Q-Learning Agent", parameters={}, starting_policy=None): BaseAgent.__init__(self, observation_space, action_space, name, params=dict(QLEARNING_CONSTS, **parameters)) # Policy Setup if starting_policy: self.predict_policy = starting_policy else: self.predict_policy = DiscreteTabularPolicy(self.observation_space, self.action_space, default_value=1/(1-self.gamma)) self.learn_policy = EpsilonGreedy( action_space=self.action_space, policy=self.predict_policy, epsilon=self.epsilon )
def __init__(self, observation_space, action_space, name="UCBVI Agent", params=None, starting_policy=None): BaseAgent.__init__(self, observation_space, action_space, name) # Hyper-parameters self.params = dict(UCBVI_DEFAULTS) if params: for key, value in params: self.params[key] = value self.max_reward = self.params['max_reward'] self.epsilon_one = self.params['epsilon_one'] self.known_threshold = self.params['known_threshold'] self.gamma = self.params['gamma'] #self.max_reward = 1 / (1 - self.gamma) self.delta = self.params['delta'] # Policy Setup self.starting_policy = starting_policy self.backup_lim = int( np.log(1 / (self.epsilon_one * (1 - self.gamma))) / (1 - self.gamma)) self.stepwise_backup_steps = 0 self.episodic_backup_steps = self.backup_lim self.policy_iterations = 0 # Model Setup self.model = DiscreteTabularModel(observation_space, action_space, default_reward=self.max_reward, limit=self.known_threshold) # Experience Tracking self.last_episode = [] # self.last_episode_model = KnownTabularModel(action_space.n, self.max_reward, 1) self.reset()