Example #1
0
class SMDPPlanning:
    """ Estimates value function given learned models R and P """
    def __init__(self,
                 env: MiniGridEnv,
                 R: np.ndarray,
                 P: np.ndarray,
                 loglevel: int = 10):
        self.env = env

        option_dim, self.state_space_dim = R.shape[0], R.shape[1:]
        state_space_flat = np.prod(self.state_space_dim)
        self.R = R.reshape((option_dim, state_space_flat))
        self.P = P.reshape((option_dim, state_space_flat, state_space_flat))
        self.V = np.zeros(state_space_flat)

        self.logger = ProjectLogger(level=loglevel, printing=True)

    def svi(self, θ: float = 1e-9):
        """ Iterative Policy Evaluation using Synchronous Value Iteration.
        
        Estimates V by acting greedy wrt to V_hat (current estimate of V)
        """
        δ = float('inf')

        while δ > θ:
            v_old = self.V
            self.V = (self.R + np.dot(self.P, self.V)).max(axis=0)
            δ = np.sum(np.abs(self.V - v_old))
            self.logger.debug(f'State-value delta: {δ}')
            yield self.V.reshape(self.state_space_dim)

        return self.V.reshape(self.state_space_dim)
Example #2
0
class SMDPValueLearning:
    """ Algorithms for finding an optimal policy over a set of options in SMDP.
    
    Treats each option as an indivisible unit.
    Does not work well in this setting, since the rooms are larger than in the original experiment
    and thus the probability of stumbling across a goal state, while performing primitive actions only is much smaller.
    Consider a case when the agent is at the hallway state. It can try to make a primitive action
    in the direction of the goal. However, at the next state it can choose to take an option that takes it
    either to the other hallway or back with probability 2/5. As the agent makes progress towards the goal state,
    it's more likely that the option will get activated along the way. Since no intra-option learning is happening,
    the value is not attributed to the states surrounding the goal.
    """
    def __init__(self,
                 env: MiniGridEnv,
                 options: Options,
                 policy: PolicyOverOptions,
                 loglevel: int = 20):
        self.env = env
        self.options = options
        self.option_names_dict = {o.name: o for o in self.options}
        self.option_idx_dict = {
            name: i
            for i, name in enumerate(self.option_names_dict)
        }

        self._policy = policy
        self.logger = ProjectLogger(level=loglevel, printing=False)

    def policy(self, state, *args, **kwargs):
        option = self._policy(state, *args, **kwargs)
        return MarkovOption(starting_state=state,
                            initiation_set=option.initiation_set,
                            termination_function=option.termination,
                            policy=option.target_policy,
                            name=str(option))

    def q_learning(self,
                   n_episodes: int,
                   γ: float = 0.9,
                   Q: np.ndarray = None,
                   N: np.ndarray = None,
                   α: float = None,
                   render: bool = False):

        env = self.env.unwrapped
        n_options = len(self.options)
        state_space_dim = (4, env.width, env.height)
        dim = (n_options, *state_space_dim)

        if Q is None:
            N = np.zeros(dim)
            Q = np.zeros(dim)

        for episode in range(n_episodes):

            self.env.reset()
            state = (env.agent_dir, *reversed(env.agent_pos))
            executing_option = self.policy(Q, state)
            done = False

            while not done:

                # Step through environment
                a = executing_option.policy(state)
                obs, reward, done, info = self.env.step(a)
                # TODO: infer the state of the agent from obs, i.e. make it POMDP
                s_next = (env.agent_dir, *reversed(env.agent_pos))

                if render:
                    action_name = list(env.actions)[a].name
                    self.logger.debug(f"State: {state}, "
                                      f"Option: {executing_option}, "
                                      f"Action: {action_name}, "
                                      f"Next State: {s_next}")
                    self.env.render()
                    time.sleep(0.05)

                # Update option
                executing_option.k += 1
                executing_option.cumulant += γ**executing_option.k * reward

                # Check for termination condition and update action-values
                if executing_option.termination_function(s_next) == 1 or done:

                    start_state = (self.option_idx_dict[executing_option.name],
                                   *executing_option.starting_state)

                    # Determine the step-size
                    if α is None:
                        N[start_state] += 1
                        alpha = 1 / N[start_state]
                    else:
                        alpha = α

                    # Update Q in the direction of the optimal action
                    r = executing_option.cumulant
                    k = executing_option.k
                    o = randargmax(Q[(slice(None), *s_next)])
                    target = r + γ**k * Q[(o, *s_next)]
                    Q[start_state] += alpha * (target - Q[start_state])

                    # Choose the next option
                    executing_option = self.policy(Q, s_next)

                # Reset the state
                state = s_next
            yield Q, self.env.step_count

        return Q, self.env.step_count

    @staticmethod
    def plot_episode_duration(steps_per_episode):
        traces = list()
        for option_set, steps_per_episode in steps_per_episode.items():
            traces.append(
                go.Scatter(
                    mode='lines',
                    y=steps_per_episode,
                    name=option_set,
                ))

        layout = dict(height=700,
                      showlegend=True,
                      xaxis=dict(title='Episodes', ),
                      yaxis=dict(title='Steps per episode', ))
        return {'data': traces, 'layout': layout}
Example #3
0
class IntraOptionModelLearning:
    
    def __init__(self,
                 env: MiniGridEnv,
                 options: Option,
                 loglevel: int = 20):
        self.env = env
        self.options = options
        self.option_names_dict = {o.name: o for o in self.options}
        self.option_idx_dict = {name: i for i, name in
                                enumerate(self.option_names_dict)}
        self.logger = ProjectLogger(level=loglevel, printing=False)
    
    def __str__(self):
        return 'IntraOptionModelLearning'
    
    def choose_option(self, state):
        """ Picks an option at random """
        options = [o for o in self.options if o.initiation_set[state] == 1]
        return random.choice(options)
    
    def run_episode(self,
                    N: np.ndarray = None,
                    R: np.ndarray = None,
                    P: np.ndarray = None,
                    γ: float = 0.9,
                    α: float = None,
                    render: bool = False):
        
        env = self.env.unwrapped
        n_options = len(self.options)
        state_space = (4, env.width, env.height)
        dim = (n_options, *state_space)
        
        if R is None:
            R = np.zeros(dim)
            N = np.zeros(dim)
            P = np.zeros((n_options, *state_space, *state_space))
        
        state = self.env.reset()
        done = False
        executing_option = None
        
        while not done:
            
            if executing_option is None:
                executing_option = self.choose_option(state)
            
            a = executing_option.policy(state)
            s_next, reward, done, info = self.env.step(a)
            
            if render:
                action_name = list(env.actions)[a].name
                self.logger.debug(f"State: {state}, "
                                  f"Option: {executing_option}, "
                                  f"Action: {action_name}, "
                                  f"Next State: {s_next}")
                self.env.render()
                time.sleep(0.05)
            
            # Update model for every option consistent with last action taken
            for option in self.options:
                
                if option.policy(state) != a:
                    continue
                
                o = self.option_idx_dict[executing_option.name]
                option_state = (o, *state)
                
                # Update visitation counter
                if α is None:
                    N[option_state] += 1
                    alpha = 1 / N[option_state]
                else:
                    alpha = α
                
                # Update reward matrix
                β = option.termination_function(s_next)
                target = reward + γ * (1 - β) * R[(o, *s_next)]
                R[option_state] += alpha * (target - R[option_state])
                
                # Update probability transition matrix
                target = γ * (1 - β) * P[o, :, :, :, s_next[0], s_next[1],
                                       s_next[2]]
                P[option_state] += alpha * (target - P[option_state])
                P[(o, *state, *s_next)] += alpha * γ * β
            
            if executing_option.termination_function(s_next) == 1:
                executing_option = None
            
            state = s_next
            yield N, R, P
        
        return N, R, P
Example #4
0
class SMDPModelLearning:
    """ Model learning in Semi-Markov Decision Process via MC sampling """
    def __init__(self, env: MiniGridEnv, options: Options, loglevel: int = 10):
        self.env = env
        self.options = options
        self.option_names_dict = {o.name: o for o in self.options}
        self.option_idx_dict = {
            name: i
            for i, name in enumerate(self.option_names_dict)
        }
        self.logger = ProjectLogger(level=loglevel, printing=True)

    def __str__(self):
        return 'SMDPModelLearning'

    def choose_option(self, state):
        """ Picks an option at random """
        options = [o for o in self.options if o.initiation_set[state] == 1]
        option = random.choice(options)
        return MarkovOption(starting_state=state,
                            initiation_set=option.initiation_set,
                            termination_function=option.termination_function,
                            policy=option.policy,
                            name=str(option))

    def run_episode(self,
                    N: np.ndarray = None,
                    R: np.ndarray = None,
                    P: np.ndarray = None,
                    γ: float = 0.9,
                    render: bool = False):

        env = self.env.unwrapped
        n_options = len(self.options)
        state_space_dim = (4, env.width, env.height)
        dim = (n_options, *state_space_dim)

        if R is None:
            R = np.zeros(dim)
            N = np.zeros(dim)
            P = np.zeros((n_options, *state_space_dim, *state_space_dim))

        self.env.reset()
        state = (self.env.agent_dir, *reversed(env.agent_pos))
        done = False
        executing_option = None

        while not done:

            if executing_option is None:
                executing_option = self.choose_option(state)

            a = executing_option.policy(state)
            obs, reward, done, info = self.env.step(a)
            state_next = (env.agent_dir, *reversed(env.agent_pos))

            if render:
                action_name = list(env.actions)[a].name
                self.logger.debug(f"State: {state}, "
                                  f"Option: {executing_option}, "
                                  f"Action: {action_name}, "
                                  f"Next State: {state_next}")
                self.env.render()
                time.sleep(0.05)

            executing_option.k += 1
            executing_option.cumulant += γ**executing_option.k * reward

            # Check for termination condition and update the model
            if executing_option.termination_function(state_next) == 1:
                option_state = (self.option_idx_dict[executing_option.name],
                                *executing_option.starting_state)
                # Update visitation counter
                N[option_state] += 1
                α = 1 / N[option_state]

                # Update reward matrix
                R[option_state] += α * (executing_option.cumulant -
                                        R[option_state])

                # Update probability transition matrix
                P[(*option_state, *state_next)] += α * (γ**executing_option.k)
                P[option_state] -= α * P[option_state]

                executing_option = None

            state = state_next
            yield N, R, P

        return N, R, P

    def get_true_models(self, seed: int = 1337, γ: float = 0.9):
        """ Learn true dynamics (P) and reward (R) models by unrolling each
        option for each state in its initiation set until termination
        # TODO: need to call multiple times if the environment is stochastic?
        """

        np.random.seed(seed)

        n_options = len(self.options)
        state_space_dim = (4, self.env.width, self.env.height)
        dim = (n_options, *state_space_dim)

        R = np.zeros(dim)
        N = np.zeros(dim)
        P = np.zeros((n_options, *state_space_dim, *state_space_dim))

        self.env.reset()

        for option_i, option in tqdm(enumerate(self.options)):
            for state, active in np.ndenumerate(option.initiation_set):

                # Checks if the state is in initiation set
                if not active:
                    continue

                env = self.env.unwrapped
                env.agent_dir, env.agent_pos = state[0], tuple(
                    reversed(state[1:]))
                cell = self.env.grid.get(*env.agent_pos)

                # Check if the state is valid for the agent to be in
                if not (cell is None or cell.can_overlap()):
                    continue

                # Activate an option and run until termination
                option = MarkovOption(starting_state=state,
                                      initiation_set=option._initiation_set,
                                      termination_set=option._termination_set,
                                      policy=option._policy,
                                      name=str(option))
                while True:
                    a = option.policy(state)
                    obs, reward, done, info = self.env.step(a)
                    env = self.env.unwrapped
                    state_next = (env.agent_dir, *reversed(env.agent_pos))
                    self.logger.debug(f"State: {state}, "
                                      f"Option: {option}, "
                                      f"Action: {a}, "
                                      f"Next State: {state_next}")
                    state = state_next
                    option.k += 1
                    option.cumulant += γ**option.k * reward
                    if option.termination_function(state):
                        break

                # Update option models
                option_state = (option_i, *option.starting_state)
                R[option_state] = option.cumulant
                P[(*option_state, *state)] = γ**option.k

        return N, R, P
Example #5
0
class IntraOptionQLearning(ControlDemon):
    """
        We turn now to the intra-option learning of option values and thus of optimal policies
        over options. If the options are semi-Markov, then again the SMDP methods described in
        Section 5.2 are probably the only feasible methods; a semi-Markov option must be completed
        before it can be evaluated in any way. But if the options are Markov and we are willing to
        look inside them, then we can consider intra-option methods. Just as in the case of model
        learning, intra-option methods for value learning are potentially more efficient than SMDP
        methods because they extract more training examples from the same experience.
    """
    
    def __init__(self,
                 env: MiniGridEnv,
                 options: List[Option],
                 target_policy: PolicyOverOptions,
                 feature_generator: FeatureGenerator,
                 weights,  # TODO: type?
                 lr: LearningRate = LearningRate(1, 1, 0),
                 gamma: float = 0.9,
                 loglevel: int = 20,
                 ):
        self.env = env
        # TODO: record all the rewards at the SMDP level?
        self.cumulant = lambda: 0
        
        super().__init__(target_policy=target_policy,
                         termination=lambda: 1,
                         eligibility=lambda: 1,
                         cumulant=self.cumulant,
                         feature=feature_generator,
                         behavioural_policy=target_policy,
                         weights=weights,
                         id=repr(self))
        
        self.options = self.Ω = options
        self.option_idx_dict = {str(o): i for i, o in enumerate(self.options)}
        
        self.lr = lr
        self.gamma = gamma
        
        self.logger = ProjectLogger(level=loglevel, printing=False)
    
    def advantage(self, state):
        """ Used as an unbiased estimator with reduced variance. """
        Q = self.predict(state)
        return Q - np.dot(self.π.pmf(state, Q), Q)
    
    def utility(self, state, option: Option) -> float:
        """ Utility of persisting with the same option vs picking another. """
        ω = self.option_idx_dict[str(option)]
        β = option.β.pmf(state)
        Q = self.predict(state)
        continuation_value = (1 - β) * Q[ω]
        termination_value = β * np.dot(self.π.pmf(state, Q), Q)
        return continuation_value + termination_value
    
    def loss(self,
             s0: np.ndarray,
             o: Option,
             r: float,
             s1: np.ndarray,
             done: bool):
        """ Calculates an Intra-Option Q-learning loss """
        # TODO: rewrite in terms of experience instead
        γ = self.gamma
        ω = self.option_idx_dict[str(o)]
        
        δ = r - self.predict(s0)[ω]
        if not done:
            δ += γ * self.utility(s1, o)
        return δ
    
    def update(self, experience: Transition):
        raise NotImplementedError
    
    def learn_option_values(self, render: bool = False):
        
        env = self.env.unwrapped
        state = self.env.reset()
        executing_option = self.target_policy(state)
        done = False
        while not done:
            
            # Step through environment
            a = executing_option.policy(state)
            s_next, reward, done, info = self.env.step(a)
            
            action_name = list(env.actions)[a].name
            
            # TODO: structure experience in (s, a, r, s') tuples
            self.logger.debug(f"State: {state}, "
                              f"Option: {executing_option}, "
                              f"Action: {action_name}, "
                              f"Next State: {s_next}")
            
            if render:
                self.env.render()
                time.sleep(0.05)
            
            # Update option-values for every option consistent with `a`
            for option in self.options:
                if option.policy(state) == a:
                    self.update(state, reward, s_next, option, done)
            
            # Terminate the option
            if executing_option.termination(s_next) or done:
                executing_option = self.target_policy(s_next)
            
            # Reset the state
            state = s_next
        
        return self.env.step_count