Example #1
0
class SMDPPlanning:
    """ Estimates value function given learned models R and P """
    def __init__(self,
                 env: MiniGridEnv,
                 R: np.ndarray,
                 P: np.ndarray,
                 loglevel: int = 10):
        self.env = env

        option_dim, self.state_space_dim = R.shape[0], R.shape[1:]
        state_space_flat = np.prod(self.state_space_dim)
        self.R = R.reshape((option_dim, state_space_flat))
        self.P = P.reshape((option_dim, state_space_flat, state_space_flat))
        self.V = np.zeros(state_space_flat)

        self.logger = ProjectLogger(level=loglevel, printing=True)

    def svi(self, θ: float = 1e-9):
        """ Iterative Policy Evaluation using Synchronous Value Iteration.
        
        Estimates V by acting greedy wrt to V_hat (current estimate of V)
        """
        δ = float('inf')

        while δ > θ:
            v_old = self.V
            self.V = (self.R + np.dot(self.P, self.V)).max(axis=0)
            δ = np.sum(np.abs(self.V - v_old))
            self.logger.debug(f'State-value delta: {δ}')
            yield self.V.reshape(self.state_space_dim)

        return self.V.reshape(self.state_space_dim)
Example #2
0
 def __init__(self,
              env: MiniGridEnv,
              options: List[Option],
              target_policy: PolicyOverOptions,
              feature_generator: FeatureGenerator,
              weights,  # TODO: type?
              lr: LearningRate = LearningRate(1, 1, 0),
              gamma: float = 0.9,
              loglevel: int = 20,
              ):
     self.env = env
     # TODO: record all the rewards at the SMDP level?
     self.cumulant = lambda: 0
     
     super().__init__(target_policy=target_policy,
                      termination=lambda: 1,
                      eligibility=lambda: 1,
                      cumulant=self.cumulant,
                      feature=feature_generator,
                      behavioural_policy=target_policy,
                      weights=weights,
                      id=repr(self))
     
     self.options = self.Ω = options
     self.option_idx_dict = {str(o): i for i, o in enumerate(self.options)}
     
     self.lr = lr
     self.gamma = gamma
     
     self.logger = ProjectLogger(level=loglevel, printing=False)
Example #3
0
 def __init__(self, env: MiniGridEnv, options: Options, loglevel: int = 10):
     self.env = env
     self.options = options
     self.option_names_dict = {o.name: o for o in self.options}
     self.option_idx_dict = {
         name: i
         for i, name in enumerate(self.option_names_dict)
     }
     self.logger = ProjectLogger(level=loglevel, printing=True)
Example #4
0
    def __init__(self,
                 env: MiniGridEnv,
                 R: np.ndarray,
                 P: np.ndarray,
                 loglevel: int = 10):
        self.env = env

        option_dim, self.state_space_dim = R.shape[0], R.shape[1:]
        state_space_flat = np.prod(self.state_space_dim)
        self.R = R.reshape((option_dim, state_space_flat))
        self.P = P.reshape((option_dim, state_space_flat, state_space_flat))
        self.V = np.zeros(state_space_flat)

        self.logger = ProjectLogger(level=loglevel, printing=True)
Example #5
0
 def __init__(self,
              env: MiniGridEnv,
              critic: IntraOptionQLearning,
              actor: PolicyOverOptions,
              action_critic: IntraOptionActionLearning = None,
              gamma: float = 0.99,
              loglevel: int = 20):
     self.env = env
     self.γ = gamma
     self.critic = critic
     self.actor = actor
     self.logger = ProjectLogger(level=loglevel)
     if action_critic is not None:
         self.advantage_estimator = None
         self.action_critic = action_critic
     else:
         self.advantage_estimator = 'io'
         self.logger.info(f'Action-critic was not provided, '
                          f'so the advantages for PG would be estimated')
Example #6
0
 def __init__(self,
              options: List[Option],
              rng: np.random.RandomState,
              loglevel: int):
     self.options = options
     self.option_names_dict = {str(o): o for o in self.options}
     self.option_idx_dict = {name: i for i, name in
                             enumerate(self.option_names_dict)}
     self.rng = rng
     self.logger = ProjectLogger(level=loglevel, printing=False)
Example #7
0
 def __init__(
         self,
         env: MiniGridEnv,
         feature_generator: NatureConvBody,
         critic: IntraOptionDeepQLearning,
         actor: PolicyOverOptions,
         optimizer: torch.optim.Optimizer,
         gamma: float = 0.99,
         loglevel: int = 20,
         rng: np.random.RandomState = np.random.RandomState(1),
 ):
     self.env = env
     self.γ = gamma
     # shared between both critic and actor
     self.feature_generator = feature_generator
     self.critic = critic
     self.actor = actor
     # self.network = torch.nn.Sequential(self.feature_generator, self.critic.critic)
     self.target_network = deepcopy(self.critic.critic)
     self.optimizer = optimizer
     self.logger = ProjectLogger(level=loglevel)
     self.rng = rng
Example #8
0
class SMDPValueLearning:
    """ Algorithms for finding an optimal policy over a set of options in SMDP.
    
    Treats each option as an indivisible unit.
    Does not work well in this setting, since the rooms are larger than in the original experiment
    and thus the probability of stumbling across a goal state, while performing primitive actions only is much smaller.
    Consider a case when the agent is at the hallway state. It can try to make a primitive action
    in the direction of the goal. However, at the next state it can choose to take an option that takes it
    either to the other hallway or back with probability 2/5. As the agent makes progress towards the goal state,
    it's more likely that the option will get activated along the way. Since no intra-option learning is happening,
    the value is not attributed to the states surrounding the goal.
    """
    def __init__(self,
                 env: MiniGridEnv,
                 options: Options,
                 policy: PolicyOverOptions,
                 loglevel: int = 20):
        self.env = env
        self.options = options
        self.option_names_dict = {o.name: o for o in self.options}
        self.option_idx_dict = {
            name: i
            for i, name in enumerate(self.option_names_dict)
        }

        self._policy = policy
        self.logger = ProjectLogger(level=loglevel, printing=False)

    def policy(self, state, *args, **kwargs):
        option = self._policy(state, *args, **kwargs)
        return MarkovOption(starting_state=state,
                            initiation_set=option.initiation_set,
                            termination_function=option.termination,
                            policy=option.target_policy,
                            name=str(option))

    def q_learning(self,
                   n_episodes: int,
                   γ: float = 0.9,
                   Q: np.ndarray = None,
                   N: np.ndarray = None,
                   α: float = None,
                   render: bool = False):

        env = self.env.unwrapped
        n_options = len(self.options)
        state_space_dim = (4, env.width, env.height)
        dim = (n_options, *state_space_dim)

        if Q is None:
            N = np.zeros(dim)
            Q = np.zeros(dim)

        for episode in range(n_episodes):

            self.env.reset()
            state = (env.agent_dir, *reversed(env.agent_pos))
            executing_option = self.policy(Q, state)
            done = False

            while not done:

                # Step through environment
                a = executing_option.policy(state)
                obs, reward, done, info = self.env.step(a)
                # TODO: infer the state of the agent from obs, i.e. make it POMDP
                s_next = (env.agent_dir, *reversed(env.agent_pos))

                if render:
                    action_name = list(env.actions)[a].name
                    self.logger.debug(f"State: {state}, "
                                      f"Option: {executing_option}, "
                                      f"Action: {action_name}, "
                                      f"Next State: {s_next}")
                    self.env.render()
                    time.sleep(0.05)

                # Update option
                executing_option.k += 1
                executing_option.cumulant += γ**executing_option.k * reward

                # Check for termination condition and update action-values
                if executing_option.termination_function(s_next) == 1 or done:

                    start_state = (self.option_idx_dict[executing_option.name],
                                   *executing_option.starting_state)

                    # Determine the step-size
                    if α is None:
                        N[start_state] += 1
                        alpha = 1 / N[start_state]
                    else:
                        alpha = α

                    # Update Q in the direction of the optimal action
                    r = executing_option.cumulant
                    k = executing_option.k
                    o = randargmax(Q[(slice(None), *s_next)])
                    target = r + γ**k * Q[(o, *s_next)]
                    Q[start_state] += alpha * (target - Q[start_state])

                    # Choose the next option
                    executing_option = self.policy(Q, s_next)

                # Reset the state
                state = s_next
            yield Q, self.env.step_count

        return Q, self.env.step_count

    @staticmethod
    def plot_episode_duration(steps_per_episode):
        traces = list()
        for option_set, steps_per_episode in steps_per_episode.items():
            traces.append(
                go.Scatter(
                    mode='lines',
                    y=steps_per_episode,
                    name=option_set,
                ))

        layout = dict(height=700,
                      showlegend=True,
                      xaxis=dict(title='Episodes', ),
                      yaxis=dict(title='Steps per episode', ))
        return {'data': traces, 'layout': layout}
Example #9
0
class SMDPModelLearning:
    """ Model learning in Semi-Markov Decision Process via MC sampling """
    def __init__(self, env: MiniGridEnv, options: Options, loglevel: int = 10):
        self.env = env
        self.options = options
        self.option_names_dict = {o.name: o for o in self.options}
        self.option_idx_dict = {
            name: i
            for i, name in enumerate(self.option_names_dict)
        }
        self.logger = ProjectLogger(level=loglevel, printing=True)

    def __str__(self):
        return 'SMDPModelLearning'

    def choose_option(self, state):
        """ Picks an option at random """
        options = [o for o in self.options if o.initiation_set[state] == 1]
        option = random.choice(options)
        return MarkovOption(starting_state=state,
                            initiation_set=option.initiation_set,
                            termination_function=option.termination_function,
                            policy=option.policy,
                            name=str(option))

    def run_episode(self,
                    N: np.ndarray = None,
                    R: np.ndarray = None,
                    P: np.ndarray = None,
                    γ: float = 0.9,
                    render: bool = False):

        env = self.env.unwrapped
        n_options = len(self.options)
        state_space_dim = (4, env.width, env.height)
        dim = (n_options, *state_space_dim)

        if R is None:
            R = np.zeros(dim)
            N = np.zeros(dim)
            P = np.zeros((n_options, *state_space_dim, *state_space_dim))

        self.env.reset()
        state = (self.env.agent_dir, *reversed(env.agent_pos))
        done = False
        executing_option = None

        while not done:

            if executing_option is None:
                executing_option = self.choose_option(state)

            a = executing_option.policy(state)
            obs, reward, done, info = self.env.step(a)
            state_next = (env.agent_dir, *reversed(env.agent_pos))

            if render:
                action_name = list(env.actions)[a].name
                self.logger.debug(f"State: {state}, "
                                  f"Option: {executing_option}, "
                                  f"Action: {action_name}, "
                                  f"Next State: {state_next}")
                self.env.render()
                time.sleep(0.05)

            executing_option.k += 1
            executing_option.cumulant += γ**executing_option.k * reward

            # Check for termination condition and update the model
            if executing_option.termination_function(state_next) == 1:
                option_state = (self.option_idx_dict[executing_option.name],
                                *executing_option.starting_state)
                # Update visitation counter
                N[option_state] += 1
                α = 1 / N[option_state]

                # Update reward matrix
                R[option_state] += α * (executing_option.cumulant -
                                        R[option_state])

                # Update probability transition matrix
                P[(*option_state, *state_next)] += α * (γ**executing_option.k)
                P[option_state] -= α * P[option_state]

                executing_option = None

            state = state_next
            yield N, R, P

        return N, R, P

    def get_true_models(self, seed: int = 1337, γ: float = 0.9):
        """ Learn true dynamics (P) and reward (R) models by unrolling each
        option for each state in its initiation set until termination
        # TODO: need to call multiple times if the environment is stochastic?
        """

        np.random.seed(seed)

        n_options = len(self.options)
        state_space_dim = (4, self.env.width, self.env.height)
        dim = (n_options, *state_space_dim)

        R = np.zeros(dim)
        N = np.zeros(dim)
        P = np.zeros((n_options, *state_space_dim, *state_space_dim))

        self.env.reset()

        for option_i, option in tqdm(enumerate(self.options)):
            for state, active in np.ndenumerate(option.initiation_set):

                # Checks if the state is in initiation set
                if not active:
                    continue

                env = self.env.unwrapped
                env.agent_dir, env.agent_pos = state[0], tuple(
                    reversed(state[1:]))
                cell = self.env.grid.get(*env.agent_pos)

                # Check if the state is valid for the agent to be in
                if not (cell is None or cell.can_overlap()):
                    continue

                # Activate an option and run until termination
                option = MarkovOption(starting_state=state,
                                      initiation_set=option._initiation_set,
                                      termination_set=option._termination_set,
                                      policy=option._policy,
                                      name=str(option))
                while True:
                    a = option.policy(state)
                    obs, reward, done, info = self.env.step(a)
                    env = self.env.unwrapped
                    state_next = (env.agent_dir, *reversed(env.agent_pos))
                    self.logger.debug(f"State: {state}, "
                                      f"Option: {option}, "
                                      f"Action: {a}, "
                                      f"Next State: {state_next}")
                    state = state_next
                    option.k += 1
                    option.cumulant += γ**option.k * reward
                    if option.termination_function(state):
                        break

                # Update option models
                option_state = (option_i, *option.starting_state)
                R[option_state] = option.cumulant
                P[(*option_state, *state)] = γ**option.k

        return N, R, P
Example #10
0
class IntraOptionQLearning(ControlDemon):
    """
        We turn now to the intra-option learning of option values and thus of optimal policies
        over options. If the options are semi-Markov, then again the SMDP methods described in
        Section 5.2 are probably the only feasible methods; a semi-Markov option must be completed
        before it can be evaluated in any way. But if the options are Markov and we are willing to
        look inside them, then we can consider intra-option methods. Just as in the case of model
        learning, intra-option methods for value learning are potentially more efficient than SMDP
        methods because they extract more training examples from the same experience.
    """
    
    def __init__(self,
                 env: MiniGridEnv,
                 options: List[Option],
                 target_policy: PolicyOverOptions,
                 feature_generator: FeatureGenerator,
                 weights,  # TODO: type?
                 lr: LearningRate = LearningRate(1, 1, 0),
                 gamma: float = 0.9,
                 loglevel: int = 20,
                 ):
        self.env = env
        # TODO: record all the rewards at the SMDP level?
        self.cumulant = lambda: 0
        
        super().__init__(target_policy=target_policy,
                         termination=lambda: 1,
                         eligibility=lambda: 1,
                         cumulant=self.cumulant,
                         feature=feature_generator,
                         behavioural_policy=target_policy,
                         weights=weights,
                         id=repr(self))
        
        self.options = self.Ω = options
        self.option_idx_dict = {str(o): i for i, o in enumerate(self.options)}
        
        self.lr = lr
        self.gamma = gamma
        
        self.logger = ProjectLogger(level=loglevel, printing=False)
    
    def advantage(self, state):
        """ Used as an unbiased estimator with reduced variance. """
        Q = self.predict(state)
        return Q - np.dot(self.π.pmf(state, Q), Q)
    
    def utility(self, state, option: Option) -> float:
        """ Utility of persisting with the same option vs picking another. """
        ω = self.option_idx_dict[str(option)]
        β = option.β.pmf(state)
        Q = self.predict(state)
        continuation_value = (1 - β) * Q[ω]
        termination_value = β * np.dot(self.π.pmf(state, Q), Q)
        return continuation_value + termination_value
    
    def loss(self,
             s0: np.ndarray,
             o: Option,
             r: float,
             s1: np.ndarray,
             done: bool):
        """ Calculates an Intra-Option Q-learning loss """
        # TODO: rewrite in terms of experience instead
        γ = self.gamma
        ω = self.option_idx_dict[str(o)]
        
        δ = r - self.predict(s0)[ω]
        if not done:
            δ += γ * self.utility(s1, o)
        return δ
    
    def update(self, experience: Transition):
        raise NotImplementedError
    
    def learn_option_values(self, render: bool = False):
        
        env = self.env.unwrapped
        state = self.env.reset()
        executing_option = self.target_policy(state)
        done = False
        while not done:
            
            # Step through environment
            a = executing_option.policy(state)
            s_next, reward, done, info = self.env.step(a)
            
            action_name = list(env.actions)[a].name
            
            # TODO: structure experience in (s, a, r, s') tuples
            self.logger.debug(f"State: {state}, "
                              f"Option: {executing_option}, "
                              f"Action: {action_name}, "
                              f"Next State: {s_next}")
            
            if render:
                self.env.render()
                time.sleep(0.05)
            
            # Update option-values for every option consistent with `a`
            for option in self.options:
                if option.policy(state) == a:
                    self.update(state, reward, s_next, option, done)
            
            # Terminate the option
            if executing_option.termination(s_next) or done:
                executing_option = self.target_policy(s_next)
            
            # Reset the state
            state = s_next
        
        return self.env.step_count
Example #11
0
class OptionCriticNetwork:
    """
    
    Here the weights for both critic and actor are learned by a single NN
    with multiple heads.
    """
    def __init__(
            self,
            env: MiniGridEnv,
            feature_generator: NatureConvBody,
            critic: IntraOptionDeepQLearning,
            actor: PolicyOverOptions,
            optimizer: torch.optim.Optimizer,
            gamma: float = 0.99,
            loglevel: int = 20,
            rng: np.random.RandomState = np.random.RandomState(1),
    ):
        self.env = env
        self.γ = gamma
        # shared between both critic and actor
        self.feature_generator = feature_generator
        self.critic = critic
        self.actor = actor
        # self.network = torch.nn.Sequential(self.feature_generator, self.critic.critic)
        self.target_network = deepcopy(self.critic.critic)
        self.optimizer = optimizer
        self.logger = ProjectLogger(level=loglevel)
        self.rng = rng
        # TODO: check if the weights of the `network` change
        # self.target_network.load_state_dict(self.network.state_dict())

    def learn(self, config):

        # Trackers
        cumulant = 0.
        duration = 0
        option_switches = 0
        avgduration = 0.

        # Initialize
        s0 = self.env.reset()
        s0 = f.normalize(s0, dim=(2, 3))
        φ0 = self.feature_generator(s0)
        Q = self.critic(φ0)
        option = self.actor(φ0, action_values=Q)
        ω = self.actor.option_idx_dict[str(option)]

        # Run until episode termination
        done = False
        while not done:
            π = option.π(φ0)
            # print(π)
            dist = torch.distributions.Categorical(probs=π)
            action, entropy = dist.sample(), dist.entropy()

            s1, r, done, info = self.env.step(int(action))
            s1 = f.normalize(s1, dim=(2, 3))
            φ1 = self.feature_generator(s1)
            # with torch.no_grad():
            target_Q = self.critic(φ1)
            β = option.β(φ1)
            experience = TorchTransition(s0=s0,
                                         o=option,
                                         r=r,
                                         s1=s1,
                                         done=done,
                                         φ0=φ0,
                                         φ1=φ1,
                                         Q=Q,
                                         target_Q=target_Q,
                                         π=π,
                                         β=β)
            q_loss = self.critic.loss(experience).mean()

            critique = self.critic.estimate_advantage(experience)
            pi_loss = -(torch.log(π)[:, action] *
                        critique.detach()) - config.entropy_weight * entropy
            pi_loss = pi_loss.mean()
            termination_advantage = self.critic.advantage(φ1, Q)[:, ω]
            beta_loss = (β * (termination_advantage.detach() + config.η) *
                         (1 - done)).mean()

            self.optimizer.zero_grad()
            # print(pi_loss + q_loss + beta_loss)
            # print(pi_loss, q_loss, beta_loss, '\n')
            (pi_loss + q_loss + beta_loss).mean().backward(retain_graph=True)
            torch.nn.utils.clip_grad_norm_(self.feature_generator.parameters(),
                                           config.gradient_clip)
            torch.nn.utils.clip_grad_norm_(self.critic.critic.parameters(),
                                           config.gradient_clip)
            torch.nn.utils.clip_grad_norm_(option.π.parameters(),
                                           config.gradient_clip)
            torch.nn.utils.clip_grad_norm_(option.β.parameters(),
                                           config.gradient_clip)
            self.optimizer.step()

            # self.plot_grad_flow(self.feature_generator.named_parameters())
            # plt.show()
            # plt.close()
            # self.plot_grad_flow(self.critic.critic.named_parameters())
            # plt.show()
            # plt.close()

            # Choose another option in case the current one terminates
            if β > self.rng.uniform():
                option = self.actor(φ1, action_values=Q)
                ω = self.actor.option_idx_dict[str(option)]
                option_switches += 1
                avgduration += (1. / option_switches) * (duration -
                                                         avgduration)
                duration = 0

            s0 = s1
            φ0 = φ1
            Q = target_Q

            if self.env.step_count % 1000 == 0:
                print(self.env.step_count, self.env)

            # if self.env.step_count % config.target_network_update_freq == 0:
            #     self.target_network.load_state_dict(
            #         self.critic.critic.state_dict())

            cumulant += r
            duration += 1

        self.logger.info(f'steps {self.env.unwrapped.step_count}\n'
                         f'cumulant {round(cumulant, 2)}\n'
                         f'avg. duration {round(avgduration, 2)}\n'
                         f'switches {option_switches}\n'
                         f'critic lr {self.critic.lr.rate}\n'
                         f'')

    def plot_grad_flow(self, named_parameters):
        '''Plots the gradients flowing through different layers in the net during training.
        Can be used for checking for possible gradient vanishing / exploding problems.

        Usage: Plug this function in Trainer class after loss.backwards() as
        "plot_grad_flow(self.model.named_parameters())" to visualize the gradient flow'''
        ave_grads = []
        max_grads = []
        layers = []
        for n, p in named_parameters:
            if (p.requires_grad) and ("bias" not in n):
                layers.append(n)
                ave_grads.append(p.grad.abs().mean())
                max_grads.append(p.grad.abs().max())
        plt.bar(np.arange(len(max_grads)),
                max_grads,
                alpha=0.1,
                lw=1,
                color="c")
        plt.bar(np.arange(len(max_grads)),
                ave_grads,
                alpha=0.1,
                lw=1,
                color="b")
        plt.hlines(0, 0, len(ave_grads) + 1, lw=2, color="k")
        plt.xticks(range(0, len(ave_grads), 1), layers, rotation="vertical")
        plt.xlim(left=0, right=len(ave_grads))
        plt.ylim(bottom=-0.001,
                 top=0.02)  # zoom in on the lower gradient regions
        plt.xlabel("Layers")
        plt.ylabel("average gradient")
        plt.title("Gradient flow")
        plt.grid(True)
        plt.legend([
            Line2D([0], [0], color="c", lw=4),
            Line2D([0], [0], color="b", lw=4),
            Line2D([0], [0], color="k", lw=4)
        ], ['max-gradient', 'mean-gradient', 'zero-gradient'])
Example #12
0
from hrl.frameworks.options.hard_coded_options import HallwayOption, PrimitiveOption
from hrl.project_logger import ProjectLogger
from hrl.utils import cache
from hrl.visualization.plotter_one_hot import PlotterOneHot
""" Evaluate the benefits of planning with options. """

SAVEPATH = Path(f'{EXPERIMENT_DIR}/SMDP_planning')

if __name__ == '__main__':

    # Create environment
    env = FullyObsWrapper(FourRooms(goal_pos=(15, 15)))

    # Create loggers
    LOGLEVEL = 20
    logger = ProjectLogger(level=LOGLEVEL, printing=False)
    logger.critical(env)
    plotter = PlotterOneHot(env=env)
    SAVEPATH /= env.unwrapped.__class__.__name__
    SAVEPATH.mkdir(parents=True, exist_ok=True)

    # Create hard-coded options
    options = [
        HallwayOption(o, env.observation_space.shape[::-1])
        for o in sorted(HallwayOption.hallway_options)
    ]
    options += [
        PrimitiveOption(o, env.observation_space.shape[::-1])
        for o in sorted(PrimitiveOption.primitive_options)
    ]
Example #13
0
            ))

    return options


if __name__ == '__main__':
    # Create environment
    tasks = iter([(15, 15), (10, 17), (17, 10)])
    env = OneHotObsWrapper(
        SimplifyActionSpace(FourRooms(agent_pos=(1, 1), goal_pos=next(tasks))))
    env.unwrapped.max_steps = 1000000
    # env.step = partial(stochastic_step, env)

    # Set up loggers
    loglevel = 10
    logger = ProjectLogger(level=loglevel, printing=False)
    plotter = PlotterOneHot(env)
    # db = redis.StrictRedis(port=6379)

    # Create options
    rng = np.random.RandomState(1338)
    n = 8
    options = create_options(env, n=n, rng=rng)

    # Define actors
    actor = EgreedyPolicy(ε=0.02, rng=rng, options=options, loglevel=20)

    # Define critics
    α = LearningRate(start_rate=0.5, min_rate=0.4, decay=0.5 / 10000)

    class Feature:
Example #14
0
class IntraOptionModelLearning:
    
    def __init__(self,
                 env: MiniGridEnv,
                 options: Option,
                 loglevel: int = 20):
        self.env = env
        self.options = options
        self.option_names_dict = {o.name: o for o in self.options}
        self.option_idx_dict = {name: i for i, name in
                                enumerate(self.option_names_dict)}
        self.logger = ProjectLogger(level=loglevel, printing=False)
    
    def __str__(self):
        return 'IntraOptionModelLearning'
    
    def choose_option(self, state):
        """ Picks an option at random """
        options = [o for o in self.options if o.initiation_set[state] == 1]
        return random.choice(options)
    
    def run_episode(self,
                    N: np.ndarray = None,
                    R: np.ndarray = None,
                    P: np.ndarray = None,
                    γ: float = 0.9,
                    α: float = None,
                    render: bool = False):
        
        env = self.env.unwrapped
        n_options = len(self.options)
        state_space = (4, env.width, env.height)
        dim = (n_options, *state_space)
        
        if R is None:
            R = np.zeros(dim)
            N = np.zeros(dim)
            P = np.zeros((n_options, *state_space, *state_space))
        
        state = self.env.reset()
        done = False
        executing_option = None
        
        while not done:
            
            if executing_option is None:
                executing_option = self.choose_option(state)
            
            a = executing_option.policy(state)
            s_next, reward, done, info = self.env.step(a)
            
            if render:
                action_name = list(env.actions)[a].name
                self.logger.debug(f"State: {state}, "
                                  f"Option: {executing_option}, "
                                  f"Action: {action_name}, "
                                  f"Next State: {s_next}")
                self.env.render()
                time.sleep(0.05)
            
            # Update model for every option consistent with last action taken
            for option in self.options:
                
                if option.policy(state) != a:
                    continue
                
                o = self.option_idx_dict[executing_option.name]
                option_state = (o, *state)
                
                # Update visitation counter
                if α is None:
                    N[option_state] += 1
                    alpha = 1 / N[option_state]
                else:
                    alpha = α
                
                # Update reward matrix
                β = option.termination_function(s_next)
                target = reward + γ * (1 - β) * R[(o, *s_next)]
                R[option_state] += alpha * (target - R[option_state])
                
                # Update probability transition matrix
                target = γ * (1 - β) * P[o, :, :, :, s_next[0], s_next[1],
                                       s_next[2]]
                P[option_state] += alpha * (target - P[option_state])
                P[(o, *state, *s_next)] += alpha * γ * β
            
            if executing_option.termination_function(s_next) == 1:
                executing_option = None
            
            state = s_next
            yield N, R, P
        
        return N, R, P
Example #15
0
class OptionCriticAgent:
    """  Actor-Critic style agent for learning options in a differentiable way.
    
    Architecture Overview:
    The option policies parameterized by θ, termination functions
    parameterized by υ and policy over options µθ belong to the actor part
    of the system while the critic consists of Q_U (action-value upon entering)
    and A_Ω (advantages of the policy over options).
    Policy over options is learned via Intra-Option Q-Learning, but could
    also be learned using policy gradients at SMDP level.
    The algorithm uses variations of Policy Gradients theorem, to learn option's
    policies and termination functions from a single stream of experience.
    """

    # TODO: add deliberation cost

    def __init__(self,
                 env: MiniGridEnv,
                 critic: IntraOptionQLearning,
                 actor: PolicyOverOptions,
                 action_critic: IntraOptionActionLearning = None,
                 gamma: float = 0.99,
                 loglevel: int = 20):
        self.env = env
        self.γ = gamma
        self.critic = critic
        self.actor = actor
        self.logger = ProjectLogger(level=loglevel)
        if action_critic is not None:
            self.advantage_estimator = None
            self.action_critic = action_critic
        else:
            self.advantage_estimator = 'io'
            self.logger.info(f'Action-critic was not provided, '
                             f'so the advantages for PG would be estimated')

    def estimate_advantages(self, state, option: Option, reward: float,
                            s_next):
        ω = self.actor.option_idx_dict[str(option)]
        if self.advantage_estimator == 'io':
            # Intra-Option Advantage Estimator
            Q = self.critic(s_next)[ω]
            advantage = reward + self.γ * Q - self.critic(state)[ω]
        elif self.advantage_estimator == 'augmented':
            # Augmented Advantage Estimator
            # FIXME: utility should be calculated wrt to the next option!
            U = self.critic.utility(state, option)
            advantage = reward + self.γ * U - self.critic(state)[ω]
        else:
            raise ValueError(f'Unknown estimator {self.advantage_estimator}')
        return advantage

    def learn(self, baseline: bool = False, render: bool = False):

        # Trackers
        cumulant = 0.
        duration = 0
        option_switches = 0
        avgduration = 0.

        # Initialize s0 and pick an option
        s0 = self.env.reset()

        option = self.actor(s0, action_values=self.critic(s0))
        ω = self.actor.option_idx_dict[str(option)]

        # Run until episode termination
        done = False
        while not done:
            if render:
                self.env.render()
                time.sleep(0.05)

            # Take action (a), observe next state (s1) and reward (r)
            a = option.π(s0)
            s1, r, done, _ = self.env.step(a)
            experience = Transition(s0, option, r, s1, done)

            # Option evaluation step
            self.critic.update(experience)
            if self.advantage_estimator is None:
                self.action_critic.update(s0, a, r, s1, option, done)

            # Option improvement step
            if not isinstance(option.π, PrimitivePolicy):
                if self.advantage_estimator is None:
                    critique = self.action_critic(s0, ω, a)
                    if baseline:
                        critique -= self.critic(s0)[ω]
                else:
                    critique = self.estimate_advantages(s0, option, r, s1)

                if critique:
                    option.π.update(s0, a, critique)

                termination_advantage = self.critic.advantage(s1)[ω]
                if termination_advantage:
                    option.β.update(s1, termination_advantage)

            # Choose another option in case the current one terminates
            if option.termination(s1):
                option = self.actor(s1, action_values=self.critic(s1))
                ω = self.actor.option_idx_dict[str(option)]
                option_switches += 1
                avgduration += (1. / option_switches) * (duration -
                                                         avgduration)
                duration = 0

            s0 = s1
            cumulant += r
            duration += 1

        self.logger.info(f'steps {self.env.unwrapped.step_count}\n'
                         f'cumulant {round(cumulant, 2)}\n'
                         f'avg. duration {round(avgduration, 2)}\n'
                         f'switches {option_switches}\n'
                         f'critic lr {self.critic.lr.rate}\n'
                         f'')
Example #16
0
     # ReseedWrapper
     env = Torch(FullyObsWrapper(SimplifyActionSpace(env)))
     # env.step = partial(stochastic_step, env)
     return env
 
 
 env = setup_env(FourRooms(goal_pos=tasks.pop(0)))
 env.unwrapped.max_steps = 1000000
 obs = env.reset()
 n_states = env.observation_space
 n_actions = env.action_space.n + 1
 
 # Set up loggers
 # TODO: use RLlog
 loglevel = 20
 logger = ProjectLogger(level=loglevel, printing=False)
 plotter = PlotterOneHot(env)
 db = redis.StrictRedis(port=6379)
 logger.critical(env)
 
 # Define a network shared across options' policies and terminations,
 # as well as the critic
 net = NatureConvBody(in_channels=3)
 params = [net.parameters()]
 
 # Create options
 rng = np.random.RandomState(1338)
 n_options = 8
 options, options_params = create_options(n_options, net.feature_dim,
                                          env.action_space.n)
 
Example #17
0
from hrl.frameworks.options.intra_option import IntraOptionValueLearning
from hrl.project_logger import ProjectLogger
from hrl.visualization import PlotterOneHot

SAVEPATH = Path(f'{EXPERIMENT_DIR}/value_learning')

if __name__ == '__main__':

    # Create environment
    tasks = iter([(15, 15), (10, 17), (17, 10), (17, 1), (8, 8)])
    env = FullyObsWrapper(FourRooms(goal_pos=next(tasks)))
    env.unwrapped.max_steps = 1000000

    # Create loggers
    LOGLEVEL = 10
    logger = ProjectLogger(level=LOGLEVEL, printing=False)
    logger.critical(env)
    plotter = PlotterOneHot(env=env)
    SAVEPATH /= env.unwrapped.__class__.__name__
    SAVEPATH.mkdir(parents=True, exist_ok=True)

    # Create hard-coded options
    options = [
        HallwayOption(o, env.observation_space.shape[::-1])
        for o in sorted(HallwayOption.hallway_options)
    ]
    options += [
        PrimitiveOption(o, env.observation_space.shape[::-1])
        for o in sorted(PrimitiveOption.primitive_options)
    ]