class SMDPPlanning: """ Estimates value function given learned models R and P """ def __init__(self, env: MiniGridEnv, R: np.ndarray, P: np.ndarray, loglevel: int = 10): self.env = env option_dim, self.state_space_dim = R.shape[0], R.shape[1:] state_space_flat = np.prod(self.state_space_dim) self.R = R.reshape((option_dim, state_space_flat)) self.P = P.reshape((option_dim, state_space_flat, state_space_flat)) self.V = np.zeros(state_space_flat) self.logger = ProjectLogger(level=loglevel, printing=True) def svi(self, θ: float = 1e-9): """ Iterative Policy Evaluation using Synchronous Value Iteration. Estimates V by acting greedy wrt to V_hat (current estimate of V) """ δ = float('inf') while δ > θ: v_old = self.V self.V = (self.R + np.dot(self.P, self.V)).max(axis=0) δ = np.sum(np.abs(self.V - v_old)) self.logger.debug(f'State-value delta: {δ}') yield self.V.reshape(self.state_space_dim) return self.V.reshape(self.state_space_dim)
def __init__(self, env: MiniGridEnv, options: List[Option], target_policy: PolicyOverOptions, feature_generator: FeatureGenerator, weights, # TODO: type? lr: LearningRate = LearningRate(1, 1, 0), gamma: float = 0.9, loglevel: int = 20, ): self.env = env # TODO: record all the rewards at the SMDP level? self.cumulant = lambda: 0 super().__init__(target_policy=target_policy, termination=lambda: 1, eligibility=lambda: 1, cumulant=self.cumulant, feature=feature_generator, behavioural_policy=target_policy, weights=weights, id=repr(self)) self.options = self.Ω = options self.option_idx_dict = {str(o): i for i, o in enumerate(self.options)} self.lr = lr self.gamma = gamma self.logger = ProjectLogger(level=loglevel, printing=False)
def __init__(self, env: MiniGridEnv, options: Options, loglevel: int = 10): self.env = env self.options = options self.option_names_dict = {o.name: o for o in self.options} self.option_idx_dict = { name: i for i, name in enumerate(self.option_names_dict) } self.logger = ProjectLogger(level=loglevel, printing=True)
def __init__(self, env: MiniGridEnv, R: np.ndarray, P: np.ndarray, loglevel: int = 10): self.env = env option_dim, self.state_space_dim = R.shape[0], R.shape[1:] state_space_flat = np.prod(self.state_space_dim) self.R = R.reshape((option_dim, state_space_flat)) self.P = P.reshape((option_dim, state_space_flat, state_space_flat)) self.V = np.zeros(state_space_flat) self.logger = ProjectLogger(level=loglevel, printing=True)
def __init__(self, env: MiniGridEnv, critic: IntraOptionQLearning, actor: PolicyOverOptions, action_critic: IntraOptionActionLearning = None, gamma: float = 0.99, loglevel: int = 20): self.env = env self.γ = gamma self.critic = critic self.actor = actor self.logger = ProjectLogger(level=loglevel) if action_critic is not None: self.advantage_estimator = None self.action_critic = action_critic else: self.advantage_estimator = 'io' self.logger.info(f'Action-critic was not provided, ' f'so the advantages for PG would be estimated')
def __init__(self, options: List[Option], rng: np.random.RandomState, loglevel: int): self.options = options self.option_names_dict = {str(o): o for o in self.options} self.option_idx_dict = {name: i for i, name in enumerate(self.option_names_dict)} self.rng = rng self.logger = ProjectLogger(level=loglevel, printing=False)
def __init__( self, env: MiniGridEnv, feature_generator: NatureConvBody, critic: IntraOptionDeepQLearning, actor: PolicyOverOptions, optimizer: torch.optim.Optimizer, gamma: float = 0.99, loglevel: int = 20, rng: np.random.RandomState = np.random.RandomState(1), ): self.env = env self.γ = gamma # shared between both critic and actor self.feature_generator = feature_generator self.critic = critic self.actor = actor # self.network = torch.nn.Sequential(self.feature_generator, self.critic.critic) self.target_network = deepcopy(self.critic.critic) self.optimizer = optimizer self.logger = ProjectLogger(level=loglevel) self.rng = rng
class SMDPValueLearning: """ Algorithms for finding an optimal policy over a set of options in SMDP. Treats each option as an indivisible unit. Does not work well in this setting, since the rooms are larger than in the original experiment and thus the probability of stumbling across a goal state, while performing primitive actions only is much smaller. Consider a case when the agent is at the hallway state. It can try to make a primitive action in the direction of the goal. However, at the next state it can choose to take an option that takes it either to the other hallway or back with probability 2/5. As the agent makes progress towards the goal state, it's more likely that the option will get activated along the way. Since no intra-option learning is happening, the value is not attributed to the states surrounding the goal. """ def __init__(self, env: MiniGridEnv, options: Options, policy: PolicyOverOptions, loglevel: int = 20): self.env = env self.options = options self.option_names_dict = {o.name: o for o in self.options} self.option_idx_dict = { name: i for i, name in enumerate(self.option_names_dict) } self._policy = policy self.logger = ProjectLogger(level=loglevel, printing=False) def policy(self, state, *args, **kwargs): option = self._policy(state, *args, **kwargs) return MarkovOption(starting_state=state, initiation_set=option.initiation_set, termination_function=option.termination, policy=option.target_policy, name=str(option)) def q_learning(self, n_episodes: int, γ: float = 0.9, Q: np.ndarray = None, N: np.ndarray = None, α: float = None, render: bool = False): env = self.env.unwrapped n_options = len(self.options) state_space_dim = (4, env.width, env.height) dim = (n_options, *state_space_dim) if Q is None: N = np.zeros(dim) Q = np.zeros(dim) for episode in range(n_episodes): self.env.reset() state = (env.agent_dir, *reversed(env.agent_pos)) executing_option = self.policy(Q, state) done = False while not done: # Step through environment a = executing_option.policy(state) obs, reward, done, info = self.env.step(a) # TODO: infer the state of the agent from obs, i.e. make it POMDP s_next = (env.agent_dir, *reversed(env.agent_pos)) if render: action_name = list(env.actions)[a].name self.logger.debug(f"State: {state}, " f"Option: {executing_option}, " f"Action: {action_name}, " f"Next State: {s_next}") self.env.render() time.sleep(0.05) # Update option executing_option.k += 1 executing_option.cumulant += γ**executing_option.k * reward # Check for termination condition and update action-values if executing_option.termination_function(s_next) == 1 or done: start_state = (self.option_idx_dict[executing_option.name], *executing_option.starting_state) # Determine the step-size if α is None: N[start_state] += 1 alpha = 1 / N[start_state] else: alpha = α # Update Q in the direction of the optimal action r = executing_option.cumulant k = executing_option.k o = randargmax(Q[(slice(None), *s_next)]) target = r + γ**k * Q[(o, *s_next)] Q[start_state] += alpha * (target - Q[start_state]) # Choose the next option executing_option = self.policy(Q, s_next) # Reset the state state = s_next yield Q, self.env.step_count return Q, self.env.step_count @staticmethod def plot_episode_duration(steps_per_episode): traces = list() for option_set, steps_per_episode in steps_per_episode.items(): traces.append( go.Scatter( mode='lines', y=steps_per_episode, name=option_set, )) layout = dict(height=700, showlegend=True, xaxis=dict(title='Episodes', ), yaxis=dict(title='Steps per episode', )) return {'data': traces, 'layout': layout}
class SMDPModelLearning: """ Model learning in Semi-Markov Decision Process via MC sampling """ def __init__(self, env: MiniGridEnv, options: Options, loglevel: int = 10): self.env = env self.options = options self.option_names_dict = {o.name: o for o in self.options} self.option_idx_dict = { name: i for i, name in enumerate(self.option_names_dict) } self.logger = ProjectLogger(level=loglevel, printing=True) def __str__(self): return 'SMDPModelLearning' def choose_option(self, state): """ Picks an option at random """ options = [o for o in self.options if o.initiation_set[state] == 1] option = random.choice(options) return MarkovOption(starting_state=state, initiation_set=option.initiation_set, termination_function=option.termination_function, policy=option.policy, name=str(option)) def run_episode(self, N: np.ndarray = None, R: np.ndarray = None, P: np.ndarray = None, γ: float = 0.9, render: bool = False): env = self.env.unwrapped n_options = len(self.options) state_space_dim = (4, env.width, env.height) dim = (n_options, *state_space_dim) if R is None: R = np.zeros(dim) N = np.zeros(dim) P = np.zeros((n_options, *state_space_dim, *state_space_dim)) self.env.reset() state = (self.env.agent_dir, *reversed(env.agent_pos)) done = False executing_option = None while not done: if executing_option is None: executing_option = self.choose_option(state) a = executing_option.policy(state) obs, reward, done, info = self.env.step(a) state_next = (env.agent_dir, *reversed(env.agent_pos)) if render: action_name = list(env.actions)[a].name self.logger.debug(f"State: {state}, " f"Option: {executing_option}, " f"Action: {action_name}, " f"Next State: {state_next}") self.env.render() time.sleep(0.05) executing_option.k += 1 executing_option.cumulant += γ**executing_option.k * reward # Check for termination condition and update the model if executing_option.termination_function(state_next) == 1: option_state = (self.option_idx_dict[executing_option.name], *executing_option.starting_state) # Update visitation counter N[option_state] += 1 α = 1 / N[option_state] # Update reward matrix R[option_state] += α * (executing_option.cumulant - R[option_state]) # Update probability transition matrix P[(*option_state, *state_next)] += α * (γ**executing_option.k) P[option_state] -= α * P[option_state] executing_option = None state = state_next yield N, R, P return N, R, P def get_true_models(self, seed: int = 1337, γ: float = 0.9): """ Learn true dynamics (P) and reward (R) models by unrolling each option for each state in its initiation set until termination # TODO: need to call multiple times if the environment is stochastic? """ np.random.seed(seed) n_options = len(self.options) state_space_dim = (4, self.env.width, self.env.height) dim = (n_options, *state_space_dim) R = np.zeros(dim) N = np.zeros(dim) P = np.zeros((n_options, *state_space_dim, *state_space_dim)) self.env.reset() for option_i, option in tqdm(enumerate(self.options)): for state, active in np.ndenumerate(option.initiation_set): # Checks if the state is in initiation set if not active: continue env = self.env.unwrapped env.agent_dir, env.agent_pos = state[0], tuple( reversed(state[1:])) cell = self.env.grid.get(*env.agent_pos) # Check if the state is valid for the agent to be in if not (cell is None or cell.can_overlap()): continue # Activate an option and run until termination option = MarkovOption(starting_state=state, initiation_set=option._initiation_set, termination_set=option._termination_set, policy=option._policy, name=str(option)) while True: a = option.policy(state) obs, reward, done, info = self.env.step(a) env = self.env.unwrapped state_next = (env.agent_dir, *reversed(env.agent_pos)) self.logger.debug(f"State: {state}, " f"Option: {option}, " f"Action: {a}, " f"Next State: {state_next}") state = state_next option.k += 1 option.cumulant += γ**option.k * reward if option.termination_function(state): break # Update option models option_state = (option_i, *option.starting_state) R[option_state] = option.cumulant P[(*option_state, *state)] = γ**option.k return N, R, P
class IntraOptionQLearning(ControlDemon): """ We turn now to the intra-option learning of option values and thus of optimal policies over options. If the options are semi-Markov, then again the SMDP methods described in Section 5.2 are probably the only feasible methods; a semi-Markov option must be completed before it can be evaluated in any way. But if the options are Markov and we are willing to look inside them, then we can consider intra-option methods. Just as in the case of model learning, intra-option methods for value learning are potentially more efficient than SMDP methods because they extract more training examples from the same experience. """ def __init__(self, env: MiniGridEnv, options: List[Option], target_policy: PolicyOverOptions, feature_generator: FeatureGenerator, weights, # TODO: type? lr: LearningRate = LearningRate(1, 1, 0), gamma: float = 0.9, loglevel: int = 20, ): self.env = env # TODO: record all the rewards at the SMDP level? self.cumulant = lambda: 0 super().__init__(target_policy=target_policy, termination=lambda: 1, eligibility=lambda: 1, cumulant=self.cumulant, feature=feature_generator, behavioural_policy=target_policy, weights=weights, id=repr(self)) self.options = self.Ω = options self.option_idx_dict = {str(o): i for i, o in enumerate(self.options)} self.lr = lr self.gamma = gamma self.logger = ProjectLogger(level=loglevel, printing=False) def advantage(self, state): """ Used as an unbiased estimator with reduced variance. """ Q = self.predict(state) return Q - np.dot(self.π.pmf(state, Q), Q) def utility(self, state, option: Option) -> float: """ Utility of persisting with the same option vs picking another. """ ω = self.option_idx_dict[str(option)] β = option.β.pmf(state) Q = self.predict(state) continuation_value = (1 - β) * Q[ω] termination_value = β * np.dot(self.π.pmf(state, Q), Q) return continuation_value + termination_value def loss(self, s0: np.ndarray, o: Option, r: float, s1: np.ndarray, done: bool): """ Calculates an Intra-Option Q-learning loss """ # TODO: rewrite in terms of experience instead γ = self.gamma ω = self.option_idx_dict[str(o)] δ = r - self.predict(s0)[ω] if not done: δ += γ * self.utility(s1, o) return δ def update(self, experience: Transition): raise NotImplementedError def learn_option_values(self, render: bool = False): env = self.env.unwrapped state = self.env.reset() executing_option = self.target_policy(state) done = False while not done: # Step through environment a = executing_option.policy(state) s_next, reward, done, info = self.env.step(a) action_name = list(env.actions)[a].name # TODO: structure experience in (s, a, r, s') tuples self.logger.debug(f"State: {state}, " f"Option: {executing_option}, " f"Action: {action_name}, " f"Next State: {s_next}") if render: self.env.render() time.sleep(0.05) # Update option-values for every option consistent with `a` for option in self.options: if option.policy(state) == a: self.update(state, reward, s_next, option, done) # Terminate the option if executing_option.termination(s_next) or done: executing_option = self.target_policy(s_next) # Reset the state state = s_next return self.env.step_count
class OptionCriticNetwork: """ Here the weights for both critic and actor are learned by a single NN with multiple heads. """ def __init__( self, env: MiniGridEnv, feature_generator: NatureConvBody, critic: IntraOptionDeepQLearning, actor: PolicyOverOptions, optimizer: torch.optim.Optimizer, gamma: float = 0.99, loglevel: int = 20, rng: np.random.RandomState = np.random.RandomState(1), ): self.env = env self.γ = gamma # shared between both critic and actor self.feature_generator = feature_generator self.critic = critic self.actor = actor # self.network = torch.nn.Sequential(self.feature_generator, self.critic.critic) self.target_network = deepcopy(self.critic.critic) self.optimizer = optimizer self.logger = ProjectLogger(level=loglevel) self.rng = rng # TODO: check if the weights of the `network` change # self.target_network.load_state_dict(self.network.state_dict()) def learn(self, config): # Trackers cumulant = 0. duration = 0 option_switches = 0 avgduration = 0. # Initialize s0 = self.env.reset() s0 = f.normalize(s0, dim=(2, 3)) φ0 = self.feature_generator(s0) Q = self.critic(φ0) option = self.actor(φ0, action_values=Q) ω = self.actor.option_idx_dict[str(option)] # Run until episode termination done = False while not done: π = option.π(φ0) # print(π) dist = torch.distributions.Categorical(probs=π) action, entropy = dist.sample(), dist.entropy() s1, r, done, info = self.env.step(int(action)) s1 = f.normalize(s1, dim=(2, 3)) φ1 = self.feature_generator(s1) # with torch.no_grad(): target_Q = self.critic(φ1) β = option.β(φ1) experience = TorchTransition(s0=s0, o=option, r=r, s1=s1, done=done, φ0=φ0, φ1=φ1, Q=Q, target_Q=target_Q, π=π, β=β) q_loss = self.critic.loss(experience).mean() critique = self.critic.estimate_advantage(experience) pi_loss = -(torch.log(π)[:, action] * critique.detach()) - config.entropy_weight * entropy pi_loss = pi_loss.mean() termination_advantage = self.critic.advantage(φ1, Q)[:, ω] beta_loss = (β * (termination_advantage.detach() + config.η) * (1 - done)).mean() self.optimizer.zero_grad() # print(pi_loss + q_loss + beta_loss) # print(pi_loss, q_loss, beta_loss, '\n') (pi_loss + q_loss + beta_loss).mean().backward(retain_graph=True) torch.nn.utils.clip_grad_norm_(self.feature_generator.parameters(), config.gradient_clip) torch.nn.utils.clip_grad_norm_(self.critic.critic.parameters(), config.gradient_clip) torch.nn.utils.clip_grad_norm_(option.π.parameters(), config.gradient_clip) torch.nn.utils.clip_grad_norm_(option.β.parameters(), config.gradient_clip) self.optimizer.step() # self.plot_grad_flow(self.feature_generator.named_parameters()) # plt.show() # plt.close() # self.plot_grad_flow(self.critic.critic.named_parameters()) # plt.show() # plt.close() # Choose another option in case the current one terminates if β > self.rng.uniform(): option = self.actor(φ1, action_values=Q) ω = self.actor.option_idx_dict[str(option)] option_switches += 1 avgduration += (1. / option_switches) * (duration - avgduration) duration = 0 s0 = s1 φ0 = φ1 Q = target_Q if self.env.step_count % 1000 == 0: print(self.env.step_count, self.env) # if self.env.step_count % config.target_network_update_freq == 0: # self.target_network.load_state_dict( # self.critic.critic.state_dict()) cumulant += r duration += 1 self.logger.info(f'steps {self.env.unwrapped.step_count}\n' f'cumulant {round(cumulant, 2)}\n' f'avg. duration {round(avgduration, 2)}\n' f'switches {option_switches}\n' f'critic lr {self.critic.lr.rate}\n' f'') def plot_grad_flow(self, named_parameters): '''Plots the gradients flowing through different layers in the net during training. Can be used for checking for possible gradient vanishing / exploding problems. Usage: Plug this function in Trainer class after loss.backwards() as "plot_grad_flow(self.model.named_parameters())" to visualize the gradient flow''' ave_grads = [] max_grads = [] layers = [] for n, p in named_parameters: if (p.requires_grad) and ("bias" not in n): layers.append(n) ave_grads.append(p.grad.abs().mean()) max_grads.append(p.grad.abs().max()) plt.bar(np.arange(len(max_grads)), max_grads, alpha=0.1, lw=1, color="c") plt.bar(np.arange(len(max_grads)), ave_grads, alpha=0.1, lw=1, color="b") plt.hlines(0, 0, len(ave_grads) + 1, lw=2, color="k") plt.xticks(range(0, len(ave_grads), 1), layers, rotation="vertical") plt.xlim(left=0, right=len(ave_grads)) plt.ylim(bottom=-0.001, top=0.02) # zoom in on the lower gradient regions plt.xlabel("Layers") plt.ylabel("average gradient") plt.title("Gradient flow") plt.grid(True) plt.legend([ Line2D([0], [0], color="c", lw=4), Line2D([0], [0], color="b", lw=4), Line2D([0], [0], color="k", lw=4) ], ['max-gradient', 'mean-gradient', 'zero-gradient'])
from hrl.frameworks.options.hard_coded_options import HallwayOption, PrimitiveOption from hrl.project_logger import ProjectLogger from hrl.utils import cache from hrl.visualization.plotter_one_hot import PlotterOneHot """ Evaluate the benefits of planning with options. """ SAVEPATH = Path(f'{EXPERIMENT_DIR}/SMDP_planning') if __name__ == '__main__': # Create environment env = FullyObsWrapper(FourRooms(goal_pos=(15, 15))) # Create loggers LOGLEVEL = 20 logger = ProjectLogger(level=LOGLEVEL, printing=False) logger.critical(env) plotter = PlotterOneHot(env=env) SAVEPATH /= env.unwrapped.__class__.__name__ SAVEPATH.mkdir(parents=True, exist_ok=True) # Create hard-coded options options = [ HallwayOption(o, env.observation_space.shape[::-1]) for o in sorted(HallwayOption.hallway_options) ] options += [ PrimitiveOption(o, env.observation_space.shape[::-1]) for o in sorted(PrimitiveOption.primitive_options) ]
)) return options if __name__ == '__main__': # Create environment tasks = iter([(15, 15), (10, 17), (17, 10)]) env = OneHotObsWrapper( SimplifyActionSpace(FourRooms(agent_pos=(1, 1), goal_pos=next(tasks)))) env.unwrapped.max_steps = 1000000 # env.step = partial(stochastic_step, env) # Set up loggers loglevel = 10 logger = ProjectLogger(level=loglevel, printing=False) plotter = PlotterOneHot(env) # db = redis.StrictRedis(port=6379) # Create options rng = np.random.RandomState(1338) n = 8 options = create_options(env, n=n, rng=rng) # Define actors actor = EgreedyPolicy(ε=0.02, rng=rng, options=options, loglevel=20) # Define critics α = LearningRate(start_rate=0.5, min_rate=0.4, decay=0.5 / 10000) class Feature:
class IntraOptionModelLearning: def __init__(self, env: MiniGridEnv, options: Option, loglevel: int = 20): self.env = env self.options = options self.option_names_dict = {o.name: o for o in self.options} self.option_idx_dict = {name: i for i, name in enumerate(self.option_names_dict)} self.logger = ProjectLogger(level=loglevel, printing=False) def __str__(self): return 'IntraOptionModelLearning' def choose_option(self, state): """ Picks an option at random """ options = [o for o in self.options if o.initiation_set[state] == 1] return random.choice(options) def run_episode(self, N: np.ndarray = None, R: np.ndarray = None, P: np.ndarray = None, γ: float = 0.9, α: float = None, render: bool = False): env = self.env.unwrapped n_options = len(self.options) state_space = (4, env.width, env.height) dim = (n_options, *state_space) if R is None: R = np.zeros(dim) N = np.zeros(dim) P = np.zeros((n_options, *state_space, *state_space)) state = self.env.reset() done = False executing_option = None while not done: if executing_option is None: executing_option = self.choose_option(state) a = executing_option.policy(state) s_next, reward, done, info = self.env.step(a) if render: action_name = list(env.actions)[a].name self.logger.debug(f"State: {state}, " f"Option: {executing_option}, " f"Action: {action_name}, " f"Next State: {s_next}") self.env.render() time.sleep(0.05) # Update model for every option consistent with last action taken for option in self.options: if option.policy(state) != a: continue o = self.option_idx_dict[executing_option.name] option_state = (o, *state) # Update visitation counter if α is None: N[option_state] += 1 alpha = 1 / N[option_state] else: alpha = α # Update reward matrix β = option.termination_function(s_next) target = reward + γ * (1 - β) * R[(o, *s_next)] R[option_state] += alpha * (target - R[option_state]) # Update probability transition matrix target = γ * (1 - β) * P[o, :, :, :, s_next[0], s_next[1], s_next[2]] P[option_state] += alpha * (target - P[option_state]) P[(o, *state, *s_next)] += alpha * γ * β if executing_option.termination_function(s_next) == 1: executing_option = None state = s_next yield N, R, P return N, R, P
class OptionCriticAgent: """ Actor-Critic style agent for learning options in a differentiable way. Architecture Overview: The option policies parameterized by θ, termination functions parameterized by υ and policy over options µθ belong to the actor part of the system while the critic consists of Q_U (action-value upon entering) and A_Ω (advantages of the policy over options). Policy over options is learned via Intra-Option Q-Learning, but could also be learned using policy gradients at SMDP level. The algorithm uses variations of Policy Gradients theorem, to learn option's policies and termination functions from a single stream of experience. """ # TODO: add deliberation cost def __init__(self, env: MiniGridEnv, critic: IntraOptionQLearning, actor: PolicyOverOptions, action_critic: IntraOptionActionLearning = None, gamma: float = 0.99, loglevel: int = 20): self.env = env self.γ = gamma self.critic = critic self.actor = actor self.logger = ProjectLogger(level=loglevel) if action_critic is not None: self.advantage_estimator = None self.action_critic = action_critic else: self.advantage_estimator = 'io' self.logger.info(f'Action-critic was not provided, ' f'so the advantages for PG would be estimated') def estimate_advantages(self, state, option: Option, reward: float, s_next): ω = self.actor.option_idx_dict[str(option)] if self.advantage_estimator == 'io': # Intra-Option Advantage Estimator Q = self.critic(s_next)[ω] advantage = reward + self.γ * Q - self.critic(state)[ω] elif self.advantage_estimator == 'augmented': # Augmented Advantage Estimator # FIXME: utility should be calculated wrt to the next option! U = self.critic.utility(state, option) advantage = reward + self.γ * U - self.critic(state)[ω] else: raise ValueError(f'Unknown estimator {self.advantage_estimator}') return advantage def learn(self, baseline: bool = False, render: bool = False): # Trackers cumulant = 0. duration = 0 option_switches = 0 avgduration = 0. # Initialize s0 and pick an option s0 = self.env.reset() option = self.actor(s0, action_values=self.critic(s0)) ω = self.actor.option_idx_dict[str(option)] # Run until episode termination done = False while not done: if render: self.env.render() time.sleep(0.05) # Take action (a), observe next state (s1) and reward (r) a = option.π(s0) s1, r, done, _ = self.env.step(a) experience = Transition(s0, option, r, s1, done) # Option evaluation step self.critic.update(experience) if self.advantage_estimator is None: self.action_critic.update(s0, a, r, s1, option, done) # Option improvement step if not isinstance(option.π, PrimitivePolicy): if self.advantage_estimator is None: critique = self.action_critic(s0, ω, a) if baseline: critique -= self.critic(s0)[ω] else: critique = self.estimate_advantages(s0, option, r, s1) if critique: option.π.update(s0, a, critique) termination_advantage = self.critic.advantage(s1)[ω] if termination_advantage: option.β.update(s1, termination_advantage) # Choose another option in case the current one terminates if option.termination(s1): option = self.actor(s1, action_values=self.critic(s1)) ω = self.actor.option_idx_dict[str(option)] option_switches += 1 avgduration += (1. / option_switches) * (duration - avgduration) duration = 0 s0 = s1 cumulant += r duration += 1 self.logger.info(f'steps {self.env.unwrapped.step_count}\n' f'cumulant {round(cumulant, 2)}\n' f'avg. duration {round(avgduration, 2)}\n' f'switches {option_switches}\n' f'critic lr {self.critic.lr.rate}\n' f'')
# ReseedWrapper env = Torch(FullyObsWrapper(SimplifyActionSpace(env))) # env.step = partial(stochastic_step, env) return env env = setup_env(FourRooms(goal_pos=tasks.pop(0))) env.unwrapped.max_steps = 1000000 obs = env.reset() n_states = env.observation_space n_actions = env.action_space.n + 1 # Set up loggers # TODO: use RLlog loglevel = 20 logger = ProjectLogger(level=loglevel, printing=False) plotter = PlotterOneHot(env) db = redis.StrictRedis(port=6379) logger.critical(env) # Define a network shared across options' policies and terminations, # as well as the critic net = NatureConvBody(in_channels=3) params = [net.parameters()] # Create options rng = np.random.RandomState(1338) n_options = 8 options, options_params = create_options(n_options, net.feature_dim, env.action_space.n)
from hrl.frameworks.options.intra_option import IntraOptionValueLearning from hrl.project_logger import ProjectLogger from hrl.visualization import PlotterOneHot SAVEPATH = Path(f'{EXPERIMENT_DIR}/value_learning') if __name__ == '__main__': # Create environment tasks = iter([(15, 15), (10, 17), (17, 10), (17, 1), (8, 8)]) env = FullyObsWrapper(FourRooms(goal_pos=next(tasks))) env.unwrapped.max_steps = 1000000 # Create loggers LOGLEVEL = 10 logger = ProjectLogger(level=LOGLEVEL, printing=False) logger.critical(env) plotter = PlotterOneHot(env=env) SAVEPATH /= env.unwrapped.__class__.__name__ SAVEPATH.mkdir(parents=True, exist_ok=True) # Create hard-coded options options = [ HallwayOption(o, env.observation_space.shape[::-1]) for o in sorted(HallwayOption.hallway_options) ] options += [ PrimitiveOption(o, env.observation_space.shape[::-1]) for o in sorted(PrimitiveOption.primitive_options) ]