def test_explorer(env_factory): env = env_factory() def select_action(engine, iter): return env.action_space.sample() explorer = Explorer(select_action) explorer.run(env, 2)
def select_action(engine: Explorer, observation): policy.train() action_distrib = policy(observation) action = action_distrib.sample() engine.store_transition_members( log_prob=action_distrib.log_prob(action), entropy=action_distrib.entropy()) return action
def test_explorer_mock(): select_action = mock.MagicMock() select_action.return_value = 1, {} explorer = Explorer(select_action) explorer.run(Env(), 2) assert select_action.call_count == 22 assert isinstance(explorer.state.transition, Transition)
def test_explorer_cast(device): explorer = Explorer(lambda x, y: (None, {}), dtype=torch.int, device=device) explorer.run(Env(), 1) # Observation are casted lazily @explorer.on(Events.ITERATION_STARTED) def _test(engine): assert engine.state.observation.device == device
def test_explorer_transition_members(): explorer = Explorer(lambda x, y: None) explorer.register_transition_members("foo", "bar") @explorer.on(Events.ITERATION_STARTED) def _add_foo(engine): engine.store_transition_members(foo=3, bar=4) assert engine.state.extra_transition_members == {"foo": 3, "bar": 4} engine.store_transition_members(foo=0) assert engine.state.extra_transition_members == {"foo": 0, "bar": 4} explorer.run(Env(), 2) assert explorer.state.transition.bar == 4 assert not hasattr(explorer.state, "extra_transition_members")
def create_memory_qlearner( dqn: nn.Module, # Callable[[Observation], QValues] random_action: Callable[[Observation], Action], optimizer: optim.Optimizer, discount: float = 0.99, epsilon: Union[float, num.Stepable] = 0.05, evaluation_mode: trainers.QLearningMode = trainers.QLearningMode.DOUBLE, optimizing_steps: int = 4, double_target_weight_copy_steps: int = 1000, memory_capacity: int = 10000, batch_size: int = 32, clip_grad_norm: Optional[float] = None, dtype: Optional[torch.dtype] = None, device: Optional[torch.device] = None, ) -> Explorer: """Create a Q-learner. Optimization is done using TD(0) deep q learning, with memory replay. Double and tartget evaluation are also available. Parameters ---------- dqn: The neural network estimating qvalues tht is being optimized. random_action: A function to make random actions. optimizer: The optimizer used to update the `dqn` parameters. discount: Dicount factor of the rfuture rewards. epsilon: Probability of amking a random action. The value is `step` if possible. evaluation_mode: Change the way targets are evaluated, either with a target network, or using double q-learning. optimizing_steps: Number of steps between optimization over the replay memory is performed. double_target_weight_copy_steps: Number of steps between the target/double newtorks weights are updated (when applicable). memory_capacity: Size of the replay memory (dataset). batch_size: Batch size when optimizing over the replay memeory. clip_grad_norm: Optionally clip the norm of the `dqn` gradient before applying them. dtype: Type the observations/model are converted to. device: Device the observations/model are moved to. Returns ------- trainer: An ignite engine that optimize an deep Q network over a dataset. """ # Enable converting from string evaluation_mode = trainers.QLearningMode(evaluation_mode) dqn.to(device=device, dtype=dtype) if evaluation_mode == trainers.QLearningMode.SIMPLE: target_dqn = None else: target_dqn = copy.deepcopy(dqn) def select_action(engine, observation): """Epsilon greedy action selection.""" with torch.no_grad(): dqn.eval() if torch.rand(1).item() < epsilon: return random_action(observation) else: return dqn(observation).greedy() agent = Explorer(select_action=select_action, dtype=dtype, device=device) trainer = trainers.create_qlearning_trainer( dqn=dqn, target_dqn=target_dqn, optimizer=optimizer, discount=discount, evaluation_mode=evaluation_mode, clip_grad_norm=clip_grad_norm, dtype=dtype, device=device, ) @agent.on(Events.STARTED) def add_memory_and_trainer_to_agent(engine): engine.state.memory = MemoryReplay(T.PinIfCuda(device=device), capacity=memory_capacity) engine.state.trainer = trainer @agent.on(Events.ITERATION_COMPLETED) def append_transition_and_step_epsilon(engine): engine.state.memory.append(engine.state.transition.cpu()) if isinstance(epsilon, num.Stepable): epsilon.step() @agent.on(Events.ITERATION_COMPLETED) @utils.every(optimizing_steps) def optimize(engine): sample_elem = engine.state.memory[0] dataloader = DataLoader( dataset=engine.state.memory, batch_size=batch_size, collate_fn=sample_elem.__class__.collate, shuffle=True, drop_last=True, ) engine.state.trainer.run(dataloader) @agent.on(Events.ITERATION_COMPLETED) @utils.every(double_target_weight_copy_steps) def copy_weights(engine): if target_dqn is not None: dqn.zero_grad() # Avoid copying the gradients target_dqn.load_state_dict(copy.deepcopy(dqn.state_dict())) return agent
def test_EpisodeLength(): agent = Explorer(lambda eng, obs: None) metrics.EpisodeLength().attach(agent, "Len") agent.run(Env(), 2) assert agent.state.metrics["Len"] == 6
def test_InfosMetric(): agent = Explorer(lambda eng, obs: None) metrics.InfoMetric("info_member").attach(agent, "Member") agent.run(Env(), 2) assert agent.state.metrics["Member"] == 6 * 3
def test_Return(): agent = Explorer(lambda eng, obs: None, metrics={"Return": metrics.Return()}) agent.run(Env(), 2) assert agent.state.metrics["Return"] == 6
def test_TransitionMetric(): agent = Explorer(lambda eng, obs: None) metrics.TransitionMetric("reward").attach(agent, "Return") agent.run(Env(), 2) assert agent.state.metrics["Return"] == 6
def create_ppo( actor_critic: nn.Module, optimizer: optim.Optimizer, discount: float = 0.99, lambda_: float = 0.9, ppo_clip: float = 0.02, exploration_loss_coef: float = 0.001, critic_loss_coef: float = 1.0, critic_loss_function: Callable = F.mse_loss, norm_returns: bool = True, norm_gaes: bool = True, dataset_size: int = 1024, n_epochs: int = 10, # FIXME change the way the dataloader is passed on to the function batch_size: int = 16, dtype: Optional[torch.dtype] = None, device: Optional[torch.device] = None, ) -> Explorer: """Create an agent using Proximal Policy Optimization learning algorithm. Parameters ---------- actor_critic: The neural network used to model the policy and critic. Must return a tuple (action probalility distribution, critic value). optimizer: The optimizer used to update the `model` parameters. discount: The discount rate used for computing the returns. lambda_: Lambda discount as defined in Generalized Advantage Estimation. ppo_clip: Clip parameter for the PPO loss. exploration_loss_coef: The entropy bonus for encouraging exploration. critic_loss_coef: Mutiplier for the critic loss. critic_loss_function: Loss function used by the critic. norm_returns: Whether to normalize returns. Running averages are kept per task (use `task_id` to differentiate tasks) and used to scale back critic for bootstrapping and GAEs. norm_gaes: Whether to normalize the advantages. Independant from the normalization of returns that is used to scale back the critic. This happens on the final advantages. dataset_size: Size of the PPO dataset to collect information from agents. n_epoch: Number of epoch of optimization to be on a single PPO dataset. batch_size: Batch size used to optimized over the PPO dataset. dtype: Type the obseravtions/model are casted to. device: Device the observations/model are moved to. Returns ------- The ignite engine, exploring the environement and optimizing. """ actor_critic.to(device=device, dtype=dtype) def select_action(engine, observation): with torch.no_grad(): actor_critic.eval() action_distrib, critic_value = actor_critic(observation) action = action_distrib.sample() engine.store_transition_members( log_prob=action_distrib.log_prob(action), entropy=action_distrib.entropy(), critic_value=critic_value, ) return action agent = Explorer(select_action=select_action, dtype=dtype, device=device) agent.register_transition_members("log_prob", "entropy", "critic_value") trainer = trainers.create_ppo_trainer( actor_critic=actor_critic, optimizer=optimizer, ppo_clip=ppo_clip, exploration_loss_coef=exploration_loss_coef, critic_loss_coef=critic_loss_coef, critic_loss_function=critic_loss_function, device=device, dtype=dtype, ) @agent.on(Events.STARTED) def add_trajectories_and_trainer_to_engine(engine): engine.state.trajectories = Trajectories( T.compose( T.WithGAEs( discount=discount, lambda_=lambda_, norm_gaes=norm_gaes, norm_returns=norm_returns, ), partial(map, T.PinIfCuda(device=device)), )) engine.state.trainer = trainer @agent.on(Events.ITERATION_COMPLETED) def append_transition(engine): engine.state.trajectories.append(engine.state.transition.cpu()) @agent.on(Events.EPOCH_COMPLETED) def terminate_trajectory_and_data_collection(engine): engine.state.trajectories.terminate_trajectory() @agent.on(Events.EPOCH_COMPLETED) def optimize(engine): if len(engine.state.trajectories) >= dataset_size: sample_elem = engine.state.trajectories[0] dataloader = DataLoader( dataset=engine.state.trajectories, batch_size=batch_size, collate_fn=sample_elem.__class__.collate, drop_last=True, ) engine.state.trainer.run(dataloader, n_epochs) engine.state.trajectories.clear() return agent
def create_reinforce( policy: nn.Module, optimizer: optim.Optimizer, discount: float = 0.99, exploration: float = 0.001, norm_returns: bool = True, grad_norm_clip: Optional[float] = 1.0, dtype: Optional[torch.dtype] = None, device: Optional[torch.device] = None, ) -> Explorer: """Create an agent using Reinforce learning algorithm. Parameters ---------- policy: The neural network used to model the policy. optimizer: The optimizer used to update the `model` parameters. discount: The discount rate used for computing the returns. exploration: The entropy bonus for encouraging exploration. norm_returns: Whether to normalize the rewards with zero mean and unit variance. Computed over an episode. Raise an error for episode of length 1. grad_norm_clip: Value to clip the norm of the gradient at before applying an update. dtype: Type the obseravtions/model are casted to. device: Device the observations/model are moved to. Returns ------- agent: The ignite engine, exploring the environement and optimizing. """ policy.to(device=device, dtype=dtype) def select_action(engine: Explorer, observation): policy.train() action_distrib = policy(observation) action = action_distrib.sample() engine.store_transition_members( log_prob=action_distrib.log_prob(action), entropy=action_distrib.entropy()) return action agent = Explorer(select_action=select_action, dtype=dtype, device=device) agent.register_transition_members("log_prob", "entropy") @agent.on(Events.STARTED) def add_trajectories_to_engine(engine): engine.state.trajectories = Trajectories( T.WithReturns(discount=discount, norm_returns=norm_returns)) @agent.on(Events.EPOCH_STARTED) def empty_trajectectories(engine): engine.state.trajectories.clear() @agent.on(Events.ITERATION_COMPLETED) def append_transition(engine): engine.state.trajectories.append(engine.state.transition) @agent.on(Events.EPOCH_COMPLETED) def optimize(engine): engine.state.trajectories.terminate_trajectory() # The setting is simple enough that using a dataloader is overkill. optimizer.zero_grad() for transition in engine.state.trajectories: loss = -transition.retrn * transition.log_prob loss -= exploration * transition.entropy loss.backward() if grad_norm_clip is not None: nn.utils.clip_grad_norm_(policy.parameters(), grad_norm_clip) optimizer.step() return agent
def create_a2c( actor_critic: nn.Module, optimizer: optim.Optimizer, discount: float = 0.99, exploration: float = 0.001, norm_returns: bool = True, critic_loss: Callable = F.mse_loss, critic_multiplier: float = 1.0, grad_norm_clip: Optional[float] = 1.0, dtype: Optional[torch.dtype] = None, device: Optional[torch.device] = None, ) -> Explorer: """Create an agent using Reinforce learning algorithm. Parameters ---------- actor_critic: The neural network used to model the policy and critic. Must return a tuple (action probalility distribution, critic value). optimizer: The optimizer used to update the `model` parameters. discount: The discount rate used for computing the returns. exploration: The entropy bonus for encouraging exploration. norm_returns: Whether to normalize the rewards with zero mean and unit variance. Computed over an episode. Raise an error for episode of length 1. critic_loss: The loss function used to learn the critic. critic_multiplier: Multiplier used for the critic loss in the total loss. grad_norm_clip: Value to clip the norm of the gradient at before applying an update. dtype: Type the obseravtions/model are casted to. device: Device the observations/model are moved to. Returns ------- agent: The ignite engine, exploring the environement and optimizing. """ actor_critic.to(device=device, dtype=dtype) def select_action(engine, observation): actor_critic.train() action_distrib, critic_value = actor_critic(observation) action = action_distrib.sample() engine.store_transition_members( log_prob=action_distrib.log_prob(action), entropy=action_distrib.entropy(), critic_value=critic_value, ) return action agent = Explorer(select_action=select_action, dtype=dtype, device=device) agent.register_transition_members("log_prob", "entropy", "critic_value") @agent.on(Events.STARTED) def add_trajectories_to_engine(engine): engine.state.trajectories = Trajectories( T.WithReturns(discount=discount, norm_returns=norm_returns)) @agent.on(Events.EPOCH_STARTED) def empty_trajectectories(engine): engine.state.trajectories.clear() @agent.on(Events.ITERATION_COMPLETED) def append_transition(engine): engine.state.trajectories.append(engine.state.transition) @agent.on(Events.EPOCH_COMPLETED) def optimize(engine): engine.state.trajectories.terminate_trajectory() # The setting is simple enough that using a dataloader is overkill. optimizer.zero_grad() for t in engine.state.trajectories: loss = -(t.retrn - t.critic_value.detach()) * t.log_prob loss -= exploration * t.entropy retrn = t.critic_value.new([t.retrn]) # Make tensor on same device loss += critic_multiplier * critic_loss(t.critic_value, retrn) loss.backward() if grad_norm_clip is not None: nn.utils.clip_grad_norm_(actor_critic.parameters(), grad_norm_clip) optimizer.step() return agent
def test_explorer_transition_members_info(): explorer = Explorer(lambda x, y: None) explorer.register_transition_members("info_member") explorer.run(Env(), 2) assert hasattr(explorer.state.transition, "info_member")