def build_networks(self) -> None: """Initializes the SAC policy and q networks (with targets)""" action_bias = torch.from_numpy((self.env.action_space.high + self.env.action_space.low) / 2) action_scale = torch.from_numpy((self.env.action_space.high - self.env.action_space.low) / 2) self.policy = ContinuousMLP(self.obs_shape, self.n_actions, action_bias=action_bias, action_scale=action_scale) concat_shape = [self.obs_shape[0] + self.n_actions] self.q1 = MLP(concat_shape, 1) self.q2 = MLP(concat_shape, 1) self.target_q1 = MLP(concat_shape, 1) self.target_q2 = MLP(concat_shape, 1) self.target_q1.load_state_dict(self.q1.state_dict()) self.target_q2.load_state_dict(self.q2.state_dict())
def setUp(self) -> None: self.env = ToTensor(gym.make("CartPole-v0")) self.obs_shape = self.env.observation_space.shape self.n_actions = self.env.action_space.n self.net = MLP(self.obs_shape, self.n_actions) self.agent = Agent(self.net) self.xp_stream = EpisodicExperienceStream(self.env, self.agent, Mock(), episodes=4) self.rl_dataloader = DataLoader(self.xp_stream) parent_parser = argparse.ArgumentParser(add_help=False) parent_parser = cli.add_base_args(parent=parent_parser) parent_parser = DQN.add_model_specific_args(parent_parser) args_list = [ "--algo", "dqn", "--warm_start_steps", "500", "--episode_length", "100", ] self.hparams = parent_parser.parse_args(args_list) self.model = Reinforce(**vars(self.hparams))
def __init__(self, env: str, gamma: float = 0.99, lr: float = 0.01, batch_size: int = 8, n_steps: int = 10, avg_reward_len: int = 100, entropy_beta: float = 0.01, epoch_len: int = 1000, **kwargs) -> None: """ Args: env: gym environment tag gamma: discount factor lr: learning rate batch_size: size of minibatch pulled from the DataLoader batch_episodes: how many episodes to rollout for each batch of training entropy_beta: dictates the level of entropy per batch avg_reward_len: how many episodes to take into account when calculating the avg reward epoch_len: how many batches before pseudo epoch """ super().__init__() if not _GYM_AVAILABLE: # pragma: no cover raise ModuleNotFoundError( "This Module requires gym environment which is not installed yet." ) # Hyperparameters self.lr = lr self.batch_size = batch_size self.batches_per_epoch = self.batch_size * epoch_len self.entropy_beta = entropy_beta self.gamma = gamma self.n_steps = n_steps self.save_hyperparameters() # Model components self.env = gym.make(env) self.net = MLP(self.env.observation_space.shape, self.env.action_space.n) self.agent = PolicyAgent(self.net) # Tracking metrics self.total_rewards = [] self.episode_rewards = [] self.done_episodes = 0 self.avg_rewards = 0 self.avg_reward_len = avg_reward_len self.eps = np.finfo(np.float32).eps.item() self.batch_states = [] self.batch_actions = [] self.state = self.env.reset()
def setUp(self) -> None: self.env = ToTensor(gym.make("CartPole-v0")) self.obs_shape = self.env.observation_space.shape self.n_actions = self.env.action_space.n self.net = MLP(self.obs_shape, self.n_actions) self.agent = Agent(self.net) parent_parser = argparse.ArgumentParser(add_help=False) parent_parser = VanillaPolicyGradient.add_model_specific_args(parent_parser) args_list = [ "--env", "CartPole-v0", "--batch_size", "32" ] self.hparams = parent_parser.parse_args(args_list) self.model = VanillaPolicyGradient(**vars(self.hparams))
def setUp(self) -> None: self.env = ToTensor(gym.make("CartPole-v0")) self.obs_shape = self.env.observation_space.shape self.n_actions = self.env.action_space.n self.net = MLP(self.obs_shape, self.n_actions) self.agent = Agent(self.net) self.exp_source = DiscountedExperienceSource(self.env, self.agent) parent_parser = argparse.ArgumentParser(add_help=False) parent_parser = Reinforce.add_model_specific_args(parent_parser) args_list = [ "--env", "CartPole-v0", "--batch_size", "32", "--gamma", "0.99" ] self.hparams = parent_parser.parse_args(args_list) self.model = Reinforce(**vars(self.hparams)) self.rl_dataloader = self.model.train_dataloader()
def build_networks(self) -> None: """Initializes the DQN train and target networks""" self.net = MLP(self.obs_shape, self.n_actions)
def __init__( self, env: str, gamma: float = 0.99, lr: float = 0.01, batch_size: int = 8, n_steps: int = 10, avg_reward_len: int = 100, entropy_beta: float = 0.01, epoch_len: int = 1000, num_batch_episodes: int = 4, **kwargs ) -> None: """ PyTorch Lightning implementation of `REINFORCE <https://papers.nips.cc/paper/ 1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf>`_ Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour Model implemented by: - `Donal Byrne <https://github.com/djbyrne>` Example: >>> from pl_bolts.models.rl.reinforce_model import Reinforce ... >>> model = Reinforce("CartPole-v0") Train:: trainer = Trainer() trainer.fit(model) Args: env: gym environment tag gamma: discount factor lr: learning rate batch_size: size of minibatch pulled from the DataLoader n_steps: number of stakes per discounted experience entropy_beta: entropy coefficient epoch_len: how many batches before pseudo epoch num_batch_episodes: how many episodes to rollout for each batch of training avg_reward_len: how many episodes to take into account when calculating the avg reward Note: This example is based on: https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter11/02_cartpole_reinforce.py Note: Currently only supports CPU and single GPU training with `distributed_backend=dp` """ super().__init__() if not _GYM_AVAILABLE: raise ModuleNotFoundError('This Module requires gym environment which is not installed yet.') # Hyperparameters self.lr = lr self.batch_size = batch_size self.batches_per_epoch = self.batch_size * epoch_len self.entropy_beta = entropy_beta self.gamma = gamma self.n_steps = n_steps self.num_batch_episodes = num_batch_episodes self.save_hyperparameters() # Model components self.env = gym.make(env) self.net = MLP(self.env.observation_space.shape, self.env.action_space.n) self.agent = PolicyAgent(self.net) # Tracking metrics self.total_steps = 0 self.total_rewards = [0] self.done_episodes = 0 self.avg_rewards = 0 self.reward_sum = 0.0 self.batch_episodes = 0 self.avg_reward_len = avg_reward_len self.batch_states = [] self.batch_actions = [] self.batch_qvals = [] self.cur_rewards = [] self.state = self.env.reset()
def __init__(self, env: str, gamma: float = 0.99, lr: float = 0.01, batch_size: int = 8, n_steps: int = 10, avg_reward_len: int = 100, entropy_beta: float = 0.01, epoch_len: int = 1000, num_batch_episodes: int = 4, **kwargs) -> None: """ Args: env: gym environment tag gamma: discount factor lr: learning rate batch_size: size of minibatch pulled from the DataLoader n_steps: number of stakes per discounted experience entropy_beta: entropy coefficient epoch_len: how many batches before pseudo epoch num_batch_episodes: how many episodes to rollout for each batch of training avg_reward_len: how many episodes to take into account when calculating the avg reward """ super().__init__() if not _GYM_AVAILABLE: raise ModuleNotFoundError( 'This Module requires gym environment which is not installed yet.' ) # Hyperparameters self.lr = lr self.batch_size = batch_size self.batches_per_epoch = self.batch_size * epoch_len self.entropy_beta = entropy_beta self.gamma = gamma self.n_steps = n_steps self.num_batch_episodes = num_batch_episodes self.save_hyperparameters() # Model components self.env = gym.make(env) self.net = MLP(self.env.observation_space.shape, self.env.action_space.n) self.agent = PolicyAgent(self.net) # Tracking metrics self.total_steps = 0 self.total_rewards = [0] self.done_episodes = 0 self.avg_rewards = 0 self.reward_sum = 0.0 self.batch_episodes = 0 self.avg_reward_len = avg_reward_len self.batch_states = [] self.batch_actions = [] self.batch_qvals = [] self.cur_rewards = [] self.state = self.env.reset()
def __init__(self, env: str, gamma: float = 0.99, lr: float = 0.01, batch_size: int = 8, n_steps: int = 10, avg_reward_len: int = 100, num_envs: int = 4, entropy_beta: float = 0.01, epoch_len: int = 1000, **kwargs) -> None: """ PyTorch Lightning implementation of `Vanilla Policy Gradient <https://papers.nips.cc/paper/ 1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf>`_ Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour Model implemented by: - `Donal Byrne <https://github.com/djbyrne>` Example: >>> from pl_bolts.models.rl.vanilla_policy_gradient_model import VanillaPolicyGradient ... >>> model = VanillaPolicyGradient("PongNoFrameskip-v4") Train:: trainer = Trainer() trainer.fit(model) Args: env: gym environment tag gamma: discount factor lr: learning rate batch_size: size of minibatch pulled from the DataLoader batch_episodes: how many episodes to rollout for each batch of training entropy_beta: dictates the level of entropy per batch avg_reward_len: how many episodes to take into account when calculating the avg reward Note: This example is based on: https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter11/04_cartpole_pg.py Note: Currently only supports CPU and single GPU training with `distributed_backend=dp` """ super().__init__() # Hyperparameters self.lr = lr self.batch_size = batch_size * num_envs self.batches_per_epoch = self.batch_size * epoch_len self.entropy_beta = entropy_beta self.gamma = gamma self.n_steps = n_steps self.save_hyperparameters() # Model components self.env = [gym.make(env) for _ in range(num_envs)] self.net = MLP(self.env[0].observation_space.shape, self.env[0].action_space.n) self.agent = PolicyAgent(self.net) self.exp_source = DiscountedExperienceSource(self.env, self.agent, gamma=gamma, n_steps=self.n_steps) # Tracking metrics self.total_steps = 0 self.total_rewards = [0] self.done_episodes = 0 self.avg_rewards = 0 self.reward_sum = 0.0 self.baseline = 0 self.avg_reward_len = avg_reward_len
def __init__( self, env: str, gamma: float = 0.99, lam: float = 0.95, lr_actor: float = 3e-4, lr_critic: float = 1e-3, max_episode_len: float = 200, batch_size: int = 512, steps_per_epoch: int = 2048, nb_optim_iters: int = 4, clip_ratio: float = 0.2, **kwargs: Any, ) -> None: """ Args: env: gym environment tag gamma: discount factor lam: advantage discount factor (lambda in the paper) lr_actor: learning rate of actor network lr_critic: learning rate of critic network max_episode_len: maximum number interactions (actions) in an episode batch_size: batch_size when training network- can simulate number of policy updates performed per epoch steps_per_epoch: how many action-state pairs to rollout for trajectory collection per epoch nb_optim_iters: how many steps of gradient descent to perform on each batch clip_ratio: hyperparameter for clipping in the policy objective """ super().__init__() if not _GYM_AVAILABLE: # pragma: no cover raise ModuleNotFoundError( "This Module requires gym environment which is not installed yet." ) # Hyperparameters self.lr_actor = lr_actor self.lr_critic = lr_critic self.steps_per_epoch = steps_per_epoch self.nb_optim_iters = nb_optim_iters self.batch_size = batch_size self.gamma = gamma self.lam = lam self.max_episode_len = max_episode_len self.clip_ratio = clip_ratio self.save_hyperparameters() self.env = gym.make(env) # value network self.critic = MLP(self.env.observation_space.shape, 1) # policy network (agent) if isinstance(self.env.action_space, gym.spaces.box.Box): act_dim = self.env.action_space.shape[0] actor_mlp = MLP(self.env.observation_space.shape, act_dim) self.actor = ActorContinous(actor_mlp, act_dim) elif isinstance(self.env.action_space, gym.spaces.discrete.Discrete): actor_mlp = MLP(self.env.observation_space.shape, self.env.action_space.n) self.actor = ActorCategorical(actor_mlp) else: raise NotImplementedError( "Env action space should be of type Box (continous) or Discrete (categorical). " f"Got type: {type(self.env.action_space)}") self.batch_states = [] self.batch_actions = [] self.batch_adv = [] self.batch_qvals = [] self.batch_logp = [] self.ep_rewards = [] self.ep_values = [] self.epoch_rewards = [] self.episode_step = 0 self.avg_ep_reward = 0 self.avg_ep_len = 0 self.avg_reward = 0 self.state = torch.FloatTensor(self.env.reset())