def test_policy_agent(self): policy_agent = PolicyAgent(self.net) action = policy_agent(self.states, self.device) self.assertIsInstance(action, list) self.assertEqual(action[0], 1)
def __init__(self, env: str, gamma: float = 0.99, lr: float = 1e-4, batch_size: int = 32, entropy_beta: float = 0.01, batch_episodes: int = 4, *args, **kwargs) -> None: """ PyTorch Lightning implementation of `Vanilla Policy Gradient <https://papers.nips.cc/paper/ 1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf>`_ Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour Model implemented by: - `Donal Byrne <https://github.com/djbyrne>` Example: >>> from pl_bolts.models.rl.vanilla_policy_gradient_model import PolicyGradient ... >>> model = PolicyGradient("PongNoFrameskip-v4") Train:: trainer = Trainer() trainer.fit(model) Args: env: gym environment tag gamma: discount factor lr: learning rate batch_size: size of minibatch pulled from the DataLoader batch_episodes: how many episodes to rollout for each batch of training entropy_beta: dictates the level of entropy per batch .. note:: This example is based on: https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition\ /blob/master/Chapter11/04_cartpole_pg.py .. note:: Currently only supports CPU and single GPU training with `distributed_backend=dp` """ super().__init__() # self.env = wrappers.make_env(self.hparams.env) # use for Atari self.env = ToTensor(gym.make(env)) # use for Box2D/Control self.env.seed(123) self.obs_shape = self.env.observation_space.shape self.n_actions = self.env.action_space.n self.net = None self.build_networks() self.agent = PolicyAgent(self.net) self.source = NStepExperienceSource(env=self.env, agent=self.agent, n_steps=10) self.gamma = gamma self.lr = lr self.batch_size = batch_size self.batch_episodes = batch_episodes self.entropy_beta = entropy_beta self.baseline = 0 # Metrics self.reward_sum = 0 self.env_steps = 0 self.total_steps = 0 self.total_reward = 0 self.episode_count = 0 self.reward_list = [] for _ in range(100): self.reward_list.append(torch.tensor(0, device=self.device)) self.avg_reward = 0
def __init__( self, env: str, gamma: float = 0.99, lr: float = 0.01, batch_size: int = 8, n_steps: int = 10, avg_reward_len: int = 100, entropy_beta: float = 0.01, epoch_len: int = 1000, num_batch_episodes: int = 4, **kwargs ) -> None: """ PyTorch Lightning implementation of `REINFORCE <https://papers.nips.cc/paper/ 1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf>`_ Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour Model implemented by: - `Donal Byrne <https://github.com/djbyrne>` Example: >>> from pl_bolts.models.rl.reinforce_model import Reinforce ... >>> model = Reinforce("CartPole-v0") Train:: trainer = Trainer() trainer.fit(model) Args: env: gym environment tag gamma: discount factor lr: learning rate batch_size: size of minibatch pulled from the DataLoader n_steps: number of stakes per discounted experience entropy_beta: entropy coefficient epoch_len: how many batches before pseudo epoch num_batch_episodes: how many episodes to rollout for each batch of training avg_reward_len: how many episodes to take into account when calculating the avg reward Note: This example is based on: https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter11/02_cartpole_reinforce.py Note: Currently only supports CPU and single GPU training with `distributed_backend=dp` """ super().__init__() if not _GYM_AVAILABLE: raise ModuleNotFoundError('This Module requires gym environment which is not installed yet.') # Hyperparameters self.lr = lr self.batch_size = batch_size self.batches_per_epoch = self.batch_size * epoch_len self.entropy_beta = entropy_beta self.gamma = gamma self.n_steps = n_steps self.num_batch_episodes = num_batch_episodes self.save_hyperparameters() # Model components self.env = gym.make(env) self.net = MLP(self.env.observation_space.shape, self.env.action_space.n) self.agent = PolicyAgent(self.net) # Tracking metrics self.total_steps = 0 self.total_rewards = [0] self.done_episodes = 0 self.avg_rewards = 0 self.reward_sum = 0.0 self.batch_episodes = 0 self.avg_reward_len = avg_reward_len self.batch_states = [] self.batch_actions = [] self.batch_qvals = [] self.cur_rewards = [] self.state = self.env.reset()
def __init__(self, env: str, gamma: float = 0.99, lr: float = 0.01, batch_size: int = 8, n_steps: int = 10, avg_reward_len: int = 100, entropy_beta: float = 0.01, epoch_len: int = 1000, num_batch_episodes: int = 4, **kwargs) -> None: """ Args: env: gym environment tag gamma: discount factor lr: learning rate batch_size: size of minibatch pulled from the DataLoader n_steps: number of stakes per discounted experience entropy_beta: entropy coefficient epoch_len: how many batches before pseudo epoch num_batch_episodes: how many episodes to rollout for each batch of training avg_reward_len: how many episodes to take into account when calculating the avg reward """ super().__init__() if not _GYM_AVAILABLE: raise ModuleNotFoundError( 'This Module requires gym environment which is not installed yet.' ) # Hyperparameters self.lr = lr self.batch_size = batch_size self.batches_per_epoch = self.batch_size * epoch_len self.entropy_beta = entropy_beta self.gamma = gamma self.n_steps = n_steps self.num_batch_episodes = num_batch_episodes self.save_hyperparameters() # Model components self.env = gym.make(env) self.net = MLP(self.env.observation_space.shape, self.env.action_space.n) self.agent = PolicyAgent(self.net) # Tracking metrics self.total_steps = 0 self.total_rewards = [0] self.done_episodes = 0 self.avg_rewards = 0 self.reward_sum = 0.0 self.batch_episodes = 0 self.avg_reward_len = avg_reward_len self.batch_states = [] self.batch_actions = [] self.batch_qvals = [] self.cur_rewards = [] self.state = self.env.reset()
def __init__(self, env: str, gamma: float = 0.99, lr: float = 0.01, batch_size: int = 8, n_steps: int = 10, avg_reward_len: int = 100, num_envs: int = 4, entropy_beta: float = 0.01, epoch_len: int = 1000, **kwargs) -> None: """ PyTorch Lightning implementation of `Vanilla Policy Gradient <https://papers.nips.cc/paper/ 1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf>`_ Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour Model implemented by: - `Donal Byrne <https://github.com/djbyrne>` Example: >>> from pl_bolts.models.rl.vanilla_policy_gradient_model import VanillaPolicyGradient ... >>> model = VanillaPolicyGradient("PongNoFrameskip-v4") Train:: trainer = Trainer() trainer.fit(model) Args: env: gym environment tag gamma: discount factor lr: learning rate batch_size: size of minibatch pulled from the DataLoader batch_episodes: how many episodes to rollout for each batch of training entropy_beta: dictates the level of entropy per batch avg_reward_len: how many episodes to take into account when calculating the avg reward Note: This example is based on: https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/master/Chapter11/04_cartpole_pg.py Note: Currently only supports CPU and single GPU training with `distributed_backend=dp` """ super().__init__() # Hyperparameters self.lr = lr self.batch_size = batch_size * num_envs self.batches_per_epoch = self.batch_size * epoch_len self.entropy_beta = entropy_beta self.gamma = gamma self.n_steps = n_steps self.save_hyperparameters() # Model components self.env = [gym.make(env) for _ in range(num_envs)] self.net = MLP(self.env[0].observation_space.shape, self.env[0].action_space.n) self.agent = PolicyAgent(self.net) self.exp_source = DiscountedExperienceSource(self.env, self.agent, gamma=gamma, n_steps=self.n_steps) # Tracking metrics self.total_steps = 0 self.total_rewards = [0] self.done_episodes = 0 self.avg_rewards = 0 self.reward_sum = 0.0 self.baseline = 0 self.avg_reward_len = avg_reward_len
def __init__(self, env: str, gamma: float = 0.99, lr: float = 1e-4, batch_size: int = 32, entropy_beta: float = 0.01, batch_episodes: int = 4, *args, **kwargs) -> None: """ PyTorch Lightning implementation of `Vanilla Policy Gradient <https://papers.nips.cc/paper/ 1713-policy-gradient-methods-for-rl-learning-with-function-approximation.pdf>`_ Paper authors: Richard S. Sutton, David McAllester, Satinder Singh, Yishay Mansour Model implemented by: - `Donal Byrne <https://github.com/djbyrne>` Example: >>> from pl_bolts.models.rl.vanilla_policy_gradient.model import PolicyGradient ... >>> model = PolicyGradient("PongNoFrameskip-v4") Train:: trainer = Trainer() trainer.fit(model) Args: env: gym environment tag gamma: discount factor lr: learning rate batch_size: size of minibatch pulled from the DataLoader batch_episodes: how many episodes to rollout for each batch of training entropy_beta: dictates the level of entropy per batch """ super().__init__() # self.env = wrappers.make_env(self.hparams.env) # use for Atari self.env = ToTensor(gym.make(env)) # use for Box2D/Control self.env.seed(123) self.obs_shape = self.env.observation_space.shape self.n_actions = self.env.action_space.n self.net = None self.build_networks() self.agent = PolicyAgent(self.net) self.gamma = gamma self.lr = lr self.batch_size = batch_size self.batch_episodes = batch_episodes self.total_reward = 0 self.episode_reward = 0 self.episode_count = 0 self.episode_steps = 0 self.total_episode_steps = 0 self.entropy_beta = entropy_beta self.reward_list = [] for _ in range(100): self.reward_list.append(0) self.avg_reward = 0