def play_steps(self, env: Environment, n_steps: int, storage: Storage) -> Storage: """Method for performing some number of steps in the environments. Appends new states to existing storage Args: env: Environment n_steps: Number of steps to play storage: Storage (Memory, History) of the earlier games (used to perform first action) Returns: History with appended states, actions, rewards, etc """ state = storage.get_last_state() for i in range(n_steps): action = self.act(state) state, reward, done, _ = env.step(action) self._actual_reward_count += reward self._actual_episode_length += 1 self.step_count += 1 agent_logger.add("agent_step", self.step_count) storage.update(action, reward, done, state) # TODO step callback if done: self.episode_count += 1 agent_logger.add("agent_episode", self.episode_count) # TODO episode callback agent_logger.add("episode_total_reward", self._actual_reward_count) agent_logger.add("episode_length", self._actual_episode_length) state = env.reset() self._actual_reward_count = 0 self._actual_episode_length = 0 storage.new_state_update(state) return storage
def play_episodes(self, env: Environment, episodes: int) -> History: """Method for playing full episodes used usually to train agents. Args: env: Environment episodes: Number of episodes to play. Returns: History object representing episodes history """ history_list = [] for i in range(episodes): state = env.reset() history: History = History(state, np.int32, env.initial_history_length) while True: action = self.act(state) state, reward, done, _ = env.step(action) self.step_count += 1 agent_logger.add("agent_step", self.step_count) history.update(action, reward, done, state) # TODO step callback if done: self.episode_count += 1 agent_logger.add("agent_episode", self.episode_count) history_list.append(history) agent_logger.add("episode_total_reward", history.get_total_rewards()[-1]) agent_logger.add("episode_length", len(history)) # TODO episode callback break return reduce(iadd, history_list)
help="count of train iterations", type=int, const=100, nargs="?") args = parser.parse_args() gym_env = gym.make("CartPole-v0") obs_shape = gym_env.observation_space.shape state_transformer = StateShiftTransformer(np.zeros(obs_shape) - 0.1) reward_transformer = RewardShiftTransformer(-0.5) env = Environment( gym_env, state_transformer=state_transformer, reward_transformer=reward_transformer, expected_episode_length=128, ) y_size = env.action_space.n net = PytorchMLP( x_shape=env.observation_space.shape, y_size=y_size, output_activation=nn.Softmax(dim=1), hidden_sizes=[64, 64], ) optimizer = optim.Adam(params=net.parameters(), lr=0.01) loss = PolicyGradientLoss() policy_network = PytorchFA(net=net, loss=loss, optimizer=optimizer)
def pre_train_setup(self, env: Environment, **kwargs): self.action_space = env.action_space state = env.reset() self.replay_buffer = Memory(state, np.int32, self.replay_buffer_size) # To ensure that we have the next state after doing the first step. self.play_steps(env, n_steps=1, storage=self.replay_buffer)
from prl.function_approximators.pytorch_nn import PytorchConv from prl.transformers.state_transformers import PongTransformer from prl.utils import time_logger from prl.callbacks import ValidationLogger parser = argparse.ArgumentParser() parser.add_argument("--n_iterations", help="count of train iterations", type=int, const=5, nargs="?") args = parser.parse_args() gym_env = gym.make("Pong-v0") env = Environment(gym_env, expected_episode_length=1024, state_transformer=PongTransformer()) test_gym_env = gym.make("Pong-v0") test_env = Environment(test_gym_env, expected_episode_length=1024, state_transformer=PongTransformer()) obs_shape = gym_env.observation_space.shape y_size = env.action_space.n net = PytorchConv(x_shape=env.observation_space.shape, hidden_sizes=[16, 16, 16], y_size=y_size) optimizer = optim.Adam(params=net.parameters(), lr=0.01) loss = PolicyGradientLoss()
from prl.environments.environments import Environment from prl.function_approximators import PytorchFA from prl.function_approximators.pytorch_nn import DQNLoss from prl.function_approximators.pytorch_nn import PytorchMLP from prl.utils import time_logger parser = argparse.ArgumentParser() parser.add_argument("--n_iterations", help="count of train iterations", type=int, const=3000, nargs="?") args = parser.parse_args() gym_env = gym.make("CartPole-v0") env = Environment(gym_env, expected_episode_length=128) y_size = env.action_space.n net = PytorchMLP( x_shape=env.observation_space.shape, y_size=y_size, output_activation=lambda x: x, hidden_sizes=[64], ) optimizer = optim.Adam(params=net.parameters(), lr=0.01) loss = DQNLoss(mode="mse") # MSE works better than Huber loss on CartPole. device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")