Example #1
0
 def play_steps(self, env: Environment, n_steps: int,
                storage: Storage) -> Storage:
     """Method for performing some number of steps in the environments. Appends new
     states to existing storage
     Args:
         env: Environment
         n_steps: Number of steps to play
         storage: Storage (Memory, History) of the earlier games (used to perform first action)
     Returns:
         History with appended states, actions, rewards, etc
     """
     state = storage.get_last_state()
     for i in range(n_steps):
         action = self.act(state)
         state, reward, done, _ = env.step(action)
         self._actual_reward_count += reward
         self._actual_episode_length += 1
         self.step_count += 1
         agent_logger.add("agent_step", self.step_count)
         storage.update(action, reward, done, state)
         # TODO step callback
         if done:
             self.episode_count += 1
             agent_logger.add("agent_episode", self.episode_count)
             # TODO episode callback
             agent_logger.add("episode_total_reward",
                              self._actual_reward_count)
             agent_logger.add("episode_length", self._actual_episode_length)
             state = env.reset()
             self._actual_reward_count = 0
             self._actual_episode_length = 0
             storage.new_state_update(state)
     return storage
Example #2
0
    def play_episodes(self, env: Environment, episodes: int) -> History:
        """Method for playing full episodes used usually to train agents.

        Args:
            env: Environment
            episodes: Number of episodes to play.
        Returns:
            History object representing episodes history
        """
        history_list = []
        for i in range(episodes):
            state = env.reset()
            history: History = History(state, np.int32,
                                       env.initial_history_length)
            while True:
                action = self.act(state)
                state, reward, done, _ = env.step(action)
                self.step_count += 1
                agent_logger.add("agent_step", self.step_count)
                history.update(action, reward, done, state)
                # TODO step callback
                if done:
                    self.episode_count += 1
                    agent_logger.add("agent_episode", self.episode_count)
                    history_list.append(history)
                    agent_logger.add("episode_total_reward",
                                     history.get_total_rewards()[-1])
                    agent_logger.add("episode_length", len(history))
                    # TODO episode callback
                    break
        return reduce(iadd, history_list)
Example #3
0
                    help="count of train iterations",
                    type=int,
                    const=100,
                    nargs="?")
args = parser.parse_args()

gym_env = gym.make("CartPole-v0")

obs_shape = gym_env.observation_space.shape

state_transformer = StateShiftTransformer(np.zeros(obs_shape) - 0.1)
reward_transformer = RewardShiftTransformer(-0.5)

env = Environment(
    gym_env,
    state_transformer=state_transformer,
    reward_transformer=reward_transformer,
    expected_episode_length=128,
)

y_size = env.action_space.n

net = PytorchMLP(
    x_shape=env.observation_space.shape,
    y_size=y_size,
    output_activation=nn.Softmax(dim=1),
    hidden_sizes=[64, 64],
)

optimizer = optim.Adam(params=net.parameters(), lr=0.01)
loss = PolicyGradientLoss()
policy_network = PytorchFA(net=net, loss=loss, optimizer=optimizer)
Example #4
0
 def pre_train_setup(self, env: Environment, **kwargs):
     self.action_space = env.action_space
     state = env.reset()
     self.replay_buffer = Memory(state, np.int32, self.replay_buffer_size)
     # To ensure that we have the next state after doing the first step.
     self.play_steps(env, n_steps=1, storage=self.replay_buffer)
Example #5
0
from prl.function_approximators.pytorch_nn import PytorchConv
from prl.transformers.state_transformers import PongTransformer
from prl.utils import time_logger
from prl.callbacks import ValidationLogger

parser = argparse.ArgumentParser()
parser.add_argument("--n_iterations",
                    help="count of train iterations",
                    type=int,
                    const=5,
                    nargs="?")
args = parser.parse_args()

gym_env = gym.make("Pong-v0")
env = Environment(gym_env,
                  expected_episode_length=1024,
                  state_transformer=PongTransformer())

test_gym_env = gym.make("Pong-v0")
test_env = Environment(test_gym_env,
                       expected_episode_length=1024,
                       state_transformer=PongTransformer())
obs_shape = gym_env.observation_space.shape
y_size = env.action_space.n

net = PytorchConv(x_shape=env.observation_space.shape,
                  hidden_sizes=[16, 16, 16],
                  y_size=y_size)

optimizer = optim.Adam(params=net.parameters(), lr=0.01)
loss = PolicyGradientLoss()
Example #6
0
from prl.environments.environments import Environment
from prl.function_approximators import PytorchFA
from prl.function_approximators.pytorch_nn import DQNLoss
from prl.function_approximators.pytorch_nn import PytorchMLP
from prl.utils import time_logger

parser = argparse.ArgumentParser()
parser.add_argument("--n_iterations",
                    help="count of train iterations",
                    type=int,
                    const=3000,
                    nargs="?")
args = parser.parse_args()

gym_env = gym.make("CartPole-v0")
env = Environment(gym_env, expected_episode_length=128)

y_size = env.action_space.n

net = PytorchMLP(
    x_shape=env.observation_space.shape,
    y_size=y_size,
    output_activation=lambda x: x,
    hidden_sizes=[64],
)

optimizer = optim.Adam(params=net.parameters(), lr=0.01)
loss = DQNLoss(mode="mse")  # MSE works better than Huber loss on CartPole.

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")