Ejemplo n.º 1
0
class Trainer:
    def __init__(self):
        #Preparing envs
        self.envs = Envs()

        self.memory = ReplayBuffer()
        self.device = torch.device(settings.device)
        self.policy = Policy().to(self.device)
        self.policy_optim = Adam(self.policy.parameters(), lr=p.lr)

        self.critic = QNetwork().to(self.device)
        self.critic_target = QNetwork().to(self.device)

        self.critic_optim = Adam(self.critic.parameters(), lr=p.lr)
        self.parameter_update(tau=1.0)

        if settings.mode == "test":
            self.policy.load_state_dict(
                torch.load("policy_seed_{}".format(settings.seed)))

        self.logger = Logger()

    def start(self):
        self.total_numsteps = 0

        if settings.mode == "train":
            self.add_random_steps()

            names = torch.FloatTensor(
                [i for i, _ in enumerate(settings.env_names)]).to(self.device)
            while self.total_numsteps < p.max_numsteps:
                self.run_test()
                leg_starts, states = self.envs.reset()
                for step in range(p._max_episode_steps):
                    self.total_numsteps += 1
                    actions = self.select_action(leg_starts, states, names)
                    next_states, rewards, dones = self.envs.step(actions)
                    self.memory.push(names, leg_starts, states, next_states,
                                     actions, rewards, dones)
                    states = self.envs.reset_dones(next_states, dones)

                    c1_loss, c2_loss, policy_loss = self.update_nets()

                    if (self.total_numsteps % 10) == 0:
                        self.logger.show_update(self.total_numsteps)

            torch.save(self.policy.state_dict(),
                       "policy_seed_{}".format(settings.seed))

        else:
            print("Seed: {}".format(settings.seed))
            self.run_test()

    def run_test(self):
        if settings.mode == "test":
            print("\nTesting current policy")
        leg_starts, states = self.envs.reset()
        done_filter = epsd_rewards = torch.FloatTensor(
            [1.0] * len(settings.env_names)).to(self.device)
        epsd_rewards = torch.FloatTensor([0.0] * len(settings.env_names)).to(
            self.device)
        names = torch.FloatTensor(
            [i for i, _ in enumerate(settings.env_names)]).to(self.device)
        for step in range(p._max_episode_steps):
            actions = self.select_action(leg_starts,
                                         states,
                                         names,
                                         evaluate=True)
            next_states, rewards, dones = self.envs.step(actions)
            epsd_rewards += done_filter * rewards
            done_filter *= (dones != 1).float()
            states = next_states

        self.logger.add_rewards(len(names), epsd_rewards, self.total_numsteps)
        self.logger.save()

    def add_random_steps(self):
        print("Adding random steps")
        leg_starts, states = self.envs.reset()
        names = torch.FloatTensor(
            [i for i, _ in enumerate(settings.env_names)]).to(self.device)
        while len(self.memory) <= p.batch_size * 10:
            actions = self.envs.sample_actions()
            next_states, rewards, dones = self.envs.step(actions)
            self.memory.push(names, leg_starts, states, next_states, actions,
                             rewards, dones)
            states = self.envs.reset_dones(next_states, dones)

    def select_action(self, leg_starts, states, names, evaluate=False):
        with torch.no_grad():

            if not evaluate:
                actions, _, _ = self.policy.sample(leg_starts, states, names)
            else:
                _, _, actions = self.policy.sample(leg_starts, states, names)

            return actions.cpu()

    def parameter_update(self, tau=p.tau):
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def update_nets(self):
        names_batch, leg_starts_batch, state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.memory.sample(
        )

        reward_batch = reward_batch.unsqueeze(1)
        mask_batch = mask_batch.unsqueeze(1)

        with torch.no_grad():
            next_state_action, next_state_log_pi, _ = self.policy.sample(
                leg_starts_batch, next_state_batch, names_batch)
            qf1_next_target, qf2_next_target = self.critic_target(
                leg_starts_batch, next_state_batch, next_state_action,
                names_batch)
            min_qf_next_target = torch.min(
                qf1_next_target, qf2_next_target) - p.alpha * next_state_log_pi
            next_q_value = reward_batch + mask_batch * p.gamma * (
                min_qf_next_target)
        qf1, qf2 = self.critic(leg_starts_batch, state_batch, action_batch,
                               names_batch)

        qf1_loss = F.mse_loss(qf1, next_q_value)
        qf2_loss = F.mse_loss(qf2, next_q_value)
        qf_loss = qf1_loss + qf2_loss

        self.critic_optim.zero_grad()
        qf_loss.backward()
        self.critic_optim.step()

        pi, log_pi, _ = self.policy.sample(leg_starts_batch, state_batch,
                                           names_batch)
        qf1_pi, qf2_pi = self.critic(leg_starts_batch, state_batch, pi,
                                     names_batch)
        min_qf_pi = torch.min(qf1_pi, qf2_pi)
        policy_loss = ((p.alpha * log_pi) - min_qf_pi).mean()

        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()

        self.parameter_update()

        return qf1_loss.item(), qf2_loss.item(), policy_loss.item()
Ejemplo n.º 2
0
# Load World
# ----------

LEFT = 5
RIGHT = 4
env = gym.make('PongDeterministic-v4')
print("List of available actions: ", env.unwrapped.get_action_meanings())

# Load Agent
# ----------

from agent import Policy
import torch.optim as optim

agent = Policy().to(device)
optimizer = optim.Adam(agent.parameters(), lr=1e-4)

# Load Parallel Environment
# -------------------------

from pong_utils import parallelEnv, preprocess_batch
envs = parallelEnv('PongDeterministic-v4', n=4, seed=12345)


#
def collect_trajectories(envs, agent, tmax=200, nrand=5):
    '''
    Collect trajectories of multiple agents of a parallelized environment
    '''
    n = len(envs.ps)
Ejemplo n.º 3
0
    #    envs = SubprocVecEnv(envs)
    envs = ShmemVecEnv(envs)
    envs = VecToTensor(envs)

    date = datetime.now().strftime('%m_%d_%H_%M')
    mon_file_name = "./tmp/" + date
    envs = VecMonitor(envs, mon_file_name)

    train_policy = Policy(84, 84, 4, envs.action_space.n).to(device)
    step_policy = Policy(84, 84, 4, envs.action_space.n).to(device)
    step_policy.load_state_dict(train_policy.state_dict())
    step_policy.eval()

    runner = Runner(envs, step_policy, n_step, gamma)

    optimizer = optim.RMSprop(train_policy.parameters(),
                              lr=lr,
                              alpha=alpha,
                              eps=epsilon)

    for i in tqdm(range(num_updates)):
        mb_obs, mb_rewards, mb_values, mb_actions = runner.run()

        action_logits, values = train_policy(mb_obs)

        mb_adv = mb_rewards - mb_values
        dist = Categorical(logits=action_logits)
        action_log_probs = dist.log_prob(mb_actions)
        pg_loss = torch.mean(-action_log_probs * mb_adv)

        vf_loss = F.mse_loss(values, mb_rewards)
import torch.optim as optim
import torch
import gym
from agent import Policy
from collections import deque
import numpy as np

env = gym.make('CartPole-v1')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

policy = Policy().to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)

n_episodes = 2000
max_t = 1000
gamma = 1.0
print_every = 100

scores_deque = deque(maxlen=100)
scores = []

for i_episode in range(1, n_episodes+1):
    saved_log_probs = []
    rewards = []
    state = env.reset()
    for t in range(max_t):
        action, log_prob = policy.act(state)
        saved_log_probs.append(log_prob)
        state, reward, done, _ = env.step(action)
        rewards.append(reward)
        if done: