Ejemplo n.º 1
0
def train(env, args, writer):
    p1_current_model = DQN(env, args).to(args.device)
    p1_target_model = DQN(env, args).to(args.device)
    update_target(p1_current_model, p1_target_model)
    p2_current_model = DQN(env, args).to(args.device)
    p2_target_model = DQN(env, args).to(args.device)
    update_target(p2_current_model, p2_target_model)

    if args.noisy:
        p1_current_model.update_noisy_modules()
        p1_target_model.update_noisy_modules()
        p2_current_model.update_noisy_modules()
        p2_target_model.update_noisy_modules()

    if args.load_model and os.path.isfile(args.load_model):
        load_model(p1_current_model, args, 1)
        load_model(p2_current_model, args, 2)

    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay)
    beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames)

    if args.prioritized_replay:
        p1_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
        p2_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
    else:
        p1_replay_buffer = ReplayBuffer(args.buffer_size)
        p2_replay_buffer = ReplayBuffer(args.buffer_size)
    
    p1_state_deque = deque(maxlen=args.multi_step)
    p2_state_deque = deque(maxlen=args.multi_step)
    p1_reward_deque = deque(maxlen=args.multi_step)
    p1_action_deque = deque(maxlen=args.multi_step)
    p2_reward_deque = deque(maxlen=args.multi_step)
    p2_action_deque = deque(maxlen=args.multi_step)

    p1_optimizer = optim.Adam(p1_current_model.parameters(), lr=args.lr)
    p2_optimizer = optim.Adam(p2_current_model.parameters(), lr=args.lr)

    length_list = []
    p1_reward_list, p1_loss_list = [], []
    p2_reward_list, p2_loss_list = [], []
    p1_episode_reward, p2_episode_reward = 0, 0
    episode_length = 0

    prev_time = time.time()
    prev_frame = 1

    (p1_state, p2_state) = env.reset()
    for frame_idx in range(1, args.max_frames + 1):
        if args.noisy:
            p1_current_model.sample_noise()
            p1_target_model.sample_noise()
            p2_current_model.sample_noise()
            p2_target_model.sample_noise()

        epsilon = epsilon_by_frame(frame_idx)
        p1_action = p1_current_model.act(torch.FloatTensor(p1_state).to(args.device), epsilon)
        p2_action = p2_current_model.act(torch.FloatTensor(p2_state).to(args.device), epsilon)

        if args.render:
            env.render()

        actions = {"1": p1_action, "2": p2_action}
        (p1_next_state, p2_next_state), reward, done, _ = env.step(actions)


        p1_state_deque.append(p1_state)
        p2_state_deque.append(p2_state)
        if args.negative:
            p1_reward_deque.append(reward[0] - 1)
        else:
            p1_reward_deque.append(reward[0])
        p1_action_deque.append(p1_action)
        if args.negative:
            p2_reward_deque.append(reward[1] - 1)
        else:
            p2_reward_deque.append(reward[1])
        p2_action_deque.append(p2_action)

        if len(p1_state_deque) == args.multi_step or done:
            n_reward = multi_step_reward(p1_reward_deque, args.gamma)
            n_state = p1_state_deque[0]
            n_action = p1_action_deque[0]
            p1_replay_buffer.push(n_state, n_action, n_reward, p1_next_state, np.float32(done))

            n_reward = multi_step_reward(p2_reward_deque, args.gamma)
            n_state = p2_state_deque[0]
            n_action = p2_action_deque[0]
            p2_replay_buffer.push(n_state, n_action, n_reward, p2_next_state, np.float32(done))

        (p1_state, p2_state) = (p1_next_state, p2_next_state)
        p1_episode_reward += (reward[0])
        p2_episode_reward += (reward[1])
        if args.negative:
            p1_episode_reward -= 1
            p2_episode_reward -= 1
        episode_length += 1

        if done or episode_length > args.max_episode_length:
            (p1_state, p2_state) = env.reset()
            p1_reward_list.append(p1_episode_reward)
            p2_reward_list.append(p2_episode_reward)
            length_list.append(episode_length)
            writer.add_scalar("data/p1_episode_reward", p1_episode_reward, frame_idx)
            writer.add_scalar("data/p2_episode_reward", p2_episode_reward, frame_idx)
            writer.add_scalar("data/episode_length", episode_length, frame_idx)
            p1_episode_reward, p2_episode_reward, episode_length = 0, 0, 0
            p1_state_deque.clear()
            p2_state_deque.clear()
            p1_reward_deque.clear()
            p2_reward_deque.clear()
            p1_action_deque.clear()
            p2_action_deque.clear()

        if len(p1_replay_buffer) > args.learning_start and frame_idx % args.train_freq == 0:
            beta = beta_by_frame(frame_idx)
            loss = compute_td_loss(p1_current_model, p1_target_model, p1_replay_buffer, p1_optimizer, args, beta)
            p1_loss_list.append(loss.item())
            writer.add_scalar("data/p1_loss", loss.item(), frame_idx)

            loss = compute_td_loss(p2_current_model, p2_target_model, p2_replay_buffer, p2_optimizer, args, beta)
            p2_loss_list.append(loss.item())
            writer.add_scalar("data/p2_loss", loss.item(), frame_idx)

        if frame_idx % args.update_target == 0:
            update_target(p1_current_model, p1_target_model)
            update_target(p2_current_model, p2_target_model)

        if frame_idx % args.evaluation_interval == 0:
            print_log(frame_idx, prev_frame, prev_time, p1_reward_list, length_list, p1_loss_list)
            print_log(frame_idx, prev_frame, prev_time, p2_reward_list, length_list, p2_loss_list)
            p1_reward_list.clear(), p2_reward_list.clear(), length_list.clear()
            p1_loss_list.clear(), p2_loss_list.clear()
            prev_frame = frame_idx
            prev_time = time.time()
            save_model(p1_current_model, args, 1)
            save_model(p2_current_model, args, 2)

    save_model(p1_current_model, args, 1)
    save_model(p2_current_model, args, 2)
Ejemplo n.º 2
0
class Agent:
  state: int
  actions: int
  history: int = 4
  atoms: int = 5 #51
  Vmin: float = -10
  Vmax: float = 10
  
  lr: float = 1e-5
  batch_size: int = 32
  discount: float = 0.99
  norm_clip: float = 10.

  def __post_init__(self):
    self.support = torch.linspace(self.Vmin, self.Vmax, self.atoms)
    self.delta_z = (self.Vmax - self.Vmin) / (self.atoms - 1)

    self.online_net = DQN(self.state, self.actions, self.history, self.atoms)
    self.online_net.train()

    self.target_net = DQN(self.state, self.actions, self.history, self.atoms)
    self.update_target_net()
    self.target_net.train()
    for param in self.target_net.parameters(): param.requires_grad = False

    self.optimiser = optim.Adam(self.online_net.parameters(), lr=self.lr)

  def act(self, state):
    state = torch.FloatTensor(state).unsqueeze(0)
    with torch.no_grad():
      return (self.online_net(state) * self.support).sum(2).argmax(1).item()

  def act_e_greedy(self, state, epsilon=0.001):
    return random.randrange(self.actions) if random.random() < epsilon else self.act(state)

  def learn(self, buffer):
    state, action, reward, next_state, terminal, weights, idx = buffer.sample(self.batch_size)
    state = torch.FloatTensor(state)
    action = torch.LongTensor(action)
    reward = torch.FloatTensor(reward)
    next_state = torch.FloatTensor(next_state)
    terminal = torch.FloatTensor(terminal)
    weights = torch.FloatTensor(weights)

    log_ps = self.online_net(state, log=True)
    log_ps_a = log_ps[range(self.batch_size), action]

    with torch.no_grad():
      # Calculate nth next state probabilities
      pns = self.online_net(next_state)
      dns = self.support.expand_as(pns) * pns
      argmax_indices_ns = dns.sum(2).argmax(1)
      self.target_net.sample_noise()
      pns = self.target_net(next_state)
      pns_a = pns[range(self.batch_size), argmax_indices_ns]

      # Compute Bellman operator T applied to z
      Tz = reward.unsqueeze(1) + (1 - terminal).unsqueeze(1) * self.discount * self.support.unsqueeze(0) # -10 ... 10 + reward
      Tz.clamp_(min=self.Vmin, max=self.Vmax)
      
      # Compute L2 projection of Tz onto fixed support z
      b = (Tz - self.Vmin) / self.delta_z # 0 ... 4
      l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64)
      # Fix disappearing probability mass when l = b = u (b is int)
      l[(u > 0) * (l == u)] -= 1
      u[(l < (self.atoms - 1)) * (l == u)] += 1

      # Distribute probability of Tz
      m = state.new_zeros(self.batch_size, self.atoms)
      offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms), self.batch_size).unsqueeze(1).expand(self.batch_size, self.atoms).to(action)
      m.view(-1).index_add_(0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1))  # m_l = m_l + p(s_t+n, a*)(u - b)
      m.view(-1).index_add_(0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1))  # m_u = m_u + p(s_t+n, a*)(b - l)

    loss = -torch.sum(m * log_ps_a, 1)  # Cross-entropy loss (minimises DKL(m||p(s_t, a_t)))
    loss = weights * loss

#     q_values = self.online_net(state)
#     q_value = q_values[range(self.batch_size), action]

#     next_q_values = self.target_net(next_state)
#     next_q_value = next_q_values.max(1)[0]

#     expected_q_value = reward + self.discount * next_q_value * (1 - terminal)
#     loss = weights * (q_value - expected_q_value).pow(2)

    self.optimiser.zero_grad()
    loss.mean().backward()
    self.optimiser.step()
    nn.utils.clip_grad_norm_(self.online_net.parameters(), self.norm_clip)

    buffer.update_priorities(idx, loss.tolist())

  def update_target_net(self):
    self.target_net.load_state_dict(self.online_net.state_dict())

  def sample_noise(self):
    self.online_net.sample_noise()

  def save(self, path):
    torch.save(self.online_net.state_dict(), path)

  # Evaluates Q-value based on single state (no batch)
  def evaluate_q(self, state):
    with torch.no_grad():
      return self.online_net(state.unsqueeze(0)).max(1)[0].item()

  def train(self):
    self.online_net.train()

  def eval(self):
    self.online_net.eval()
Ejemplo n.º 3
0
def train(env, args):
    # Init WandB
    wandb.init(config=args)

    current_model = DQN(env, args).to(args.device)
    target_model = DQN(env, args).to(args.device)

    if args.noisy:
        current_model.update_noisy_modules()
        target_model.update_noisy_modules()

    if args.load_model and os.path.isfile(args.load_model):
        load_model(current_model, args)

    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final,
                                         args.eps_decay)
    beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames)

    if args.prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
    else:
        replay_buffer = ReplayBuffer(args.buffer_size)

    state_deque = deque(maxlen=args.multi_step)
    reward_deque = deque(maxlen=args.multi_step)
    action_deque = deque(maxlen=args.multi_step)

    optimizer = optim.Adam(current_model.parameters(), lr=args.lr)

    reward_list, length_list, loss_list = [], [], []
    episode_reward = 0
    episode_length = 0

    prev_time = time.time()
    prev_frame = 1

    state = env.reset()
    for frame_idx in range(1, args.max_frames + 1):
        if args.render:
            env.render()

        if args.noisy:
            current_model.sample_noise()
            target_model.sample_noise()

        epsilon = epsilon_by_frame(frame_idx)
        action = current_model.act(
            torch.FloatTensor(state).to(args.device), epsilon)

        next_state, reward, done, _ = env.step(action)
        state_deque.append(state)
        reward_deque.append(reward)
        action_deque.append(action)

        if len(state_deque) == args.multi_step or done:
            n_reward = multi_step_reward(reward_deque, args.gamma)
            n_state = state_deque[0]
            n_action = action_deque[0]
            replay_buffer.push(n_state, n_action, n_reward, next_state,
                               np.float32(done))

        state = next_state
        episode_reward += reward
        episode_length += 1

        if done:
            state = env.reset()
            reward_list.append(episode_reward)
            length_list.append(episode_length)
            wandb.log({
                'episode_reward': episode_reward,
                'episode_length': episode_length,
            })
            episode_reward, episode_length = 0, 0
            state_deque.clear()
            reward_deque.clear()
            action_deque.clear()

        if len(replay_buffer
               ) > args.learning_start and frame_idx % args.train_freq == 0:
            beta = beta_by_frame(frame_idx)
            loss = compute_td_loss(current_model, target_model, replay_buffer,
                                   optimizer, args, beta)
            loss_list.append(loss.item())
            wandb.log({'loss': loss.item()})

        if frame_idx % args.update_target == 0:
            update_target(current_model, target_model)

        if frame_idx % args.evaluation_interval == 0:
            print_log(frame_idx, prev_frame, prev_time, reward_list,
                      length_list, loss_list)
            reward_list.clear(), length_list.clear(), loss_list.clear()
            prev_frame = frame_idx
            prev_time = time.time()
            save_model(current_model, args)

    save_model(current_model, args)
Ejemplo n.º 4
0
def train(env, args, writer):
    current_model = DQN(env, args).to(args.device)
    target_model = DQN(env, args).to(args.device)

    if args.noisy:
        current_model.update_noisy_modules()
        target_model.update_noisy_modules()

    if args.load_model:  # and os.path.isfile(args.load_model)
        load_model(current_model, args)
        load_model(target_model, args)

    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final,
                                         args.eps_decay)
    beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames)

    if args.prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
    else:
        replay_buffer = ReplayBuffer(args.buffer_size)

    state_buffer = deque(maxlen=args.action_repeat)
    states_deque = [
        deque(maxlen=args.multi_step) for _ in range(args.num_agents)
    ]
    rewards_deque = [
        deque(maxlen=args.multi_step) for _ in range(args.num_agents)
    ]
    actions_deque = [
        deque(maxlen=args.multi_step) for _ in range(args.num_agents)
    ]

    optimizer = optim.Adam(current_model.parameters(), lr=args.lr)

    reward_list, length_list, loss_list = [], [], []
    episode_reward = 0
    episode_length = 0
    episode = 0

    prev_time = time.time()
    prev_frame = 1

    state, state_buffer = get_initial_state(env, state_buffer,
                                            args.action_repeat)
    for frame_idx in range(1, args.max_frames + 1):

        if args.noisy:
            current_model.sample_noise()
            target_model.sample_noise()

        epsilon = epsilon_by_frame(frame_idx)
        action = current_model.act(
            torch.FloatTensor(state).to(args.device), epsilon)

        next_state, reward, done, end = env.step(action,
                                                 save_screenshots=False)
        add_state(next_state, state_buffer)
        next_state = recent_state(state_buffer)

        for agent_index in range(len(done)):
            states_deque[agent_index].append((state[agent_index]))
            rewards_deque[agent_index].append(reward[agent_index])
            actions_deque[agent_index].append(action[agent_index])
            if len(states_deque[agent_index]
                   ) == args.multi_step or done[agent_index]:
                n_reward = multi_step_reward(rewards_deque[agent_index],
                                             args.gamma)
                n_state = states_deque[agent_index][0]
                n_action = actions_deque[agent_index][0]
                replay_buffer.push(n_state, n_action, n_reward,
                                   next_state[agent_index],
                                   np.float32(done[agent_index]))

        # delete the agents that have reached the goal
        r_index = 0
        for r in range(len(done)):
            if done[r] is True:
                state_buffer, states_deque, actions_deque, rewards_deque = del_record(
                    r_index, state_buffer, states_deque, actions_deque,
                    rewards_deque)
                r_index -= 1
            r_index += 1
        next_state = recent_state(state_buffer)

        state = next_state
        episode_reward += np.array(reward).mean()
        episode_length += 1

        if end:
            if args.save_video and episode % 10 == 0:
                evaluate(env, current_model, args)
            state, state_buffer = get_initial_state(env, state_buffer,
                                                    args.action_repeat)
            reward_list.append(episode_reward)
            length_list.append(episode_length)
            writer.add_scalar("data/episode_reward", episode_reward, frame_idx)
            writer.add_scalar("data/episode_length", episode_length, frame_idx)
            episode_reward, episode_length = 0, 0
            for d in range(len(states_deque)):
                states_deque[d].clear()
                rewards_deque[d].clear()
                actions_deque[d].clear()
            states_deque = [
                deque(maxlen=args.multi_step) for _ in range(args.num_agents)
            ]
            rewards_deque = [
                deque(maxlen=args.multi_step) for _ in range(args.num_agents)
            ]
            actions_deque = [
                deque(maxlen=args.multi_step) for _ in range(args.num_agents)
            ]
            episode += 1

        if len(replay_buffer
               ) > args.learning_start and frame_idx % args.train_freq == 0:
            beta = beta_by_frame(frame_idx)
            losses = 0
            for _ in range(1):
                loss = compute_td_loss(current_model, target_model,
                                       replay_buffer, optimizer, args, beta)
                losses += loss.item()
            loss_list.append(losses)
            writer.add_scalar("data/loss", loss.item(), frame_idx)

        if frame_idx % args.update_target == 0:
            update_target(current_model, target_model)

        if frame_idx % args.evaluation_interval == 0:
            print_log(frame_idx, prev_frame, prev_time, reward_list,
                      length_list, loss_list)
            reward_list.clear(), length_list.clear(), loss_list.clear()
            prev_frame = frame_idx
            prev_time = time.time()
            save_model(current_model, args)

    save_model(current_model, args)
Ejemplo n.º 5
0
def train(env, args, writer):
    current_model = DQN(env, args).to(args.device)
    target_model = DQN(env, args).to(args.device)

    for para in target_model.parameters():
        para.requires_grad = False

    if args.noisy:
        current_model.update_noisy_modules()
        target_model.update_noisy_modules()
    #target_model.eval()

    if args.load_model and os.path.isfile(args.load_model):
        load_model(current_model, args)

    update_target(current_model, target_model)
    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final,
                                         args.eps_decay)
    beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames)

    if args.prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
        args.buffer_size = replay_buffer.it_capacity
    else:
        replay_buffer = ReplayBuffer(args.buffer_size)

    print_args(args)
    state_deque = deque(maxlen=args.multi_step)
    reward_deque = deque(maxlen=args.multi_step)
    action_deque = deque(maxlen=args.multi_step)

    if args.optim == 'adam':
        optimizer = optim.Adam(current_model.parameters(),
                               lr=args.lr,
                               eps=args.adam_eps,
                               betas=(0.9, args.beta2))
    elif args.optim == 'laprop':
        optimizer = laprop.LaProp(current_model.parameters(),
                                  lr=args.lr,
                                  betas=(0.9, args.beta2))

    reward_list, length_list, loss_list = [], [], []
    episode_reward = 0.
    episode_length = 0

    prev_time = time.time()
    prev_frame = 1

    state = env.reset()
    evaluation_interval = args.evaluation_interval
    for frame_idx in range(1, args.max_frames + 1):
        if args.render:
            env.render()

        if args.noisy:
            current_model.sample_noise()
            target_model.sample_noise()

        epsilon = epsilon_by_frame(frame_idx)
        action = current_model.act(
            torch.FloatTensor(state).to(args.device), epsilon)

        next_state, raw_reward, done, _ = env.step(action)
        if args.clip_rewards:
            reward = np.clip(raw_reward, -1., 1.)
        else:
            reward = raw_reward
        state_deque.append(state)
        reward_deque.append(reward)
        action_deque.append(action)

        if len(state_deque) == args.multi_step or done:
            n_reward = multi_step_reward(reward_deque, args.gamma)
            n_state = state_deque[0]
            n_action = action_deque[0]
            replay_buffer.push(n_state, n_action, n_reward, next_state,
                               np.float32(done))

        state = next_state
        episode_reward += raw_reward
        episode_length += 1

        if episode_length >= 9950:
            while not done:
                _, _, done, _ = env.step(random.randrange(env.action_space.n))

        if done:
            state = env.reset()
            reward_list.append(episode_reward)
            length_list.append(episode_length)
            if episode_length > 10000:
                print('{:.2f}'.format(episode_reward), end='')
            writer.add_scalar("data/episode_reward", episode_reward, frame_idx)
            writer.add_scalar("data/episode_length", episode_length, frame_idx)
            episode_reward, episode_length = 0., 0
            state_deque.clear()
            reward_deque.clear()
            action_deque.clear()

        if len(replay_buffer
               ) > args.learning_start and frame_idx % args.train_freq == 0:
            beta = beta_by_frame(frame_idx)
            loss = compute_td_loss(current_model, target_model, replay_buffer,
                                   optimizer, args, beta)
            loss_list.append(loss.item())
            writer.add_scalar("data/loss", loss.item(), frame_idx)

        if frame_idx % args.update_target == 0:
            update_target(current_model, target_model)

        if frame_idx % evaluation_interval == 0:
            if len(length_list) > 0:
                print_log(frame_idx, prev_frame, prev_time, reward_list,
                          length_list, loss_list, args)
                reward_list.clear(), length_list.clear(), loss_list.clear()
                prev_frame = frame_idx
                prev_time = time.time()
                save_model(current_model, args)
            else:
                evaluation_interval += args.evaluation_interval
        if frame_idx % 200000 == 0:
            if args.adam_eps == 1.5e-4:
                save_model(current_model,
                           args,
                           name="{}_{}".format(args.optim, frame_idx))
            else:
                save_model(current_model,
                           args,
                           name="{}{:.2e}_{}".format(args.optim, args.adam_eps,
                                                     frame_idx))

    reward_list.append(episode_reward)
    length_list.append(episode_length)
    print_log(frame_idx, prev_frame, prev_time, reward_list, length_list,
              loss_list, args)
    reward_list.clear(), length_list.clear(), loss_list.clear()
    prev_frame = frame_idx
    prev_time = time.time()

    save_model(current_model, args)