Ejemplo n.º 1
0
def main():
    num_digits = 4
    state_size = 128
    embedding_dim = 8

    model = StateModel(num_digits, state_size, embedding_dim)
    
    env = gym.make("GuessNumEnv-v0")
    
    episodes = 100
    max_epside_len = 100

    replay_memory = ReplayMemory(1000)

    for ep in range(episodes):
        state, reward, done = env.reset()

        state = torch.from_numpy(state)
        action = torch.argmax(model((state[:, :-2].unsqueeze(0).long(), state[:, -2:].unsqueeze(0).float())), dim=-1) + 1 # Plus one because the action is composed of the numbers between 1 and 9
        
        next_state, reward, done = env.step(action.numpy().reshape(-1,))
        t = Transition(state=state, next_state=next_state, reward=reward, action=action)
        env.render()
        print(reward, done)
        break
Ejemplo n.º 2
0
    def learn(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                              batch.next_state)), device=device, dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch.next_state
                                                    if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.Tensor(batch.reward)


        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        next_state_values = torch.zeros(self.batch_size, device=device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
        # Compute the expected Q values
        expected_state_action_values = (next_state_values * self.gamma) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
Ejemplo n.º 3
0
def optimize_model():
    if len(memory) < batch_size:
        return
    transitions = memory.sample(batch_size)
    batch = Transition(*zip(*transitions))
    non_final_mask = torch.tensor(tuple(
        map(lambda s: s is not None, batch.next_state)),
                                  device=device,
                                  dtype=torch.bool)
    non_final_next_states = torch.cat(
        [s for s in batch.next_state if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    state_action_values = policy_net(state_batch.float()).gather(
        1, action_batch)

    next_state_values = torch.zeros(batch_size, device=device)
    next_state_values[non_final_mask] = target_net(
        non_final_next_states.float()).max(1)[0].detach()

    expected_state_action_values = (next_state_values *
                                    gama_discount) + reward_batch

    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values,
                     expected_state_action_values.unsqueeze(1))

    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
Ejemplo n.º 4
0
def optimize_dqn(bsz, opt_step):
    transitions = memory.sample(bsz)
    batch = Transition(*zip(*transitions))

    non_final_mask = torch.ByteTensor(
        tuple(map(lambda s: s is not None, batch.next_state)))
    non_final_next_states_t = torch.cat(
        tuple(s for s in batch.next_state if s is not None)).type(dtype)
    non_final_next_states = Variable(non_final_next_states_t, volatile=True)
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    if USE_CUDA:
        state_batch = state_batch.cuda()
        action_batch = action_batch.cuda()
        reward_batch = reward_batch.cuda()
        non_final_mask = non_final_mask.cuda()
    q_vals = policy_net(state_batch)
    state_action_values = q_vals.gather(1, action_batch.unsqueeze(0))

    next_state_values = Variable(torch.zeros(bsz).cuda())
    next_state_values[non_final_mask] = target_net(
        non_final_next_states).data.max(1)[0]
    expected_state_action_values = (next_state_values *
                                    args.gamma) + reward_batch

    q_loss = F.mse_loss(state_action_values,
                        expected_state_action_values,
                        size_average=False)

    loss = q_loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
Ejemplo n.º 5
0
    def optimize_model(self):
        """
        Train model.
        """
        if len(self.memory) < self.batch_size:
            return 0.0
        transitions = self.memory.sample(self.batch_size)
        # batch is ([state], [action], [next_state], [reward])
        batch = Transition(*zip(*transitions))
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device)
        non_final_next_states = torch.cat([
            torch.tensor([s], dtype=torch.float) for s in batch.next_state
            if s is not None
        ])
        state_batch = torch.cat(
            [torch.tensor([s], dtype=torch.float) for s in batch.state])
        action_batch = torch.cat(
            [torch.tensor([[s]], dtype=torch.long) for s in batch.action])
        reward_batch = torch.cat(
            [torch.tensor([[s]], dtype=torch.float) for s in batch.reward])
        q_eval = self.policy_net(state_batch).gather(1, action_batch)
        q_next = torch.zeros(self.batch_size, device=self.device)
        q_next[non_final_mask] = self.target_net(non_final_next_states).max(
            1)[0].detach()
        q_target = (q_next * self.gamma) + reward_batch.squeeze()

        loss = F.mse_loss(q_eval, q_target.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()
Ejemplo n.º 6
0
def optimize_policy(replay_buffer, policy_net, target_net, optimizer,
                    loss_function):
    """
    This method optimizes the policy network by minimizing the TD error between the Q from the 
    policy network and the Q calculated through a Bellman backup via the target network.
    """
    global losses
    global eps_threshold
    if len(replay_buffer) < BATCH_SIZE: return
    transitions = replay_buffer.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions))

    # Manage edge cases
    non_final_mask = torch.tensor(tuple(
        map(lambda x: x is not None, batch.next_state)),
                                  device=device,
                                  dtype=torch.bool)
    non_final_next_states = torch.stack(
        [x for x in batch.next_state if x is not None])

    # Create batch
    state_batch = torch.stack(batch.state)
    action_batch = torch.stack(batch.action)
    reward_batch = torch.stack(batch.reward)

    # Get Q value per policy network
    policy_net.train()
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Get Q value per target_network
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(
        1).values
    expected_state_action_values = (
        next_state_values.unsqueeze(1) *
        GAMMA) + reward_batch  # value at terminal state is reward_batch

    # Compute loss
    loss = loss_function(state_action_values, expected_state_action_values)
    losses.append(loss.item())

    # Optimize the policy network
    optimizer.zero_grad()
    loss.backward()
    clip_grad_norm_(policy_net.parameters(), 2.0)
    optimizer.step()

    eps_threshold = update_epsilon(eps_threshold)

    # Record output
    if RECORD:
        grad_norm = torch.stack(
            [params.grad.data.norm() for params in policy_net.parameters()])
        writer.add_scalar('TD Loss', loss.item(), total_iterations)
        writer.add_scalar('Min Gradient Norm',
                          grad_norm.min().item(), total_iterations)
        writer.add_scalar('Max Gradient Norm',
                          grad_norm.max().item(), total_iterations)
        writer.add_scalar('Epsilon', eps_threshold, total_iterations)
Ejemplo n.º 7
0
def optimize_model(BATCH_SIZE, memory, device, policy_net, target_net, GAMMA,
                   optimizer):
    # performs a single step of the optimization. It first samples a batch,
    # concatenates all the tensors into a single one, computes Q(st,at) and
    # V(st+1)=maxaQ(st+1,a), and combines them into our loss. By defition we
    # set V(s)=0 if s is a terminal state. We also use a target network to
    # compute V(st+1) for added stability. The target network has its weights
    # kept frozen most of the time, but is updated with the policy network’s
    # weights every so often. This is usually a set number of steps but we
    # shall use episodes for simplicity.

    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(
        tuple(map(lambda s: s is not None, batch.next_state)),
        device=device,
        #  dtype=torch.uint8,
        dtype=torch.bool,
    )
    non_final_next_states = torch.cat(
        [s for s in batch.next_state if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = (
        target_net(non_final_next_states).max(1)[0].detach())
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values,
                            expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
Ejemplo n.º 8
0
    def _do_network_update(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = 1 - torch.tensor(batch.done, dtype=torch.uint8)
        non_final_mask = non_final_mask.type(torch.bool)
        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal > 0
        ]
        non_final_next_states = torch.stack(non_final_next_states)
        state_batch = torch.stack(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.batch_size)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()

        # Task 4: DONE: Compute the expected Q values
        expected_q_values = [
            reward_batch[i].item() if batch.done[i] else
            reward_batch[i].item() + self.gamma * next_state_values[i].item()
            for i in range(len(batch.done))
        ]
        # Array is converted to numpy
        expected_q_values = np.array(expected_q_values)
        expected_state_action_values = torch.tensor(expected_q_values,
                                                    dtype=torch.float32)

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values.squeeze(),
                                expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1e-1, 1e-1)
        self.optimizer.step()
Ejemplo n.º 9
0
    def learn(self):
        """
        Learning function
        :return:
        """
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        non_final_mask = 1 - T.tensor(batch.done, dtype=T.uint8)

        # avoid having an empty tensor
        test_tensor = T.zeros(self.batch_size)
        while T.all(T.eq(test_tensor, non_final_mask)).item() is True:
            transitions = self.memory.sample(self.batch_size)
            batch = Transition(*zip(*transitions))
            non_final_mask = 1 - T.tensor(batch.done, dtype=T.uint8)

        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal > 0
        ]
        non_final_next_states = T.stack(non_final_next_states)
        state_batch = T.stack(batch.state)
        action_batch = T.cat(batch.action)
        reward_batch = T.cat(batch.reward)

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        next_state_values = T.zeros(self.batch_size)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch
        # Compute mse loss
        loss = F.mse_loss(state_action_values.squeeze(),
                          expected_state_action_values)
        # Optimize the model
        self.policy_net.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1e-1, 1e-1)
        self.policy_net.optimizer.step()
Ejemplo n.º 10
0
def optimize_model(optimizer, memory, model, model_target, batch_size, gamma,
                   use_cuda):
    if len(memory) < batch_size:
        return
    transitions = memory.sample(batch_size)
    # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for
    # detailed explanation).
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    non_final_mask = torch.ByteTensor(
        tuple(map(lambda s: s is not None, batch.next_state)))

    # We don't want to backprop through the expected action values and volatile
    # will save us on temporarily changing the model parameters'
    # requires_grad to False!
    non_final_next_states = Variable(torch.stack(
        [s for s in batch.next_state if s is not None]),
                                     volatile=True)
    state_batch = Variable(torch.stack(batch.state))
    action_batch = Variable(torch.cat(batch.action))
    reward_batch = Variable(torch.stack(batch.reward))

    if use_cuda:
        non_final_mask = non_final_mask.cuda()
        non_final_next_states = non_final_next_states.cuda()
        state_batch = state_batch.cuda()
        action_batch = action_batch.cuda()
        reward_batch = reward_batch.cuda()

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken
    state_action_values = model(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    next_state_values = Variable(torch.zeros(batch_size, 1).type(torch.Tensor))
    if use_cuda:
        next_state_values = next_state_values.cuda()

    next_state_values[non_final_mask] = model_target(
        non_final_next_states).max(1)[0]
    # Now, we don't want to mess up the loss with a volatile flag, so let's
    # clear it. After this, we'll just end up with a Variable that has
    # requires_grad=False
    next_state_values.volatile = False
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * gamma) + reward_batch

    # Compute Huber loss
    loss = F.mse_loss(state_action_values, expected_state_action_values)

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in model.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
Ejemplo n.º 11
0
def real_batch(policy, env, batch_size):
    states, actions, next_states, masks, rewards = rollout(policy, env, batch_size)
    rewards = np.array([item for sublist in rewards for item in sublist])
    batch = Transition(states,
                        actions,
                        masks,
                        next_states,
                        rewards)
    return batch
Ejemplo n.º 12
0
def optimize_model(batch_size, memory, policy_net, target_net, optimizer, GAMMA=0.999, device='cuda'):
    """Optimize the model for one step
       Return mini-batch loss
    """

    if len(memory) < batch_size:
        return
    transitions = memory.sample(batch_size)
    # Transpose the batch. This converts batch-array of Transitions to Transition of batch-arrays
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device,
                                  dtype=torch.uint8)
    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])

    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken.
    # These are the actions which would've been taken for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states
    # Expected values of actions for non_final_next_states are computed
    # This is merged based on the mask, such that we'll have either the expected state value or 0
    # in case the state was final

    # DOUBLE DQN implementation:
    # . we use the online policy net to greedily select the action
    # . and the target net to estimate the Q-value
    next_state_values = torch.zeros(batch_size, device=device)
    next_action_policynet_decisions = policy_net(non_final_next_states).max(1)[1]
    non_final_next_state_targetnet_values = target_net(non_final_next_states) \
                                                .gather(1, next_action_policynet_decisions.view(-1, 1).repeat(1, 2))[:,
                                            0]
    next_state_values[non_final_mask] = non_final_next_state_targetnet_values.detach()

    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()

    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

    # Return minibatch huber loss
    return loss.item()
Ejemplo n.º 13
0
def ma_batch(policies, env, batch_size):
    states, actions, next_states, masks, rewards, avg_reward = ma_rollout(
        policies, env, batch_size)
    batches = []
    for idx in range(len(states)):
        batches.append(
            Transition(np.array(states[idx]), np.array(actions[idx]),
                       np.array(masks[idx]).reshape(-1),
                       np.array(next_states[idx]),
                       np.array(rewards[idx]).reshape(-1)))
    return batches
Ejemplo n.º 14
0
    def update_model(self):
        if self.use_PER:
            batch_index, batch, ImportanceSamplingWeights = self.replay.sample(
                self.batch_size)
        else:
            batch = self.replay.sample(self.batch_size)

        batch_tuple = Transition(*zip(*batch))

        state = torch.stack(batch_tuple.state)
        action = torch.stack(batch_tuple.action)
        reward = torch.stack(batch_tuple.reward)
        next_state = torch.stack(batch_tuple.next_state)
        done = torch.stack(batch_tuple.done)

        self.optimizer.zero_grad()
        if self.use_ICM:
            self.icm.optimizer.zero_grad()
            forward_loss = self.icm.get_forward_loss(state, action, next_state)
            inverse_loss = self.icm.get_inverse_loss(state, action, next_state)
            icm_loss = (1 - self.icm.beta) * inverse_loss.mean(
            ) + self.ICM.beta * forward_loss.mean()

        td_estimates = self.policy(state).gather(1, action).squeeze()

        td_targets = reward + (1 - done.float()) * self.gamma * \
            self.target(next_state).max(1)[0].detach_()

        if self.use_PER:

            loss = (torch.tensor(ImportanceSamplingWeights, device=self.device)
                    * self.loss_function(td_estimates, td_targets)
                    ).sum() * self.loss_function(td_estimates, td_targets)

            errors = td_estimates - td_targets
            self.replay.batch_update(batch_index, errors.data.numpy())
        else:
            loss = self.loss_function(td_estimates, td_targets)

        if self.use_ICM:
            loss = self.icm.lambda_weight * loss + icm_loss

        loss.backward()

        for param in self.policy.parameters():
            param.grad.data.clamp_(-1, 1)

        if self.use_ICM:
            self.icm.optimizer.step()

        self.optimizer.step()

        return loss.item()
Ejemplo n.º 15
0
def add_transition(rep_buffer,
                   ns_state,
                   ns_action,
                   ns_rew,
                   ns_nexts,
                   ns_done,
                   current_state,
                   empty_deque=False,
                   ns=10,
                   ns_gamma=0.99,
                   is_done=True):
    ns_rew_sum = 0.
    trans = {}
    if empty_deque:
        # emptying the deques
        while len(ns_rew) > 0:
            for j in range(len(ns_rew)):
                ns_rew_sum += ns_rew[j] * ns_gamma**j

            # state,action,reward,
            # next_state,done, n_step_rew_sum, n_steps later
            # don't use done value because at this point the episode is done
            # trans['sample'] = [ns_state.popleft(), ns_action.popleft(), ns_rew.pop(0),
            #                    ns_nexts.popleft(), is_done, ns_rew_sum, current_state]
            trans = Transition(ns_state.popleft(), ns_action.popleft(),
                               ns_nexts.popleft(), ns_rew.pop(0), ns_rew_sum)
            rep_buffer.add_sample(trans)
    else:
        for j in range(ns):
            ns_rew_sum += ns_rew[j] * ns_gamma**j

        # state,action,reward,
        # next_state,done, n_step_rew_sum, n_steps later
        # trans['sample'] = [ns_state.popleft(), ns_action.popleft(), ns_rew.pop(0),
        #                    ns_nexts.popleft(), ns_done.popleft(), ns_rew_sum, current_state]
        trans = Transition(ns_state.popleft(), ns_action.popleft(),
                           ns_nexts.popleft(), ns_rew.pop(0), ns_rew_sum)
        rep_buffer.add_sample(trans)
Ejemplo n.º 16
0
 def add_experience(self, state, action, reward, new_state, final):
     """
     Add a SARS' tuple to the experience replay.
     :param source: source state
     :param action: action index
     :param reward: reward associated to the transition
     :param dest: destination state
     :param final: whether the state is absorbing
     """
     # Remove older transitions if the replay memory is full
     if len(self.experiences) >= self.replay_memory_size:
         self.experiences.pop(0)
     # Add a tuple (source, action, reward, dest, final) to replay memory
     experience = Transition(state, action, reward, new_state, final)
     # print(f'add_experience: added {experience}')
     self.experiences.append(experience)
Ejemplo n.º 17
0
    def add_transition(self, action, next_state, reward, done):
        if not done and self.index < self.nsteps:
            next_state = self.processor._observation(next_state)
            self.transitions.insert(0, Transition(self.state, self.add_noop(action), next_state, torch.FloatTensor([reward]), torch.zeros(1)))

            transitions = []
            gamma = 1
            for trans in self.transitions:
                transitions.append(trans._replace(n_reward= trans.n_reward + gamma * reward))
                gamma = gamma * GAMMA
            self.transitions = transitions
        else:
            for trans in self.transitions:
                self.memory.push(trans)
            self.transitions = []
        self.state = next_state
Ejemplo n.º 18
0
    def update_policy_net(self) -> None:
        """Update policy_net via Q-learning approximation"""

        # check if memory has enough elements to sample
        if len(self.memory) < self.batch_size:
            return

        # get transitions
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        # get elements from batch
        non_final_mask = 1 - torch.tensor(batch.done, dtype=torch.uint8).to(
            torch.device(device))
        non_final_mask = non_final_mask.type(torch.bool)
        non_final_next_obs = torch.stack([
            ob for nonfinal, ob in zip(non_final_mask, batch.next_ob)
            if nonfinal
        ]).to(torch.device(device))
        ob_batch = torch.stack(batch.ob).to(torch.device(device))
        rew_batch = torch.stack(batch.rew).to(torch.device(device))
        action_batch = torch.stack(batch.action).to(torch.device(device))

        # estimate Q(st, a) with the policy network
        state_action_values = (self.policy_net.forward(ob_batch).gather(
            1, action_batch).squeeze())

        # estimate V(st+1) with target network
        next_state_values = torch.zeros(self.batch_size).to(
            torch.device(device))
        next_state_values[non_final_mask] = (
            self.target_net.forward(non_final_next_obs).max(1)[0].detach())

        # expected Q value
        expected_state_action_values = (rew_batch.squeeze() +
                                        self.gamma * next_state_values)

        # loss
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values)

        # optimize the network
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-0.1, 0.1)
        self.optimizer.step()
Ejemplo n.º 19
0
	def update(self, state, action, reward, next_state, terminal):
		
		self._episode_transitions.append(
			Transition(state, action, reward, next_state, terminal))

		# Loop through the episode.
		# Compute discounted return from each state until the episode termination.
		# Use this computation to update both the actor and baseline.
		if terminal:
			discounted_return = 0
			for transition in reversed(self._episode_transitions):
				discounted_return = self.DISCOUNT * discounted_return + transition.reward

				baseline = self._get_baseline(transition.state)
				td_error = discounted_return - baseline

				self._update_actor(transition.state, transition.action, td_error)
				self._update_baseline(transition.state, discounted_return)
Ejemplo n.º 20
0
    def update(self, batch_size=16):
        if len(self.memory.memory) < batch_size:
            batch_size = len(self.memory.memory)

        transitions = self.memory.sample(batch_size)
        batch = Transition(*zip(*transitions))

        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))

        non_final_mask = ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))
        non_final_next_states = Variable(torch.cat(
            [s for s in batch.next_state if s is not None]),
                                         volatile=True)

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        next_state_values = Variable(torch.zeros(batch_size).type(Tensor))
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0]

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch
        expected_state_action_values = Variable(
            expected_state_action_values.data)

        loss = F.mse_loss(state_action_values, expected_state_action_values)

        old_params = freeze_as_np_dict(self.policy_net.state_dict())
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            logging.debug(param.grad.data.sum())
            param.grad.data.clamp_(-1., 1.)
        self.optimizer.step()

        new_params = freeze_as_np_dict(self.policy_net.state_dict())
        check_params_changed(old_params, new_params)
        return loss.data[0]
    def _do_network_update(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        non_final_mask = ~torch.tensor(batch.done, dtype=torch.bool)
        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal > 0
        ]
        non_final_next_states = torch.stack(non_final_next_states)
        state_batch = torch.stack(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.get_state_act_vals(state_batch,
                                                      action_batch)
        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        state_action_values = state_action_values.view(-1, 1).repeat(
            1, len(self.q_models))
        next_state_values = self.get_max_next_state_vals(
            non_final_mask, non_final_next_states)
        expected_state_action_values = next_state_values + reward_batch.view(
            -1, 1).repeat(1, len(self.q_models))
        loss = (state_action_values - expected_state_action_values)**2
        coefs = self.get_hyperbolic_train_coeffs(self.k, len(self.q_models))
        loss = torch.sum(loss * coefs)
        # loss = F.smooth_l1_loss(state_action_values.squeeze(),
        #                         expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
Ejemplo n.º 22
0
    def prime_buffer(self, env):
        """ Fill the n-step buffer each time the environment
            has been reset.
        """

        # Maybe something is in there, clear it out.
        self.nstep_buffer = []

        for step in range(self.config['n_steps']):
            action = self.online_network(self.state_transformer(self.state))
            action = self.action_transformer(action)
            next_state, reward, done, info = env.step(action)

            trans = Transition(state=self.state,
                               action=action,
                               reward=reward,
                               next_state=next_state,
                               done=done,
                               discounted_reward=0.,
                               nth_state=None,
                               n=None)
            self.nstep_buffer.append(trans)
            self.state = next_state
Ejemplo n.º 23
0
    def train(self):
        batch = self.memory.sample(min(BATCH_SIZE, len(self.memory)))
        b_dict = [torch.stack(elem) for elem in Transition(*zip(*batch))]
        states, actions, rewards, next_states, dones = \
            b_dict[0], b_dict[1].view(-1, 1), \
            b_dict[2].view(-1, 1).float().to(device), b_dict[3], \
            b_dict[4].view(-1, 1).float().to(device)

        #  CRITIC LOSS: Q(s, a) += (r + gamma*Q'(s, π'(s)) - Q(s, a))
        # inputs computation
        inputs_critic = self.qnet(states, actions)
        # targets
        with torch.no_grad():
            policy_acts = self.policy_targ(next_states)
        targ_values = self.qnet_targ(next_states, policy_acts)
        targets_critics = rewards + GAMMA * (1 - dones) * targ_values
        loss_critic = self.MSE_loss(inputs_critic, targets_critics)
        self.q_optimizer.zero_grad()
        loss_critic.backward()
        # nn.utils.clip_grad_norm_(self.qnet.parameters(), GRAD_CLIP)
        self.q_optimizer.step()

        # ACTOR objective: derivative of Q(s, π(s | ø)) with respect to ø
        actor_loss = -self.qnet(states, self.policy(states)).mean()
        self.p_optimizer.zero_grad()
        actor_loss.backward()
        # nn.utils.clip_grad_norm_(self.policy.parameters(), GRAD_CLIP)
        self.p_optimizer.step()
        soft_update(self.policy_targ, self.policy, TAU)
        soft_update(self.qnet_targ, self.qnet, TAU)
        if self.args.use_writer:
            self.writer.add_scalar("critic_loss", loss_critic.item(),
                                   self.n_updates)
            self.writer.add_scalar("actor_loss", actor_loss.item(),
                                   self.n_updates)
        self.n_updates += 1
Ejemplo n.º 24
0
def train_agent_model_free(agent, env, params):

    update_timestep = params['update_every_n_steps']
    seed = params['seed']
    log_interval = 1000
    gif_interval = 500000
    n_random_actions = params['n_random_actions']
    n_evals = params['n_evals']
    n_collect_steps = params['n_collect_steps']
    use_statefilter = params['obs_filter']
    save_model = params['save_model']

    assert n_collect_steps > agent.batchsize, "We must initially collect as many steps as the batch size!"

    avg_length = 0
    time_step = 0
    cumulative_timestep = 0
    cumulative_log_timestep = 0
    n_updates = 0
    i_episode = 0
    log_episode = 0
    samples_number = 0
    episode_rewards = []
    episode_steps = []

    if use_statefilter:
        state_filter = MeanStdevFilter(env.env.observation_space.shape[0])
    else:
        state_filter = None

    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    env.seed(seed)
    env.action_space.np_random.seed(seed)

    max_steps = env.spec.max_episode_steps

    writer = SummaryWriter()

    while samples_number < 3e7:
        time_step = 0
        episode_reward = 0
        i_episode += 1
        log_episode += 1
        state = env.reset()
        if state_filter:
            state_filter.update(state)
        done = False

        while (not done):
            cumulative_log_timestep += 1
            cumulative_timestep += 1
            time_step += 1
            samples_number += 1
            if samples_number < n_random_actions:
                action = env.action_space.sample()
            else:
                action = agent.get_action(state, state_filter=state_filter)
            nextstate, reward, done, _ = env.step(action)
            # if we hit the time-limit, it's not a 'real' done; we don't want to assign low value to those states
            real_done = False if time_step == max_steps else done
            agent.replay_pool.push(
                Transition(state, action, reward, nextstate, real_done))
            state = nextstate
            if state_filter:
                state_filter.update(state)
            episode_reward += reward
            # update if it's time
            if cumulative_timestep % update_timestep == 0 and cumulative_timestep > n_collect_steps:
                q1_loss, q2_loss, pi_loss, a_loss = agent.optimize(
                    update_timestep, state_filter=state_filter)
                n_updates += 1
            # logging
            if cumulative_timestep % log_interval == 0 and cumulative_timestep > n_collect_steps:
                writer.add_scalar('Loss/Q-func_1', q1_loss, n_updates)
                writer.add_scalar('Loss/Q-func_2', q2_loss, n_updates)
                writer.add_scalar('Loss/policy', pi_loss, n_updates)
                writer.add_scalar('Loss/alpha', a_loss, n_updates)
                writer.add_scalar('Values/alpha',
                                  np.exp(agent.log_alpha.item()), n_updates)
                avg_length = np.mean(episode_steps)
                running_reward = np.mean(episode_rewards)
                eval_reward = evaluate_agent(env,
                                             agent,
                                             state_filter,
                                             n_starts=n_evals)
                writer.add_scalar('Reward/Train', running_reward,
                                  cumulative_timestep)
                writer.add_scalar('Reward/Test', eval_reward,
                                  cumulative_timestep)
                print(
                    'Episode {} \t Samples {} \t Avg length: {} \t Test reward: {} \t Train reward: {} \t Number of Policy Updates: {}'
                    .format(i_episode, samples_number, avg_length, eval_reward,
                            running_reward, n_updates))
                episode_steps = []
                episode_rewards = []
            if cumulative_timestep % gif_interval == 0:
                make_gif(agent, env, cumulative_timestep, state_filter)
                if save_model:
                    make_checkpoint(agent, cumulative_timestep, params['env'])

        episode_steps.append(time_step)
        episode_rewards.append(episode_reward)
Ejemplo n.º 25
0
empty_color = []
empty_depth = []

for i in range(m1.length):
    M1.add(m1.tree.data[i])
    M2.add(m2.tree.data[i])
    M3.add(m3.tree.data[i])

for i in range(m1.length):
    # Invalid point is common
    if m1.tree.data[i].reward == -3 * R:
        transition = m1.tree.data[i]
        pixel_index = transition.pixel_idx
        pixel_index[0] = 1
        transition_2 = Transition(transition.color, transition.depth,
                                  pixel_index, transition.reward,
                                  transition.next_color, transition.next_depth,
                                  transition.is_empty)
        M2.add(transition_2)
        pixel_index[0] = np.random.choice(range(2, 6))
        transition_3 = Transition(transition.color, transition.depth,
                                  pixel_index, transition.reward,
                                  transition.next_color, transition.next_depth,
                                  transition.is_empty)
        M3.add(transition_3)
    if m2.tree.data[i].reward == -3 * R:
        transition = m2.tree.data[i]
        pixel_index = transition.pixel_idx
        pixel_index[0] = 0
        transition_1 = Transition(transition.color, transition.depth,
                                  pixel_index, transition.reward,
                                  transition.next_color, transition.next_depth,
Ejemplo n.º 26
0
def optimize_model(policy_net, target_net, replay_memory, optimizer,
                   scheduler):
    if len(replay_memory) < config.BATCH_SIZE:
        return
    # print('Training...')
    policy_net.train()
    # print('Model mode:',policy_net.training)
    transitions = replay_memory.sample(config.BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(
        map(lambda s: s is not None, batch.next_state)),
                                  device=device,
                                  dtype=torch.uint8)

    try:
        non_final_next_states = torch.stack(
            [s for s in batch.next_state if s is not None])
    except:
        non_final_next_states = None

    state_batch = torch.stack(batch.state)
    action_batch = torch.stack(batch.action)
    reward_batch = torch.stack(batch.reward)
    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)
    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(config.BATCH_SIZE, device=device)

    if non_final_next_states is not None:
        next_state_values[non_final_mask] = target_net(
            non_final_next_states).max(1)[0].detach()

    # next_state_action = policy_net(non_final_next_states).max(1)[1].view(-1,1).detach()
    # next_state_values[non_final_mask] = target_net(non_final_next_states).gather(1, next_state_action)

    next_state_values = next_state_values.view(config.BATCH_SIZE, 1).float()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values *
                                    config.GAMMA) + reward_batch.float()
    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)

    # Optimize the model
    # for param_group in optimizer.param_groups:
    #     print(param_group['lr'])
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
    scheduler.step()
    policy_net.eval()
    # print('Model mode:',policy_net.training)
    return
Ejemplo n.º 27
0
def main():
	# Parse input
	parser = argparse.ArgumentParser(prog="exp_1", description="Code for Exp 1., testing the model capacity for grasping")
	parser.add_argument("model", type=str, help="model path for testing")
	parser.add_argument("type", type=str, help="novel/hybrid")
	parser.add_argument("run", type=int, help="Which number is this run")
	parser.add_argument("episode", type=int, help="Which episode is this run")
	parser.add_argument("--obj_nums", type=int, default=8, help="Number of object, default is 6")
	parser.add_argument("--port", type=str, default="/dev/ttylight", help="Port for arduino, which controls the alram lamp, default is /dev/ttylight")
	parser.add_argument("--densenet_lr", type=float, default=1e-5, help="Learning rate for feature extraction part, default is 1e-5")
	parser.add_argument("--primitive_lr", type=float, default=5e-5, help="Learning rate for motion primitive subnetworks, default is 1e-4")
	args = parser.parse_args()
	utils.show_args(args)
	# Create directories
	r = rospkg.RosPack()
	package_path = r.get_path("grasp_suck")
	root_path, image_path, depth_path, pc_path, vis_path, grasp_path, mixed_paths, feat_paths = create_directories(package_path, args.episode, args.run, args.type)
	arduino = serial.Serial(args.port, 115200)
	reward = 5.0
	discount_factor = 0.5
	return_ = 0.0
	pick_item = 0
	# Service clients
	vacuum_pump_control      = rospy.ServiceProxy("/vacuum_pump_control_node/vacuum_control", SetBool)
	check_suck_success       = rospy.ServiceProxy("/vacuum_pump_control_node/check_suck_success", SetBool)
	go_home                  = rospy.ServiceProxy("/agent_server_node/go_home", Empty)
	go_place                 = rospy.ServiceProxy("/agent_server_node/go_place", Empty)
	fixed_home               = rospy.ServiceProxy("/agent_server_node/go_home_fix_orientation", Empty)
	publish_data_client      = rospy.ServiceProxy("/agent_server_node/publish_data", publish_info)
	record_bag_client        = rospy.ServiceProxy("/autonomous_recording_node/start_recording", recorder)
	stop_record_client       = rospy.ServiceProxy("/autonomous_recording_node/stop_recording", Empty)
	# Shared data between processes
	work = mp.Value(c_bool, True) # Can prediction thread continue working? <bool>
	ready = mp.Value(c_bool, False) # Is prediction thread ready? <bool>
	can_predict = mp.Value(c_bool, False) # Can prediction thread do predict? <bool>
	should_reset = mp.Value(c_bool, False) # Should prediction thread reset model? <bool>
	iteration = mp.Value("i", 0) # What iteration is this action? <int>
	path_queue = mp.Queue()
	path_queue.put([image_path, depth_path, pc_path, vis_path, feat_paths, mixed_paths])
	action_queue = mp.Queue() # Action placeholder, prediction thread will generate an action and main thread will consume it
	experience_queue = mp.Queue() # Transition placeholder, main thread will generate a transition and prediction thread will consume it
	# Start prediction thread
	p = mp.Process(target=prediction_process, args=(args, \
													action_queue, experience_queue, \
													work, ready, can_predict, should_reset, \
													iteration, \
													path_queue, ))
	p.start()
	# Initialize
	while not ready.value:
		pass
	go_home()
	vacuum_pump_control(SetBoolRequest(False))
	is_empty = False
	cmd = raw_input("[Main Thread] Press any key to continue...")
	program_ts = time.time()
	can_predict.value = True
	while 1:
		action_target = []
		is_empty_list = []
		print("Code: {}".format(encode_index(args.episode, args.run)))
		record_bag_client(recorderRequest(encode_index(args.episode, args.run))) # Start recording
		while not is_empty and iteration.value<args.obj_nums*2:
			print("\033[1;32m[{}] Iteration: {}\033[0m".format(time.time()-program_ts, iteration.value))
			arduino.write("b 1000")
			# Wait until there is action in the queue
			while action_queue.empty():
				pass
			action_obj = action_queue.get() # [action, action_str, points, angle, pixel_index]
			is_valid = utils.check_if_valid(action_obj[2])
			_viz(action_obj[2], action_obj[0], action_obj[3], is_valid)
			will_collide = None
			if is_valid:
				tool_id = (3-action_obj[0]) if action_obj[0] <2 else 1
				if tool_id == 1:
					will_collide = _check_collide(action_obj[2], action_obj[3])
				if not will_collide or tool_id!=1:
					_take_action(tool_id, action_obj[2], action_obj[3])
				else:
					print("[Main Thread] Will collide, abort request!")
			else:
				arduino.write("r 1000")
				action_success = False
			if is_valid:
				if action_obj[0] < 2:
					action_success = check_suck_success().success
				else:
					if not will_collide:
						action_success = _check_grasp_success(iteration.value, grasp_path)
					else:
						action_success = False
			if action_success: pick_item += 1
			info = publish_infoRequest(); info.execution = utils.wrap_execution_info(iteration.value, is_valid, action_obj[0], action_success); publish_data_client(info)
			empty_state = mp.Value(c_bool, False)
			iteration.value += 1
			next_state_thread = mp.Process(target=get_next_state, args=(empty_state, iteration.value-1, (pick_item==args.obj_nums), pc_path, image_path, depth_path))
			next_state_thread.start()
			if action_success:
				arduino.write("g 1000"); go_place(); fixed_home(); 
			else:
				fixed_home(); vacuum_pump_control(SetBoolRequest(False));
			current_reward = utils.reward_judgement(reward, is_valid, action_success)
			return_ += current_reward * np.power(discount_factor, iteration.value-1)
			print "\033[1;33mCurrent reward: {} \t Return: {}\033[0m".format(current_reward, return_)
			color_name, depth_name, next_color_name, next_depth_name = utils.wrap_strings(image_path, depth_path, iteration.value-1)
			next_state_thread.join(); is_empty = empty_state.value
			action_target.append(action_obj[4]); is_empty_list.append(is_empty)
			transition = Transition(color_name, depth_name, action_obj[4], current_reward, next_color_name, next_depth_name, is_empty)
			experience_queue.put(transition)
			if not is_empty and iteration.value < args.obj_nums*2: can_predict.value = True
		stop_record_client()
		
		if is_empty: 
			print("\033[1;33m[{}] Pass test with return: {}\033[0m".format(time.time()-program_ts, return_)) 
		else: 
			print("\033[1;31m[{}] Failed with return: {}\033[0m".format(time.time()-program_ts, return_))
		np.savetxt(root_path+"action_target.csv", action_target, delimiter=",")
		np.savetxt(root_path+"is_empty.csv", is_empty_list, delimiter=",")
		f = open(root_path+"{}.txt".format(encode_index(args.episode, args.run)), 'w')
		f.write("{}\n".format(is_empty)); f.write("{}".format(return_)); f.close()
		action_target = []; is_empty_list = []
		cmd = raw_input("Press 'r' to reset, 'e' to exit: ")
		if cmd == 'e' or cmd == 'E':
			break
			
		elif cmd == 'r' or cmd == 'R':
			print("[Main Thread] Receive reset command")
			ready.value = False
			should_reset.value = True
			args.run += 1
			root_path, image_path, depth_path, pc_path, vis_path, grasp_path, mixed_paths, feat_paths = create_directories(package_path, args.episode, args.run, args.type)
			path_queue.put([image_path, depth_path, pc_path, vis_path, feat_paths, mixed_paths])
			is_empty = False
			pick_item = 0
			return_ = 0.0
			iteration.value = 0
			# Wait until prediction thread ready
			while not ready.value: pass
			program_ts = time.time()
			can_predict.value = True # Tell prediction thread we can start
			
	# Stop prediction thread
	work.value = False
	p.join()
	print("Main thread stop")
Ejemplo n.º 28
0
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)

    # print("trasitions:", transitions)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    # Batch is a name tuple, each field contains a list of batch size states.
    batch = Transition(*zip(*transitions))
    # print("batch", batch)
    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(
        map(lambda s: s is not None, batch.next_state)),
                                  device=device,
                                  dtype=torch.bool).to(device)

    #print("non_final_mask:",non_final_mask)
    non_final_next_states = torch.cat(
        [s for s in batch.next_state if s is not None]).to(device)
    #print("non_final_next_states", non_final_next_states)
    #print("non_final_next_states shape", non_final_next_states.shape)
    # (batch_size, state_h, state_w)
    state_batch = torch.cat(batch.state).to(device)
    action_batch = torch.cat(batch.action).to(device)
    reward_batch = torch.cat(batch.reward).to(device)
    #print("reward batch:", reward_batch.shape)
    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    #print("state batch shape:",state_batch.shape)
    #print("action_batch:", action_batch)
    #print("unsqueeze:",action_batch.unsqueeze(1))
    state_action_values = policy_net(state_batch).gather(
        1, action_batch.unsqueeze(1))
    #print("state action values:",state_action_values)
    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(
        1)[0].detach()
    #print("next_state_values:", next_state_values.shape)
    # Compute the expected Q values
    expected_state_action_values = (next_state_values.view(BATCH_SIZE, 1) *
                                    GAMMA) + reward_batch
    #print("state action values size:",state_action_values.shape)
    #print("expected state action values:",expected_state_action_values)
    #print("expected state action values size:", expected_state_action_values.unsqueeze(1)[:,:,0].shape)
    # Compute Huber loss
    loss = F.smooth_l1_loss(
        state_action_values.view(BATCH_SIZE, 1),
        expected_state_action_values.unsqueeze(1).view(BATCH_SIZE, 1).float())
    #print("loss",loss)
    #input('press to continue')
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()

    # for param in policy_net.parameters():
    #     param.grad.data.clamp_(-1, 1)
    optimizer.step()
Ejemplo n.º 29
0
    def train(self):
        """ Train the online network using the n-step loss. 
        """
        env = self.env_builder()
        self.state = env.reset()
        self.prime_buffer(env)

        self.step = 0
        score = 0
        for epoch in range(self.config['n_epochs']):
            epoch_loss = []
            start_time = time.time()
            for batch in range(self.config['n_batches_per_epoch']):

                scores = self.evaluator.evaluate(self.step)
                if scores is not None and self.batchsize_bandit is not None:

                    if self.step > 0:
                        reward = np.median(scores)
                        self.batchsize_bandit.step(reward)

                    batch_size, expert_batch_size = self.batchsize_bandit.sample(
                    )
                    self.config['batch_size'] = batch_size
                    self.config['expert_batch_size'] = expert_batch_size

                    wandb.log({
                        "bandit_batch_size": batch_size,
                        "bandit_expert_batch_size": expert_batch_size,
                        "bandit_values": self.batchsize_bandit.values
                    })

                # Choose an action based on the current state and
                # according to an epsilon-greedy policy.
                epsilon = self.epsilon_schedule.value(self.step)
                if random.random() < epsilon:
                    action = env.action_space.sample()
                else:
                    action = self.action_transformer(
                        self.online_network(self.state_transformer(
                            self.state)))

                # Update the current state of the environment by taking
                # the action and building the current transition to be
                # added to the n-step buffer.  These states are only added
                # to the replay buffer after a delay of n-steps.
                next_state, reward, done, info = env.step(action)
                current_trans = Transition(state=self.state,
                                           action=action,
                                           next_state=next_state,
                                           reward=reward,
                                           discounted_reward=None,
                                           nth_state=None,
                                           done=done,
                                           n=None)

                # Now use the contents of the n-step buffer to construct
                # the delayed transition and add that to the prioritized
                # replay buffer to be sampled for learning.
                (delayed_states, delayed_actions, delayed_rewards,
                 delayed_next_states, delayed_discounted_rewards,
                 delayed_nth_states, delayed_dones,
                 delayed_ns) = expand_transitions(self.nstep_buffer,
                                                  torchify=False)

                # Ensure that if the current episode has ended the last
                # few transitions get added correctly to the buffer.
                if not current_trans.done:
                    delayed_trans = Transition(
                        state=delayed_states[0],
                        action=delayed_actions[0],
                        reward=delayed_rewards[0],
                        next_state=delayed_next_states[0],
                        discounted_reward=np.sum([
                            reward * self.config['gamma']**i
                            for i, reward in enumerate(delayed_rewards)
                        ]),
                        nth_state=self.state,
                        done=done,
                        n=self.config['n_steps'])
                    self.buffer.add(delayed_trans)

                else:
                    for i in range(self.config['n_steps']):
                        delayed_trans = Transition(
                            state=delayed_states[i],
                            action=delayed_actions[i],
                            reward=delayed_rewards[i],
                            next_state=delayed_next_states[i],
                            discounted_reward=np.sum([
                                reward * self.config['gamma']**j
                                for j, reward in enumerate(delayed_rewards[i:])
                            ]),
                            nth_state=self.state,
                            done=done,
                            n=self.config['n_steps'] - i)
                        self.buffer.add(delayed_trans)

                # Now that we have used the buffer, we can add the current
                # transition to the queue.  Update the current state of the
                # environment.
                self.nstep_buffer.append(current_trans)
                if len(self.nstep_buffer) > self.config['n_steps']:
                    _ = self.nstep_buffer.pop(0)
                self.state = next_state

                beta = self.beta_schedule.value(self.step)
                if len(self.buffer) >= self.config[
                        'batch_size'] and self.config['batch_size'] > 0:
                    # Sample a batch of experience from the replay buffer and
                    # train with the n-step TD loss.
                    transitions, weights, indices = self.buffer.sample(
                        self.config['batch_size'], beta)
                    (states, actions, rewards, next_states, discounted_rewards,
                     nth_states, dones, ns) = expand_transitions(
                         transitions,
                         torchify=True,
                         state_transformer=self.state_transformer)

                    # Calculate the loss per transition.  This is not
                    # aggregated so that we can make the importance sampling
                    # correction to the loss.
                    #
                    # First we calculate the loss for 1-step ahead, then if
                    # required, we look ahead n-steps and add that to our loss.
                    # Importance sampling weights are based on the 1-step loss.
                    loss = ntd_loss(online_model=self.online_network,
                                    target_model=self.target_network,
                                    states=states,
                                    actions=actions,
                                    next_states=next_states,
                                    rewards=rewards,
                                    dones=dones,
                                    gamma=0.99,
                                    n=1)
                    weights = torch.FloatTensor(weights).to(self.device)
                    loss = loss * weights
                    priorities = loss + 1e-5
                    priorities = priorities.detach().cpu().numpy()
                    self.buffer.update_priorities(priorities, indices)
                    loss = loss.mean()

                    if self.config['n_steps'] > 1:
                        nstep_loss = ntd_loss(online_model=self.online_network,
                                              target_model=self.target_network,
                                              states=states,
                                              actions=actions,
                                              next_states=nth_states,
                                              rewards=discounted_rewards,
                                              dones=dones,
                                              gamma=0.99,
                                              n=ns)
                        nstep_loss = nstep_loss.mean()
                        loss += nstep_loss

                # Maybe we have an expert buffer, if so we should train some
                # samples from that expert buffer.
                if self.expert_buffer is not None and self.config[
                        'expert_batch_size'] > 0:
                    e_transitions, e_weights, e_indices = self.expert_buffer.sample(
                        self.config['expert_batch_size'], beta)

                    (e_states, e_actions, e_rewards, e_next_states,
                     e_discounted_rewards, e_nth_states, e_dones,
                     e_ns) = expand_transitions(
                         e_transitions,
                         torchify=True,
                         state_transformer=self.state_transformer)

                    e_loss = ntd_loss(online_model=self.online_network,
                                      target_model=self.target_network,
                                      states=e_states,
                                      actions=e_actions,
                                      next_states=e_next_states,
                                      rewards=e_rewards,
                                      dones=e_dones,
                                      gamma=0.99,
                                      n=1)
                    e_weights = torch.FloatTensor(e_weights).to(self.device)
                    e_loss = e_loss * e_weights
                    e_priorities = e_loss + 1e-5
                    e_priorities = e_priorities.detach().cpu().numpy()
                    self.expert_buffer.update_priorities(
                        e_priorities, e_indices)
                    e_loss = e_loss.mean()

                    if self.config['n_steps'] > 1:
                        e_nstep_loss = ntd_loss(
                            online_model=self.online_network,
                            target_model=self.target_network,
                            states=e_states,
                            actions=e_actions,
                            next_states=e_nth_states,
                            rewards=e_discounted_rewards,
                            dones=e_dones,
                            gamma=0.99,
                            n=e_ns)
                        e_nstep_loss = e_nstep_loss.mean()
                        e_loss += e_nstep_loss

                    q_values = self.online_network(e_states)
                    e_loss += torch.mean(margin_loss(q_values, e_actions))

                # Finally, add this sucker to the loss if we do have expert samples.
                if len(self.buffer) > self.config["batch_size"]:
                    if self.config['batch_size'] > 0 and self.config[
                            'expert_batch_size'] > 0:
                        loss = loss * self.online_coef + e_loss * self.expert_coef
                    elif self.config['batch_size'] > 0:
                        loss = loss
                    elif self.config['expert_batch_size'] > 0:
                        loss = e_loss

                    # Take the step of updating online network parameters
                    # based on this batch loss.
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()

                    # End of training step actions
                    epoch_loss.append(loss.detach().cpu().numpy())

                # End of every step actions
                if self.step % self.config['update_interval'] == 0:
                    self.target_network.load_state_dict(
                        self.online_network.state_dict())
                self.step += 1
                score += current_trans.reward

                if current_trans.done:
                    if score > self.best_episode:
                        self.best_episode = score

                    self.episodic_reward.append(score)
                    score = 0
                    self.state = env.reset()
                    self.prime_buffer(env)

                    wandb.log({"episodic_reward": self.episodic_reward[-1]})

            # End of batch actions
            self.loss.append(np.mean(epoch_loss))
            print("Epoch {0}, Score {1:6.4f}, Loss {2:6.4f}, Time {3:6.4f}".
                  format(epoch, score, self.loss[-1],
                         time.time() - start_time))

            wandb.log({
                "time": time.time() - start_time,
                "loss": self.loss[-1],
                "epsilon": epsilon,
                "beta": beta
            })

        wandb.log({"best_episode": self.best_episode})
Ejemplo n.º 30
0
 def add_transition(self, state, action, next_state, reward, done,
                    priority):
     trans = Transition(state, action, next_state, reward, done)
     self._buffer.push(item=trans, priority=priority)