Example #1
0
def optimize_policy(policy, optimizer, memories, batch_size, num_envs, gamma,
                    alpha, beta):
    loss = 0
    for i_env in range(num_envs):
        size_to_sample = np.minimum(batch_size,
                                    memories[i_env].policy_length())
        transitions = memories[i_env].policy_sample(size_to_sample)
        batch = Transition(*zip(*transitions))

        state_batch = Variable(torch.cat(batch.state))
        time_batch = Variable(torch.cat(batch.time))

        action_batch = torch.cat(batch.action)
        cur_loss = (
            torch.pow(Variable(Tensor([gamma])), time_batch) *
            torch.log(policy(state_batch).gather(1, action_batch))).sum()

        loss -= cur_loss

    loss = (alpha / beta) * loss
    optimizer.zero_grad()
    loss.backward()

    for param in policy.parameters():
        param.grad.data.clamp_(-500, 500)
    optimizer.step()
Example #2
0
def optimize_policy(policy, optimizer, memories, batch_size, num_envs, gamma,
                    device):
    loss = 0
    for i_env in range(num_envs):
        size_to_sample = np.minimum(batch_size, len(memories[i_env]))
        transitions = memories[i_env].policy_sample(size_to_sample)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state).to(device)
        # print(batch.action)
        gamma = Tensor([gamma]).to(device)
        time_batch = torch.cat(batch.time).to(device)
        actions = np.array(
            [action.cpu().numpy()[0][0] for action in batch.action])
        actions = torch.from_numpy(actions).to(device)
        cur_loss = (
            torch.pow(gamma, time_batch) *
            torch.log(policy(state_batch).to(device)[:, actions])).sum()
        loss -= cur_loss
        # loss = cur_loss if i_env == 0 else loss + cur_loss

    optimizer.zero_grad()
    loss.backward()

    for param in policy.parameters():
        param.grad.data.clamp_(-500, 500)
        # print("policy:", param.grad.data)
    optimizer.step()
def optimize_model(policy, model, optimizer, memory, batch_size,
                    alpha, beta, gamma):
    '''
    Optimize w.r.t. task-specific policies via MSE and TD-learning
    '''

    if len(memory) < batch_size:
        return
    transitions = memory.sample(batch_size)
    # format transition tuple
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    non_final_mask = ByteTensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)))

    # extract all none terminal next states
    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])

    # format state, action and rewards
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # calculate the numerator of equation 8 from Teh et al.
    pi0_a_pref = policy.forward_action_pref(state_batch)
    term = alpha*pi0_a_pref + beta*model(state_batch)
    max_term = torch.max(term, 1)[0].unsqueeze(1)

    # get pi_i(a_t | s_t) via equation 8 from Teh et al.
    pi_i = torch.exp(term-max_term)/(torch.exp(term-max_term).sum(1).unsqueeze(1))

    # compute regularized rewards as defined in Teh et al.
    reward_batch = (reward_batch.unsqueeze(1) + (alpha/beta)*torch.log(policy.forward(state_batch).gather(1, action_batch))
                     - (1/beta)*torch.log(pi_i.gather(1, action_batch)))

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken
    state_action_values = model(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states, 2nd component of equation 7 from Teh et al.
    next_state_values = torch.zeros(batch_size).type(Tensor)
    next_state_values[non_final_mask] = ( torch.log(
        (torch.pow(policy.forward(non_final_next_states), alpha)
        * (torch.exp(beta * model(non_final_next_states)) + 1e-16)).sum(1)) / beta ).detach()

    # Compute the expected Q values
    expected_state_action_values = (next_state_values.unsqueeze(1) * gamma) + reward_batch

    # Compute loss
    loss = F.mse_loss(state_action_values, expected_state_action_values)

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in model.parameters():
        param.grad.data.clamp_(-500, 500)
    optimizer.step()
Example #4
0
def optimize_model(policy, model, optimizer, memory, batch_size,
                    alpha, beta, gamma):
    if len(memory) < batch_size:
        return
    transitions = memory.sample(batch_size)
    # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for
    # detailed explanation).
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    non_final_mask = ByteTensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)))
    # We don't want to backprop through the expected action values and volatile
    # will save us on temporarily changing the model parameters'
    # requires_grad to False!
    non_final_next_states = Variable(torch.cat([s for s in batch.next_state
                                                if s is not None]),
                                     volatile=True)

    state_batch = Variable(torch.cat(batch.state))
    action_batch = Variable(torch.cat(batch.action))
    reward_batch = Variable(torch.cat(batch.reward))

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken
    state_action_values = model(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    next_state_values = Variable(torch.zeros(batch_size).type(Tensor))
    next_state_values[non_final_mask] = torch.log(
        (torch.pow(policy(non_final_next_states), alpha)
        * (torch.exp(beta * model(non_final_next_states)) + 1e-16)).sum(1)) / beta
    try:
        np.isnan(next_state_values.sum().data[0])
    except Exception:
        print("next_state_values:", next_state_values)
        print(policy(non_final_next_states))
        print(torch.exp(beta * model(non_final_next_states)))
        print(model(non_final_next_states))

    # Now, we don't want to mess up the loss with a volatile flag, so let's
    # clear it. After this, we'll just end up with a Variable that has
    # requires_grad=False
    next_state_values.volatile = False
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * gamma) + reward_batch

    # Compute Huber loss
    loss = F.mse_loss(state_action_values, expected_state_action_values)

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # for param in model.parameters():
    #     param.grad.data.clamp_(-1, 1)
    optimizer.step()
def optimize_model(policy, model, optimizer, memory, batch_size,
                    alpha, beta, gamma):
    if len(memory) < batch_size:
        return
    transitions = memory.sample(batch_size)
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    non_final_mask = ByteTensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)))

    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])

    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # calculate pi_i
    term = model(state_batch)
    max_term = torch.max(term, 1)[0].unsqueeze(1)
    pi_i = torch.exp(term-max_term)/(torch.exp(term-max_term).sum(1).unsqueeze(1))

    # reg rewards
    reward_batch = (reward_batch.unsqueeze(1) +
                     (alpha/beta)*torch.log(policy.forward(state_batch).gather(1, action_batch))
                     - (1/beta)*torch.log(pi_i.gather(1, action_batch)))

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken
    state_action_values = model(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states, 2nd component of equation 7
    next_state_values = torch.zeros(batch_size).type(Tensor)
    next_state_values[non_final_mask] = ( torch.log(
        (torch.pow(policy.forward(non_final_next_states), alpha)
        * (torch.exp(beta * model(non_final_next_states)) + 1e-16)).sum(1)) / beta ).detach()
    
    # Compute the expected Q values
    expected_state_action_values = (next_state_values.unsqueeze(1) * gamma) + reward_batch

    # Compute MSE loss
    loss = F.mse_loss(state_action_values, expected_state_action_values)

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in model.parameters():
        param.grad.data.clamp_(-500, 500)
    optimizer.step()
Example #6
0
def optimize_model(model, optimizer, memory, BATCH_SIZE, GAMMA, BETA):
    global last_sync
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for
    # detailed explanation).
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    non_final_mask = ByteTensor(
        tuple(map(lambda s: s is not None, batch.next_state)))
    # We don't want to backprop through the expected action values and volatile
    # will save us on temporarily changing the model parameters'
    # requires_grad to False!
    non_final_next_states = Variable(torch.cat(
        [s for s in batch.next_state if s is not None]),
                                     volatile=True)
    state_batch = Variable(torch.cat(batch.state))
    action_batch = Variable(torch.cat(batch.action))
    reward_batch = Variable(torch.cat(batch.reward))

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken
    state_action_values = model(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    next_state_values = Variable(torch.zeros(BATCH_SIZE).type(Tensor))
    next_state_values[non_final_mask] = torch.log(
        torch.exp(BETA * model(non_final_next_states)).sum(1)) / BETA
    # Now, we don't want to mess up the loss with a volatile flag, so let's
    # clear it. After this, we'll just end up with a Variable that has
    # requires_grad=False
    next_state_values.volatile = False
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    loss = F.mse_loss(state_action_values, expected_state_action_values)

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in model.parameters():
        param.grad.data.clamp_(-500, 500)
    optimizer.step()
def optimize_policy(policy, optimizer, memories, batch_size,
                    num_envs, gamma, alpha, beta):
    '''
    Optimize the distilled policy via stochastic gradient descent
    by sampling from equation 5 from Teh et al.
    '''

    loss = 0
    for i_env in range(num_envs):
        # determine sample size
        size_to_sample = np.minimum(batch_size, len(memories[i_env]))

        # extract sample
        transitions = memories[i_env].policy_sample(size_to_sample)

        # format sampled batch
        batch = Transition(*zip(*transitions))

        # format state, time and action batch
        state_batch = torch.cat(batch.state)
        time_batch = torch.cat(batch.time)
        action_batch = torch.cat(batch.action)

        # add to current loss according to equation 5 from Teh et al.
        cur_loss = (torch.pow(Variable(Tensor([gamma])), time_batch) *
            torch.log(policy(state_batch).gather(1, action_batch))).sum()

        loss -= cur_loss

    loss = (alpha/beta) * loss

    optimizer.zero_grad()
    loss.backward()

    for param in policy.parameters():
        param.grad.data.clamp_(-500, 500)
    optimizer.step()
Example #8
0
def optimize_policy(policy, optimizer, memories, batch_size,
                    num_envs, gamma):
    loss = 0
    for i_env in range(num_envs):
        size_to_sample = np.minimum(batch_size, len(memories[i_env]))
        transitions = memories[i_env].policy_sample(size_to_sample)
        batch = Transition(*zip(*transitions))
        
        state_batch = Variable(torch.cat(batch.state))
        # print(batch.action)
        time_batch = Variable(torch.cat(batch.time))
        actions = np.array([action.numpy()[0][0] for action in batch.action])
        
        cur_loss = (torch.pow(Variable(Tensor([gamma])), time_batch) *
            torch.log(policy(state_batch)[:, actions])).sum()
        loss -= cur_loss
        # loss = cur_loss if i_env == 0 else loss + cur_loss

    optimizer.zero_grad()
    loss.backward()

    # for param in policy.parameters():
    #     param.grad.data.clamp_(-1, 1)
    optimizer.step()
Example #9
0
def optimize_model(policy, model, optimizer, memory, batch_size, alpha, beta,
                   gamma):
    if len(memory) < batch_size:
        return
    transitions = memory.sample(batch_size)
    # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for
    # detailed explanation).
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    non_final_mask = ByteTensor(
        tuple(map(lambda s: s is not None, batch.next_state)))
    # We don't want to backprop through the expected action values and volatile
    # will save us on temporarily changing the model parameters'
    # requires_grad to False!
    non_final_next_states = torch.cat(
        [s for s in batch.next_state if s is not None])
    # non_final_next_states.requires_grad = False

    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # calculate the numerator of equation 8
    # pi0_a_pref = policy.forward_action_pref(state_batch)
    term = beta * model(state_batch)
    max_term = torch.max(term, 1)[0].unsqueeze(1)

    # get equation 8, pi_i(a_t | s_t)
    pi_i = torch.exp(term - max_term) / (
        torch.exp(term - max_term).sum(1).unsqueeze(1))

    # reg rewards
    reward_batch = (
        reward_batch.unsqueeze(1) + (alpha / beta) *
        torch.log(policy.forward(state_batch).gather(1, action_batch)) -
        (1 / beta) * torch.log(pi_i.gather(1, action_batch)))

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken
    state_action_values = model(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states, 2nd component of equation 7
    next_state_values = torch.zeros(batch_size).type(Tensor)
    next_state_values[non_final_mask] = (torch.log(
        (torch.pow(policy.forward(non_final_next_states), alpha) *
         (torch.exp(beta * model(non_final_next_states)) + 1e-16)).sum(1)) /
                                         beta).detach()

    if np.isnan(next_state_values.sum().data.numpy()):
        print('true')

    # Now, we don't want to mess up the loss with a volatile flag, so let's
    # clear it. After this, we'll just end up with a Variable that has
    # requires_grad=False
    # next_state_values.volatile = False
    # Compute the expected Q values
    expected_state_action_values = (next_state_values.unsqueeze(1) *
                                    gamma) + reward_batch

    # Compute Huber loss
    loss = F.mse_loss(state_action_values, expected_state_action_values)
    # loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in model.parameters():
        param.grad.data.clamp_(-500, 500)
    optimizer.step()