def optimize_policy(policy, optimizer, memories, batch_size, num_envs, gamma, alpha, beta): loss = 0 for i_env in range(num_envs): size_to_sample = np.minimum(batch_size, memories[i_env].policy_length()) transitions = memories[i_env].policy_sample(size_to_sample) batch = Transition(*zip(*transitions)) state_batch = Variable(torch.cat(batch.state)) time_batch = Variable(torch.cat(batch.time)) action_batch = torch.cat(batch.action) cur_loss = ( torch.pow(Variable(Tensor([gamma])), time_batch) * torch.log(policy(state_batch).gather(1, action_batch))).sum() loss -= cur_loss loss = (alpha / beta) * loss optimizer.zero_grad() loss.backward() for param in policy.parameters(): param.grad.data.clamp_(-500, 500) optimizer.step()
def optimize_policy(policy, optimizer, memories, batch_size, num_envs, gamma, device): loss = 0 for i_env in range(num_envs): size_to_sample = np.minimum(batch_size, len(memories[i_env])) transitions = memories[i_env].policy_sample(size_to_sample) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state).to(device) # print(batch.action) gamma = Tensor([gamma]).to(device) time_batch = torch.cat(batch.time).to(device) actions = np.array( [action.cpu().numpy()[0][0] for action in batch.action]) actions = torch.from_numpy(actions).to(device) cur_loss = ( torch.pow(gamma, time_batch) * torch.log(policy(state_batch).to(device)[:, actions])).sum() loss -= cur_loss # loss = cur_loss if i_env == 0 else loss + cur_loss optimizer.zero_grad() loss.backward() for param in policy.parameters(): param.grad.data.clamp_(-500, 500) # print("policy:", param.grad.data) optimizer.step()
def optimize_model(policy, model, optimizer, memory, batch_size, alpha, beta, gamma): ''' Optimize w.r.t. task-specific policies via MSE and TD-learning ''' if len(memory) < batch_size: return transitions = memory.sample(batch_size) # format transition tuple batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = ByteTensor(tuple(map(lambda s: s is not None, batch.next_state))) # extract all none terminal next states non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) # format state, action and rewards state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # calculate the numerator of equation 8 from Teh et al. pi0_a_pref = policy.forward_action_pref(state_batch) term = alpha*pi0_a_pref + beta*model(state_batch) max_term = torch.max(term, 1)[0].unsqueeze(1) # get pi_i(a_t | s_t) via equation 8 from Teh et al. pi_i = torch.exp(term-max_term)/(torch.exp(term-max_term).sum(1).unsqueeze(1)) # compute regularized rewards as defined in Teh et al. reward_batch = (reward_batch.unsqueeze(1) + (alpha/beta)*torch.log(policy.forward(state_batch).gather(1, action_batch)) - (1/beta)*torch.log(pi_i.gather(1, action_batch))) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken state_action_values = model(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states, 2nd component of equation 7 from Teh et al. next_state_values = torch.zeros(batch_size).type(Tensor) next_state_values[non_final_mask] = ( torch.log( (torch.pow(policy.forward(non_final_next_states), alpha) * (torch.exp(beta * model(non_final_next_states)) + 1e-16)).sum(1)) / beta ).detach() # Compute the expected Q values expected_state_action_values = (next_state_values.unsqueeze(1) * gamma) + reward_batch # Compute loss loss = F.mse_loss(state_action_values, expected_state_action_values) # Optimize the model optimizer.zero_grad() loss.backward() for param in model.parameters(): param.grad.data.clamp_(-500, 500) optimizer.step()
def optimize_model(policy, model, optimizer, memory, batch_size, alpha, beta, gamma): if len(memory) < batch_size: return transitions = memory.sample(batch_size) # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for # detailed explanation). batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = ByteTensor(tuple(map(lambda s: s is not None, batch.next_state))) # We don't want to backprop through the expected action values and volatile # will save us on temporarily changing the model parameters' # requires_grad to False! non_final_next_states = Variable(torch.cat([s for s in batch.next_state if s is not None]), volatile=True) state_batch = Variable(torch.cat(batch.state)) action_batch = Variable(torch.cat(batch.action)) reward_batch = Variable(torch.cat(batch.reward)) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken state_action_values = model(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. next_state_values = Variable(torch.zeros(batch_size).type(Tensor)) next_state_values[non_final_mask] = torch.log( (torch.pow(policy(non_final_next_states), alpha) * (torch.exp(beta * model(non_final_next_states)) + 1e-16)).sum(1)) / beta try: np.isnan(next_state_values.sum().data[0]) except Exception: print("next_state_values:", next_state_values) print(policy(non_final_next_states)) print(torch.exp(beta * model(non_final_next_states))) print(model(non_final_next_states)) # Now, we don't want to mess up the loss with a volatile flag, so let's # clear it. After this, we'll just end up with a Variable that has # requires_grad=False next_state_values.volatile = False # Compute the expected Q values expected_state_action_values = (next_state_values * gamma) + reward_batch # Compute Huber loss loss = F.mse_loss(state_action_values, expected_state_action_values) # Optimize the model optimizer.zero_grad() loss.backward() # for param in model.parameters(): # param.grad.data.clamp_(-1, 1) optimizer.step()
def optimize_model(policy, model, optimizer, memory, batch_size, alpha, beta, gamma): if len(memory) < batch_size: return transitions = memory.sample(batch_size) batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = ByteTensor(tuple(map(lambda s: s is not None, batch.next_state))) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # calculate pi_i term = model(state_batch) max_term = torch.max(term, 1)[0].unsqueeze(1) pi_i = torch.exp(term-max_term)/(torch.exp(term-max_term).sum(1).unsqueeze(1)) # reg rewards reward_batch = (reward_batch.unsqueeze(1) + (alpha/beta)*torch.log(policy.forward(state_batch).gather(1, action_batch)) - (1/beta)*torch.log(pi_i.gather(1, action_batch))) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the columns of actions taken state_action_values = model(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states, 2nd component of equation 7 next_state_values = torch.zeros(batch_size).type(Tensor) next_state_values[non_final_mask] = ( torch.log( (torch.pow(policy.forward(non_final_next_states), alpha) * (torch.exp(beta * model(non_final_next_states)) + 1e-16)).sum(1)) / beta ).detach() # Compute the expected Q values expected_state_action_values = (next_state_values.unsqueeze(1) * gamma) + reward_batch # Compute MSE loss loss = F.mse_loss(state_action_values, expected_state_action_values) # Optimize the model optimizer.zero_grad() loss.backward() for param in model.parameters(): param.grad.data.clamp_(-500, 500) optimizer.step()
def optimize_model(model, optimizer, memory, BATCH_SIZE, GAMMA, BETA): global last_sync if len(memory) < BATCH_SIZE: return transitions = memory.sample(BATCH_SIZE) # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for # detailed explanation). batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = ByteTensor( tuple(map(lambda s: s is not None, batch.next_state))) # We don't want to backprop through the expected action values and volatile # will save us on temporarily changing the model parameters' # requires_grad to False! non_final_next_states = Variable(torch.cat( [s for s in batch.next_state if s is not None]), volatile=True) state_batch = Variable(torch.cat(batch.state)) action_batch = Variable(torch.cat(batch.action)) reward_batch = Variable(torch.cat(batch.reward)) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken state_action_values = model(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states. next_state_values = Variable(torch.zeros(BATCH_SIZE).type(Tensor)) next_state_values[non_final_mask] = torch.log( torch.exp(BETA * model(non_final_next_states)).sum(1)) / BETA # Now, we don't want to mess up the loss with a volatile flag, so let's # clear it. After this, we'll just end up with a Variable that has # requires_grad=False next_state_values.volatile = False # Compute the expected Q values expected_state_action_values = (next_state_values * GAMMA) + reward_batch # Compute Huber loss loss = F.mse_loss(state_action_values, expected_state_action_values) # Optimize the model optimizer.zero_grad() loss.backward() for param in model.parameters(): param.grad.data.clamp_(-500, 500) optimizer.step()
def optimize_policy(policy, optimizer, memories, batch_size, num_envs, gamma, alpha, beta): ''' Optimize the distilled policy via stochastic gradient descent by sampling from equation 5 from Teh et al. ''' loss = 0 for i_env in range(num_envs): # determine sample size size_to_sample = np.minimum(batch_size, len(memories[i_env])) # extract sample transitions = memories[i_env].policy_sample(size_to_sample) # format sampled batch batch = Transition(*zip(*transitions)) # format state, time and action batch state_batch = torch.cat(batch.state) time_batch = torch.cat(batch.time) action_batch = torch.cat(batch.action) # add to current loss according to equation 5 from Teh et al. cur_loss = (torch.pow(Variable(Tensor([gamma])), time_batch) * torch.log(policy(state_batch).gather(1, action_batch))).sum() loss -= cur_loss loss = (alpha/beta) * loss optimizer.zero_grad() loss.backward() for param in policy.parameters(): param.grad.data.clamp_(-500, 500) optimizer.step()
def optimize_policy(policy, optimizer, memories, batch_size, num_envs, gamma): loss = 0 for i_env in range(num_envs): size_to_sample = np.minimum(batch_size, len(memories[i_env])) transitions = memories[i_env].policy_sample(size_to_sample) batch = Transition(*zip(*transitions)) state_batch = Variable(torch.cat(batch.state)) # print(batch.action) time_batch = Variable(torch.cat(batch.time)) actions = np.array([action.numpy()[0][0] for action in batch.action]) cur_loss = (torch.pow(Variable(Tensor([gamma])), time_batch) * torch.log(policy(state_batch)[:, actions])).sum() loss -= cur_loss # loss = cur_loss if i_env == 0 else loss + cur_loss optimizer.zero_grad() loss.backward() # for param in policy.parameters(): # param.grad.data.clamp_(-1, 1) optimizer.step()
def optimize_model(policy, model, optimizer, memory, batch_size, alpha, beta, gamma): if len(memory) < batch_size: return transitions = memory.sample(batch_size) # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for # detailed explanation). batch = Transition(*zip(*transitions)) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = ByteTensor( tuple(map(lambda s: s is not None, batch.next_state))) # We don't want to backprop through the expected action values and volatile # will save us on temporarily changing the model parameters' # requires_grad to False! non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) # non_final_next_states.requires_grad = False state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # calculate the numerator of equation 8 # pi0_a_pref = policy.forward_action_pref(state_batch) term = beta * model(state_batch) max_term = torch.max(term, 1)[0].unsqueeze(1) # get equation 8, pi_i(a_t | s_t) pi_i = torch.exp(term - max_term) / ( torch.exp(term - max_term).sum(1).unsqueeze(1)) # reg rewards reward_batch = ( reward_batch.unsqueeze(1) + (alpha / beta) * torch.log(policy.forward(state_batch).gather(1, action_batch)) - (1 / beta) * torch.log(pi_i.gather(1, action_batch))) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken state_action_values = model(state_batch).gather(1, action_batch) # Compute V(s_{t+1}) for all next states, 2nd component of equation 7 next_state_values = torch.zeros(batch_size).type(Tensor) next_state_values[non_final_mask] = (torch.log( (torch.pow(policy.forward(non_final_next_states), alpha) * (torch.exp(beta * model(non_final_next_states)) + 1e-16)).sum(1)) / beta).detach() if np.isnan(next_state_values.sum().data.numpy()): print('true') # Now, we don't want to mess up the loss with a volatile flag, so let's # clear it. After this, we'll just end up with a Variable that has # requires_grad=False # next_state_values.volatile = False # Compute the expected Q values expected_state_action_values = (next_state_values.unsqueeze(1) * gamma) + reward_batch # Compute Huber loss loss = F.mse_loss(state_action_values, expected_state_action_values) # loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) # Optimize the model optimizer.zero_grad() loss.backward() for param in model.parameters(): param.grad.data.clamp_(-500, 500) optimizer.step()