Example #1
0
def act_with_softmax(algo, state, body, gpu):
    '''Assumes actor network outputs one variable; the logits of a categorical probability distribution over the actions'''
    recurrent = algo.agent.len_state_buffer > 0
    torch_state = create_torch_state(state, body.state_buffer, gpu, recurrent,
                                     algo.agent.len_state_buffer)
    out = algo.get_actor_output(torch_state, evaluate=False)
    if type(out) is list:
        out = out[0]
    out.squeeze_(dim=0)
    probs = F.softmax(out, dim=0)
    m = Categorical(probs)
    action = m.sample()
    logger.debug2(f'Network output: {out.data}')
    logger.debug2(f'Probability of actions: {probs.data}')
    logger.debug(
        f'Action: {action.data[0]}, log prob: {m.log_prob(action).data[0]}')
    algo.saved_log_probs.append(m.log_prob(action))
    # Calculate entropy of the distribution
    H = -torch.sum(torch.mul(probs, torch.log(probs)))
    if np.isnan(H.data.cpu().numpy()):
        logger.debug(f'NaN entropy, setting to 0')
        H = torch.zeros(1)
        if torch.cuda.is_available() and gpu:
            H = H.cuda()
        H = Variable(H)
    algo.entropy.append(H)
    return action.data[0]
Example #2
0
def multi_act_with_boltzmann(nanflat_body_a, state_a, net, nanflat_tau_a, gpu):
    nanflat_state_a = util.nanflatten(state_a)
    cat_state_a = np.concatenate(nanflat_state_a).astype(float)
    torch_state = torch.from_numpy(cat_state_a).float()
    if torch.cuda.is_available() and gpu:
        torch_state = torch_state.cuda()
    torch_state = Variable(torch_state)
    out = net.wrap_eval(torch_state)
    nanflat_action_a = []
    start_idx = 0
    logger.debug2(f'taus: {nanflat_tau_a}')
    for body, tau in zip(nanflat_body_a, nanflat_tau_a):
        end_idx = start_idx + body.action_dim
        out_with_temp = torch.div(out[start_idx:end_idx], tau)
        logger.debug3(f'''
        tau: {tau}, out: {out},
        out select: {out[start_idx: end_idx]},
        out with temp: {out_with_temp}''')
        probs = F.softmax(Variable(out_with_temp.cpu()), dim=0).data.numpy()
        action = np.random.choice(list(range(body.action_dim)), p=probs)
        logger.debug3(f'''
        body: {body.aeb}, net idx: {start_idx}-{end_idx}
        probs: {probs}, action: {action}''')
        nanflat_action_a.append(action)
        start_idx = end_idx
    return nanflat_action_a
Example #3
0
 def get_R_ex_state_val_estimate(self, next_state_vals, rewards):
     nts = self.num_step_returns
     R = torch.zeros_like(next_state_vals)
     curr_reward_step = torch.zeros_like(next_state_vals)
     next_state_gammas = torch.zeros_like(next_state_vals)
     if nts >= next_state_vals.size(0):
         logger.debug2(
             f'Num step returns {self.num_step_returns} greater than length batch {next_state_vals.size(0)}. Updating to batch length'
         )
         nts = next_state_vals.size(0) - 1
     if nts == 0:
         next_state_gammas.fill_(1.0)
     else:
         j = -nts
         next_state_gammas[:j] = 1.0
     for i in range(nts, 0, -1):
         logger.debug(f'i: {i}, j: {j}')
         curr_reward_step[:j] = rewards[i:]
         next_state_gammas[:j] *= self.gamma
         R = curr_reward_step + self.gamma * R
         next_state_gammas[j] = 1.0
         j += 1
         logger.debug3(f'curr_reward_step: {curr_reward_step}')
         logger.debug3(f'next_state_gammas: {next_state_gammas}')
         logger.debug3(f'R: {R}')
     return (R, next_state_gammas)
Example #4
0
 def compute_q_target_values(self, batch):
     '''Computes the target Q values for a batch of experiences. Note that the net references may differe based on algorithm.'''
     q_sts = self.net.wrap_eval(batch['states'])
     # Use act_select network to select actions in next state
     q_next_st_acts = self.online_net.wrap_eval(batch['next_states'])
     _val, q_next_acts = torch.max(q_next_st_acts, dim=1)
     logger.debug2(f'Q next action: {q_next_acts.size()}')
     # Select q_next_st_maxs based on action selected in q_next_acts
     # Evaluate the action selection using the eval net
     q_next_sts = self.eval_net.wrap_eval(batch['next_states'])
     logger.debug2(f'Q next_states: {q_next_sts.size()}')
     idx = torch.from_numpy(np.array(list(range(self.batch_size))))
     q_next_st_maxs = q_next_sts[idx, q_next_acts]
     q_next_st_maxs.unsqueeze_(1)
     logger.debug2(f'Q next_states max {q_next_st_maxs.size()}')
     # Compute final q_target using reward and estimated best Q value from the next state if there is one
     # Make future reward 0 if the current state is done
     q_targets_max = batch['rewards'].data + self.gamma * \
         torch.mul((1 - batch['dones'].data), q_next_st_maxs)
     logger.debug2(f'Q targets max: {q_targets_max.size()}')
     # We only want to train the network for the action selected
     # For all other actions we set the q_target = q_sts
     # So that the loss for these actions is 0
     q_targets = torch.mul(q_targets_max, batch['actions'].data) + \
         torch.mul(q_sts, (1 - batch['actions'].data))
     logger.debug2(f'Q targets: {q_targets.size()}')
     return q_targets
Example #5
0
 def get_nstep_target_episodic(self, batch):
     '''Returns a list of tensors containing the estimate of the state-action values per batch using n-step returns'''
     nts = self.num_step_returns
     targets = []
     dones = batch['dones']
     next_states = batch['next_states']
     rewards = batch['rewards']
     for d, ns, r in zip(dones, next_states, rewards):
         next_state_vals = self.get_critic_output(ns).squeeze_(dim=1)
         r = r.data
         (R, next_state_gammas) = self.get_R_ex_state_val_estimate(
             next_state_vals, r)
         '''Complete for 0th step and add state-value estimate'''
         R = r + self.gamma * R
         next_state_gammas *= self.gamma
         logger.debug3(f'R: {R}')
         logger.debug3(f'next_state_gammas: {next_state_gammas}')
         logger.debug3(f'dones: {d}')
         '''Calculate appropriate state value accounting for terminal states and number of time steps'''
         discounted_state_val_estimate = torch.mul(next_state_vals,
                                                   next_state_gammas)
         discounted_state_val_estimate = torch.mul(
             discounted_state_val_estimate, 1 - d.data)
         if nts < next_state_vals.size(0):
             logger.debug2(
                 f'N step returns less than episode length, adding boostrap'
             )
             R += discounted_state_val_estimate
         logger.debug3(
             f'discounted_state_val_estimate: {discounted_state_val_estimate}'
         )
         logger.debug3(f'R: {R}')
         targets.append(R)
     return targets
Example #6
0
def multi_head_act_with_boltzmann(nanflat_body_a, state_a, net, nanflat_tau_a,
                                  gpu):
    nanflat_state_a = util.nanflatten(state_a)
    torch_states = []
    for state in nanflat_state_a:
        state = state.astype('float')
        torch_states.append(torch.from_numpy(state).float().unsqueeze_(dim=0))
    if torch.cuda.is_available() and gpu:
        for torch_state in torch_states:
            torch_state = torch_state.cuda()
    for torch_state in torch_states:
        torch_state = Variable(torch_state)
    outs = net.wrap_eval(torch_states)
    out_with_temp = [torch.div(x, t) for x, t in zip(outs, nanflat_tau_a)]
    logger.debug2(
        f'taus: {nanflat_tau_a}, outs: {outs}, out_with_temp: {out_with_temp}')
    nanflat_action_a = []
    for body, output in zip(nanflat_body_a, out_with_temp):
        probs = F.softmax(Variable(output.cpu()), dim=1).data.numpy()[0]
        action = np.random.choice(list(range(body.action_dim)), p=probs)
        logger.debug3(f'''
        body: {body.aeb}, output: {output},
        probs: {probs}, action: {action}''')
        nanflat_action_a.append(action)
    return nanflat_action_a
Example #7
0
def act_with_boltzmann(body, state, net, tau):
    torch_state = Variable(torch.from_numpy(state).float())
    out = net.wrap_eval(torch_state)
    out_with_temp = torch.div(out, tau)
    probs = F.softmax(Variable(out_with_temp), dim=0).data.numpy()
    action = np.random.choice(list(range(body.action_dim)), p=probs)
    logger.debug2('prob: {}, action: {}'.format(probs, action))
    return action
Example #8
0
 def calc_advantage_batch(self, batch):
     '''Calculates advantage when memory is batch based.
        target and state_vals are Tensors.
        returns advantage as a single Tensor'''
     target = self.get_target(batch)
     state_vals = self.get_critic_output(batch['states']).squeeze_()
     advantage = target - state_vals
     advantage.squeeze_()
     logger.debug2(f'Advantage: {advantage.size()}')
     return advantage
Example #9
0
 def update(self, action, reward, state, done):
     '''Interface method to update memory'''
     if not np.isnan(reward):
         self.add_experience(self.last_state, action, reward, state, done)
         self.nan_idxs.append(0)
     else:
         self.nan_idxs.append(1)
         logger.debug2(f'Nan reward')
     self.last_state = state
     '''Clear bodies state buffer for recurent nets'''
     if done:
         self.body.state_buffer = []
Example #10
0
def act_with_boltzmann(body, state, net, tau, gpu):
    recurrent = body.agent.len_state_buffer > 0
    logger.debug2(f'Length state buffer: {body.agent.len_state_buffer}')
    torch_state = create_torch_state(state, body.state_buffer, gpu, recurrent,
                                     body.agent.len_state_buffer)
    out = net.wrap_eval(torch_state)
    out_with_temp = torch.div(out, tau).squeeze_(dim=0)
    probs = F.softmax(Variable(out_with_temp.cpu()), dim=0).data.numpy()
    action = np.random.choice(list(range(body.action_dim)), p=probs)
    logger.debug2('out with temp: {}, prob: {}, action: {}'.format(
        out_with_temp, probs, action))
    return action
Example #11
0
def multi_head_act_with_epsilon_greedy(nanflat_body_a, state_a, net,
                                       nanflat_epsilon_a, gpu):
    '''Multi-headed body nanflat_action_a on a single-pass from net. Uses epsilon-greedy but in a batch manner.'''
    nanflat_state_a = util.nanflatten(state_a)
    nanflat_action_a = []
    torch_states = []
    for state in nanflat_state_a:
        state = state.astype('float')
        torch_states.append(torch.from_numpy(state).float().unsqueeze_(dim=0))
    if torch.cuda.is_available() and gpu:
        for torch_state in torch_states:
            torch_state = torch_state.cuda()
    for torch_state in torch_states:
        torch_state = Variable(torch_state)
    outs = net.wrap_eval(torch_states)
    for body, e, output in zip(nanflat_body_a, nanflat_epsilon_a, outs):
        logger.debug2(f'body: {body.aeb}, epsilon: {e}')
        if e > np.random.rand():
            logger.debug2(f'Random action')
            action = np.random.randint(body.action_dim)
        else:
            logger.debug2(f'Greedy action')
            action = torch.max(output, dim=1)[1][0]
        nanflat_action_a.append(action)
        logger.debug2(f'epsilon: {e}, outputs: {output}, action: {action}')
    return nanflat_action_a
Example #12
0
 def compute_q_target_values(self, batch):
     '''Computes the target Q values for a batch of experiences'''
     # Calculate the Q values of the current and next states
     q_sts = self.net.wrap_eval(batch['states'])
     q_next_st = self.net.wrap_eval(batch['next_states'])
     q_next_actions = batch['next_actions']
     logger.debug2(f'Q next states: {q_next_st.size()}')
     # Get the q value for the next action that was actually taken
     idx = torch.from_numpy(np.array(list(range(q_next_st.size(0)))))
     if torch.cuda.is_available() and self.gpu:
         idx = idx.cuda()
     q_next_st_vals = q_next_st[idx, q_next_actions.squeeze_(1).data.long()]
     # Expand the dims so that q_next_st_vals can be broadcast
     q_next_st_vals.unsqueeze_(1)
     logger.debug2(f'Q next_states vals {q_next_st_vals.size()}')
     logger.debug3(f'Q next_states {q_next_st}')
     logger.debug3(f'Q next actions {q_next_actions}')
     logger.debug3(f'Q next_states vals {q_next_st_vals}')
     logger.debug3(f'Dones {batch["dones"]}')
     # Compute q_targets using reward and Q value corresponding to the action taken in the next state if there is one. Make next state Q value 0 if the current state is done
     q_targets_actual = batch['rewards'].data + self.gamma * \
         torch.mul((1 - batch['dones'].data), q_next_st_vals)
     logger.debug2(f'Q targets actual: {q_targets_actual.size()}')
     logger.debug3(f'Q states {q_sts}')
     logger.debug3(f'Q targets actual: {q_targets_actual}')
     # We only want to train the network for the action selected in the current state
     # For all other actions we set the q_target = q_sts so that the loss for these actions is 0
     q_targets = torch.mul(q_targets_actual, batch['actions_onehot'].data) + \
         torch.mul(q_sts, (1 - batch['actions_onehot'].data))
     logger.debug2(f'Q targets: {q_targets.size()}')
     logger.debug3(f'Q targets: {q_targets}')
     return q_targets
Example #13
0
def multi_act_with_epsilon_greedy(nanflat_body_a, state_a, net,
                                  nanflat_epsilon_a):
    '''Multi-body nanflat_action_a on a single-pass from net. Uses epsilon-greedy but in a batch manner.'''
    nanflat_state_a = util.nanflatten(state_a)
    cat_state_a = np.concatenate(nanflat_state_a)
    nanflat_action_a = []
    start_idx = 0
    for body, e in zip(nanflat_body_a, nanflat_epsilon_a):
        logger.debug2(f'body: {body.aeb}, epsilon: {e}')
        end_idx = start_idx + body.action_dim
        if e > np.random.rand():
            logger.debug2(f'Random action')
            action = np.random.randint(body.action_dim)
        else:
            logger.debug2(f'Greedy action')
            cat_state_a = cat_state_a.astype('float')
            torch_state = Variable(torch.from_numpy(cat_state_a).float())
            out = net.wrap_eval(torch_state)
            action = int(torch.max(out[start_idx:end_idx], dim=0)[1][0])
        nanflat_action_a.append(action)
        start_idx = end_idx
        logger.debug2(f'''
        body: {body.aeb}, net idx: {start_idx}-{end_idx}
        action: {action}''')
    return nanflat_action_a
Example #14
0
 def get_gae_target_batch(self, batch, critic_specific):
     '''Returns a tensor containing the estimate of the state-action values using generalized advantage estimation'''
     rewards = batch['rewards'].data
     if critic_specific:
         logger.debug2(f'Using critic specific target')
         '''Target is the discounted sum of returns for training the critic'''
         target = self.get_gae_critic_target(rewards)
     else:
         logger.debug2(f'Using actor specific target')
         '''Target is the Generalized advantage estimate + current state-value estimate'''
         states = batch['states']
         next_states = batch['next_states']
         dones = batch['dones']
         target = self.get_gae_actor_target(rewards, states, next_states, dones)
     return target
Example #15
0
 def calc_advantage_episodic(self, batch):
     '''Calculates advantage when memory is batch based.
        target and state_vals are lists containing tensors per episode.
        returns advantage as a single tensor combined for all episodes'''
     target = self.get_target(batch)
     advantage = []
     states = batch['states']
     for s, t in zip(states, target):
         state_vals = self.get_critic_output(s).squeeze_()
         a = t - state_vals
         a.squeeze_()
         logger.debug2(f'Advantage: {a.size()}')
         advantage.append(a)
     advantage = torch.cat(advantage)
     return advantage
Example #16
0
 def update(self, action, reward, state, done):
     '''Interface method to update memory'''
     self.base_update(action, reward, state, done)
     state = self.preprocess_state(
         state,
         append=False)  # prevent conflict with preprocess in epi_reset
     if not np.isnan(reward):  # not the start of episode
         logger.debug2(f'original reward: {reward}')
         if not np.isnan(reward):
             reward = max(-1, min(1, reward))
         logger.debug(
             f'state: {state.shape}, reward: {reward}, last_state: {self.last_state.shape}'
         )
         self.add_experience(self.last_state, action, reward, state, done)
     self.last_state = state
Example #17
0
def act_with_epsilon_greedy(body, state, net, epsilon, gpu):
    '''
    Single body action with probability epsilon to select a random action,
    otherwise select the action associated with the largest q value
    '''
    if epsilon > np.random.rand():
        action = np.random.randint(body.action_dim)
    else:
        recurrent = body.agent.len_state_buffer > 0
        logger.debug2(f'Length state buffer: {body.agent.len_state_buffer}')
        torch_state = create_torch_state(state, body.state_buffer, gpu,
                                         recurrent,
                                         body.agent.len_state_buffer)
        out = net.wrap_eval(torch_state).squeeze_(dim=0)
        action = int(torch.max(out, dim=0)[1][0])
        logger.debug2(f'Outs {out} Action {action}')
    return action
Example #18
0
def act_with_gaussian(algo, state, body):
    '''Assumes net outputs two variables; the mean and std dev of a normal distribution'''
    recurrent = algo.agent.len_state_buffer > 0
    torch_state = create_torch_state(state, body.state_buffer, recurrent,
                                     algo.agent.len_state_buffer)
    [mu, sigma] = algo.get_actor_output(torch_state, evaluate=False)
    sigma = F.softplus(sigma) + 1e-5  # Ensures sigma > 0
    m = Normal(mu, sigma)
    action = m.sample()
    action = torch.clamp(action, -algo.continuous_action_clip,
                         algo.continuous_action_clip)
    logger.debug2(
        f'Action: {action.data[0]}, log prob: {m.log_prob(action).data[0]}')
    algo.saved_log_probs.append(m.log_prob(action))
    # Calculate entropy of the distribution
    H = 0.5 * torch.log(2.0 * np.pi * np.e * sigma * sigma)
    algo.entropy.append(H)
    return action.data
Example #19
0
 def check_sizes(self, advantage):
     '''Checks that log probs, advantage, and entropy all have the same size
        Occassionally they do not, this is caused by first reward of an episode being nan. If they are not the same size, the function removes the elements of the log probs and entropy that correspond to nan rewards.'''
     body = self.agent.nanflat_body_a[0]
     nan_idxs = body.memory.last_nan_idxs
     num_nans = sum(nan_idxs)
     assert len(nan_idxs) == len(self.saved_log_probs)
     assert len(nan_idxs) == len(self.entropy)
     assert len(nan_idxs) - num_nans == advantage.size(0)
     logger.debug2(f'{num_nans} nans encountered when gathering data')
     if num_nans != 0:
         idxs = [x for x in range(len(nan_idxs)) if nan_idxs[x] == 1]
         logger.debug3(f'Nan indexes: {idxs}')
         for idx in idxs[::-1]:
             del self.saved_log_probs[idx]
             del self.entropy[idx]
     assert len(self.saved_log_probs) == advantage.size(0)
     assert len(self.entropy) == advantage.size(0)
     return advantage
Example #20
0
def create_torch_state(state, state_buffer, recurrent=False, length=0):
    if recurrent:
        '''Create sequence of inputs for recurrent net'''
        logger.debug3(f'length of state buffer: {length}')
        if len(state_buffer) < length:
            PAD = np.zeros_like(state)
            while len(state_buffer) < length:
                state_buffer.insert(0, PAD)
        state_buffer = np.asarray(state_buffer)
        '''Hack to fix buffer not storing the very first state in an epi'''
        if np.sum(state_buffer) == 0:
            state_buffer[-1] = state
        torch_state = Variable(torch.from_numpy(state_buffer).float())
        torch_state.unsqueeze_(dim=0)
    else:
        torch_state = Variable(torch.from_numpy(state).float())
    logger.debug2(f'State size: {torch_state.size()}')
    logger.debug3(f'Original state: {state}')
    logger.debug3(f'State: {torch_state}')
    return torch_state
Example #21
0
 def get_gae_target_episodic(self, batch, critic_specific):
     '''Returns a list of tensors containing the estimate of the state-action values per batch using generalized advantage estimation'''
     rewards = batch['rewards']
     targets = []
     if critic_specific:
         logger.debug2(f'Using critic specific target')
         '''Target is the discounted sum of returns for training the critic'''
         for r in rewards:
             t = self.get_gae_critic_target(r.data)
             targets.append(t)
     else:
         logger.debug2(f'Using actor specific target')
         '''Target is the Generalized advantage estimate + current state-value estimate'''
         states = batch['states']
         next_states = batch['next_states']
         dones = batch['dones']
         for r, s, ns, d in zip(rewards, states, next_states, dones):
             t = self.get_gae_actor_target(r.data, s, ns, d)
             targets.append(t)
     return targets
Example #22
0
 def train_shared(self):
     '''Trains the network when the actor and critic share parameters'''
     if self.to_train == 1:
         batch = self.sample()
         '''Calculate policy loss (actor)'''
         policy_loss = self.get_policy_loss(batch)
         '''Calculate state-value loss (critic)'''
         target = self.get_target(batch, critic_specific=True)
         states = batch['states']
         if self.is_episodic:
             target = torch.cat(target)
             states = torch.cat(states)
         if torch.cuda.is_available() and self.gpu:
             target = target.cuda()
         y = Variable(target.unsqueeze_(dim=-1))
         state_vals = self.get_critic_output(states, evaluate=False)
         assert state_vals.data.size() == y.data.size()
         val_loss = F.mse_loss(state_vals, y)
         '''Combine losses and train'''
         self.actorcritic.optim.zero_grad()
         total_loss = self.policy_loss_weight * policy_loss + self.val_loss_weight * val_loss
         loss = total_loss.data[0]
         total_loss.backward()
         if self.actorcritic.clamp_grad:
             logger.debug("Clipping actorcritic gradient...")
             torch.nn.utils.clip_grad_norm(self.actorcritic.params,
                                           self.actorcritic.clamp_grad_val)
         logger.debug2(
             f'Combined AC gradient norms: {self.actorcritic.get_grad_norms()}'
         )
         self.actorcritic.optim.step()
         self.to_train = 0
         self.saved_log_probs = []
         self.entropy = []
         logger.debug(
             "Losses: Critic: {:.2f}, Actor: {:.2f}, Total: {:.2f}".format(
                 val_loss.data[0], abs(policy_loss.data[0]), loss))
         return loss
     else:
         return np.nan
Example #23
0
 def train_critic_episodic(self, batch):
     '''Trains the critic using entire episodes of data. Algorithm waits until episode has ended to train'''
     loss = 0
     for _i in range(self.training_iters_per_batch):
         target = self.get_target(batch, critic_specific=True)
         target = torch.cat(target)
         logger.debug2(f'Combined size: {target.size()}')
         x = []
         for state in batch['states']:
             x.append(state)
             logger.debug2(f'states: {state.size()}')
         x = torch.cat(x, dim=0)
         logger.debug2(f'Combined states: {x.size()}')
         y = Variable(target)
         loss = self.critic.training_step(x, y).data[0]
         logger.debug2(f'Critic grad norms: {self.critic.get_grad_norms()}')
     return loss
Example #24
0
 def train(self):
     if self.to_train == 1:
         logger.debug2(f'Training...')
         # We only care about the rewards from the batch
         rewards = self.sample()['rewards']
         logger.debug3(f'Length first epi: {len(rewards[0])}')
         logger.debug3(f'Len log probs: {len(self.saved_log_probs)}')
         self.net.optim.zero_grad()
         policy_loss = self.get_policy_loss(rewards)
         loss = policy_loss.data[0]
         policy_loss.backward()
         if self.net.clamp_grad:
             logger.debug("Clipping gradient...")
             torch.nn.utils.clip_grad_norm(self.net.parameters(),
                                           self.net.clamp_grad_val)
         logger.debug2(f'Gradient norms: {self.net.get_grad_norms()}')
         self.net.optim.step()
         self.to_train = 0
         self.saved_log_probs = []
         self.entropy = []
         logger.debug(f'Policy loss: {loss}')
         return loss
     else:
         return np.nan
Example #25
0
 def compute_q_target_values(self, batch):
     '''Computes the target Q values for a batch of experiences'''
     # Calculate the Q values of the current and next states
     q_sts = self.net.wrap_eval(batch['states'])
     q_next_st = self.net.wrap_eval(batch['next_states'])
     logger.debug2(f'Q next states: {q_next_st.size()}')
     # Get the max for each next state
     q_next_st_max, _ = torch.max(q_next_st, dim=1)
     # Expand the dims so that q_next_st_max can be broadcast
     q_next_st_max.unsqueeze_(1)
     logger.debug2(f'Q next_states max {q_next_st_max.size()}')
     # Compute q_targets using reward and estimated best Q value from the next state if there is one
     # Make future reward 0 if the current state is done
     q_targets_max = batch['rewards'].data + self.gamma * \
         torch.mul((1 - batch['dones'].data), q_next_st_max)
     logger.debug2(f'Q targets max: {q_targets_max.size()}')
     # We only want to train the network for the action selected
     # For all other actions we set the q_target = q_sts
     # So that the loss for these actions is 0
     q_targets = torch.mul(q_targets_max, batch['actions'].data) + \
         torch.mul(q_sts, (1 - batch['actions'].data))
     logger.debug2(f'Q targets: {q_targets.size()}')
     return q_targets
Example #26
0
 def compute_q_target_values(self, batch):
     batches = batch['batches']
     # NOTE: q_sts, q_next_st_acts and q_next_sts are lists
     q_sts = self.net.wrap_eval(batch['states'])
     logger.debug3(f'Q sts: {q_sts}')
     q_next_st_acts = self.online_net.wrap_eval(batch['next_states'])
     logger.debug3(f'Q next st act vals: {q_next_st_acts}')
     q_next_acts = []
     for i, q in enumerate(q_next_st_acts):
         _val, q_next_act_b = torch.max(q, dim=1)
         logger.debug3(f'Q next action for body {i}: {q_next_act_b}')
         q_next_acts.append(q_next_act_b)
     # Select q_next_st_maxs based on action selected in q_next_acts
     q_next_sts = self.eval_net.wrap_eval(batch['next_states'])
     logger.debug3(f'Q next_states: {q_next_sts}')
     idx = torch.from_numpy(np.array(list(range(self.batch_size))))
     q_next_st_maxs = []
     for q_next_st_val_b, q_next_act_b in zip(q_next_sts, q_next_acts):
         q_next_st_max_b = q_next_st_val_b[idx, q_next_act_b]
         q_next_st_max_b.unsqueeze_(1)
         logger.debug2(f'Q next_states max {q_next_st_max_b.size()}')
         logger.debug3(f'Q next_states max {q_next_st_max_b}')
         q_next_st_maxs.append(q_next_st_max_b)
     # Compute q_targets per environment using reward and estimated best Q value from the next state if there is one
     # Make future reward 0 if the current state is done
     q_targets_maxs = []
     for b, batch_b in enumerate(batches):
         q_targets_max_b = batch_b['rewards'].data + self.gamma * \
             torch.mul((1 - batch_b['dones'].data), q_next_st_maxs[b])
         q_targets_maxs.append(q_targets_max_b)
         logger.debug2(
             f'Batch {b}, Q targets max: {q_targets_max_b.size()}')
     # As in the standard DQN we only want to train the network for the action selected
     # For all other actions we set the q_target = q_sts
     # So that the loss for these actions is 0
     q_targets = []
     for b, batch_b in enumerate(batches):
         q_targets_b = torch.mul(q_targets_maxs[b], batch_b['actions'].data) + \
             torch.mul(q_sts[b], (1 - batch_b['actions'].data))
         q_targets.append(q_targets_b)
         logger.debug2(f'Batch {b}, Q targets: {q_targets_b.size()}')
     return q_targets
Example #27
0
    def compute_q_target_values(self, batch):
        batches = batch['batches']
        q_sts = self.net.wrap_eval(batch['states'])
        logger.debug3(f'Q sts: {q_sts}')
        # TODO parametrize usage of eval or target_net
        q_next_st_acts = self.online_net.wrap_eval(batch['next_states'])
        logger.debug3(f'Q next st act vals: {q_next_st_acts}')
        start_idx = 0
        q_next_acts = []
        for body in self.agent.nanflat_body_a:
            end_idx = start_idx + body.action_dim
            _val, q_next_act_b = torch.max(q_next_st_acts[:,
                                                          start_idx:end_idx],
                                           dim=1)
            # Shift action so that they have the right indices in combined layer
            q_next_act_b += start_idx
            logger.debug2(
                f'Q next action for body {body.aeb}: {q_next_act_b.size()}')
            logger.debug3(f'Q next action for body {body.aeb}: {q_next_act_b}')
            q_next_acts.append(q_next_act_b)
            start_idx = end_idx

        # Select q_next_st_maxs based on action selected in q_next_acts
        q_next_sts = self.eval_net.wrap_eval(batch['next_states'])
        logger.debug2(f'Q next_states: {q_next_sts.size()}')
        logger.debug3(f'Q next_states: {q_next_sts}')
        idx = torch.from_numpy(np.array(list(range(self.batch_size))))
        q_next_st_maxs = []
        for q_next_act_b in q_next_acts:
            q_next_st_max_b = q_next_sts[idx, q_next_act_b]
            q_next_st_max_b.unsqueeze_(1)
            logger.debug2(f'Q next_states max {q_next_st_max_b.size()}')
            logger.debug3(f'Q next_states max {q_next_st_max_b}')
            q_next_st_maxs.append(q_next_st_max_b)

        # Compute final q_target using reward and estimated best Q value from the next state if there is one. Make future reward 0 if the current state is done. Do it individually first, then combine. Each individual target should automatically expand to the dimension of the relevant action space
        q_targets_maxs = []
        for b, batch_b in enumerate(batches):
            q_targets_max_b = (
                batch_b['rewards'].data + self.gamma * torch.mul(
                    (1 - batch_b['dones'].data), q_next_st_maxs[b])).numpy()
            q_targets_max_b = torch.from_numpy(
                np.broadcast_to(
                    q_targets_max_b,
                    (q_targets_max_b.shape[0], self.action_dims[b])))
            q_targets_maxs.append(q_targets_max_b)
            logger.debug2(f'Q targets max: {q_targets_max_b.size()}')
        q_targets_maxs = torch.cat(q_targets_maxs, dim=1)
        logger.debug2(f'Q targets maxes: {q_targets_maxs.size()}')
        logger.debug3(f'Q targets maxes: {q_targets_maxs}')
        # Also concat actions - each batch should have only two non zero dimensions
        actions = [batch_b['actions'] for batch_b in batches]
        combined_actions = torch.cat(actions, dim=1)
        logger.debug2(f'combined_actions: {combined_actions.size()}')
        logger.debug3(f'combined_actions: {combined_actions}')
        # We only want to train the network for the action selected
        # For all other actions we set the q_target = q_sts
        # So that the loss for these actions is 0
        q_targets = torch.mul(q_targets_maxs, combined_actions.data) + \
            torch.mul(q_sts, (1 - combined_actions.data))
        logger.debug2(f'Q targets: {q_targets.size()}')
        logger.debug3(f'Q targets: {q_targets}')
        return q_targets