Esempio n. 1
0
 def get_nstep_target_episodic(self, batch):
     '''Returns a list of tensors containing the estimate of the state-action values per batch using n-step returns'''
     nts = self.num_step_returns
     targets = []
     dones = batch['dones']
     next_states = batch['next_states']
     rewards = batch['rewards']
     for d, ns, r in zip(dones, next_states, rewards):
         next_state_vals = self.get_critic_output(ns).squeeze_(dim=1)
         r = r.data
         (R, next_state_gammas) = self.get_R_ex_state_val_estimate(
             next_state_vals, r)
         '''Complete for 0th step and add state-value estimate'''
         R = r + self.gamma * R
         next_state_gammas *= self.gamma
         logger.debug3(f'R: {R}')
         logger.debug3(f'next_state_gammas: {next_state_gammas}')
         logger.debug3(f'dones: {d}')
         '''Calculate appropriate state value accounting for terminal states and number of time steps'''
         discounted_state_val_estimate = torch.mul(next_state_vals,
                                                   next_state_gammas)
         discounted_state_val_estimate = torch.mul(
             discounted_state_val_estimate, 1 - d.data)
         if nts < next_state_vals.size(0):
             logger.debug2(
                 f'N step returns less than episode length, adding boostrap'
             )
             R += discounted_state_val_estimate
         logger.debug3(
             f'discounted_state_val_estimate: {discounted_state_val_estimate}'
         )
         logger.debug3(f'R: {R}')
         targets.append(R)
     return targets
Esempio n. 2
0
 def train(self):
     '''Completes one training step for the agent if it is time to train.
        i.e. the environment timestep is greater than the minimum training
        timestep and a multiple of the training_frequency.
        Each training step consists of sampling n batches from the agent's memory.
           For each of the batches, the target Q values (q_targets) are computed and
           a single training step is taken k times
        Otherwise this function does nothing.
     '''
     t = util.s_get(self, 'aeb_space.clock').get('total_t')
     if (t > self.training_min_timestep
             and t % self.training_frequency == 0):
         logger.debug3(f'Training at t: {t}')
         nanflat_loss_a = np.zeros(self.agent.body_num)
         for _b in range(self.training_epoch):
             batch_losses = np.zeros(self.agent.body_num)
             batch = self.sample()
             for _i in range(self.training_iters_per_batch):
                 q_targets = self.compute_q_target_values(batch)
                 y = [Variable(q) for q in q_targets]
                 losses = self.net.training_step(batch['states'], y)
                 logger.debug(f'losses {losses}')
                 batch_losses += losses
             batch_losses /= self.training_iters_per_batch
             nanflat_loss_a += batch_losses
         nanflat_loss_a /= self.training_epoch
         loss_a = self.nanflat_to_data_a('loss', nanflat_loss_a)
         return loss_a
     else:
         logger.debug3('NOT training')
         return np.nan
Esempio n. 3
0
 def time_fn(*args, **kwargs):
     start = time.time()
     output = fn(*args, **kwargs)
     end = time.time()
     logger.debug3(
         f'Timed: {fn.__name__} {round((end - start) * 1000, 4)}ms')
     return output
Esempio n. 4
0
 def train(self):
     '''Completes one training step for the agent if it is time to train.
        i.e. the environment timestep is greater than the minimum training
        timestep and a multiple of the training_frequency.
        Each training step consists of sampling n batches from the agent's memory.
           For each of the batches, the target Q values (q_targets) are computed and
           a single training step is taken k times
        Otherwise this function does nothing.
     '''
     t = util.s_get(self, 'aeb_space.clock').get('total_t')
     if (t > self.training_min_timestep
             and t % self.training_frequency == 0):
         logger.debug3(f'Training at t: {t}')
         total_loss = 0.0
         for _b in range(self.training_epoch):
             batch = self.sample()
             batch_loss = 0.0
             for _i in range(self.training_iters_per_batch):
                 q_targets = self.compute_q_target_values(batch)
                 y = Variable(q_targets)
                 loss = self.net.training_step(batch['states'], y)
                 batch_loss += loss.data[0]
             batch_loss /= self.training_iters_per_batch
             total_loss += batch_loss
         total_loss /= self.training_epoch
         logger.debug(f'total_loss {total_loss}')
         return total_loss
     else:
         logger.debug3('NOT training')
         return np.nan
Esempio n. 5
0
def multi_act_with_boltzmann(nanflat_body_a, state_a, net, nanflat_tau_a, gpu):
    nanflat_state_a = util.nanflatten(state_a)
    cat_state_a = np.concatenate(nanflat_state_a).astype(float)
    torch_state = torch.from_numpy(cat_state_a).float()
    if torch.cuda.is_available() and gpu:
        torch_state = torch_state.cuda()
    torch_state = Variable(torch_state)
    out = net.wrap_eval(torch_state)
    nanflat_action_a = []
    start_idx = 0
    logger.debug2(f'taus: {nanflat_tau_a}')
    for body, tau in zip(nanflat_body_a, nanflat_tau_a):
        end_idx = start_idx + body.action_dim
        out_with_temp = torch.div(out[start_idx:end_idx], tau)
        logger.debug3(f'''
        tau: {tau}, out: {out},
        out select: {out[start_idx: end_idx]},
        out with temp: {out_with_temp}''')
        probs = F.softmax(Variable(out_with_temp.cpu()), dim=0).data.numpy()
        action = np.random.choice(list(range(body.action_dim)), p=probs)
        logger.debug3(f'''
        body: {body.aeb}, net idx: {start_idx}-{end_idx}
        probs: {probs}, action: {action}''')
        nanflat_action_a.append(action)
        start_idx = end_idx
    return nanflat_action_a
Esempio n. 6
0
def multi_head_act_with_boltzmann(nanflat_body_a, state_a, net, nanflat_tau_a,
                                  gpu):
    nanflat_state_a = util.nanflatten(state_a)
    torch_states = []
    for state in nanflat_state_a:
        state = state.astype('float')
        torch_states.append(torch.from_numpy(state).float().unsqueeze_(dim=0))
    if torch.cuda.is_available() and gpu:
        for torch_state in torch_states:
            torch_state = torch_state.cuda()
    for torch_state in torch_states:
        torch_state = Variable(torch_state)
    outs = net.wrap_eval(torch_states)
    out_with_temp = [torch.div(x, t) for x, t in zip(outs, nanflat_tau_a)]
    logger.debug2(
        f'taus: {nanflat_tau_a}, outs: {outs}, out_with_temp: {out_with_temp}')
    nanflat_action_a = []
    for body, output in zip(nanflat_body_a, out_with_temp):
        probs = F.softmax(Variable(output.cpu()), dim=1).data.numpy()[0]
        action = np.random.choice(list(range(body.action_dim)), p=probs)
        logger.debug3(f'''
        body: {body.aeb}, output: {output},
        probs: {probs}, action: {action}''')
        nanflat_action_a.append(action)
    return nanflat_action_a
Esempio n. 7
0
 def reset(self, state_space):
     logger.debug3('AgentSpace.reset')
     _action_v, _loss_v, _explore_var_v = self.aeb_space.init_data_v(AGENT_DATA_NAMES)
     for agent in self.agents:
         state_a = state_space.get(a=agent.a)
         agent.reset(state_a)
     _action_space, _loss_space, _explore_var_space = self.aeb_space.add(AGENT_DATA_NAMES, [_action_v, _loss_v, _explore_var_v])
     logger.debug3(f'action_space: {_action_space}')
     return _action_space
Esempio n. 8
0
def update_linear_decay(cls, space_clock):
    epi = space_clock.get('epi')
    rise = cls.explore_var_end - cls.explore_var_start
    slope = rise / float(cls.explore_anneal_epi)
    explore_var = max(slope * (epi - 1) + cls.explore_var_start,
                      cls.explore_var_end)
    cls.nanflat_explore_var_a = [explore_var] * cls.agent.body_num
    logger.debug3(f'nanflat_explore_var_a: {cls.nanflat_explore_var_a[0]}')
    return cls.nanflat_explore_var_a
Esempio n. 9
0
 def reset(self):
     logger.debug3('EnvSpace.reset')
     _reward_v, state_v, done_v = self.aeb_space.init_data_v(ENV_DATA_NAMES)
     for env in self.envs:
         _reward_e, state_e, done_e = env.space_reset()
         state_v[env.e, 0:len(state_e)] = state_e
         done_v[env.e, 0:len(done_e)] = done_e
     _reward_space, state_space, done_space = self.aeb_space.add(ENV_DATA_NAMES, (_reward_v, state_v, done_v))
     logger.debug3(f'\nstate_space: {state_space}')
     return _reward_space, state_space, done_space
Esempio n. 10
0
 def get_gae_critic_target(self, rewards):
     '''Target is the discounted sum of returns for training the critic'''
     target = []
     big_r = 0
     for i in range(rewards.size(0) - 1, -1, -1):
         big_r = rewards[i] + self.gamma * big_r
         target.insert(0, big_r)
     target = torch.Tensor(target)
     logger.debug3(f'Target: {target}')
     return target
Esempio n. 11
0
 def act(self, state_space):
     data_names = ('action', )
     action_v, = self.aeb_space.init_data_v(data_names)
     for agent in self.agents:
         a = agent.a
         state_a = state_space.get(a=a)
         action_a = agent.space_act(state_a)
         action_v[a, 0:len(action_a)] = action_a
     action_space, = self.aeb_space.add(data_names, (action_v, ))
     logger.debug3(f'\naction_space: {action_space}')
     return action_space
Esempio n. 12
0
 def step(self, action_space):
     reward_v, state_v, done_v = self.aeb_space.init_data_v(ENV_DATA_NAMES)
     for env in self.envs:
         e = env.e
         action_e = action_space.get(e=e)
         reward_e, state_e, done_e = env.space_step(action_e)
         reward_v[e, 0:len(reward_e)] = reward_e
         state_v[e, 0:len(state_e)] = state_e
         done_v[e, 0:len(done_e)] = done_e
     reward_space, state_space, done_space = self.aeb_space.add(ENV_DATA_NAMES, (reward_v, state_v, done_v))
     logger.debug3(f'\nreward_space: {reward_space}\nstate_space: {state_space}\ndone_space: {done_space}')
     return reward_space, state_space, done_space
Esempio n. 13
0
def update_multi_linear_decay(cls, _space_clock):
    nanflat_explore_var_a = []
    for body in cls.agent.nanflat_body_a:
        # use body-clock instead of space clock
        epi = body.env.clock.get('epi')
        rise = cls.explore_var_end - cls.explore_var_start
        slope = rise / float(cls.explore_anneal_epi)
        explore_var = max(slope * (epi - 1) + cls.explore_var_start,
                          cls.explore_var_end)
        nanflat_explore_var_a.append(explore_var)
    cls.nanflat_explore_var_a = nanflat_explore_var_a
    logger.debug3(f'nanflat_explore_var_a: {cls.nanflat_explore_var_a}')
    return cls.nanflat_explore_var_a
Esempio n. 14
0
 def train_separate(self):
     '''Trains the network when the actor and critic are separate networks'''
     if self.to_train == 1:
         batch = self.sample()
         logger.debug3(f'Batch states: {batch["states"]}')
         critic_loss = self.train_critic(batch)
         actor_loss = self.train_actor(batch)
         total_loss = critic_loss + abs(actor_loss)
         logger.debug(
             "Losses: Critic: {:.2f}, Actor: {:.2f}, Total: {:.2f}".format(
                 critic_loss, abs(actor_loss), total_loss))
         return total_loss
     else:
         return np.nan
Esempio n. 15
0
 def update(self, action_space, reward_space, state_space, done_space):
     data_names = ['loss', 'explore_var']
     loss_v, explore_var_v = self.aeb_space.init_data_v(data_names)
     for agent in self.agents:
         a = agent.a
         action_a = action_space.get(a=a)
         reward_a = reward_space.get(a=a)
         state_a = state_space.get(a=a)
         done_a = done_space.get(a=a)
         loss_a, explore_var_a = agent.update(action_a, reward_a, state_a, done_a)
         loss_v[a, 0:len(loss_a)] = loss_a
         explore_var_v[a, 0:len(explore_var_a)] = explore_var_a
     loss_space, explore_var_space = self.aeb_space.add(data_names, [loss_v, explore_var_v])
     logger.debug3(f'\nloss_space: {loss_space}\nexplore_var_space: {explore_var_space}')
     return loss_space, explore_var_space
Esempio n. 16
0
 def get_policy_loss(self, batch):
     '''Returns the policy loss for a batch of data.
     For REINFORCE just rewards are passed in as the batch'''
     advantage = self.calc_advantage(batch)
     advantage = self.check_sizes(advantage)
     policy_loss = []
     for log_prob, a, e in zip(self.saved_log_probs, advantage,
                               self.entropy):
         logger.debug3(
             f'log prob: {log_prob.data[0]}, advantage: {a}, entropy: {e.data[0]}'
         )
         if self.add_entropy:
             policy_loss.append(-log_prob * a - self.entropy_weight * e)
         else:
             policy_loss.append(-log_prob * a)
     policy_loss = torch.cat(policy_loss).sum()
     return policy_loss
Esempio n. 17
0
 def get_nstep_target_batch(self, batch):
     '''Returns a tensor containing the estimate of the state-action values using n-step returns'''
     nts = self.num_step_returns
     next_state_vals = self.get_critic_output(batch['next_states']).squeeze_(dim=1)
     rewards = batch['rewards'].data
     (R, next_state_gammas) = self.get_R_ex_state_val_estimate(next_state_vals, rewards)
     '''Complete for 0th step and add state-value estimate'''
     R = rewards + self.gamma * R
     next_state_gammas *= self.gamma
     logger.debug3(f'R: {R}')
     logger.debug3(f'next_state_gammas: {next_state_gammas}')
     logger.debug3(f'dones: {batch["dones"]}')
     '''Calculate appropriate state value accounting for terminal states and number of time steps'''
     discounted_state_val_estimate = torch.mul(next_state_vals, next_state_gammas)
     discounted_state_val_estimate = torch.mul(discounted_state_val_estimate, 1 - batch['dones'].data)
     R += discounted_state_val_estimate
     logger.debug3(f'discounted_state_val_estimate: {discounted_state_val_estimate}')
     logger.debug3(f'R: {R}')
     return R
Esempio n. 18
0
 def check_sizes(self, advantage):
     '''Checks that log probs, advantage, and entropy all have the same size
        Occassionally they do not, this is caused by first reward of an episode being nan. If they are not the same size, the function removes the elements of the log probs and entropy that correspond to nan rewards.'''
     body = self.agent.nanflat_body_a[0]
     nan_idxs = body.memory.last_nan_idxs
     num_nans = sum(nan_idxs)
     assert len(nan_idxs) == len(self.saved_log_probs)
     assert len(nan_idxs) == len(self.entropy)
     assert len(nan_idxs) - num_nans == advantage.size(0)
     logger.debug2(f'{num_nans} nans encountered when gathering data')
     if num_nans != 0:
         idxs = [x for x in range(len(nan_idxs)) if nan_idxs[x] == 1]
         logger.debug3(f'Nan indexes: {idxs}')
         for idx in idxs[::-1]:
             del self.saved_log_probs[idx]
             del self.entropy[idx]
     assert len(self.saved_log_probs) == advantage.size(0)
     assert len(self.entropy) == advantage.size(0)
     return advantage
Esempio n. 19
0
 def compute_q_target_values(self, batch):
     batches = batch['batches']
     # NOTE: q_sts, q_next_st_acts and q_next_sts are lists
     q_sts = self.net.wrap_eval(batch['states'])
     logger.debug3(f'Q sts: {q_sts}')
     q_next_st_acts = self.online_net.wrap_eval(batch['next_states'])
     logger.debug3(f'Q next st act vals: {q_next_st_acts}')
     q_next_acts = []
     for i, q in enumerate(q_next_st_acts):
         _val, q_next_act_b = torch.max(q, dim=1)
         logger.debug3(f'Q next action for body {i}: {q_next_act_b}')
         q_next_acts.append(q_next_act_b)
     # Select q_next_st_maxs based on action selected in q_next_acts
     q_next_sts = self.eval_net.wrap_eval(batch['next_states'])
     logger.debug3(f'Q next_states: {q_next_sts}')
     idx = torch.from_numpy(np.array(list(range(self.batch_size))))
     q_next_st_maxs = []
     for q_next_st_val_b, q_next_act_b in zip(q_next_sts, q_next_acts):
         q_next_st_max_b = q_next_st_val_b[idx, q_next_act_b]
         q_next_st_max_b.unsqueeze_(1)
         logger.debug2(f'Q next_states max {q_next_st_max_b.size()}')
         logger.debug3(f'Q next_states max {q_next_st_max_b}')
         q_next_st_maxs.append(q_next_st_max_b)
     # Compute q_targets per environment using reward and estimated best Q value from the next state if there is one
     # Make future reward 0 if the current state is done
     q_targets_maxs = []
     for b, batch_b in enumerate(batches):
         q_targets_max_b = batch_b['rewards'].data + self.gamma * \
             torch.mul((1 - batch_b['dones'].data), q_next_st_maxs[b])
         q_targets_maxs.append(q_targets_max_b)
         logger.debug2(
             f'Batch {b}, Q targets max: {q_targets_max_b.size()}')
     # As in the standard DQN we only want to train the network for the action selected
     # For all other actions we set the q_target = q_sts
     # So that the loss for these actions is 0
     q_targets = []
     for b, batch_b in enumerate(batches):
         q_targets_b = torch.mul(q_targets_maxs[b], batch_b['actions'].data) + \
             torch.mul(q_sts[b], (1 - batch_b['actions'].data))
         q_targets.append(q_targets_b)
         logger.debug2(f'Batch {b}, Q targets: {q_targets_b.size()}')
     return q_targets
Esempio n. 20
0
 def get_R_ex_state_val_estimate(self, next_state_vals, rewards):
     nts = self.num_step_returns
     R = torch.zeros_like(next_state_vals)
     curr_reward_step = torch.zeros_like(next_state_vals)
     next_state_gammas = torch.zeros_like(next_state_vals)
     if nts >= next_state_vals.size(0):
         logger.debug2(
             f'Num step returns {self.num_step_returns} greater than length batch {next_state_vals.size(0)}. Updating to batch length'
         )
         nts = next_state_vals.size(0) - 1
     if nts == 0:
         next_state_gammas.fill_(1.0)
     else:
         j = -nts
         next_state_gammas[:j] = 1.0
     for i in range(nts, 0, -1):
         logger.debug(f'i: {i}, j: {j}')
         curr_reward_step[:j] = rewards[i:]
         next_state_gammas[:j] *= self.gamma
         R = curr_reward_step + self.gamma * R
         next_state_gammas[j] = 1.0
         j += 1
         logger.debug3(f'curr_reward_step: {curr_reward_step}')
         logger.debug3(f'next_state_gammas: {next_state_gammas}')
         logger.debug3(f'R: {R}')
     return (R, next_state_gammas)
Esempio n. 21
0
 def train(self):
     '''Completes one training step for the agent if it is time to train.
        Otherwise this function does nothing.
     '''
     t = util.s_get(self, 'aeb_space.clock').get('total_t')
     if self.to_train == 1:
         logger.debug3(f'Training at t: {t}')
         batch = self.sample()
         if batch['states'].size(0) < 2:
             logger.info(f'Batch too small to train with, skipping...')
             self.to_train = 0
             return np.nan
         q_targets = self.compute_q_target_values(batch)
         if torch.cuda.is_available() and self.gpu:
             q_targets = q_targets.cuda()
         y = Variable(q_targets)
         loss = self.net.training_step(batch['states'], y)
         logger.debug(f'loss {loss.data[0]}')
         self.to_train = 0
         return loss.data[0]
     else:
         logger.debug3('NOT training')
         return np.nan
Esempio n. 22
0
 def train(self):
     if self.to_train == 1:
         logger.debug2(f'Training...')
         # We only care about the rewards from the batch
         rewards = self.sample()['rewards']
         logger.debug3(f'Length first epi: {len(rewards[0])}')
         logger.debug3(f'Len log probs: {len(self.saved_log_probs)}')
         self.net.optim.zero_grad()
         policy_loss = self.get_policy_loss(rewards)
         loss = policy_loss.data[0]
         policy_loss.backward()
         if self.net.clamp_grad:
             logger.debug("Clipping gradient...")
             torch.nn.utils.clip_grad_norm(self.net.parameters(),
                                           self.net.clamp_grad_val)
         logger.debug2(f'Gradient norms: {self.net.get_grad_norms()}')
         self.net.optim.step()
         self.to_train = 0
         self.saved_log_probs = []
         self.entropy = []
         logger.debug(f'Policy loss: {loss}')
         return loss
     else:
         return np.nan
Esempio n. 23
0
 def calc_advantage(self, raw_rewards):
     '''Returns the advantage for each action'''
     advantage = []
     logger.debug3(f'Raw rewards: {raw_rewards}')
     for epi_rewards in raw_rewards:
         rewards = []
         big_r = 0
         for r in epi_rewards[::-1]:
             big_r = r + self.gamma * big_r
             rewards.insert(0, big_r)
         rewards = torch.Tensor(rewards)
         logger.debug3(f'Rewards: {rewards}')
         rewards = (rewards - rewards.mean()) / (rewards.std() +
                                                 np.finfo(np.float32).eps)
         logger.debug3(f'Normalized rewards: {rewards}')
         advantage.append(rewards)
     advantage = torch.cat(advantage)
     return advantage
Esempio n. 24
0
def create_torch_state(state, state_buffer, recurrent=False, length=0):
    if recurrent:
        '''Create sequence of inputs for recurrent net'''
        logger.debug3(f'length of state buffer: {length}')
        if len(state_buffer) < length:
            PAD = np.zeros_like(state)
            while len(state_buffer) < length:
                state_buffer.insert(0, PAD)
        state_buffer = np.asarray(state_buffer)
        '''Hack to fix buffer not storing the very first state in an epi'''
        if np.sum(state_buffer) == 0:
            state_buffer[-1] = state
        torch_state = Variable(torch.from_numpy(state_buffer).float())
        torch_state.unsqueeze_(dim=0)
    else:
        torch_state = Variable(torch.from_numpy(state).float())
    logger.debug2(f'State size: {torch_state.size()}')
    logger.debug3(f'Original state: {state}')
    logger.debug3(f'State: {torch_state}')
    return torch_state
Esempio n. 25
0
 def forward(self, x):
     '''The feedforward step.
     Input is batch_size x sequence_length x state_dim'''
     '''Unstack input to (batch_size x sequence_length) x state_dim in order to transform all state inputs'''
     batch_size = x.size(0)
     x = x.view(-1, self.in_dim)
     x = self.state_proc_model(x)
     '''Restack to batch_size x sequence_length x rnn_input_dim'''
     x = x.view(-1, self.sequence_length, self.rnn_input_dim)
     hid_0 = self.init_hidden(batch_size)
     _, final_hid = self.rnn(x, hid_0)
     final_hid.squeeze_(dim=0)
     '''If only one head, return tensor, otherwise return list of outputs'''
     outs = []
     for layer in self.out_layers:
         out = layer(final_hid)
         outs.append(out)
     logger.debug3(f'Network input: {x.size()}')
     logger.debug3(f'Network input: {x.data}')
     logger.debug3(f'Network output: {outs}')
     if len(outs) == 1:
         return outs[0]
     else:
         return outs
Esempio n. 26
0
    def compute_q_target_values(self, batch):
        batches = batch['batches']
        q_sts = self.net.wrap_eval(batch['states'])
        logger.debug3(f'Q sts: {q_sts}')
        # TODO parametrize usage of eval or target_net
        q_next_st_acts = self.online_net.wrap_eval(batch['next_states'])
        logger.debug3(f'Q next st act vals: {q_next_st_acts}')
        start_idx = 0
        q_next_acts = []
        for body in self.agent.nanflat_body_a:
            end_idx = start_idx + body.action_dim
            _val, q_next_act_b = torch.max(q_next_st_acts[:,
                                                          start_idx:end_idx],
                                           dim=1)
            # Shift action so that they have the right indices in combined layer
            q_next_act_b += start_idx
            logger.debug2(
                f'Q next action for body {body.aeb}: {q_next_act_b.size()}')
            logger.debug3(f'Q next action for body {body.aeb}: {q_next_act_b}')
            q_next_acts.append(q_next_act_b)
            start_idx = end_idx

        # Select q_next_st_maxs based on action selected in q_next_acts
        q_next_sts = self.eval_net.wrap_eval(batch['next_states'])
        logger.debug2(f'Q next_states: {q_next_sts.size()}')
        logger.debug3(f'Q next_states: {q_next_sts}')
        idx = torch.from_numpy(np.array(list(range(self.batch_size))))
        q_next_st_maxs = []
        for q_next_act_b in q_next_acts:
            q_next_st_max_b = q_next_sts[idx, q_next_act_b]
            q_next_st_max_b.unsqueeze_(1)
            logger.debug2(f'Q next_states max {q_next_st_max_b.size()}')
            logger.debug3(f'Q next_states max {q_next_st_max_b}')
            q_next_st_maxs.append(q_next_st_max_b)

        # Compute final q_target using reward and estimated best Q value from the next state if there is one. Make future reward 0 if the current state is done. Do it individually first, then combine. Each individual target should automatically expand to the dimension of the relevant action space
        q_targets_maxs = []
        for b, batch_b in enumerate(batches):
            q_targets_max_b = (
                batch_b['rewards'].data + self.gamma * torch.mul(
                    (1 - batch_b['dones'].data), q_next_st_maxs[b])).numpy()
            q_targets_max_b = torch.from_numpy(
                np.broadcast_to(
                    q_targets_max_b,
                    (q_targets_max_b.shape[0], self.action_dims[b])))
            q_targets_maxs.append(q_targets_max_b)
            logger.debug2(f'Q targets max: {q_targets_max_b.size()}')
        q_targets_maxs = torch.cat(q_targets_maxs, dim=1)
        logger.debug2(f'Q targets maxes: {q_targets_maxs.size()}')
        logger.debug3(f'Q targets maxes: {q_targets_maxs}')
        # Also concat actions - each batch should have only two non zero dimensions
        actions = [batch_b['actions'] for batch_b in batches]
        combined_actions = torch.cat(actions, dim=1)
        logger.debug2(f'combined_actions: {combined_actions.size()}')
        logger.debug3(f'combined_actions: {combined_actions}')
        # We only want to train the network for the action selected
        # For all other actions we set the q_target = q_sts
        # So that the loss for these actions is 0
        q_targets = torch.mul(q_targets_maxs, combined_actions.data) + \
            torch.mul(q_sts, (1 - combined_actions.data))
        logger.debug2(f'Q targets: {q_targets.size()}')
        logger.debug3(f'Q targets: {q_targets}')
        return q_targets
Esempio n. 27
0
 def get_gae_actor_target(self, rewards, states, next_states, dones):
     '''Target is the Generalized advantage estimate + current state-value estimate'''
     '''First calculate the 1 step bootstrapped estimate of the advantage. Also described as the TD residual of V with discount self.gamma (Sutton & Barto, 1998)'''
     next_state_vals = self.get_critic_output(next_states).squeeze_(dim=1)
     next_state_vals = torch.mul(next_state_vals, 1 - dones.data)
     state_vals = self.get_critic_output(states).squeeze_(dim=1)
     deltas = rewards + self.gamma * next_state_vals - state_vals
     logger.debug3(f'State_vals: {state_vals}')
     logger.debug3(f'Next state_vals: {next_state_vals}')
     logger.debug3(f'Dones: {dones}')
     logger.debug3(f'Deltas: {deltas}')
     logger.debug3(f'Lamda: {self.lamda}, gamma: {self.gamma}')
     '''Then calculate GAE, the exponentially weighted average of the TD residuals'''
     advantage = []
     gae = 0
     for i in range(deltas.size(0) - 1, -1, -1):
         gae = deltas[i] + self.gamma * self.lamda * gae
         advantage.insert(0, gae)
     advantage = torch.Tensor(advantage)
     if torch.cuda.is_available() and self.gpu:
         advantage = advantage.cuda()
     '''Add state_vals so that calc_advantage() api is preserved'''
     target = advantage + state_vals
     logger.debug3(f'Advantage: {advantage}')
     logger.debug3(f'Target: {target}')
     return target
Esempio n. 28
0
 def compute_q_target_values(self, batch):
     '''Computes the target Q values for a batch of experiences'''
     # Calculate the Q values of the current and next states
     q_sts = self.net.wrap_eval(batch['states'])
     q_next_st = self.net.wrap_eval(batch['next_states'])
     q_next_actions = batch['next_actions']
     logger.debug2(f'Q next states: {q_next_st.size()}')
     # Get the q value for the next action that was actually taken
     idx = torch.from_numpy(np.array(list(range(q_next_st.size(0)))))
     if torch.cuda.is_available() and self.gpu:
         idx = idx.cuda()
     q_next_st_vals = q_next_st[idx, q_next_actions.squeeze_(1).data.long()]
     # Expand the dims so that q_next_st_vals can be broadcast
     q_next_st_vals.unsqueeze_(1)
     logger.debug2(f'Q next_states vals {q_next_st_vals.size()}')
     logger.debug3(f'Q next_states {q_next_st}')
     logger.debug3(f'Q next actions {q_next_actions}')
     logger.debug3(f'Q next_states vals {q_next_st_vals}')
     logger.debug3(f'Dones {batch["dones"]}')
     # Compute q_targets using reward and Q value corresponding to the action taken in the next state if there is one. Make next state Q value 0 if the current state is done
     q_targets_actual = batch['rewards'].data + self.gamma * \
         torch.mul((1 - batch['dones'].data), q_next_st_vals)
     logger.debug2(f'Q targets actual: {q_targets_actual.size()}')
     logger.debug3(f'Q states {q_sts}')
     logger.debug3(f'Q targets actual: {q_targets_actual}')
     # We only want to train the network for the action selected in the current state
     # For all other actions we set the q_target = q_sts so that the loss for these actions is 0
     q_targets = torch.mul(q_targets_actual, batch['actions_onehot'].data) + \
         torch.mul(q_sts, (1 - batch['actions_onehot'].data))
     logger.debug2(f'Q targets: {q_targets.size()}')
     logger.debug3(f'Q targets: {q_targets}')
     return q_targets
Esempio n. 29
0
 def check_api(*args, **kwargs):
     output = fn(*args, **kwargs)
     logger.debug3(f'API method: {fn.__name__}, output: {output}')
     return output