def get_nstep_target_episodic(self, batch): '''Returns a list of tensors containing the estimate of the state-action values per batch using n-step returns''' nts = self.num_step_returns targets = [] dones = batch['dones'] next_states = batch['next_states'] rewards = batch['rewards'] for d, ns, r in zip(dones, next_states, rewards): next_state_vals = self.get_critic_output(ns).squeeze_(dim=1) r = r.data (R, next_state_gammas) = self.get_R_ex_state_val_estimate( next_state_vals, r) '''Complete for 0th step and add state-value estimate''' R = r + self.gamma * R next_state_gammas *= self.gamma logger.debug3(f'R: {R}') logger.debug3(f'next_state_gammas: {next_state_gammas}') logger.debug3(f'dones: {d}') '''Calculate appropriate state value accounting for terminal states and number of time steps''' discounted_state_val_estimate = torch.mul(next_state_vals, next_state_gammas) discounted_state_val_estimate = torch.mul( discounted_state_val_estimate, 1 - d.data) if nts < next_state_vals.size(0): logger.debug2( f'N step returns less than episode length, adding boostrap' ) R += discounted_state_val_estimate logger.debug3( f'discounted_state_val_estimate: {discounted_state_val_estimate}' ) logger.debug3(f'R: {R}') targets.append(R) return targets
def train(self): '''Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' t = util.s_get(self, 'aeb_space.clock').get('total_t') if (t > self.training_min_timestep and t % self.training_frequency == 0): logger.debug3(f'Training at t: {t}') nanflat_loss_a = np.zeros(self.agent.body_num) for _b in range(self.training_epoch): batch_losses = np.zeros(self.agent.body_num) batch = self.sample() for _i in range(self.training_iters_per_batch): q_targets = self.compute_q_target_values(batch) y = [Variable(q) for q in q_targets] losses = self.net.training_step(batch['states'], y) logger.debug(f'losses {losses}') batch_losses += losses batch_losses /= self.training_iters_per_batch nanflat_loss_a += batch_losses nanflat_loss_a /= self.training_epoch loss_a = self.nanflat_to_data_a('loss', nanflat_loss_a) return loss_a else: logger.debug3('NOT training') return np.nan
def time_fn(*args, **kwargs): start = time.time() output = fn(*args, **kwargs) end = time.time() logger.debug3( f'Timed: {fn.__name__} {round((end - start) * 1000, 4)}ms') return output
def train(self): '''Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' t = util.s_get(self, 'aeb_space.clock').get('total_t') if (t > self.training_min_timestep and t % self.training_frequency == 0): logger.debug3(f'Training at t: {t}') total_loss = 0.0 for _b in range(self.training_epoch): batch = self.sample() batch_loss = 0.0 for _i in range(self.training_iters_per_batch): q_targets = self.compute_q_target_values(batch) y = Variable(q_targets) loss = self.net.training_step(batch['states'], y) batch_loss += loss.data[0] batch_loss /= self.training_iters_per_batch total_loss += batch_loss total_loss /= self.training_epoch logger.debug(f'total_loss {total_loss}') return total_loss else: logger.debug3('NOT training') return np.nan
def multi_act_with_boltzmann(nanflat_body_a, state_a, net, nanflat_tau_a, gpu): nanflat_state_a = util.nanflatten(state_a) cat_state_a = np.concatenate(nanflat_state_a).astype(float) torch_state = torch.from_numpy(cat_state_a).float() if torch.cuda.is_available() and gpu: torch_state = torch_state.cuda() torch_state = Variable(torch_state) out = net.wrap_eval(torch_state) nanflat_action_a = [] start_idx = 0 logger.debug2(f'taus: {nanflat_tau_a}') for body, tau in zip(nanflat_body_a, nanflat_tau_a): end_idx = start_idx + body.action_dim out_with_temp = torch.div(out[start_idx:end_idx], tau) logger.debug3(f''' tau: {tau}, out: {out}, out select: {out[start_idx: end_idx]}, out with temp: {out_with_temp}''') probs = F.softmax(Variable(out_with_temp.cpu()), dim=0).data.numpy() action = np.random.choice(list(range(body.action_dim)), p=probs) logger.debug3(f''' body: {body.aeb}, net idx: {start_idx}-{end_idx} probs: {probs}, action: {action}''') nanflat_action_a.append(action) start_idx = end_idx return nanflat_action_a
def multi_head_act_with_boltzmann(nanflat_body_a, state_a, net, nanflat_tau_a, gpu): nanflat_state_a = util.nanflatten(state_a) torch_states = [] for state in nanflat_state_a: state = state.astype('float') torch_states.append(torch.from_numpy(state).float().unsqueeze_(dim=0)) if torch.cuda.is_available() and gpu: for torch_state in torch_states: torch_state = torch_state.cuda() for torch_state in torch_states: torch_state = Variable(torch_state) outs = net.wrap_eval(torch_states) out_with_temp = [torch.div(x, t) for x, t in zip(outs, nanflat_tau_a)] logger.debug2( f'taus: {nanflat_tau_a}, outs: {outs}, out_with_temp: {out_with_temp}') nanflat_action_a = [] for body, output in zip(nanflat_body_a, out_with_temp): probs = F.softmax(Variable(output.cpu()), dim=1).data.numpy()[0] action = np.random.choice(list(range(body.action_dim)), p=probs) logger.debug3(f''' body: {body.aeb}, output: {output}, probs: {probs}, action: {action}''') nanflat_action_a.append(action) return nanflat_action_a
def reset(self, state_space): logger.debug3('AgentSpace.reset') _action_v, _loss_v, _explore_var_v = self.aeb_space.init_data_v(AGENT_DATA_NAMES) for agent in self.agents: state_a = state_space.get(a=agent.a) agent.reset(state_a) _action_space, _loss_space, _explore_var_space = self.aeb_space.add(AGENT_DATA_NAMES, [_action_v, _loss_v, _explore_var_v]) logger.debug3(f'action_space: {_action_space}') return _action_space
def update_linear_decay(cls, space_clock): epi = space_clock.get('epi') rise = cls.explore_var_end - cls.explore_var_start slope = rise / float(cls.explore_anneal_epi) explore_var = max(slope * (epi - 1) + cls.explore_var_start, cls.explore_var_end) cls.nanflat_explore_var_a = [explore_var] * cls.agent.body_num logger.debug3(f'nanflat_explore_var_a: {cls.nanflat_explore_var_a[0]}') return cls.nanflat_explore_var_a
def reset(self): logger.debug3('EnvSpace.reset') _reward_v, state_v, done_v = self.aeb_space.init_data_v(ENV_DATA_NAMES) for env in self.envs: _reward_e, state_e, done_e = env.space_reset() state_v[env.e, 0:len(state_e)] = state_e done_v[env.e, 0:len(done_e)] = done_e _reward_space, state_space, done_space = self.aeb_space.add(ENV_DATA_NAMES, (_reward_v, state_v, done_v)) logger.debug3(f'\nstate_space: {state_space}') return _reward_space, state_space, done_space
def get_gae_critic_target(self, rewards): '''Target is the discounted sum of returns for training the critic''' target = [] big_r = 0 for i in range(rewards.size(0) - 1, -1, -1): big_r = rewards[i] + self.gamma * big_r target.insert(0, big_r) target = torch.Tensor(target) logger.debug3(f'Target: {target}') return target
def act(self, state_space): data_names = ('action', ) action_v, = self.aeb_space.init_data_v(data_names) for agent in self.agents: a = agent.a state_a = state_space.get(a=a) action_a = agent.space_act(state_a) action_v[a, 0:len(action_a)] = action_a action_space, = self.aeb_space.add(data_names, (action_v, )) logger.debug3(f'\naction_space: {action_space}') return action_space
def step(self, action_space): reward_v, state_v, done_v = self.aeb_space.init_data_v(ENV_DATA_NAMES) for env in self.envs: e = env.e action_e = action_space.get(e=e) reward_e, state_e, done_e = env.space_step(action_e) reward_v[e, 0:len(reward_e)] = reward_e state_v[e, 0:len(state_e)] = state_e done_v[e, 0:len(done_e)] = done_e reward_space, state_space, done_space = self.aeb_space.add(ENV_DATA_NAMES, (reward_v, state_v, done_v)) logger.debug3(f'\nreward_space: {reward_space}\nstate_space: {state_space}\ndone_space: {done_space}') return reward_space, state_space, done_space
def update_multi_linear_decay(cls, _space_clock): nanflat_explore_var_a = [] for body in cls.agent.nanflat_body_a: # use body-clock instead of space clock epi = body.env.clock.get('epi') rise = cls.explore_var_end - cls.explore_var_start slope = rise / float(cls.explore_anneal_epi) explore_var = max(slope * (epi - 1) + cls.explore_var_start, cls.explore_var_end) nanflat_explore_var_a.append(explore_var) cls.nanflat_explore_var_a = nanflat_explore_var_a logger.debug3(f'nanflat_explore_var_a: {cls.nanflat_explore_var_a}') return cls.nanflat_explore_var_a
def train_separate(self): '''Trains the network when the actor and critic are separate networks''' if self.to_train == 1: batch = self.sample() logger.debug3(f'Batch states: {batch["states"]}') critic_loss = self.train_critic(batch) actor_loss = self.train_actor(batch) total_loss = critic_loss + abs(actor_loss) logger.debug( "Losses: Critic: {:.2f}, Actor: {:.2f}, Total: {:.2f}".format( critic_loss, abs(actor_loss), total_loss)) return total_loss else: return np.nan
def update(self, action_space, reward_space, state_space, done_space): data_names = ['loss', 'explore_var'] loss_v, explore_var_v = self.aeb_space.init_data_v(data_names) for agent in self.agents: a = agent.a action_a = action_space.get(a=a) reward_a = reward_space.get(a=a) state_a = state_space.get(a=a) done_a = done_space.get(a=a) loss_a, explore_var_a = agent.update(action_a, reward_a, state_a, done_a) loss_v[a, 0:len(loss_a)] = loss_a explore_var_v[a, 0:len(explore_var_a)] = explore_var_a loss_space, explore_var_space = self.aeb_space.add(data_names, [loss_v, explore_var_v]) logger.debug3(f'\nloss_space: {loss_space}\nexplore_var_space: {explore_var_space}') return loss_space, explore_var_space
def get_policy_loss(self, batch): '''Returns the policy loss for a batch of data. For REINFORCE just rewards are passed in as the batch''' advantage = self.calc_advantage(batch) advantage = self.check_sizes(advantage) policy_loss = [] for log_prob, a, e in zip(self.saved_log_probs, advantage, self.entropy): logger.debug3( f'log prob: {log_prob.data[0]}, advantage: {a}, entropy: {e.data[0]}' ) if self.add_entropy: policy_loss.append(-log_prob * a - self.entropy_weight * e) else: policy_loss.append(-log_prob * a) policy_loss = torch.cat(policy_loss).sum() return policy_loss
def get_nstep_target_batch(self, batch): '''Returns a tensor containing the estimate of the state-action values using n-step returns''' nts = self.num_step_returns next_state_vals = self.get_critic_output(batch['next_states']).squeeze_(dim=1) rewards = batch['rewards'].data (R, next_state_gammas) = self.get_R_ex_state_val_estimate(next_state_vals, rewards) '''Complete for 0th step and add state-value estimate''' R = rewards + self.gamma * R next_state_gammas *= self.gamma logger.debug3(f'R: {R}') logger.debug3(f'next_state_gammas: {next_state_gammas}') logger.debug3(f'dones: {batch["dones"]}') '''Calculate appropriate state value accounting for terminal states and number of time steps''' discounted_state_val_estimate = torch.mul(next_state_vals, next_state_gammas) discounted_state_val_estimate = torch.mul(discounted_state_val_estimate, 1 - batch['dones'].data) R += discounted_state_val_estimate logger.debug3(f'discounted_state_val_estimate: {discounted_state_val_estimate}') logger.debug3(f'R: {R}') return R
def check_sizes(self, advantage): '''Checks that log probs, advantage, and entropy all have the same size Occassionally they do not, this is caused by first reward of an episode being nan. If they are not the same size, the function removes the elements of the log probs and entropy that correspond to nan rewards.''' body = self.agent.nanflat_body_a[0] nan_idxs = body.memory.last_nan_idxs num_nans = sum(nan_idxs) assert len(nan_idxs) == len(self.saved_log_probs) assert len(nan_idxs) == len(self.entropy) assert len(nan_idxs) - num_nans == advantage.size(0) logger.debug2(f'{num_nans} nans encountered when gathering data') if num_nans != 0: idxs = [x for x in range(len(nan_idxs)) if nan_idxs[x] == 1] logger.debug3(f'Nan indexes: {idxs}') for idx in idxs[::-1]: del self.saved_log_probs[idx] del self.entropy[idx] assert len(self.saved_log_probs) == advantage.size(0) assert len(self.entropy) == advantage.size(0) return advantage
def compute_q_target_values(self, batch): batches = batch['batches'] # NOTE: q_sts, q_next_st_acts and q_next_sts are lists q_sts = self.net.wrap_eval(batch['states']) logger.debug3(f'Q sts: {q_sts}') q_next_st_acts = self.online_net.wrap_eval(batch['next_states']) logger.debug3(f'Q next st act vals: {q_next_st_acts}') q_next_acts = [] for i, q in enumerate(q_next_st_acts): _val, q_next_act_b = torch.max(q, dim=1) logger.debug3(f'Q next action for body {i}: {q_next_act_b}') q_next_acts.append(q_next_act_b) # Select q_next_st_maxs based on action selected in q_next_acts q_next_sts = self.eval_net.wrap_eval(batch['next_states']) logger.debug3(f'Q next_states: {q_next_sts}') idx = torch.from_numpy(np.array(list(range(self.batch_size)))) q_next_st_maxs = [] for q_next_st_val_b, q_next_act_b in zip(q_next_sts, q_next_acts): q_next_st_max_b = q_next_st_val_b[idx, q_next_act_b] q_next_st_max_b.unsqueeze_(1) logger.debug2(f'Q next_states max {q_next_st_max_b.size()}') logger.debug3(f'Q next_states max {q_next_st_max_b}') q_next_st_maxs.append(q_next_st_max_b) # Compute q_targets per environment using reward and estimated best Q value from the next state if there is one # Make future reward 0 if the current state is done q_targets_maxs = [] for b, batch_b in enumerate(batches): q_targets_max_b = batch_b['rewards'].data + self.gamma * \ torch.mul((1 - batch_b['dones'].data), q_next_st_maxs[b]) q_targets_maxs.append(q_targets_max_b) logger.debug2( f'Batch {b}, Q targets max: {q_targets_max_b.size()}') # As in the standard DQN we only want to train the network for the action selected # For all other actions we set the q_target = q_sts # So that the loss for these actions is 0 q_targets = [] for b, batch_b in enumerate(batches): q_targets_b = torch.mul(q_targets_maxs[b], batch_b['actions'].data) + \ torch.mul(q_sts[b], (1 - batch_b['actions'].data)) q_targets.append(q_targets_b) logger.debug2(f'Batch {b}, Q targets: {q_targets_b.size()}') return q_targets
def get_R_ex_state_val_estimate(self, next_state_vals, rewards): nts = self.num_step_returns R = torch.zeros_like(next_state_vals) curr_reward_step = torch.zeros_like(next_state_vals) next_state_gammas = torch.zeros_like(next_state_vals) if nts >= next_state_vals.size(0): logger.debug2( f'Num step returns {self.num_step_returns} greater than length batch {next_state_vals.size(0)}. Updating to batch length' ) nts = next_state_vals.size(0) - 1 if nts == 0: next_state_gammas.fill_(1.0) else: j = -nts next_state_gammas[:j] = 1.0 for i in range(nts, 0, -1): logger.debug(f'i: {i}, j: {j}') curr_reward_step[:j] = rewards[i:] next_state_gammas[:j] *= self.gamma R = curr_reward_step + self.gamma * R next_state_gammas[j] = 1.0 j += 1 logger.debug3(f'curr_reward_step: {curr_reward_step}') logger.debug3(f'next_state_gammas: {next_state_gammas}') logger.debug3(f'R: {R}') return (R, next_state_gammas)
def train(self): '''Completes one training step for the agent if it is time to train. Otherwise this function does nothing. ''' t = util.s_get(self, 'aeb_space.clock').get('total_t') if self.to_train == 1: logger.debug3(f'Training at t: {t}') batch = self.sample() if batch['states'].size(0) < 2: logger.info(f'Batch too small to train with, skipping...') self.to_train = 0 return np.nan q_targets = self.compute_q_target_values(batch) if torch.cuda.is_available() and self.gpu: q_targets = q_targets.cuda() y = Variable(q_targets) loss = self.net.training_step(batch['states'], y) logger.debug(f'loss {loss.data[0]}') self.to_train = 0 return loss.data[0] else: logger.debug3('NOT training') return np.nan
def train(self): if self.to_train == 1: logger.debug2(f'Training...') # We only care about the rewards from the batch rewards = self.sample()['rewards'] logger.debug3(f'Length first epi: {len(rewards[0])}') logger.debug3(f'Len log probs: {len(self.saved_log_probs)}') self.net.optim.zero_grad() policy_loss = self.get_policy_loss(rewards) loss = policy_loss.data[0] policy_loss.backward() if self.net.clamp_grad: logger.debug("Clipping gradient...") torch.nn.utils.clip_grad_norm(self.net.parameters(), self.net.clamp_grad_val) logger.debug2(f'Gradient norms: {self.net.get_grad_norms()}') self.net.optim.step() self.to_train = 0 self.saved_log_probs = [] self.entropy = [] logger.debug(f'Policy loss: {loss}') return loss else: return np.nan
def calc_advantage(self, raw_rewards): '''Returns the advantage for each action''' advantage = [] logger.debug3(f'Raw rewards: {raw_rewards}') for epi_rewards in raw_rewards: rewards = [] big_r = 0 for r in epi_rewards[::-1]: big_r = r + self.gamma * big_r rewards.insert(0, big_r) rewards = torch.Tensor(rewards) logger.debug3(f'Rewards: {rewards}') rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) logger.debug3(f'Normalized rewards: {rewards}') advantage.append(rewards) advantage = torch.cat(advantage) return advantage
def create_torch_state(state, state_buffer, recurrent=False, length=0): if recurrent: '''Create sequence of inputs for recurrent net''' logger.debug3(f'length of state buffer: {length}') if len(state_buffer) < length: PAD = np.zeros_like(state) while len(state_buffer) < length: state_buffer.insert(0, PAD) state_buffer = np.asarray(state_buffer) '''Hack to fix buffer not storing the very first state in an epi''' if np.sum(state_buffer) == 0: state_buffer[-1] = state torch_state = Variable(torch.from_numpy(state_buffer).float()) torch_state.unsqueeze_(dim=0) else: torch_state = Variable(torch.from_numpy(state).float()) logger.debug2(f'State size: {torch_state.size()}') logger.debug3(f'Original state: {state}') logger.debug3(f'State: {torch_state}') return torch_state
def forward(self, x): '''The feedforward step. Input is batch_size x sequence_length x state_dim''' '''Unstack input to (batch_size x sequence_length) x state_dim in order to transform all state inputs''' batch_size = x.size(0) x = x.view(-1, self.in_dim) x = self.state_proc_model(x) '''Restack to batch_size x sequence_length x rnn_input_dim''' x = x.view(-1, self.sequence_length, self.rnn_input_dim) hid_0 = self.init_hidden(batch_size) _, final_hid = self.rnn(x, hid_0) final_hid.squeeze_(dim=0) '''If only one head, return tensor, otherwise return list of outputs''' outs = [] for layer in self.out_layers: out = layer(final_hid) outs.append(out) logger.debug3(f'Network input: {x.size()}') logger.debug3(f'Network input: {x.data}') logger.debug3(f'Network output: {outs}') if len(outs) == 1: return outs[0] else: return outs
def compute_q_target_values(self, batch): batches = batch['batches'] q_sts = self.net.wrap_eval(batch['states']) logger.debug3(f'Q sts: {q_sts}') # TODO parametrize usage of eval or target_net q_next_st_acts = self.online_net.wrap_eval(batch['next_states']) logger.debug3(f'Q next st act vals: {q_next_st_acts}') start_idx = 0 q_next_acts = [] for body in self.agent.nanflat_body_a: end_idx = start_idx + body.action_dim _val, q_next_act_b = torch.max(q_next_st_acts[:, start_idx:end_idx], dim=1) # Shift action so that they have the right indices in combined layer q_next_act_b += start_idx logger.debug2( f'Q next action for body {body.aeb}: {q_next_act_b.size()}') logger.debug3(f'Q next action for body {body.aeb}: {q_next_act_b}') q_next_acts.append(q_next_act_b) start_idx = end_idx # Select q_next_st_maxs based on action selected in q_next_acts q_next_sts = self.eval_net.wrap_eval(batch['next_states']) logger.debug2(f'Q next_states: {q_next_sts.size()}') logger.debug3(f'Q next_states: {q_next_sts}') idx = torch.from_numpy(np.array(list(range(self.batch_size)))) q_next_st_maxs = [] for q_next_act_b in q_next_acts: q_next_st_max_b = q_next_sts[idx, q_next_act_b] q_next_st_max_b.unsqueeze_(1) logger.debug2(f'Q next_states max {q_next_st_max_b.size()}') logger.debug3(f'Q next_states max {q_next_st_max_b}') q_next_st_maxs.append(q_next_st_max_b) # Compute final q_target using reward and estimated best Q value from the next state if there is one. Make future reward 0 if the current state is done. Do it individually first, then combine. Each individual target should automatically expand to the dimension of the relevant action space q_targets_maxs = [] for b, batch_b in enumerate(batches): q_targets_max_b = ( batch_b['rewards'].data + self.gamma * torch.mul( (1 - batch_b['dones'].data), q_next_st_maxs[b])).numpy() q_targets_max_b = torch.from_numpy( np.broadcast_to( q_targets_max_b, (q_targets_max_b.shape[0], self.action_dims[b]))) q_targets_maxs.append(q_targets_max_b) logger.debug2(f'Q targets max: {q_targets_max_b.size()}') q_targets_maxs = torch.cat(q_targets_maxs, dim=1) logger.debug2(f'Q targets maxes: {q_targets_maxs.size()}') logger.debug3(f'Q targets maxes: {q_targets_maxs}') # Also concat actions - each batch should have only two non zero dimensions actions = [batch_b['actions'] for batch_b in batches] combined_actions = torch.cat(actions, dim=1) logger.debug2(f'combined_actions: {combined_actions.size()}') logger.debug3(f'combined_actions: {combined_actions}') # We only want to train the network for the action selected # For all other actions we set the q_target = q_sts # So that the loss for these actions is 0 q_targets = torch.mul(q_targets_maxs, combined_actions.data) + \ torch.mul(q_sts, (1 - combined_actions.data)) logger.debug2(f'Q targets: {q_targets.size()}') logger.debug3(f'Q targets: {q_targets}') return q_targets
def get_gae_actor_target(self, rewards, states, next_states, dones): '''Target is the Generalized advantage estimate + current state-value estimate''' '''First calculate the 1 step bootstrapped estimate of the advantage. Also described as the TD residual of V with discount self.gamma (Sutton & Barto, 1998)''' next_state_vals = self.get_critic_output(next_states).squeeze_(dim=1) next_state_vals = torch.mul(next_state_vals, 1 - dones.data) state_vals = self.get_critic_output(states).squeeze_(dim=1) deltas = rewards + self.gamma * next_state_vals - state_vals logger.debug3(f'State_vals: {state_vals}') logger.debug3(f'Next state_vals: {next_state_vals}') logger.debug3(f'Dones: {dones}') logger.debug3(f'Deltas: {deltas}') logger.debug3(f'Lamda: {self.lamda}, gamma: {self.gamma}') '''Then calculate GAE, the exponentially weighted average of the TD residuals''' advantage = [] gae = 0 for i in range(deltas.size(0) - 1, -1, -1): gae = deltas[i] + self.gamma * self.lamda * gae advantage.insert(0, gae) advantage = torch.Tensor(advantage) if torch.cuda.is_available() and self.gpu: advantage = advantage.cuda() '''Add state_vals so that calc_advantage() api is preserved''' target = advantage + state_vals logger.debug3(f'Advantage: {advantage}') logger.debug3(f'Target: {target}') return target
def compute_q_target_values(self, batch): '''Computes the target Q values for a batch of experiences''' # Calculate the Q values of the current and next states q_sts = self.net.wrap_eval(batch['states']) q_next_st = self.net.wrap_eval(batch['next_states']) q_next_actions = batch['next_actions'] logger.debug2(f'Q next states: {q_next_st.size()}') # Get the q value for the next action that was actually taken idx = torch.from_numpy(np.array(list(range(q_next_st.size(0))))) if torch.cuda.is_available() and self.gpu: idx = idx.cuda() q_next_st_vals = q_next_st[idx, q_next_actions.squeeze_(1).data.long()] # Expand the dims so that q_next_st_vals can be broadcast q_next_st_vals.unsqueeze_(1) logger.debug2(f'Q next_states vals {q_next_st_vals.size()}') logger.debug3(f'Q next_states {q_next_st}') logger.debug3(f'Q next actions {q_next_actions}') logger.debug3(f'Q next_states vals {q_next_st_vals}') logger.debug3(f'Dones {batch["dones"]}') # Compute q_targets using reward and Q value corresponding to the action taken in the next state if there is one. Make next state Q value 0 if the current state is done q_targets_actual = batch['rewards'].data + self.gamma * \ torch.mul((1 - batch['dones'].data), q_next_st_vals) logger.debug2(f'Q targets actual: {q_targets_actual.size()}') logger.debug3(f'Q states {q_sts}') logger.debug3(f'Q targets actual: {q_targets_actual}') # We only want to train the network for the action selected in the current state # For all other actions we set the q_target = q_sts so that the loss for these actions is 0 q_targets = torch.mul(q_targets_actual, batch['actions_onehot'].data) + \ torch.mul(q_sts, (1 - batch['actions_onehot'].data)) logger.debug2(f'Q targets: {q_targets.size()}') logger.debug3(f'Q targets: {q_targets}') return q_targets
def check_api(*args, **kwargs): output = fn(*args, **kwargs) logger.debug3(f'API method: {fn.__name__}, output: {output}') return output