def eval_action_cert_rate(curr_model, env, args, epsilon=1e-4): episode_reward = 0 state = env.reset() total = 0 certified = 0 with torch.no_grad(): while True: input_x = torch.FloatTensor(state).unsqueeze(0) if args.gpu_id >= 0: with torch.cuda.device(args.gpu_id): input_x = input_x.cuda() output = curr_model.forward(input_x) action = torch.argmax(output, dim=1) upper, lower = network_bounds(curr_model.model, input_x, epsilon=epsilon) #remove the action selected from calculations upper[:, action] = -1e10 max_other = torch.max(upper, dim=1)[0] if lower[:, action] > max_other: certified += 1 total += 1 next_state, reward, done, info = env.step(action) episode_reward += reward state = next_state if done and not info: state = env.reset() elif info: state = env.reset() print('Reward:{}, action certification rate {:.4f}'.format( episode_reward, certified / total)) return certified / total
def eval_greedy_wc(curr_model, env, args, epsilon=1e-4): episode_reward = 0 state = env.reset() with torch.no_grad(): while True: input_x = torch.FloatTensor(state).unsqueeze(0) if args.gpu_id >= 0: with torch.cuda.device(args.gpu_id): input_x = input_x.cuda() output = curr_model.forward(input_x) #print(output) upper, lower = network_bounds(curr_model.model, input_x, epsilon=epsilon) impossible = upper < torch.max(lower, dim=1)[0] #add a large number to ignore impossible ones, choose possible action with smallest q-value worst_case_action = torch.argmin(output + 1e6 * impossible, dim=1) next_state, reward, done, info = env.step(worst_case_action[0]) episode_reward += reward state = next_state if done and not info: state = env.reset() elif info: state = env.reset() print('Worst case reward {}'.format(episode_reward)) return episode_reward
def action_test_losses(self, bound_epsilon=None): with torch.no_grad(): value, logit = self.model(Variable(self.state.unsqueeze(0))) prob = torch.clamp(F.softmax(logit, dim=1), 1e-6, 1) log_prob = torch.clamp(F.log_softmax(logit, dim=1), -30, -1e-6) entropy = -(log_prob * prob).sum(1) self.entropies.append(entropy) action = prob.argmax(1, keepdim=True).data if bound_epsilon: upper, lower = network_bounds(self.model.model, Variable( self.state.unsqueeze(0)), epsilon=bound_epsilon) upper, lower = upper[:, 1:], lower[:, 1:] with torch.cuda.device(self.gpu_id): onehot_action = torch.zeros(upper.shape).cuda() onehot_action[range(upper.shape[0]), action] = 1 min_prob = torch.clamp( F.log_softmax(onehot_action * lower + (1 - onehot_action) * upper, dim=1), -30, -1e-6) max_prob = torch.clamp( F.log_softmax( (1 - onehot_action) * lower + onehot_action * upper, dim=1), -30, -1e-6) self.max_log_probs.append(max_prob.gather(1, Variable(action))) self.min_log_probs.append(min_prob.gather(1, Variable(action))) log_prob = log_prob.gather(1, Variable(action)) state, self.noclip_reward, self.done, self.info = self.env.step( action.cpu().numpy()) self.reward = max(min(self.noclip_reward, 1), -1) self.state = torch.from_numpy(state).float() if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): self.state = self.state.cuda() self.values.append(value) self.log_probs.append(log_prob) self.rewards.append(self.reward) self.eps_len += 1 return self
def _compute_robust_loss(curr_model, target_model, data, epsilon, kappa, gamma, device, args): state, action, reward, next_state, done = data q_values = curr_model(state) next_q_values = curr_model(next_state) next_q_state_values = target_model(next_state) q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) next_q_value = next_q_state_values.gather( 1, torch.argmax(next_q_values, 1, keepdim=True)).squeeze(1) expected_q_value = reward + gamma * next_q_value * (1 - done) standard_loss = torch.min((q_value - expected_q_value.detach()).pow(2), torch.abs(q_value - expected_q_value.detach())) upper, lower = network_bounds(curr_model.model, state, epsilon) onehot_labels = torch.zeros(upper.shape).to(device) onehot_labels[range(state.shape[0]), action] = 1 if args.worse_bound: upper_diff = upper - q_values * ( 1 - onehot_labels ) - expected_q_value.detach().unsqueeze(1) * onehot_labels lower_diff = lower - q_values * ( 1 - onehot_labels ) - expected_q_value.detach().unsqueeze(1) * onehot_labels wc_diff = torch.max(torch.abs(upper_diff), torch.abs(lower_diff)) else: worst_case = onehot_labels * lower + (1 - onehot_labels) * upper wc_diff = torch.abs(worst_case - q_values * (1 - onehot_labels) - expected_q_value.detach().unsqueeze(1) * onehot_labels) #sum over output layer, mean only in batch dimension worst_case_loss = torch.sum(torch.min(wc_diff.pow(2), wc_diff), dim=1).mean() standard_loss = standard_loss.mean() loss = (kappa * (standard_loss) + (1 - kappa) * (worst_case_loss)) return loss, standard_loss, worst_case_loss