def integrated_gradient(model, x, pred_label, step_size=0.02, n_iters=4): avg_grad = None for n in range(1, n_iters + 1): x_ = float(n) / n_iters * x x_ = x_.detach() gradient, _, _, _ = vanilla_gradient(model, x_, pred_label, step_size) if n == 1: avg_grad = gradient else: avg_grad += gradient avg_grad /= n_iters inte_grad = np.multiply(avg_grad, x.detach().cpu().data.numpy()) scale = np.sum(inte_grad, axis=-1, keepdims=True) intp = np.multiply(avg_grad, scale) grad_l2 = np.sum(intp[:, 0, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size model.hidden = model.init_hidden() pred, _ = model(x.cpu()) p_prior = logit2prob(pred[0].data.numpy()) intp /= np.sqrt(np.sum(intp[:, 0, :]**2)) # normalize to unit length x_after = np.copy(x.cpu().data.numpy()) x_after = perturb_embedding(x_after, intp * step_size) x_after = torch.from_numpy(x_after) model.hidden = model.init_hidden() pred, _ = model(x_after.cpu()) p_after = logit2prob(pred[0].data.numpy()) changes_pred = p_after - p_prior return inte_grad, importance_score, x_after, changes_pred, avg_grad
def smooth_gradient(model, x0, pred_label, DEVICE, step_size, noise_range=0.02, n_iters=20): smooth_grad = None for n in range(n_iters): x0_ = x0 + torch.randn(x0.shape).to(DEVICE) * noise_range gradient, _, _, _ = vanilla_gradient(model, x0_, pred_label) if n == 0: smooth_grad = gradient else: smooth_grad += gradient smooth_grad /= n_iters grad_l2 = np.sum(smooth_grad[:, 0, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size model.hidden = model.init_hidden() pred, _ = model(x0.cpu()) p_prior = logit2prob(pred[0].data.numpy()) smooth_grad /= np.sqrt(np.sum( smooth_grad[:, 0, :]**2)) # normalize to unit length x_after = np.copy(x0.cpu().data.numpy()) x_after = perturb_embedding(x_after, smooth_grad * step_size) x_after = torch.from_numpy(x_after) model.hidden = model.init_hidden() pred, _ = model(x_after) p_after = logit2prob(pred[0].data.numpy()) changes_pred = p_after - p_prior return smooth_grad, importance_score, x_after, changes_pred
def vanilla_gradient(model, x, pred_label, step_size=0.02): model.batch_size = 1 model.hidden = model.init_hidden() x = x.cpu() x.requires_grad = True pred, _ = model(x) x_prior = x.data.numpy() p_prior = logit2prob(pred[0].data.numpy()) one_hot = np.zeros((1, 2), dtype=np.float32) one_hot[0][pred_label[0]] = 1 one_hot = torch.from_numpy(one_hot) one_hot.requires_grad = True one_hot = torch.sum(one_hot * pred[0]) gradient = grad(one_hot, x)[0].numpy() grad_l2 = np.sum(gradient[:, 0, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size gradient /= np.sqrt(np.sum(gradient[:, 0, :]**2)) # normalize to unit length x_after = np.copy(x_prior) x_after = perturb_embedding(x_after, gradient * step_size) x_after = torch.from_numpy(x_after) model.hidden = model.init_hidden() pred, _ = model(x_after) p_after = logit2prob(pred[0].data.numpy()) changes_pred = p_after - p_prior #print(pred_label) #print(importance_score) #print(changes_pred) return gradient, importance_score, x_after, changes_pred
def gradient_times_input(model, row, pred_label, DEVICE, step_size=0.02): gradient, importance_score, x_after, changes_pred = vanilla_gradient( model, row, pred_label, DEVICE, step_size=step_size) x0, segments_ids, input_masks = row grad_times_input = np.multiply(gradient, x0.detach().cpu().data.numpy()) scale = np.sum(grad_times_input, axis=-1, keepdims=True) intp = np.multiply(gradient, scale) grad_l2 = np.sum(intp[0, :, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size pred = model(inputs_embeds=x0, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_prior = logit2prob(pred[0].cpu().data.numpy()) intp /= np.sqrt(np.sum(intp[0, :, :]**2)) # normalize to unit length x_after = np.copy(x0.cpu().data.numpy()) x_after = perturb_embedding(x_after, intp * step_size) x_after = torch.from_numpy(x_after).to(DEVICE) pred = model(inputs_embeds=x_after, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_after = logit2prob(pred[0].cpu().data.numpy()) changes_pred = p_after - p_prior return grad_times_input, importance_score, x_after, changes_pred
def vanilla_gradient(model, row, pred_label, DEVICE, step_size=0.02): x, segments_ids, input_masks = row x.requires_grad = True pred = model(inputs_embeds=x, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] x_prior = x.cpu().data.numpy() p_prior = logit2prob(pred[0].cpu().data.numpy()) one_hot = np.zeros((1, 2), dtype=np.float32) one_hot[0][pred_label[0]] = 1 one_hot = torch.from_numpy(one_hot).to(DEVICE) one_hot.requires_grad = True one_hot = torch.sum(one_hot * pred[0]) gradient = grad(one_hot, x)[0].cpu().numpy() grad_l2 = np.sum(gradient[0, :, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size gradient_unit = gradient / np.sqrt(np.sum(gradient[0, :, :]** 2)) # normalize to unit length x_after = np.copy(x_prior) x_after = perturb_embedding(x_after, gradient_unit * step_size) x_after = torch.from_numpy(x_after).to(DEVICE) pred = model(inputs_embeds=x_after, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_after = logit2prob(pred[0].cpu().data.numpy()) changes_pred = p_after - p_prior # print(pred_label) # print(changes_pred) return gradient, importance_score, x_after, changes_pred
def integrated_gradient(model, row, pred_label, DEVICE, step_size=0.02, n_iters=7): x, segments_ids, input_masks = row avg_grad = None for n in range(1, n_iters + 1): x_ = float(n) / n_iters * x x_ = x_.detach() gradient, _, _, _ = vanilla_gradient(model, [x_, segments_ids, input_masks], pred_label, DEVICE) if n == 1: avg_grad = gradient else: avg_grad += gradient avg_grad /= n_iters inte_grad = np.multiply(avg_grad, x.detach().cpu().data.numpy()) scale = np.sum(inte_grad, axis=-1, keepdims=True) intp = np.multiply(avg_grad, scale) grad_l2 = np.sum(intp[0, :, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size pred = model(inputs_embeds=x, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_prior = logit2prob(pred[0].cpu().data.numpy()) intp /= np.sqrt(np.sum(intp[0, :, :]**2)) # normalize to unit length x_after = np.copy(x.cpu().data.numpy()) x_after = perturb_embedding(x_after, intp * step_size) x_after = torch.from_numpy(x_after).to(DEVICE) pred = model(inputs_embeds=x_after, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_after = logit2prob(pred[0].cpu().data.numpy()) changes_pred = p_after - p_prior return inte_grad, importance_score, x_after, changes_pred
def gradient_times_input(model, x, pred_label, step_size=0.02): gradient, importance_score, x_after, changes_pred = vanilla_gradient( model, x.detach(), pred_label, step_size=step_size) grad_times_input = np.multiply(gradient, x.detach().cpu().data.numpy()) scale = np.sum(grad_times_input, axis=-1, keepdims=True) intp = np.multiply(gradient, scale) grad_l2 = np.sum(intp[:, 0, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size model.hidden = model.init_hidden() pred, _ = model(x.cpu()) p_prior = logit2prob(pred[0].data.numpy()) intp /= np.sqrt(np.sum(intp[:, 0, :]**2)) # normalize to unit length x_after = np.copy(x.cpu().data.numpy()) x_after = perturb_embedding(x_after, intp * step_size) x_after = torch.from_numpy(x_after) model.hidden = model.init_hidden() pred, _ = model(x_after.cpu()) p_after = logit2prob(pred[0].data.numpy()) changes_pred = p_after - p_prior return intp, importance_score, x_after, changes_pred
def smooth_gradient(model, row, pred_label, DEVICE, step_size, n_iters=20): x0, segments_ids, input_masks = row noise_range = 0.4 * step_size smooth_grad = None for n in range(n_iters): noise = torch.randn(x0.shape) noise = noise / torch.sqrt(torch.sum( noise[0, :, :]**2)) * noise_range # normalize noise to unit length x0_ = x0 + noise.to(DEVICE) gradient, _, _, _ = vanilla_gradient(model, [x0_, segments_ids, input_masks], pred_label, DEVICE) if n == 0: smooth_grad = gradient else: smooth_grad += gradient smooth_grad /= n_iters grad_l2 = np.sum(smooth_grad[0, :, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size pred = model(inputs_embeds=x0, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_prior = logit2prob(pred[0].cpu().data.numpy()) smooth_grad /= np.sqrt(np.sum( smooth_grad[0, :, :]**2)) # normalize to unit length x_after = np.copy(x0.cpu().data.numpy()) x_after = perturb_embedding(x_after, smooth_grad * step_size) x_after = torch.from_numpy(x_after).to(DEVICE) pred = model(inputs_embeds=x_after, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_after = logit2prob(pred[0].cpu().data.numpy()) changes_pred = p_after - p_prior return smooth_grad, importance_score, x_after, changes_pred