def attack(self, x_orig, target, max_change=0.4): x_adv = x_orig.copy() x_len = np.sum(np.sign(x_orig)) # Neigbhours for every word. tmp = [ glove_utils.pick_most_similar_words(x_orig[i], self.dist_mat, 50, 0.5) for i in range(x_len) ] neigbhours_list = [x[0] for x in tmp] neighbours_dist = [x[1] for x in tmp] neighbours_len = [len(x) for x in neigbhours_list] for i in range(x_len): if (x_adv[i] < 27): # To prevent replacement of words like 'the', 'a', 'of', etc. neighbours_len[i] = 0 w_select_probs = neighbours_len / np.sum(neighbours_len) tmp = [ glove_utils.pick_most_similar_words(x_orig[i], self.dist_mat, self.top_n, 0.5) for i in range(x_len) ] neigbhours_list = [x[0] for x in tmp] neighbours_dist = [x[1] for x in tmp] pop = self.generate_population(x_orig, neigbhours_list, neighbours_dist, w_select_probs, target, self.pop_size) for i in range(self.max_iters): # print(i) pop_preds = self.batch_model.predict(self.sess, np.array(pop)) pop_scores = pop_preds[:, target] print('\t\t', i, ' -- ', np.max(pop_scores)) pop_ranks = np.argsort(pop_scores)[::-1] top_attack = pop_ranks[0] logits = np.exp(pop_scores / self.temp) select_probs = logits / np.sum(logits) if np.argmax(pop_preds[top_attack, :]) == target: print("Finished genetic attack in {} iterations".format(i)) return pop[top_attack] elite = [pop[top_attack]] # elite # print(select_probs.shape) parent1_idx = np.random.choice(self.pop_size, size=self.pop_size - 1, p=select_probs) parent2_idx = np.random.choice(self.pop_size, size=self.pop_size - 1, p=select_probs) childs = [ self.crossover(pop[parent1_idx[i]], pop[parent2_idx[i]]) for i in range(self.pop_size - 1) ] childs = [ self.perturb(x, x_orig, neigbhours_list, neighbours_dist, w_select_probs, target) for x in childs ] pop = elite + childs return None
def perturb(self, x_cur, pos, x_orig, target): # perturb a word that is in given position. x_len = np.sum(np.sign(x_cur)) if pos % 50 == 0: print(' --- {} / {} '.format(pos, x_len)) assert pos < x_len, "invalid position" src_word = x_cur[pos] replace_list, _ = glove_utils.pick_most_similar_words( src_word, self.dist_mat, 60) replace_list = [w if w != 0 else src_word for w in replace_list] return self.select_best_replacement(pos, x_cur, x_orig, target, replace_list)
def attack(self, x_orig, target, max_change=0.4): x_adv = x_orig.copy() doc_len = np.sum(np.sign(x_orig)) num_updates = 0 while ((num_updates / doc_len) < max_change): # pick some word W = [] # Set of candiaate updates list_x_new = [] for i, x in enumerate(x_adv): # for each word in x_adv if x != self.dataset.dict["UNK"]: # skip the UNK x_list, _ = glove_utils.pick_most_similar_words( x, self.dist_mat) # TODO(malzantot) Score words in x_ based on the language model # Add the selected word to the W list # TODO(malzantot): check selected word is not equal to the original word. for j in range(len(x_list)): if x_list[j] != x_orig[i]: W.append((i, x_list[0])) x_new = x_adv.copy() x_new[i] = x_list[j] # print(self.inv_dict[x_orig[i]], ' -> ', self.inv_dict[x_new[i]]) list_x_new.append(x_new) break x_new_pred_probs = np.array([ self.model.predict(self.sess, x[np.newaxis, :])[0] for x in list_x_new ]) x_new_preds = np.argmax(x_new_pred_probs, axis=1) x_new_scores = x_new_pred_probs[:, target] top_attack = np.argsort(x_new_scores)[-1] x_adv = list_x_new[top_attack] num_updates += 1 if x_new_preds[top_attack] == target: return x_adv return None
def attack(self, x_orig, target): x1_adv = x_orig[0].copy().ravel() x2_adv = x_orig[1].copy().ravel() x1_orig = x_orig[0].ravel() x2_orig = x_orig[1].ravel() x1_len = np.sum(np.sign(x1_adv)) x2_len = np.sum(np.sign(x2_adv)) tmp = [ glove_utils.pick_most_similar_words(x2_adv[i], self.dist_mat, 50, 0.5) if x2_adv[i] != 0 else ([], []) for i in range(len(x2_adv)) ] neighbours_list = [x[0] for x in tmp] neighbours_dist = [x[1] for x in tmp] neigbhours_len = [len(x) for x in neighbours_list] w_select_probs = neigbhours_len / np.sum(neigbhours_len) tmp = [ glove_utils.pick_most_similar_words( x2_adv[i], self.dist_mat, self.n1, 0.5) if x2_adv[i] != 0 else ([], []) for i in range(len(x2_adv)) ] neighbours_list = [x[0] for x in tmp] neighbours_dist = [x[1] for x in tmp] pop = np.array( self.generate_population(x2_adv, neighbours_list, w_select_probs, target, self.pop_size)) pop = pop.reshape(self.pop_size, -1) # print(pop) pop_x1 = np.tile(x1_adv, (self.pop_size, 1, 1)).reshape(self.pop_size, -1) for iter_idx in range(self.max_iters): pop_preds = self.model.predict([pop_x1, pop]) pop_scores = pop_preds[:, target] pop_ranks = np.argsort(pop_scores)[::-1] top_attack = pop_ranks[0] if np.argmax(pop_preds[top_attack, :]) == target: return x1_orig, pop[top_attack] print(iter_idx, ' : ', np.max(pop_scores)) logits = np.exp(pop_scores / self.temp) pop_select_probs = logits / np.sum(logits) elite = [pop[top_attack]] parent1_idx = np.random.choice(self.pop_size, size=self.pop_size - 1, p=pop_select_probs) parent2_idx = np.random.choice(self.pop_size, size=self.pop_size - 1, p=pop_select_probs) childs = [ self.crossover(pop[parent1_idx[i]], pop[parent2_idx[i]]) for i in range(self.pop_size - 1) ] childs = [ self.perturb(x, x2_orig, neighbours_list, w_select_probs, target) for x in childs ] pop = elite + childs pop = np.array(pop) return None
def attack(self, seq, target, l, max_change=0.4): seq = seq.numpy().squeeze() seq_adv = seq.copy() seq_len = np.sum(np.sign(seq)) l = l.cpu() # To calculate the sampling probability tmp = [ glove_utils.pick_most_similar_words(self.compute_dist(seq[i]), ret_count=50, threshold=0.5) for i in range(l) ] neighbour_list = [t[0] for t in tmp] neighbour_dist = [t[1] for t in tmp] neighbour_len = [len(i) for i in neighbour_list] for i in range(seq_len): if seq[i] < 27: neighbour_len[i] = 0 prob_select = neighbour_len / np.sum(neighbour_len) tmp = [ glove_utils.pick_most_similar_words(self.compute_dist(seq[i]), self.top_n1, 0.5) for i in range(l) ] neighbour_list = [t[0] for t in tmp] neighbour_dist = [t[1] for t in tmp] pop = [ self.perturb(seq_adv, seq, neighbour_list, neighbour_dist, prob_select, seq_len, target, l) for _ in range(self.pop_size) ] l_tensor = l * torch.ones([len(pop)]) pop_np = np.expand_dims(pop[0], 0) for p in pop[1:]: pop_np = np.concatenate((pop_np, np.expand_dims(p, 0)), 0) for i in range(self.max_iters): pop_tensor = torch.tensor(pop_np).type(torch.LongTensor).to( self.device) l_tensor = l_tensor.to(self.device) self.batch_model.eval() with torch.no_grad(): pop_preds = self.batch_model.pred( pop_tensor, l_tensor)[1].cpu().detach().numpy() pop_scores = pop_preds[:, target] print('\t\t', i, ' -- ', np.max(pop_scores)) pop_ranks = np.argsort(pop_scores)[::-1] top_attack = pop_ranks[0] logits = np.exp(pop_scores / self.temp) select_probs = logits / np.sum(logits) if np.argmax(pop_preds[top_attack, :]) == target: print('Success and score: {:.4f}'.format( pop_scores[top_attack])) return pop[top_attack] elite = [pop[top_attack]] # elite # print(select_probs.shape) parent1_idx = np.random.choice(self.pop_size, size=self.pop_size - 1, p=select_probs) parent2_idx = np.random.choice(self.pop_size, size=self.pop_size - 1, p=select_probs) childs = [ self.crossover(pop[parent1_idx[i]], pop[parent2_idx[i]]) for i in range(self.pop_size - 1) ] childs = [ self.perturb(x, seq, neighbour_list, neighbour_dist, prob_select, seq_len, target, l) for x in childs ] pop = elite + childs pop_np = np.expand_dims(pop[0], 0) for p in pop[1:]: pop_np = np.concatenate((pop_np, np.expand_dims(p, 0)), 0) return None
def attack(self, x_orig, target, max_change=0.4): x_orig = x_orig.numpy().squeeze() x_adv = x_orig.copy() x_len = np.sum(np.sign(x_orig)) # Neigbhours for every word. tmp = [ glove_utils.pick_most_similar_words(x_orig[i], self.dist, 50, 0.5) for i in range(x_len) ] neigbhours_list = [x[0] for x in tmp] neighbours_dist = [x[1] for x in tmp] neighbours_len = [len(x) for x in neigbhours_list] for i in range(x_len): if (x_adv[i] < 27): # To prevent replacement of words like 'the', 'a', 'of', etc. neighbours_len[i] = 0 w_select_probs = neighbours_len / np.sum(neighbours_len) tmp = [ glove_utils.pick_most_similar_words(x_orig[i], self.dist, self.top_n, 0.5) for i in range(x_len) ] neigbhours_list = [x[0] for x in tmp] neighbours_dist = [x[1] for x in tmp] pop = self.generate_population(x_orig, neigbhours_list, neighbours_dist, w_select_probs, target, self.pop_size) for i in range(self.max_iters): # print(i) l_tensor = x_len * torch.ones([len(pop)]) pop_np = np.expand_dims(pop[0], 0) for p in pop[1:]: pop_np = np.concatenate((pop_np, np.expand_dims(p, 0)), 0) pop_tensor = torch.tensor(pop_np).type(torch.LongTensor).to( self.device) l_tensor = l_tensor.to(self.device) self.batch_model.eval() with torch.no_grad(): pop_preds = self.batch_model.pred( pop_tensor, l_tensor, False)[1].cpu().detach().numpy() # pop_preds = self.batch_model.predict(self.sess, np.array(pop)) pop_scores = pop_preds[:, target] print('\t\t', i, ' -- ', np.max(pop_scores)) pop_ranks = np.argsort(pop_scores)[::-1] top_attack = pop_ranks[0] ampl = pop_scores / self.temp # print(ampl) covariance = np.cov(ampl) # print(covariance) if covariance > 10e-6: mean = np.mean(ampl) # print(mean) ampl_update = (ampl - mean) / np.sqrt(covariance + 0.001) # print(ampl_update) logits = np.exp(ampl_update) else: if np.max(ampl) > 100: ampl = ampl / (np.max(ampl) / 5) logits = np.exp(ampl) # logits = np.exp(ampl) select_probs = logits / np.sum(logits) if np.argmax(pop_preds[top_attack, :]) == target: return pop[top_attack] elite = [pop[top_attack]] # elite # print(select_probs.shape) parent1_idx = np.random.choice(self.pop_size, size=self.pop_size - 1, p=select_probs) parent2_idx = np.random.choice(self.pop_size, size=self.pop_size - 1, p=select_probs) childs = [ self.crossover(pop[parent1_idx[i]], pop[parent2_idx[i]]) for i in range(self.pop_size - 1) ] childs = [ self.perturb(x, x_orig, neigbhours_list, neighbours_dist, w_select_probs, target) for x in childs ] pop = elite + childs return None
Author: Moustafa Alzantot ([email protected]) All rights reserved. """ import numpy as np import tensorflow as tf import glove_utils import pickle from tensorflow.keras.preprocessing.sequence import pad_sequences MAX_VOCAB_SIZE = 50000 embedding_matrix = np.load( ('aux_files/embeddings_counter_%d.npy' % (MAX_VOCAB_SIZE))) missed = np.load( ('aux_files/missed_embeddings_counter_%d.npy' % (MAX_VOCAB_SIZE))) c_ = -2 * np.dot(embedding_matrix.T, embedding_matrix) a = np.sum(np.square(embedding_matrix), axis=0).reshape((1, -1)) b = a.T dist = a + b + c_ np.save(('aux_files/dist_counter_%d.npy' % (MAX_VOCAB_SIZE)), dist) # Try an example with open('aux_files/dataset_%d.pkl' % MAX_VOCAB_SIZE, 'rb') as f: dataset = pickle.load(f) src_word = dataset.dict['good'] neighbours, neighbours_dist = glove_utils.pick_most_similar_words( src_word, dist) print('Closest words to `good` are :') result_words = [dataset.inv_dict[x] for x in neighbours] print(result_words)
def attack(self, seq, target, l, max_change=0.5): seq = seq.cpu().detach().numpy().squeeze( ) #'''label of change; convert''' seq_orig, seq_orig_string, l_orig = self.orig_sentence(seq) # print(seq_orig) # seq_adv = seq.copy() # seq_len = np.sum(np.sign(seq)) l = l.cpu() # print(self.tokenizer.convert_ids_to_tokens(seq.tolist())) # To calculate the sampling probability tmp = [ glove_utils.pick_most_similar_words(self.compute_dist(seq_orig[i]), 50, 0.5) for i in range(l_orig) ] # tmp = [glove_utils.pick_most_similar_words(self.compute_dist(self.dataset.dict[self.tokenizer.convert_ids_to_tokens([seq[i]])[0]]), ret_count = 50, threshold = 0.5) if self.tokenizer.convert_ids_to_tokens([seq[i]])[0] in self.dataset.dict else ([], []) for i in range(l)] neighbour_list = [t[0] for t in tmp] neighbour_dist = [t[1] for t in tmp] neighbour_len = [len(i) for i in neighbour_list] for i in range(l_orig): if (seq_orig[i] < 27): # To prevent replacement of words like 'the', 'a', 'of', etc. neighbour_len[i] = 0 prob_select = neighbour_len / np.sum(neighbour_len) # print(prob_select) # tmp = [glove_utils.pick_most_similar_words( # self.compute_dist(self.dataset.dict[self.tokenizer.convert_ids_to_tokens([seq[i]])[0]]), self.top_n1, 0.5 # ) if self.tokenizer.convert_ids_to_tokens([seq[i]])[0] in self.dataset.dict else ([], []) for i in range(l)] tmp = [ glove_utils.pick_most_similar_words(self.compute_dist(seq_orig[i]), self.top_n1, 0.5) for i in range(l_orig) ] neighbour_list = [t[0] for t in tmp] neighbour_dist = [t[1] for t in tmp] # print('synonyms') # print(tmp) # print([[self.dataset.inv_dict[j] for j in i if j in self.dataset.inv_dict] for i in neighbour_list]) seq_adv = seq_orig_string.copy() # pop = [self.perturb(seq_adv, seq, seq_orig, l_orig, neighbour_list, neighbour_dist, prob_select, seq_len, target, l) for _ in range(self.pop_size)] pop = [ self.perturb(seq_adv, seq_orig_string, l_orig, neighbour_list, neighbour_dist, prob_select, target, l) for _ in range(self.pop_size) ] l_tensor = torch.ones([len(pop)]).type(torch.LongTensor) pop_np = [[self.tokenizer.cls_token_id] + self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(' '.join(pop[0]).strip())) + [self.tokenizer.sep_token_id]] l_tensor[0] = len(pop_np[0]) # print(l_tensor) for p in range(1, len(pop)): token_ids = [ self.tokenizer.cls_token_id ] + self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(' '.join( pop[p]).strip())) + [self.tokenizer.sep_token_id] pop_np.append(token_ids) l_tensor[p] = len(token_ids) l_max = torch.max(l_tensor) # print(l_max, l_tensor, len(pop_np)) pop_np = pad_sequences(pop_np, maxlen=l_max.item(), padding='post') pop_tensor = torch.tensor(pop_np) # print(torch.tensor(pop_np)) sort = torch.sort(l_tensor, descending=True)[1] # print(len(sort), sort) pop_tensor = pop_tensor[sort] l_tensor = l_tensor[sort] pop = np.array(pop)[sort].tolist() # print(l_tensor) for i in range(self.max_iters): pop_tensor = pop_tensor.type(torch.LongTensor).to(self.device) l_tensor = l_tensor.to(self.device) # print('pop_tensor:',pop_tensor) # print(pop_tensor.shape) # print(l_tensor) self.batch_model.eval() with torch.no_grad(): pop_preds = self.batch_model.pred( pop_tensor, l_tensor, False)[1].cpu().detach().numpy() # print(sort) # print(pop_preds) # print(pop_tensor) pop_scores = pop_preds[:, target] print('\t\t', i, ' -- ', np.max(pop_scores)) pop_ranks = np.argsort(pop_scores)[::-1] # print(l_tensor) # print(pop_ranks) top_attack = pop_ranks[0] # print(top_attack) ampl = pop_scores / self.temp # print(ampl) covariance = np.cov(ampl) # print('pop:', pop) print(covariance) if covariance > 10e-6: mean = np.mean(ampl) # print(mean) ampl_update = (ampl - mean) / np.sqrt(covariance + 0.001) # print(ampl_update) logits = np.exp(ampl_update) else: if np.max(ampl) > 100: ampl = ampl / (np.max(ampl) / 5) logits = np.exp(ampl) # logits = np.exp(ampl) select_probs = logits / np.sum(logits) # print('prob:', select_probs) # print([self.tokenizer.convert_ids_to_tokens([i]) for i in pop_np[top_attack]]) if np.argmax(pop_preds[top_attack, :]) == target: print('Success and score: {:.4f}'.format( pop_scores[top_attack])) print(seq_orig_string) print(pop[top_attack]) return pop[top_attack], seq_orig_string # for i in pop: # print(i) # print('\t') elite = [pop[top_attack]] # elite # print(elite) # print(select_probs.shape) parent1_idx = np.random.choice(self.pop_size, size=self.pop_size - 1, p=select_probs) parent2_idx = np.random.choice(self.pop_size, size=self.pop_size - 1, p=select_probs) childs = [ self.crossover(pop[parent1_idx[i]], pop[parent2_idx[i]]) for i in range(self.pop_size - 1) ] childs = [ self.perturb(x, seq_orig_string, l_orig, neighbour_list, neighbour_dist, prob_select, target, l) for x in childs ] # print(childs) pop = elite + childs # print(len(pop)) # print('pop:', pop) l_tensor = torch.ones([len(pop)]).type(torch.LongTensor) pop_np = [[self.tokenizer.cls_token_id] + self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(' '.join(pop[0]).strip())) + [self.tokenizer.sep_token_id]] l_tensor[0] = len(pop_np[0]) # print(pop_np) # print(l_tensor) # print(pop_np) for p in range(1, len(pop)): token_ids = [ self.tokenizer.cls_token_id ] + self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(' '.join( pop[p]).strip())) + [self.tokenizer.sep_token_id] pop_np.append(token_ids) l_tensor[p] = len(token_ids) # print(l_tensor) # print(pop_np) l_max = torch.max(l_tensor) pop_np = pad_sequences(pop_np, maxlen=l_max.item(), padding='post') pop_tensor = torch.tensor(pop_np) # print(torch.tensor(pop_np)) sort = torch.sort(l_tensor, descending=True)[1] # print(len(sort), sort) pop_tensor = pop_tensor[sort] l_tensor = l_tensor[sort] pop = np.array(pop)[sort].tolist() # print(np.array(pop).shape) # pop_np = np.expand_dims(pop[0], 0) # for p in pop[1:]: # pop_np = np.concatenate((pop_np, np.expand_dims(p, 0)),0) return None, seq_orig