def attack(self, data_to_attack: TransactionsData) -> AttackerOutput: inputs_to_attack = data_to_tensors(data_to_attack, self.reader, self.vocab, self.device) orig_prob = self.get_clf_probs(inputs_to_attack)[self.label_to_index( data_to_attack.label)].item() adv_data = deepcopy(data_to_attack) amounts = generate_transaction_amounts(self.total_amount, self.num_tokens_to_add) for amount in amounts: self.attacker.total_amount = amount output = self.attacker.attack(adv_data) adv_data = output.to_dict()["adversarial_data"] adv_data['label'] = data_to_attack.label adv_data = TransactionsData(**adv_data) adv_data.label = output.adversarial_data['label'] adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab, self.device) adv_probs = self.get_clf_probs(adv_inputs) adv_prob = adv_probs[self.label_to_index(data_to_attack.label)].item() output = AttackerOutput( data=data_to_attack.to_dict(), adversarial_data=adv_data.to_dict(), probability=orig_prob, adversarial_probability=adv_prob, prob_diff=(orig_prob - adv_prob), wer=word_error_rate_on_sequences(data_to_attack.transactions, adv_data.transactions), ) return output
def attack(self, data_to_attack: TransactionsData) -> AttackerOutput: inputs_to_attack = data_to_tensors(data_to_attack, self.reader, self.vocab, self.device) orig_prob = self.get_clf_probs(inputs_to_attack)[self.label_to_index( data_to_attack.label)].item() logits = self.get_lm_logits(inputs_to_attack) logits = logits / self.temperature probs = torch.softmax(logits, dim=-1) probs[:, :, self.special_indexes] = 0.0 indexes = Categorical(probs=probs[0]).sample((self.num_samples, )) adversarial_sequences = [ decode_indexes(idx, self.vocab) for idx in indexes ] outputs = [] adv_data = deepcopy(data_to_attack) for adv_sequence in adversarial_sequences: adv_data.transactions = adv_sequence adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab, self.device) adv_probs = self.get_clf_probs(adv_inputs) adv_data.label = self.probs_to_label(adv_probs) adv_prob = adv_probs[self.label_to_index( data_to_attack.label)].item() output = AttackerOutput( data=data_to_attack.to_dict(), adversarial_data=adv_data.to_dict(), probability=orig_prob, adversarial_probability=adv_prob, prob_diff=(orig_prob - adv_prob), wer=word_error_rate_on_sequences(data_to_attack.transactions, adv_data.transactions), ) outputs.append(output) best_output = self.find_best_attack(outputs) # we don't need history here actually # best_output.history = [deepcopy(o.__dict__) for o in outputs] return best_output
def attack(self, data_to_attack: TransactionsData) -> AttackerOutput: orig_prob = self.get_probability_of_data(data_to_attack) adv_data = deepcopy(data_to_attack) num_steps = self._num_steps or len(data_to_attack) indexes_to_flip = np.random.randint(0, len(data_to_attack), size=num_steps) outputs = [] for index_to_flip in indexes_to_flip: probabilities = {} for idx, token in self.vocab.get_index_to_token_vocabulary( namespace="transactions").items(): curr_adv_data = deepcopy(adv_data) curr_adv_data.transactions[index_to_flip] = token curr_prob = self.get_probability_of_data(curr_adv_data) probabilities[token] = curr_prob probabilities_sorted = sorted(probabilities.items(), key=lambda x: x[1], reverse=False) max_token, adv_prob = probabilities_sorted[0] prob_drop = orig_prob - adv_prob if prob_drop > 0.0: adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab, self.device) adv_data.transactions[index_to_flip] = max_token adv_data.label = self.probs_to_label( self.get_clf_probs(adv_inputs)) output = AttackerOutput( data=data_to_attack.to_dict(), adversarial_data=adv_data.to_dict(), probability=orig_prob, adversarial_probability=adv_prob, prob_diff=prob_drop, wer=word_error_rate_on_sequences( data_to_attack.transactions, adv_data.transactions), ) outputs.append(output) # TODO: empty outputs best_output = self.find_best_attack(outputs) return best_output
def attack(self, data_to_attack: TransactionsData) -> AttackerOutput: # get inputs to the model inputs = data_to_tensors(data_to_attack, reader=self.reader, vocab=self.vocab, device=self.device) adversarial_idexes = inputs["transactions"]["tokens"]["tokens"][0] # original probability of the true label orig_prob = self.get_clf_probs(inputs)[self.label_to_index(data_to_attack.label)].item() # get mask and transaction embeddings emb_out = self.classifier.get_transaction_embeddings(transactions=inputs["transactions"]) # disable gradients using a trick embeddings = emb_out["transaction_embeddings"].detach() embeddings_splitted = [e for e in embeddings[0]] outputs = [] for step in range(self.num_steps): # choose random index of embeddings (except for start/end tokens) random_idx = random.randint(1, max(1, len(data_to_attack.transactions) - 2)) # only one embedding can be modified embeddings_splitted[random_idx].requires_grad = True # calculate the loss for current embeddings loss = self.classifier.forward_on_transaction_embeddings( transaction_embeddings=torch.stack(embeddings_splitted, dim=0).unsqueeze(0), mask=emb_out["mask"], amounts=inputs["amounts"], label=inputs["label"], )["loss"] loss.backward() # update the chosen embedding embeddings_splitted[random_idx] = ( embeddings_splitted[random_idx] + self.epsilon * embeddings_splitted[random_idx].grad.data.sign() ) self.classifier.zero_grad() # find the closest embedding for the modified one distances = torch.nn.functional.pairwise_distance(embeddings_splitted[random_idx], self.emb_layer) # we dont choose special tokens distances[self.special_indexes] = 10 ** 16 # swap embeddings closest_idx = distances.argsort(descending=False).tolist() for idx in closest_idx: embeddings_splitted[random_idx] = self.emb_layer[idx] embeddings_splitted = [e.detach() for e in embeddings_splitted] adversarial_idexes_lm = deepcopy(adversarial_idexes) adversarial_idexes_lm[random_idx] = idx adv_data_lm = deepcopy(data_to_attack) adv_data_lm.transactions = decode_indexes(adversarial_idexes, vocab=self.vocab) adv_inputs_lm = data_to_tensors(adv_data_lm, self.reader, self.vocab, self.device) if self.lm(**adv_inputs_lm)["loss"] < self.lm_threshold: # get adversarial indexes adversarial_idexes[random_idx] = idx break else: continue adv_data = deepcopy(data_to_attack) adv_data.transactions = decode_indexes(adversarial_idexes, vocab=self.vocab) adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab, self.device) # get adversarial probability and adversarial label adv_probs = self.get_clf_probs(adv_inputs) adv_data.label = self.probs_to_label(adv_probs) adv_prob = adv_probs[self.label_to_index(data_to_attack.label)].item() output = AttackerOutput( data=data_to_attack.to_dict(), adversarial_data=adv_data.to_dict(), probability=orig_prob, adversarial_probability=adv_prob, prob_diff=(orig_prob - adv_prob), wer=word_error_rate_on_sequences(data_to_attack.transactions, adv_data.transactions), ) outputs.append(output) best_output = self.find_best_attack(outputs) best_output.history = [output.to_dict() for output in outputs] return best_output
def get_probability_of_data(self, data: TransactionsData) -> float: inputs_to_attack = data_to_tensors(data, self.reader, self.vocab, self.device) prob = self.get_clf_probs(inputs_to_attack)[self.label_to_index( data.label)].item() return prob
def attack(self, data_to_attack: TransactionsData) -> AttackerOutput: inputs_to_attack = data_to_tensors(data_to_attack, self.reader, self.vocab, self.device) orig_prob = self.get_clf_probs(inputs_to_attack)[self.label_to_index( data_to_attack.label)].item() adv_data = deepcopy(data_to_attack) amounts = generate_transaction_amounts(self.total_amount, self.num_tokens_to_add) if self.position == Position.END: adv_data.transactions = adv_data.transactions + random.sample( self.all_tokens, self.num_tokens_to_add) adv_data.amounts = adv_data.amounts + amounts elif self.position == Position.START: adv_data.transactions = random.sample( self.all_tokens, self.num_tokens_to_add) + adv_data.transactions adv_data.amounts = amounts + adv_data.amounts else: raise NotImplementedError adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab, self.device) logits = self.get_lm_logits(adv_inputs) # drop start and end tokens logits = logits[:, 1:-1] if self.position == Position.END: logits_to_sample = logits[:, -self.num_tokens_to_add:][0] elif self.position == Position.START: logits_to_sample = logits[:, :self.num_tokens_to_add][0] else: raise NotImplementedError logits_to_sample = logits_to_sample / self.temperature probs = torch.softmax(logits_to_sample, dim=-1) probs[:, self.special_indexes] = 0.0 indexes = Categorical(probs=probs).sample((self.num_samples, )) if self.position == Position.END: adversarial_sequences = [ data_to_attack.transactions + decode_indexes(idx, self.vocab, drop_start_end=False) for idx in indexes ] elif self.position == Position.START: adversarial_sequences = [ decode_indexes(idx, self.vocab, drop_start_end=False) + data_to_attack.transactions for idx in indexes ] else: raise NotImplementedError outputs = [] for adv_sequence in adversarial_sequences: adv_data.transactions = adv_sequence adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab, self.device) adv_probs = self.get_clf_probs(adv_inputs) adv_label = self.probs_to_label(adv_probs) adv_data.label = adv_label adv_prob = adv_probs[self.label_to_index( data_to_attack.label)].item() output = AttackerOutput( data=data_to_attack.to_dict(), adversarial_data=adv_data.to_dict(), probability=orig_prob, adversarial_probability=adv_prob, prob_diff=(orig_prob - adv_prob), wer=word_error_rate_on_sequences(data_to_attack.transactions, adv_data.transactions), ) outputs.append(output) best_output = self.find_best_attack(outputs) # we don't need history here actually # best_output.history = [deepcopy(o.__dict__) for o in outputs] return best_output