def attack(self, data_to_attack: TransactionsData) -> AttackerOutput: inputs_to_attack = data_to_tensors(data_to_attack, self.reader, self.vocab, self.device) orig_prob = self.get_clf_probs(inputs_to_attack)[self.label_to_index( data_to_attack.label)].item() adv_data = deepcopy(data_to_attack) amounts = generate_transaction_amounts(self.total_amount, self.num_tokens_to_add) for amount in amounts: self.attacker.total_amount = amount output = self.attacker.attack(adv_data) adv_data = output.to_dict()["adversarial_data"] adv_data['label'] = data_to_attack.label adv_data = TransactionsData(**adv_data) adv_data.label = output.adversarial_data['label'] adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab, self.device) adv_probs = self.get_clf_probs(adv_inputs) adv_prob = adv_probs[self.label_to_index(data_to_attack.label)].item() output = AttackerOutput( data=data_to_attack.to_dict(), adversarial_data=adv_data.to_dict(), probability=orig_prob, adversarial_probability=adv_prob, prob_diff=(orig_prob - adv_prob), wer=word_error_rate_on_sequences(data_to_attack.transactions, adv_data.transactions), ) return output
def data_to_tensors( data: TransactionsData, reader: DatasetReader, vocab: Vocabulary, device: Union[torch.device, int] = -1, ) -> ModelsInput: instances = Batch([reader.text_to_instance(**data.to_dict())]) instances.index_instances(vocab) inputs = instances.as_tensor_dict() return move_to_device(inputs, device)
def main(config_path: str, samples: int = typer.Option(None, help="Number of samples")): params = Params.from_file(config_path) # enable for testing params['attacker']['device'] = -1 attacker = Attacker.from_params(params["attacker"]) data = load_jsonlines(params["data_path"])[:samples] output_path = params["output_path"] typer.secho(f"Saving results to {output_path} ...", fg="green") with jsonlines.open(output_path, "w") as writer: for el in tqdm(data): adversarial_output = attacker.attack(TransactionsData(**el)) writer.write(adversarial_output.to_dict())
def attack(self, data_to_attack: TransactionsData) -> AttackerOutput: orig_prob = self.get_probability_of_data(data_to_attack) adv_data = deepcopy(data_to_attack) num_steps = self._num_steps or len(data_to_attack) indexes_to_flip = np.random.randint(0, len(data_to_attack), size=num_steps) outputs = [] for index_to_flip in indexes_to_flip: probabilities = {} for idx, token in self.vocab.get_index_to_token_vocabulary( namespace="transactions").items(): curr_adv_data = deepcopy(adv_data) curr_adv_data.transactions[index_to_flip] = token curr_prob = self.get_probability_of_data(curr_adv_data) probabilities[token] = curr_prob probabilities_sorted = sorted(probabilities.items(), key=lambda x: x[1], reverse=False) max_token, adv_prob = probabilities_sorted[0] prob_drop = orig_prob - adv_prob if prob_drop > 0.0: adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab, self.device) adv_data.transactions[index_to_flip] = max_token adv_data.label = self.probs_to_label( self.get_clf_probs(adv_inputs)) output = AttackerOutput( data=data_to_attack.to_dict(), adversarial_data=adv_data.to_dict(), probability=orig_prob, adversarial_probability=adv_prob, prob_diff=prob_drop, wer=word_error_rate_on_sequences( data_to_attack.transactions, adv_data.transactions), ) outputs.append(output) # TODO: empty outputs best_output = self.find_best_attack(outputs) return best_output
def attack(self, data_to_attack: TransactionsData) -> AttackerOutput: inputs_to_attack = data_to_tensors(data_to_attack, self.reader, self.vocab, self.device) orig_prob = self.get_clf_probs(inputs_to_attack)[self.label_to_index( data_to_attack.label)].item() logits = self.get_lm_logits(inputs_to_attack) logits = logits / self.temperature probs = torch.softmax(logits, dim=-1) probs[:, :, self.special_indexes] = 0.0 indexes = Categorical(probs=probs[0]).sample((self.num_samples, )) adversarial_sequences = [ decode_indexes(idx, self.vocab) for idx in indexes ] outputs = [] adv_data = deepcopy(data_to_attack) for adv_sequence in adversarial_sequences: adv_data.transactions = adv_sequence adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab, self.device) adv_probs = self.get_clf_probs(adv_inputs) adv_data.label = self.probs_to_label(adv_probs) adv_prob = adv_probs[self.label_to_index( data_to_attack.label)].item() output = AttackerOutput( data=data_to_attack.to_dict(), adversarial_data=adv_data.to_dict(), probability=orig_prob, adversarial_probability=adv_prob, prob_diff=(orig_prob - adv_prob), wer=word_error_rate_on_sequences(data_to_attack.transactions, adv_data.transactions), ) outputs.append(output) best_output = self.find_best_attack(outputs) # we don't need history here actually # best_output.history = [deepcopy(o.__dict__) for o in outputs] return best_output
def test_from_params(self, config_path, clf_path): dataset = clf_path.parent.parent.parent.name if dataset == "age": data = self.age_test_data elif dataset == "gender": data = self.gender_test_data else: raise NotImplementedError try: params = Params.from_file( str(config_path), ext_vars={ "DATA_PATH": "", "OUTPUT_PATH": "", "CLF_PATH": str(clf_path), "MASKED_LM_PATH": str(clf_path.parent.parent / "lm/bert_with_amounts.tar.gz"), }, ) params["attacker"]["device"] = -1 attacker = advsber.Attacker.from_params(params["attacker"]) except Exception as e: raise AssertionError( f"unable to load params from {config_path}, because {e}") output = attacker.attack(TransactionsData(**data[0])) assert isinstance(output, advsber.AttackerOutput) assert isinstance(output.wer, int) assert output.wer >= 0 assert isinstance(output.prob_diff, float) assert abs(output.prob_diff) <= 1.0 assert isinstance(output.probability, float) assert output.probability >= 0.0 assert isinstance(output.adversarial_probability, float) assert output.adversarial_probability >= 0.0
def __call__( self, trainer: GradientDescentTrainer, batch_inputs: List[List[TensorDict]], batch_outputs: List[Dict[str, Any]], epoch: int, batch_number: int, is_training: bool, is_master: bool, ) -> None: if is_training: attacker = Attacker(classifier=trainer.model, reader=self.reader, device=-1) for batch in batch_inputs: instances = [] for element in batch: data = TransactionsData.from_tensors( inputs=element, vocab=trainer.model.vocab) adv_data = attacker.attack(data) instance = self.reader.text_to_instance(**adv_data) instances.append(instance) new_batch = Batch(instances) new_batch.index_instances(vocab=trainer.model.vocab) new_batch = new_batch.as_tensor_dict() batch_outputs = trainer.batch_outputs(new_batch, for_training=True) loss = batch_outputs.get("loss") _ = batch_outputs.get("reg_loss") loss.backward() trainer.optimizer.step() trainer.optimizer.zero_grad()
def attack(self, data_to_attack: TransactionsData) -> AttackerOutput: # get inputs to the model inputs = data_to_tensors(data_to_attack, reader=self.reader, vocab=self.vocab, device=self.device) adversarial_idexes = inputs["transactions"]["tokens"]["tokens"][0] # original probability of the true label orig_prob = self.get_clf_probs(inputs)[self.label_to_index(data_to_attack.label)].item() # get mask and transaction embeddings emb_out = self.classifier.get_transaction_embeddings(transactions=inputs["transactions"]) # disable gradients using a trick embeddings = emb_out["transaction_embeddings"].detach() embeddings_splitted = [e for e in embeddings[0]] outputs = [] for step in range(self.num_steps): # choose random index of embeddings (except for start/end tokens) random_idx = random.randint(1, max(1, len(data_to_attack.transactions) - 2)) # only one embedding can be modified embeddings_splitted[random_idx].requires_grad = True # calculate the loss for current embeddings loss = self.classifier.forward_on_transaction_embeddings( transaction_embeddings=torch.stack(embeddings_splitted, dim=0).unsqueeze(0), mask=emb_out["mask"], amounts=inputs["amounts"], label=inputs["label"], )["loss"] loss.backward() # update the chosen embedding embeddings_splitted[random_idx] = ( embeddings_splitted[random_idx] + self.epsilon * embeddings_splitted[random_idx].grad.data.sign() ) self.classifier.zero_grad() # find the closest embedding for the modified one distances = torch.nn.functional.pairwise_distance(embeddings_splitted[random_idx], self.emb_layer) # we dont choose special tokens distances[self.special_indexes] = 10 ** 16 # swap embeddings closest_idx = distances.argsort(descending=False).tolist() for idx in closest_idx: embeddings_splitted[random_idx] = self.emb_layer[idx] embeddings_splitted = [e.detach() for e in embeddings_splitted] adversarial_idexes_lm = deepcopy(adversarial_idexes) adversarial_idexes_lm[random_idx] = idx adv_data_lm = deepcopy(data_to_attack) adv_data_lm.transactions = decode_indexes(adversarial_idexes, vocab=self.vocab) adv_inputs_lm = data_to_tensors(adv_data_lm, self.reader, self.vocab, self.device) if self.lm(**adv_inputs_lm)["loss"] < self.lm_threshold: # get adversarial indexes adversarial_idexes[random_idx] = idx break else: continue adv_data = deepcopy(data_to_attack) adv_data.transactions = decode_indexes(adversarial_idexes, vocab=self.vocab) adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab, self.device) # get adversarial probability and adversarial label adv_probs = self.get_clf_probs(adv_inputs) adv_data.label = self.probs_to_label(adv_probs) adv_prob = adv_probs[self.label_to_index(data_to_attack.label)].item() output = AttackerOutput( data=data_to_attack.to_dict(), adversarial_data=adv_data.to_dict(), probability=orig_prob, adversarial_probability=adv_prob, prob_diff=(orig_prob - adv_prob), wer=word_error_rate_on_sequences(data_to_attack.transactions, adv_data.transactions), ) outputs.append(output) best_output = self.find_best_attack(outputs) best_output.history = [output.to_dict() for output in outputs] return best_output
def attack(self, data_to_attack: TransactionsData) -> AttackerOutput: inputs_to_attack = data_to_tensors(data_to_attack, self.reader, self.vocab, self.device) orig_prob = self.get_clf_probs(inputs_to_attack)[self.label_to_index( data_to_attack.label)].item() adv_data = deepcopy(data_to_attack) amounts = generate_transaction_amounts(self.total_amount, self.num_tokens_to_add) if self.position == Position.END: adv_data.transactions = adv_data.transactions + random.sample( self.all_tokens, self.num_tokens_to_add) adv_data.amounts = adv_data.amounts + amounts elif self.position == Position.START: adv_data.transactions = random.sample( self.all_tokens, self.num_tokens_to_add) + adv_data.transactions adv_data.amounts = amounts + adv_data.amounts else: raise NotImplementedError adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab, self.device) logits = self.get_lm_logits(adv_inputs) # drop start and end tokens logits = logits[:, 1:-1] if self.position == Position.END: logits_to_sample = logits[:, -self.num_tokens_to_add:][0] elif self.position == Position.START: logits_to_sample = logits[:, :self.num_tokens_to_add][0] else: raise NotImplementedError logits_to_sample = logits_to_sample / self.temperature probs = torch.softmax(logits_to_sample, dim=-1) probs[:, self.special_indexes] = 0.0 indexes = Categorical(probs=probs).sample((self.num_samples, )) if self.position == Position.END: adversarial_sequences = [ data_to_attack.transactions + decode_indexes(idx, self.vocab, drop_start_end=False) for idx in indexes ] elif self.position == Position.START: adversarial_sequences = [ decode_indexes(idx, self.vocab, drop_start_end=False) + data_to_attack.transactions for idx in indexes ] else: raise NotImplementedError outputs = [] for adv_sequence in adversarial_sequences: adv_data.transactions = adv_sequence adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab, self.device) adv_probs = self.get_clf_probs(adv_inputs) adv_label = self.probs_to_label(adv_probs) adv_data.label = adv_label adv_prob = adv_probs[self.label_to_index( data_to_attack.label)].item() output = AttackerOutput( data=data_to_attack.to_dict(), adversarial_data=adv_data.to_dict(), probability=orig_prob, adversarial_probability=adv_prob, prob_diff=(orig_prob - adv_prob), wer=word_error_rate_on_sequences(data_to_attack.transactions, adv_data.transactions), ) outputs.append(output) best_output = self.find_best_attack(outputs) # we don't need history here actually # best_output.history = [deepcopy(o.__dict__) for o in outputs] return best_output