def attack(self, data_to_attack: TransactionsData) -> AttackerOutput:
        inputs_to_attack = data_to_tensors(data_to_attack, self.reader,
                                           self.vocab, self.device)
        orig_prob = self.get_clf_probs(inputs_to_attack)[self.label_to_index(
            data_to_attack.label)].item()

        adv_data = deepcopy(data_to_attack)
        amounts = generate_transaction_amounts(self.total_amount,
                                               self.num_tokens_to_add)

        for amount in amounts:
            self.attacker.total_amount = amount
            output = self.attacker.attack(adv_data)
            adv_data = output.to_dict()["adversarial_data"]
            adv_data['label'] = data_to_attack.label
            adv_data = TransactionsData(**adv_data)

        adv_data.label = output.adversarial_data['label']
        adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab,
                                     self.device)
        adv_probs = self.get_clf_probs(adv_inputs)
        adv_prob = adv_probs[self.label_to_index(data_to_attack.label)].item()

        output = AttackerOutput(
            data=data_to_attack.to_dict(),
            adversarial_data=adv_data.to_dict(),
            probability=orig_prob,
            adversarial_probability=adv_prob,
            prob_diff=(orig_prob - adv_prob),
            wer=word_error_rate_on_sequences(data_to_attack.transactions,
                                             adv_data.transactions),
        )
        return output
Beispiel #2
0
def data_to_tensors(
    data: TransactionsData, reader: DatasetReader, vocab: Vocabulary, device: Union[torch.device, int] = -1,
) -> ModelsInput:

    instances = Batch([reader.text_to_instance(**data.to_dict())])

    instances.index_instances(vocab)
    inputs = instances.as_tensor_dict()
    return move_to_device(inputs, device)
Beispiel #3
0
def main(config_path: str, samples: int = typer.Option(None, help="Number of samples")):
    params = Params.from_file(config_path)
    # enable for testing params['attacker']['device'] = -1
    attacker = Attacker.from_params(params["attacker"])

    data = load_jsonlines(params["data_path"])[:samples]

    output_path = params["output_path"]
    typer.secho(f"Saving results to {output_path} ...", fg="green")
    with jsonlines.open(output_path, "w") as writer:
        for el in tqdm(data):
            adversarial_output = attacker.attack(TransactionsData(**el))
            writer.write(adversarial_output.to_dict())
Beispiel #4
0
    def attack(self, data_to_attack: TransactionsData) -> AttackerOutput:
        orig_prob = self.get_probability_of_data(data_to_attack)
        adv_data = deepcopy(data_to_attack)

        num_steps = self._num_steps or len(data_to_attack)
        indexes_to_flip = np.random.randint(0,
                                            len(data_to_attack),
                                            size=num_steps)

        outputs = []
        for index_to_flip in indexes_to_flip:
            probabilities = {}

            for idx, token in self.vocab.get_index_to_token_vocabulary(
                    namespace="transactions").items():
                curr_adv_data = deepcopy(adv_data)
                curr_adv_data.transactions[index_to_flip] = token
                curr_prob = self.get_probability_of_data(curr_adv_data)
                probabilities[token] = curr_prob

            probabilities_sorted = sorted(probabilities.items(),
                                          key=lambda x: x[1],
                                          reverse=False)
            max_token, adv_prob = probabilities_sorted[0]

            prob_drop = orig_prob - adv_prob
            if prob_drop > 0.0:
                adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab,
                                             self.device)
                adv_data.transactions[index_to_flip] = max_token
                adv_data.label = self.probs_to_label(
                    self.get_clf_probs(adv_inputs))

                output = AttackerOutput(
                    data=data_to_attack.to_dict(),
                    adversarial_data=adv_data.to_dict(),
                    probability=orig_prob,
                    adversarial_probability=adv_prob,
                    prob_diff=prob_drop,
                    wer=word_error_rate_on_sequences(
                        data_to_attack.transactions, adv_data.transactions),
                )
                outputs.append(output)

        # TODO: empty outputs
        best_output = self.find_best_attack(outputs)
        return best_output
Beispiel #5
0
    def attack(self, data_to_attack: TransactionsData) -> AttackerOutput:
        inputs_to_attack = data_to_tensors(data_to_attack, self.reader,
                                           self.vocab, self.device)

        orig_prob = self.get_clf_probs(inputs_to_attack)[self.label_to_index(
            data_to_attack.label)].item()

        logits = self.get_lm_logits(inputs_to_attack)
        logits = logits / self.temperature
        probs = torch.softmax(logits, dim=-1)
        probs[:, :, self.special_indexes] = 0.0
        indexes = Categorical(probs=probs[0]).sample((self.num_samples, ))
        adversarial_sequences = [
            decode_indexes(idx, self.vocab) for idx in indexes
        ]

        outputs = []
        adv_data = deepcopy(data_to_attack)
        for adv_sequence in adversarial_sequences:
            adv_data.transactions = adv_sequence
            adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab,
                                         self.device)

            adv_probs = self.get_clf_probs(adv_inputs)
            adv_data.label = self.probs_to_label(adv_probs)
            adv_prob = adv_probs[self.label_to_index(
                data_to_attack.label)].item()

            output = AttackerOutput(
                data=data_to_attack.to_dict(),
                adversarial_data=adv_data.to_dict(),
                probability=orig_prob,
                adversarial_probability=adv_prob,
                prob_diff=(orig_prob - adv_prob),
                wer=word_error_rate_on_sequences(data_to_attack.transactions,
                                                 adv_data.transactions),
            )
            outputs.append(output)

        best_output = self.find_best_attack(outputs)
        # we don't need history here actually
        # best_output.history = [deepcopy(o.__dict__) for o in outputs]
        return best_output
Beispiel #6
0
    def test_from_params(self, config_path, clf_path):

        dataset = clf_path.parent.parent.parent.name
        if dataset == "age":
            data = self.age_test_data
        elif dataset == "gender":
            data = self.gender_test_data
        else:
            raise NotImplementedError

        try:
            params = Params.from_file(
                str(config_path),
                ext_vars={
                    "DATA_PATH":
                    "",
                    "OUTPUT_PATH":
                    "",
                    "CLF_PATH":
                    str(clf_path),
                    "MASKED_LM_PATH":
                    str(clf_path.parent.parent /
                        "lm/bert_with_amounts.tar.gz"),
                },
            )
            params["attacker"]["device"] = -1
            attacker = advsber.Attacker.from_params(params["attacker"])
        except Exception as e:
            raise AssertionError(
                f"unable to load params from {config_path}, because {e}")

        output = attacker.attack(TransactionsData(**data[0]))
        assert isinstance(output, advsber.AttackerOutput)
        assert isinstance(output.wer, int)
        assert output.wer >= 0
        assert isinstance(output.prob_diff, float)
        assert abs(output.prob_diff) <= 1.0
        assert isinstance(output.probability, float)
        assert output.probability >= 0.0
        assert isinstance(output.adversarial_probability, float)
        assert output.adversarial_probability >= 0.0
Beispiel #7
0
    def __call__(
        self,
        trainer: GradientDescentTrainer,
        batch_inputs: List[List[TensorDict]],
        batch_outputs: List[Dict[str, Any]],
        epoch: int,
        batch_number: int,
        is_training: bool,
        is_master: bool,
    ) -> None:

        if is_training:

            attacker = Attacker(classifier=trainer.model,
                                reader=self.reader,
                                device=-1)
            for batch in batch_inputs:

                instances = []
                for element in batch:
                    data = TransactionsData.from_tensors(
                        inputs=element, vocab=trainer.model.vocab)
                    adv_data = attacker.attack(data)

                    instance = self.reader.text_to_instance(**adv_data)
                    instances.append(instance)

                new_batch = Batch(instances)
                new_batch.index_instances(vocab=trainer.model.vocab)

                new_batch = new_batch.as_tensor_dict()

                batch_outputs = trainer.batch_outputs(new_batch,
                                                      for_training=True)
                loss = batch_outputs.get("loss")
                _ = batch_outputs.get("reg_loss")
                loss.backward()
                trainer.optimizer.step()
                trainer.optimizer.zero_grad()
Beispiel #8
0
    def attack(self, data_to_attack: TransactionsData) -> AttackerOutput:
        # get inputs to the model
        inputs = data_to_tensors(data_to_attack, reader=self.reader, vocab=self.vocab, device=self.device)

        adversarial_idexes = inputs["transactions"]["tokens"]["tokens"][0]

        # original probability of the true label
        orig_prob = self.get_clf_probs(inputs)[self.label_to_index(data_to_attack.label)].item()

        # get mask and transaction embeddings
        emb_out = self.classifier.get_transaction_embeddings(transactions=inputs["transactions"])

        # disable gradients using a trick
        embeddings = emb_out["transaction_embeddings"].detach()
        embeddings_splitted = [e for e in embeddings[0]]

        outputs = []
        for step in range(self.num_steps):
            # choose random index of embeddings (except for start/end tokens)
            random_idx = random.randint(1, max(1, len(data_to_attack.transactions) - 2))
            # only one embedding can be modified
            embeddings_splitted[random_idx].requires_grad = True

            # calculate the loss for current embeddings
            loss = self.classifier.forward_on_transaction_embeddings(
                transaction_embeddings=torch.stack(embeddings_splitted, dim=0).unsqueeze(0),
                mask=emb_out["mask"],
                amounts=inputs["amounts"],
                label=inputs["label"],
            )["loss"]
            loss.backward()

            # update the chosen embedding
            embeddings_splitted[random_idx] = (
                embeddings_splitted[random_idx] + self.epsilon * embeddings_splitted[random_idx].grad.data.sign()
            )
            self.classifier.zero_grad()

            # find the closest embedding for the modified one
            distances = torch.nn.functional.pairwise_distance(embeddings_splitted[random_idx], self.emb_layer)
            # we dont choose special tokens
            distances[self.special_indexes] = 10 ** 16

            # swap embeddings
            closest_idx = distances.argsort(descending=False).tolist()
            for idx in closest_idx:
                embeddings_splitted[random_idx] = self.emb_layer[idx]
                embeddings_splitted = [e.detach() for e in embeddings_splitted]

                adversarial_idexes_lm = deepcopy(adversarial_idexes)
                adversarial_idexes_lm[random_idx] = idx
                adv_data_lm = deepcopy(data_to_attack)
                adv_data_lm.transactions = decode_indexes(adversarial_idexes, vocab=self.vocab)
                adv_inputs_lm = data_to_tensors(adv_data_lm, self.reader, self.vocab, self.device)

                if self.lm(**adv_inputs_lm)["loss"] < self.lm_threshold:
                    # get adversarial indexes
                    adversarial_idexes[random_idx] = idx
                    break
                else:
                    continue

            adv_data = deepcopy(data_to_attack)
            adv_data.transactions = decode_indexes(adversarial_idexes, vocab=self.vocab)

            adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab, self.device)

            # get adversarial probability and adversarial label
            adv_probs = self.get_clf_probs(adv_inputs)
            adv_data.label = self.probs_to_label(adv_probs)
            adv_prob = adv_probs[self.label_to_index(data_to_attack.label)].item()

            output = AttackerOutput(
                data=data_to_attack.to_dict(),
                adversarial_data=adv_data.to_dict(),
                probability=orig_prob,
                adversarial_probability=adv_prob,
                prob_diff=(orig_prob - adv_prob),
                wer=word_error_rate_on_sequences(data_to_attack.transactions, adv_data.transactions),
            )
            outputs.append(output)

        best_output = self.find_best_attack(outputs)
        best_output.history = [output.to_dict() for output in outputs]

        return best_output
    def attack(self, data_to_attack: TransactionsData) -> AttackerOutput:
        inputs_to_attack = data_to_tensors(data_to_attack, self.reader,
                                           self.vocab, self.device)

        orig_prob = self.get_clf_probs(inputs_to_attack)[self.label_to_index(
            data_to_attack.label)].item()

        adv_data = deepcopy(data_to_attack)
        amounts = generate_transaction_amounts(self.total_amount,
                                               self.num_tokens_to_add)
        if self.position == Position.END:
            adv_data.transactions = adv_data.transactions + random.sample(
                self.all_tokens, self.num_tokens_to_add)
            adv_data.amounts = adv_data.amounts + amounts
        elif self.position == Position.START:
            adv_data.transactions = random.sample(
                self.all_tokens,
                self.num_tokens_to_add) + adv_data.transactions
            adv_data.amounts = amounts + adv_data.amounts
        else:
            raise NotImplementedError

        adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab,
                                     self.device)

        logits = self.get_lm_logits(adv_inputs)
        # drop start and end tokens
        logits = logits[:, 1:-1]

        if self.position == Position.END:
            logits_to_sample = logits[:, -self.num_tokens_to_add:][0]
        elif self.position == Position.START:
            logits_to_sample = logits[:, :self.num_tokens_to_add][0]
        else:
            raise NotImplementedError

        logits_to_sample = logits_to_sample / self.temperature
        probs = torch.softmax(logits_to_sample, dim=-1)
        probs[:, self.special_indexes] = 0.0

        indexes = Categorical(probs=probs).sample((self.num_samples, ))

        if self.position == Position.END:
            adversarial_sequences = [
                data_to_attack.transactions +
                decode_indexes(idx, self.vocab, drop_start_end=False)
                for idx in indexes
            ]
        elif self.position == Position.START:
            adversarial_sequences = [
                decode_indexes(idx, self.vocab, drop_start_end=False) +
                data_to_attack.transactions for idx in indexes
            ]
        else:
            raise NotImplementedError

        outputs = []
        for adv_sequence in adversarial_sequences:
            adv_data.transactions = adv_sequence
            adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab,
                                         self.device)

            adv_probs = self.get_clf_probs(adv_inputs)
            adv_label = self.probs_to_label(adv_probs)
            adv_data.label = adv_label

            adv_prob = adv_probs[self.label_to_index(
                data_to_attack.label)].item()

            output = AttackerOutput(
                data=data_to_attack.to_dict(),
                adversarial_data=adv_data.to_dict(),
                probability=orig_prob,
                adversarial_probability=adv_prob,
                prob_diff=(orig_prob - adv_prob),
                wer=word_error_rate_on_sequences(data_to_attack.transactions,
                                                 adv_data.transactions),
            )
            outputs.append(output)

        best_output = self.find_best_attack(outputs)
        # we don't need history here actually
        # best_output.history = [deepcopy(o.__dict__) for o in outputs]
        return best_output