def attack(self, data_to_attack: TransactionsData) -> AttackerOutput:
        inputs_to_attack = data_to_tensors(data_to_attack, self.reader,
                                           self.vocab, self.device)
        orig_prob = self.get_clf_probs(inputs_to_attack)[self.label_to_index(
            data_to_attack.label)].item()

        adv_data = deepcopy(data_to_attack)
        amounts = generate_transaction_amounts(self.total_amount,
                                               self.num_tokens_to_add)

        for amount in amounts:
            self.attacker.total_amount = amount
            output = self.attacker.attack(adv_data)
            adv_data = output.to_dict()["adversarial_data"]
            adv_data['label'] = data_to_attack.label
            adv_data = TransactionsData(**adv_data)

        adv_data.label = output.adversarial_data['label']
        adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab,
                                     self.device)
        adv_probs = self.get_clf_probs(adv_inputs)
        adv_prob = adv_probs[self.label_to_index(data_to_attack.label)].item()

        output = AttackerOutput(
            data=data_to_attack.to_dict(),
            adversarial_data=adv_data.to_dict(),
            probability=orig_prob,
            adversarial_probability=adv_prob,
            prob_diff=(orig_prob - adv_prob),
            wer=word_error_rate_on_sequences(data_to_attack.transactions,
                                             adv_data.transactions),
        )
        return output
Exemple #2
0
    def attack(self, data_to_attack: TransactionsData) -> AttackerOutput:
        inputs_to_attack = data_to_tensors(data_to_attack, self.reader,
                                           self.vocab, self.device)

        orig_prob = self.get_clf_probs(inputs_to_attack)[self.label_to_index(
            data_to_attack.label)].item()

        logits = self.get_lm_logits(inputs_to_attack)
        logits = logits / self.temperature
        probs = torch.softmax(logits, dim=-1)
        probs[:, :, self.special_indexes] = 0.0
        indexes = Categorical(probs=probs[0]).sample((self.num_samples, ))
        adversarial_sequences = [
            decode_indexes(idx, self.vocab) for idx in indexes
        ]

        outputs = []
        adv_data = deepcopy(data_to_attack)
        for adv_sequence in adversarial_sequences:
            adv_data.transactions = adv_sequence
            adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab,
                                         self.device)

            adv_probs = self.get_clf_probs(adv_inputs)
            adv_data.label = self.probs_to_label(adv_probs)
            adv_prob = adv_probs[self.label_to_index(
                data_to_attack.label)].item()

            output = AttackerOutput(
                data=data_to_attack.to_dict(),
                adversarial_data=adv_data.to_dict(),
                probability=orig_prob,
                adversarial_probability=adv_prob,
                prob_diff=(orig_prob - adv_prob),
                wer=word_error_rate_on_sequences(data_to_attack.transactions,
                                                 adv_data.transactions),
            )
            outputs.append(output)

        best_output = self.find_best_attack(outputs)
        # we don't need history here actually
        # best_output.history = [deepcopy(o.__dict__) for o in outputs]
        return best_output
Exemple #3
0
    def attack(self, data_to_attack: TransactionsData) -> AttackerOutput:
        orig_prob = self.get_probability_of_data(data_to_attack)
        adv_data = deepcopy(data_to_attack)

        num_steps = self._num_steps or len(data_to_attack)
        indexes_to_flip = np.random.randint(0,
                                            len(data_to_attack),
                                            size=num_steps)

        outputs = []
        for index_to_flip in indexes_to_flip:
            probabilities = {}

            for idx, token in self.vocab.get_index_to_token_vocabulary(
                    namespace="transactions").items():
                curr_adv_data = deepcopy(adv_data)
                curr_adv_data.transactions[index_to_flip] = token
                curr_prob = self.get_probability_of_data(curr_adv_data)
                probabilities[token] = curr_prob

            probabilities_sorted = sorted(probabilities.items(),
                                          key=lambda x: x[1],
                                          reverse=False)
            max_token, adv_prob = probabilities_sorted[0]

            prob_drop = orig_prob - adv_prob
            if prob_drop > 0.0:
                adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab,
                                             self.device)
                adv_data.transactions[index_to_flip] = max_token
                adv_data.label = self.probs_to_label(
                    self.get_clf_probs(adv_inputs))

                output = AttackerOutput(
                    data=data_to_attack.to_dict(),
                    adversarial_data=adv_data.to_dict(),
                    probability=orig_prob,
                    adversarial_probability=adv_prob,
                    prob_diff=prob_drop,
                    wer=word_error_rate_on_sequences(
                        data_to_attack.transactions, adv_data.transactions),
                )
                outputs.append(output)

        # TODO: empty outputs
        best_output = self.find_best_attack(outputs)
        return best_output
Exemple #4
0
    def attack(self, data_to_attack: TransactionsData) -> AttackerOutput:
        # get inputs to the model
        inputs = data_to_tensors(data_to_attack, reader=self.reader, vocab=self.vocab, device=self.device)

        adversarial_idexes = inputs["transactions"]["tokens"]["tokens"][0]

        # original probability of the true label
        orig_prob = self.get_clf_probs(inputs)[self.label_to_index(data_to_attack.label)].item()

        # get mask and transaction embeddings
        emb_out = self.classifier.get_transaction_embeddings(transactions=inputs["transactions"])

        # disable gradients using a trick
        embeddings = emb_out["transaction_embeddings"].detach()
        embeddings_splitted = [e for e in embeddings[0]]

        outputs = []
        for step in range(self.num_steps):
            # choose random index of embeddings (except for start/end tokens)
            random_idx = random.randint(1, max(1, len(data_to_attack.transactions) - 2))
            # only one embedding can be modified
            embeddings_splitted[random_idx].requires_grad = True

            # calculate the loss for current embeddings
            loss = self.classifier.forward_on_transaction_embeddings(
                transaction_embeddings=torch.stack(embeddings_splitted, dim=0).unsqueeze(0),
                mask=emb_out["mask"],
                amounts=inputs["amounts"],
                label=inputs["label"],
            )["loss"]
            loss.backward()

            # update the chosen embedding
            embeddings_splitted[random_idx] = (
                embeddings_splitted[random_idx] + self.epsilon * embeddings_splitted[random_idx].grad.data.sign()
            )
            self.classifier.zero_grad()

            # find the closest embedding for the modified one
            distances = torch.nn.functional.pairwise_distance(embeddings_splitted[random_idx], self.emb_layer)
            # we dont choose special tokens
            distances[self.special_indexes] = 10 ** 16

            # swap embeddings
            closest_idx = distances.argsort(descending=False).tolist()
            for idx in closest_idx:
                embeddings_splitted[random_idx] = self.emb_layer[idx]
                embeddings_splitted = [e.detach() for e in embeddings_splitted]

                adversarial_idexes_lm = deepcopy(adversarial_idexes)
                adversarial_idexes_lm[random_idx] = idx
                adv_data_lm = deepcopy(data_to_attack)
                adv_data_lm.transactions = decode_indexes(adversarial_idexes, vocab=self.vocab)
                adv_inputs_lm = data_to_tensors(adv_data_lm, self.reader, self.vocab, self.device)

                if self.lm(**adv_inputs_lm)["loss"] < self.lm_threshold:
                    # get adversarial indexes
                    adversarial_idexes[random_idx] = idx
                    break
                else:
                    continue

            adv_data = deepcopy(data_to_attack)
            adv_data.transactions = decode_indexes(adversarial_idexes, vocab=self.vocab)

            adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab, self.device)

            # get adversarial probability and adversarial label
            adv_probs = self.get_clf_probs(adv_inputs)
            adv_data.label = self.probs_to_label(adv_probs)
            adv_prob = adv_probs[self.label_to_index(data_to_attack.label)].item()

            output = AttackerOutput(
                data=data_to_attack.to_dict(),
                adversarial_data=adv_data.to_dict(),
                probability=orig_prob,
                adversarial_probability=adv_prob,
                prob_diff=(orig_prob - adv_prob),
                wer=word_error_rate_on_sequences(data_to_attack.transactions, adv_data.transactions),
            )
            outputs.append(output)

        best_output = self.find_best_attack(outputs)
        best_output.history = [output.to_dict() for output in outputs]

        return best_output
Exemple #5
0
 def get_probability_of_data(self, data: TransactionsData) -> float:
     inputs_to_attack = data_to_tensors(data, self.reader, self.vocab,
                                        self.device)
     prob = self.get_clf_probs(inputs_to_attack)[self.label_to_index(
         data.label)].item()
     return prob
    def attack(self, data_to_attack: TransactionsData) -> AttackerOutput:
        inputs_to_attack = data_to_tensors(data_to_attack, self.reader,
                                           self.vocab, self.device)

        orig_prob = self.get_clf_probs(inputs_to_attack)[self.label_to_index(
            data_to_attack.label)].item()

        adv_data = deepcopy(data_to_attack)
        amounts = generate_transaction_amounts(self.total_amount,
                                               self.num_tokens_to_add)
        if self.position == Position.END:
            adv_data.transactions = adv_data.transactions + random.sample(
                self.all_tokens, self.num_tokens_to_add)
            adv_data.amounts = adv_data.amounts + amounts
        elif self.position == Position.START:
            adv_data.transactions = random.sample(
                self.all_tokens,
                self.num_tokens_to_add) + adv_data.transactions
            adv_data.amounts = amounts + adv_data.amounts
        else:
            raise NotImplementedError

        adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab,
                                     self.device)

        logits = self.get_lm_logits(adv_inputs)
        # drop start and end tokens
        logits = logits[:, 1:-1]

        if self.position == Position.END:
            logits_to_sample = logits[:, -self.num_tokens_to_add:][0]
        elif self.position == Position.START:
            logits_to_sample = logits[:, :self.num_tokens_to_add][0]
        else:
            raise NotImplementedError

        logits_to_sample = logits_to_sample / self.temperature
        probs = torch.softmax(logits_to_sample, dim=-1)
        probs[:, self.special_indexes] = 0.0

        indexes = Categorical(probs=probs).sample((self.num_samples, ))

        if self.position == Position.END:
            adversarial_sequences = [
                data_to_attack.transactions +
                decode_indexes(idx, self.vocab, drop_start_end=False)
                for idx in indexes
            ]
        elif self.position == Position.START:
            adversarial_sequences = [
                decode_indexes(idx, self.vocab, drop_start_end=False) +
                data_to_attack.transactions for idx in indexes
            ]
        else:
            raise NotImplementedError

        outputs = []
        for adv_sequence in adversarial_sequences:
            adv_data.transactions = adv_sequence
            adv_inputs = data_to_tensors(adv_data, self.reader, self.vocab,
                                         self.device)

            adv_probs = self.get_clf_probs(adv_inputs)
            adv_label = self.probs_to_label(adv_probs)
            adv_data.label = adv_label

            adv_prob = adv_probs[self.label_to_index(
                data_to_attack.label)].item()

            output = AttackerOutput(
                data=data_to_attack.to_dict(),
                adversarial_data=adv_data.to_dict(),
                probability=orig_prob,
                adversarial_probability=adv_prob,
                prob_diff=(orig_prob - adv_prob),
                wer=word_error_rate_on_sequences(data_to_attack.transactions,
                                                 adv_data.transactions),
            )
            outputs.append(output)

        best_output = self.find_best_attack(outputs)
        # we don't need history here actually
        # best_output.history = [deepcopy(o.__dict__) for o in outputs]
        return best_output