Beispiel #1
0
    def _attack(self, initial_result):
        """Calls the ``SearchMethod`` to perturb the ``AttackedText`` stored in
        ``initial_result``.

        Args:
            initial_result: The initial ``GoalFunctionResult`` from which to perturb.

        Returns:
            A ``SuccessfulAttackResult``, ``FailedAttackResult``,
                or ``MaximizedAttackResult``.
        """
        final_result = self.search_method(initial_result)
        self.clear_cache()
        if final_result.goal_status == GoalFunctionResultStatus.SUCCEEDED:
            return SuccessfulAttackResult(
                initial_result,
                final_result,
            )
        elif final_result.goal_status == GoalFunctionResultStatus.SEARCHING:
            return FailedAttackResult(
                initial_result,
                final_result,
            )
        elif final_result.goal_status == GoalFunctionResultStatus.MAXIMIZING:
            return MaximizedAttackResult(
                initial_result,
                final_result,
            )
        else:
            raise ValueError(
                f"Unrecognized goal status {final_result.goal_status}")
Beispiel #2
0
    def attack_one(self, original_tokenized_text, correct_output):
        max_words_changed = min(self.max_words_changed,
                                len(original_tokenized_text.words))
        original_result = self.goal_function.get_results(
            [original_tokenized_text], correct_output)[0]
        default_unswapped_word_indices = list(
            range(len(original_tokenized_text.words)))
        beam = [(original_tokenized_text, default_unswapped_word_indices)]
        num_words_changed = 0
        best_result = None
        while num_words_changed < max_words_changed:
            num_words_changed += 1
            potential_next_beam = []
            for text, unswapped_word_indices in beam:
                transformations = self.get_transformations(
                    text,
                    indices_to_replace=unswapped_word_indices,
                    original_text=original_tokenized_text)
                for next_text in transformations:
                    new_unswapped_word_indices = unswapped_word_indices.copy()
                    modified_word_index = next_text.attack_attrs[
                        'modified_word_index']
                    new_unswapped_word_indices.remove(modified_word_index)
                    potential_next_beam.append(
                        (next_text, new_unswapped_word_indices))
            if len(potential_next_beam) == 0:
                # If we did not find any possible perturbations, give up.
                return FailedAttackResult(original_result)
            transformed_text_candidates = [
                text for (text, _) in potential_next_beam
            ]
            results = self.goal_function.get_results(
                transformed_text_candidates, correct_output)
            scores = np.array([r.score for r in results])
            # If we succeeded, break
            best_result = results[scores.argmax()]
            if best_result.succeeded:
                break
            # Otherwise, refill the beam. This works by sorting the scores
            # in descending order and filling the beam from there.
            best_indices = -scores.argsort()[:self.beam_width]
            beam = [potential_next_beam[i] for i in best_indices]

        if best_result is None:
            return FailedAttackResult(original_result, best_result)
        else:
            return SuccessfulAttackResult(original_result, best_result)
Beispiel #3
0
    def attack_one(self, initial_result):
        """
        Calls the ``SearchMethod`` to perturb the ``TokenizedText`` stored in 
        ``initial_result``.

        Args:
            initial_result: The initial ``GoalFunctionResult`` from which to perturb.

        Returns:
            Either a ``SuccessfulAttackResult`` or ``FailedAttackResult``.
        """
        final_result = self.search_method(initial_result)
        if final_result.succeeded:
            return SuccessfulAttackResult(initial_result, final_result)
        else:
            return FailedAttackResult(initial_result, final_result)
    def attack_one(self, tokenized_text, correct_output):
        self.original_tokenized_text = tokenized_text
        self.correct_output = correct_output
        original_result = self.goal_function.get_results([tokenized_text],
                                                         correct_output)[0]
        neighbors_len = self._get_neighbors_len(tokenized_text)
        pop = self._generate_population(neighbors_len)
        cur_score = original_result.score
        for i in range(self.max_iters):
            pop_results = self.goal_function.get_results(
                [pm.tokenized_text for pm in pop], correct_output)
            for idx, result in enumerate(pop_results):
                pop[idx].result = pop_results[idx]
            pop = sorted(pop, key=lambda x: -x.result.score)
            print('\t\t', i, ' -- ', float(pop[0].result.score))

            pop_scores = torch.Tensor([r.score for r in pop_results])
            logits = ((-pop_scores) / self.temp).exp()
            select_probs = (logits / logits.sum()).cpu().numpy()

            if pop[0].result.succeeded:
                return SuccessfulAttackResult(original_result, pop[0].result)

            if pop[0].result.score > cur_score:
                cur_score = pop[0].result.score
            elif self.give_up_if_no_improvement:
                break

            elite = [pop[0]]
            parent1_idx = np.random.choice(self.pop_size,
                                           size=self.pop_size - 1,
                                           p=select_probs)
            parent2_idx = np.random.choice(self.pop_size,
                                           size=self.pop_size - 1,
                                           p=select_probs)

            children = [
                self._crossover(pop[parent1_idx[idx]], pop[parent2_idx[idx]])
                for idx in range(self.pop_size - 1)
            ]
            for c in children:
                self._perturb(c)

            pop = elite + children

        return FailedAttackResult(original_result, pop[0].result)
    def attack_one(self, tokenized_text, correct_output):
        original_tokenized_text = tokenized_text
        num_words_changed = 0

        # Sort words by order of importance
        original_result = self.goal_function.get_results([tokenized_text],
                                                         correct_output)[0]
        cur_score = original_result.score
        len_text = len(tokenized_text.words)

        leave_one_texts = \
            [tokenized_text.replace_word_at_index(i,self.replacement_str) for i in range(len_text)]
        leave_one_scores = np.array([result.score for result in \
            self.goal_function.get_results(leave_one_texts, correct_output)])
        index_order = (-leave_one_scores).argsort()

        new_tokenized_text = None
        new_text_label = None
        i = 0
        while ((self.max_depth is None) or
               num_words_changed <= self.max_depth) and i < len(index_order):
            transformed_text_candidates = self.get_transformations(
                tokenized_text,
                original_tokenized_text,
                indices_to_replace=[index_order[i]])
            i += 1
            if len(transformed_text_candidates) == 0:
                continue
            num_words_changed += 1
            results = sorted(self.goal_function.get_results(
                transformed_text_candidates, correct_output),
                             key=lambda x: -x.score)
            # Skip swaps which don't improve the score
            if results[0].score > cur_score:
                cur_score = results[0].score
            else:
                continue
            # If we succeeded, return the index with best similarity.
            if results[0].succeeded:
                best_result = results[0]
                # @TODO: Use vectorwise operations
                max_similarity = -float('inf')
                for result in results:
                    if not result.succeeded:
                        break
                    candidate = result.tokenized_text
                    try:
                        similarity_score = candidate.attack_attrs[
                            'similarity_score']
                    except KeyError:
                        # If the attack was run without any similarity metrics,
                        # candidates won't have a similarity score. In this
                        # case, break and return the candidate that changed
                        # the original score the most.
                        break
                    if similarity_score > max_similarity:
                        max_similarity = similarity_score
                        best_result = result
                return SuccessfulAttackResult(original_result, best_result)
            else:
                tokenized_text = results[0].tokenized_text

        if len(results):
            return FailedAttackResult(original_result, results[0])
        else:
            return FailedAttackResult(original_result)