def InputReductionFeng2018(model):
    """Feng, Wallace, Grissom, Iyyer, Rodriguez, Boyd-Graber. (2018).

    Pathologies of Neural Models Make Interpretations Difficult.

    ArXiv, abs/1804.07781.
    """
    # At each step, we remove the word with the lowest importance value until
    # the model changes its prediction.
    transformation = WordDeletion()

    constraints = [RepeatModification(), StopwordModification()]
    #
    # Goal is untargeted classification
    #
    goal_function = InputReduction(model, maximizable=True)
    #
    # "For each word in an input sentence, we measure its importance by the
    # change in the confidence of the original prediction when we remove
    # that word from the sentence."
    #
    # "Instead of looking at the words with high importance values—what
    # interpretation methods commonly do—we take a complementary approach
    # and study how the model behaves when the supposedly unimportant words are
    # removed."
    #
    search_method = GreedyWordSwapWIR(wir_method="delete")

    return Attack(goal_function, constraints, transformation, search_method)
Ejemplo n.º 2
0
    def build(model):
        #
        #  we propose five bug generation methods for TEXTBUGGER:
        #
        transformation = CompositeTransformation(
            [
                # (1) Insert: Insert a space into the word.
                # Generally, words are segmented by spaces in English. Therefore,
                # we can deceive classifiers by inserting spaces into words.
                WordSwapRandomCharacterInsertion(
                    random_one=True,
                    letters_to_insert=" ",
                    skip_first_char=True,
                    skip_last_char=True,
                ),
                # (2) Delete: Delete a random character of the word except for the first
                # and the last character.
                WordSwapRandomCharacterDeletion(
                    random_one=True, skip_first_char=True, skip_last_char=True
                ),
                # (3) Swap: Swap random two adjacent letters in the word but do not
                # alter the first or last letter. This is a common occurrence when
                # typing quickly and is easy to implement.
                WordSwapNeighboringCharacterSwap(
                    random_one=True, skip_first_char=True, skip_last_char=True
                ),
                # (4) Substitute-C (Sub-C): Replace characters with visually similar
                # characters (e.g., replacing “o” with “0”, “l” with “1”, “a” with “@”)
                # or adjacent characters in the keyboard (e.g., replacing “m” with “n”).
                WordSwapHomoglyphSwap(),
                # (5) Substitute-W
                # (Sub-W): Replace a word with its topk nearest neighbors in a
                # context-aware word vector space. Specifically, we use the pre-trained
                # GloVe model [30] provided by Stanford for word embedding and set
                # topk = 5 in the experiment.
                WordSwapEmbedding(max_candidates=5),
            ]
        )

        constraints = [RepeatModification(), StopwordModification()]
        # In our experiment, we first use the Universal Sentence
        # Encoder [7], a model trained on a number of natural language
        # prediction tasks that require modeling the meaning of word
        # sequences, to encode sentences into high dimensional vectors.
        # Then, we use the cosine similarity to measure the semantic
        # similarity between original texts and adversarial texts.
        # ... "Furthermore, the semantic similarity threshold \eps is set
        # as 0.8 to guarantee a good trade-off between quality and
        # strength of the generated adversarial text."
        constraints.append(UniversalSentenceEncoder(threshold=0.8))
        #
        # Goal is untargeted classification
        #
        goal_function = UntargetedClassification(model)
        #
        # Greedily swap words with "Word Importance Ranking".
        #
        search_method = GreedyWordSwapWIR(wir_method="delete")

        return Attack(goal_function, constraints, transformation, search_method)
def Seq2SickCheng2018BlackBox(model, goal_function="non_overlapping"):
    """
        Cheng, Minhao, et al. 
        
        Seq2Sick: Evaluating the Robustness of Sequence-to-Sequence Models with 
        Adversarial Examples
        
        https://arxiv.org/abs/1803.01128    
        
        This is a greedy re-implementation of the seq2sick attack method. It does 
        not use gradient descent.
    """

    #
    # Goal is non-overlapping output.
    #
    goal_function = NonOverlappingOutput(model)
    # @TODO implement transformation / search method just like they do in
    # seq2sick.
    transformation = WordSwapEmbedding(max_candidates=50)
    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]
    #
    # In these experiments, we hold the maximum difference
    # on edit distance (ϵ) to a constant 30 for each sample.
    #
    constraints.append(LevenshteinEditDistance(30))
    #
    # Greedily swap words with "Word Importance Ranking".
    #
    search_method = GreedyWordSwapWIR()

    return Attack(goal_function, constraints, transformation, search_method)
Ejemplo n.º 4
0
def build_baegarg2019(model_wrapper, threshold_cosine=0.936338023, query_budget=None, max_candidates=50):
    """
    Modified from https://github.com/QData/TextAttack/blob/04b7c6f79bdb5301b360555bd5458c15aa2b8695/textattack/attack_recipes/bae_garg_2019.py
    """
    transformation = WordSwapMaskedLM(
        method="bae", max_candidates=max_candidates, min_confidence=0.0
    )
    constraints = [RepeatModification(), StopwordModification()]

    constraints.append(PartOfSpeech(allow_verb_noun_swap=True))

    use_constraint = UniversalSentenceEncoder(
        threshold=threshold_cosine,
        metric="cosine",
        compare_against_original=True,
        window_size=15,
        skip_text_shorter_than_window=True,
    )
    constraints.append(use_constraint)
    goal_function = UntargetedClassification(model_wrapper)
    if query_budget is not None:
        goal_function.query_budget = query_budget
    search_method = GreedyWordSwapWIR(wir_method="delete")

    return Attack(goal_function, constraints, transformation, search_method)
Ejemplo n.º 5
0
    def build(model, ensemble: bool = False):
        # [from correspondence with the author]
        # Candidate size K is set to 48 for all data-sets.
        transformation = WordSwapMaskedLM(method="bert-attack",
                                          max_candidates=48)
        #
        # Don't modify the same word twice or stopwords.
        #
        constraints = [RepeatModification(), StopwordModification()]

        # "We only take ε percent of the most important words since we tend to keep
        # perturbations minimum."
        #
        # [from correspondence with the author]
        # "Word percentage allowed to change is set to 0.4 for most data-sets, this
        # parameter is trivial since most attacks only need a few changes. This
        # epsilon is only used to avoid too much queries on those very hard samples."
        constraints.append(MaxWordsPerturbed(max_percent=0.4))

        # "As used in TextFooler (Jin et al., 2019), we also use Universal Sentence
        # Encoder (Cer et al., 2018) to measure the semantic consistency between the
        # adversarial sample and the original sequence. To balance between semantic
        # preservation and attack success rate, we set up a threshold of semantic
        # similarity score to filter the less similar examples."
        #
        # [from correspondence with author]
        # "Over the full texts, after generating all the adversarial samples, we filter
        # out low USE score samples. Thus the success rate is lower but the USE score
        # can be higher. (actually USE score is not a golden metric, so we simply
        # measure the USE score over the final texts for a comparison with TextFooler).
        # For datasets like IMDB, we set a higher threshold between 0.4-0.7; for
        # datasets like MNLI, we set threshold between 0-0.2."
        #
        # Since the threshold in the real world can't be determined from the training
        # data, the TextAttack implementation uses a fixed threshold - determined to
        # be 0.2 to be most fair.
        use_constraint = UniversalSentenceEncoder(
            threshold=0.2,
            metric="cosine",
            compare_against_original=True,
            window_size=None,
        )
        constraints.append(use_constraint)
        #
        # Goal is untargeted classification.
        #
        goal_function = UntargetedClassification(model)
        #
        # "We first select the words in the sequence which have a high significance
        # influence on the final output logit. Let S = [w0, ··· , wi ··· ] denote
        # the input sentence, and oy(S) denote the logit output by the target model
        # for correct label y, the importance score Iwi is defined as
        # Iwi = oy(S) − oy(S\wi), where S\wi = [w0, ··· , wi−1, [MASK], wi+1, ···]
        # is the sentence after replacing wi with [MASK]. Then we rank all the words
        # according to the ranking score Iwi in descending order to create word list
        # L."
        search_method = GreedyWordSwapWIR(wir_method="unk", ensemble=ensemble)

        return Attack(goal_function, constraints, transformation,
                      search_method)
Ejemplo n.º 6
0
 def build(model):
     transformation = WordSwapWordNet()
     constraints = [RepeatModification(), StopwordModification()]
     goal_function = UntargetedClassification(model)
     # search over words based on a combination of their saliency score, and how efficient the WordSwap transform is
     search_method = GreedyWordSwapWIR("weighted-saliency")
     return Attack(goal_function, constraints, transformation,
                   search_method)
Ejemplo n.º 7
0
def TextFoolerJin2019Adjusted(model, SE_thresh=0.98, sentence_encoder='bert'):
    #
    # Swap words with their embedding nearest-neighbors. 
    #
    # Embedding: Counter-fitted PARAGRAM-SL999 vectors.
    #
    # 50 nearest-neighbors with a cosine similarity of at least 0.5.
    # (The paper claims 0.7, but analysis of the code and some empirical
    # results show that it's definitely 0.5.)
    #
    transformation = WordSwapEmbedding(max_candidates=50, textfooler_stopwords=True)
    #
    # Minimum word embedding cosine similarity of 0.9.
    #
    constraints = []
    constraints.append(
            WordEmbeddingDistance(min_cos_sim=0.9)
    )
    #
    # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7.
    #
    if sentence_encoder == 'bert':
        se_constraint = BERT(threshold=SE_thresh,
            metric='cosine', compare_with_original=False, window_size=15,
            skip_text_shorter_than_window=False)
    else:
        se_constraint = UniversalSentenceEncoder(threshold=SE_thresh,
            metric='cosine', compare_with_original=False, window_size=15,
            skip_text_shorter_than_window=False)
    constraints.append(se_constraint)
    #
    # Do grammar checking
    #
    constraints.append(
            LanguageTool(0)
    )
    
    #
    # Untargeted attack   
    #
    goal_function = UntargetedClassification(model)

    #
    # Greedily swap words with "Word Importance Ranking".
    #
    attack = GreedyWordSwapWIR(goal_function, transformation=transformation,
        constraints=constraints, max_depth=None)
    
    return attack
Ejemplo n.º 8
0
def TextFoolerJin2019(model):
    #
    # Swap words with their embedding nearest-neighbors.
    #
    # Embedding: Counter-fitted PARAGRAM-SL999 vectors.
    #
    # 50 nearest-neighbors with a cosine similarity of at least 0.5.
    # (The paper claims 0.7, but analysis of the code and some empirical
    # results show that it's definitely 0.5.)
    #
    transformation = WordSwapEmbedding(max_candidates=50,
                                       textfooler_stopwords=True)
    #
    # Minimum word embedding cosine similarity of 0.5.
    #
    constraints = []
    constraints.append(WordEmbeddingDistance(min_cos_sim=0.5))
    #
    # Only replace words with the same part of speech (or nouns with verbs)
    #
    constraints.append(PartOfSpeech(allow_verb_noun_swap=True))
    #
    # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7.
    #
    # In the TextFooler code, they forget to divide the angle between the two
    # embeddings by pi. So if the original threshold was that 1 - sim >= 0.7, the
    # new threshold is 1 - (0.3) / pi = 0.90445
    #
    use_constraint = UniversalSentenceEncoder(
        threshold=0.904458599,
        metric='angular',
        compare_with_original=False,
        window_size=15,
        skip_text_shorter_than_window=True)
    constraints.append(use_constraint)
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Greedily swap words with "Word Importance Ranking".
    #
    attack = GreedyWordSwapWIR(goal_function,
                               transformation=transformation,
                               constraints=constraints,
                               max_depth=None)

    return attack
Ejemplo n.º 9
0
def DeepWordBugGao2018(model, use_all_transformations=True):
    """
        Gao, Lanchantin, Soffa, Qi.
        
        Black-box Generation of Adversarial Text Sequences to Evade Deep Learning 
        Classifiers.
        
        https://arxiv.org/abs/1801.04354
    """
    #
    # Swap characters out from words. Choose the best of four potential transformations.
    #
    if use_all_transformations:
        # We propose four similar methods:
        transformation = CompositeTransformation([
            # (1) Swap: Swap two adjacent letters in the word.
            WordSwapNeighboringCharacterSwap(),
            # (2) Substitution: Substitute a letter in the word with a random letter.
            WordSwapRandomCharacterSubstitution(),
            # (3) Deletion: Delete a random letter from the word.
            WordSwapRandomCharacterDeletion(),
            # (4) Insertion: Insert a random letter in the word.
            WordSwapRandomCharacterInsertion(),
        ])
    else:
        # We use the Combined Score and the Substitution Transformer to generate
        # adversarial samples, with the maximum edit distance difference of 30
        # (ϵ = 30).
        transformation = WordSwapRandomCharacterSubstitution()
    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]
    #
    # In these experiments, we hold the maximum difference
    # on edit distance (ϵ) to a constant 30 for each sample.
    #
    constraints.append(LevenshteinEditDistance(30))
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Greedily swap words with "Word Importance Ranking".
    #
    search_method = GreedyWordSwapWIR()

    return Attack(goal_function, constraints, transformation, search_method)
Ejemplo n.º 10
0
def build_attack(model_wrapper, target_class=-1):
    """
    Same as bert-attack except:
    - it is TargetedClassification instead of Untargeted when target_class != -1
    - using "bae" instead of "bert-attack" because of bert-attack's problem for subtokens
    Modified from https://github.com/QData/TextAttack/blob/36dfce6bdab933bdeed3a2093ae411e93018ebbf/textattack/attack_recipes/bert_attack_li_2020.py
    """

    # transformation = WordSwapMaskedLM(method="bert-attack", max_candidates=48)
    transformation = WordSwapMaskedLM(method="bae", max_candidates=100)
    constraints = [RepeatModification(), StopwordModification()]
    constraints.append(MaxWordsPerturbed(max_percent=0.4))

    use_constraint = UniversalSentenceEncoder(
        threshold=0.2,
        metric="cosine",
        compare_against_original=True,
        window_size=None,
    )
    constraints.append(use_constraint)
    if target_class == -1:
        goal_function = UntargetedClassification(model_wrapper)
    else:
        # We modify the goal
        goal_function = TargetedClassification(model_wrapper, target_class=target_class)
    search_method = GreedyWordSwapWIR(wir_method="unk")

    return Attack(goal_function, constraints, transformation, search_method)


# def build_attack_2(model_wrapper, target_class):
#     """
#     Same as HotFlipEbrahimi2017 attack except:
#     - it is TargetedClassification instead of Untargeted
#     """
#     transformation = WordSwapGradientBased(model_wrapper, top_n=1)
#     constraints = [RepeatModification(), StopwordModification()]
#     constraints.append(MaxWordsPerturbed(max_num_words=2))
#     constraints.append(WordEmbeddingDistance(min_cos_sim=0.8))
#     constraints.append(PartOfSpeech())
#     goal_function = TargetedClassification(model_wrapper)
    
#     search_method = BeamSearch(beam_width=10)

#     return Attack(goal_function, constraints, transformation, search_method)
Ejemplo n.º 11
0
    def build(model_wrapper, mlm=False):
        """Build attack recipe.

        Args:
            model_wrapper (:class:`~textattack.models.wrappers.ModelWrapper`):
                Model wrapper containing both the model and the tokenizer.
            mlm (:obj:`bool`, `optional`, defaults to :obj:`False`):
                If :obj:`True`, load `A2T-MLM` attack. Otherwise, load regular `A2T` attack.

        Returns:
            :class:`~textattack.Attack`: A2T attack.
        """
        constraints = [RepeatModification(), StopwordModification()]
        input_column_modification = InputColumnModification(
            ["premise", "hypothesis"], {"premise"})
        constraints.append(input_column_modification)
        constraints.append(PartOfSpeech(allow_verb_noun_swap=False))
        constraints.append(MaxModificationRate(max_rate=0.1, min_threshold=4))
        sent_encoder = BERT(model_name="stsb-distilbert-base",
                            threshold=0.9,
                            metric="cosine")
        constraints.append(sent_encoder)

        if mlm:
            transformation = transformation = WordSwapMaskedLM(
                method="bae",
                max_candidates=20,
                min_confidence=0.0,
                batch_size=16)
        else:
            transformation = WordSwapEmbedding(max_candidates=20)
            constraints.append(WordEmbeddingDistance(min_cos_sim=0.8))

        #
        # Goal is untargeted classification
        #
        goal_function = UntargetedClassification(model_wrapper,
                                                 model_batch_size=32)
        #
        # Greedily swap words with "Word Importance Ranking".
        #
        search_method = GreedyWordSwapWIR(wir_method="gradient")

        return Attack(goal_function, constraints, transformation,
                      search_method)
Ejemplo n.º 12
0
def PWWSRen2019(model):
    """An implementation of Probability Weighted Word Saliency from "Generating
    Natural Langauge Adversarial Examples through Probability Weighted Word
    Saliency", Ren et al., 2019.

    Words are prioritized for a synonym-swap transformation based on
    a combination of their saliency score and maximum word-swap effectiveness.
    Note that this implementation does not include the Named
    Entity adversarial swap from the original paper, because it requires
    access to the full dataset and ground truth labels in advance.

    https://www.aclweb.org/anthology/P19-1103/
    """
    transformation = WordSwapWordNet()
    constraints = [RepeatModification(), StopwordModification()]
    goal_function = UntargetedClassification(model)
    # search over words based on a combination of their saliency score, and how efficient the WordSwap transform is
    search_method = GreedyWordSwapWIR("pwws")
    return Attack(goal_function, constraints, transformation, search_method)
Ejemplo n.º 13
0
    def build(model, use_all_transformations=True, ensemble: bool=False):
        #
        # Swap characters out from words. Choose the best of four potential transformations.
        #
        if use_all_transformations:
            # We propose four similar methods:
            transformation = CompositeTransformation(
                [
                    # (1) Swap: Swap two adjacent letters in the word.
                    WordSwapNeighboringCharacterSwap(),
                    # (2) Substitution: Substitute a letter in the word with a random letter.
                    WordSwapRandomCharacterSubstitution(),
                    # (3) Deletion: Delete a random letter from the word.
                    WordSwapRandomCharacterDeletion(),
                    # (4) Insertion: Insert a random letter in the word.
                    WordSwapRandomCharacterInsertion(),
                ]
            )
        else:
            # We use the Combined Score and the Substitution Transformer to generate
            # adversarial samples, with the maximum edit distance difference of 30
            # (ϵ = 30).
            transformation = WordSwapRandomCharacterSubstitution()
        #
        # Don't modify the same word twice or stopwords
        #
        constraints = [RepeatModification(), StopwordModification()]
        #
        # In these experiments, we hold the maximum difference
        # on edit distance (ϵ) to a constant 30 for each sample.
        #
        constraints.append(LevenshteinEditDistance(30))
        #
        # Goal is untargeted classification
        #
        goal_function = UntargetedClassification(model)
        #
        # Greedily swap words with "Word Importance Ranking".
        #
        search_method = GreedyWordSwapWIR(ensemble=ensemble)

        return Attack(goal_function, constraints, transformation, search_method)
def Seq2SickCheng2018BlackBox(model, goal_function='non_overlapping'):
    #
    # Goal is non-overlapping output.
    #
    goal_function = NonOverlappingOutput(model)
    # @TODO implement transformation / search method just like they do in
    # seq2sick.
    transformation = WordSwapEmbedding(max_candidates=50)
    #
    # In these experiments, we hold the maximum difference
    # on edit distance (ϵ) to a constant 30 for each sample.
    #
    #
    # Greedily swap words with "Word Importance Ranking".
    #
    attack = GreedyWordSwapWIR(goal_function,
                               transformation=transformation,
                               constraints=[],
                               max_depth=10)

    return attack
Ejemplo n.º 15
0
def DeepWordBugGao2018(model, use_all_transformations=True):
    #
    # Swap characters out from words. Choose the best of four potential transformations.
    #
    if use_all_transformations:
        # We propose four similar methods:
        transformation = CompositeTransformation([
            # (1) Swap: Swap two adjacent letters in the word.
            WordSwapNeighboringCharacterSwap(),
            # (2) Substitution: Substitute a letter in the word with a random letter.
            WordSwapRandomCharacterSubstitution(),
            # (3) Deletion: Delete a random letter from the word.
            WordSwapRandomCharacterDeletion(),
            # (4) Insertion: Insert a random letter in the word.
            WordSwapRandomCharacterInsertion()
        ])
    else:
        # We use the Combined Score and the Substitution Transformer to generate
        # adversarial samples, with the maximum edit distance difference of 30
        # (ϵ = 30).
        transformation = WordSwapRandomCharacterSubstitution()
    #
    # In these experiments, we hold the maximum difference
    # on edit distance (ϵ) to a constant 30 for each sample.
    #
    constraints = [LevenshteinEditDistance(30)]
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Greedily swap words with "Word Importance Ranking".
    #
    attack = GreedyWordSwapWIR(goal_function,
                               transformation=transformation,
                               constraints=constraints,
                               max_depth=None)

    return attack
Ejemplo n.º 16
0
    def build(model):
        # At each step, we remove the word with the lowest importance value until
        # the model changes its prediction.
        transformation = WordDeletion()

        constraints = [RepeatModification(), StopwordModification()]
        #
        # Goal is untargeted classification
        #
        goal_function = InputReduction(model, maximizable=True)
        #
        # "For each word in an input sentence, we measure its importance by the
        # change in the confidence of the original prediction when we remove
        # that word from the sentence."
        #
        # "Instead of looking at the words with high importance values—what
        # interpretation methods commonly do—we take a complementary approach
        # and study how the model behaves when the supposedly unimportant words are
        # removed."
        #
        search_method = GreedyWordSwapWIR(wir_method="delete")

        return Attack(goal_function, constraints, transformation, search_method)
    def build(model, goal_function="non_overlapping"):

        #
        # Goal is non-overlapping output.
        #
        goal_function = NonOverlappingOutput(model)
        transformation = WordSwapEmbedding(max_candidates=50)
        #
        # Don't modify the same word twice or stopwords
        #
        constraints = [RepeatModification(), StopwordModification()]
        #
        # In these experiments, we hold the maximum difference
        # on edit distance (ϵ) to a constant 30 for each sample.
        #
        constraints.append(LevenshteinEditDistance(30))
        #
        # Greedily swap words with "Word Importance Ranking".
        #
        search_method = GreedyWordSwapWIR(wir_method="unk")

        return Attack(goal_function, constraints, transformation,
                      search_method)
Ejemplo n.º 18
0
def TextFoolerJin2019(model):
    """
        Jin, D., Jin, Z., Zhou, J.T., & Szolovits, P. (2019). 
        
        Is BERT Really Robust? Natural Language Attack on Text Classification and Entailment. 
        
        https://arxiv.org/abs/1907.11932 
    """
    #
    # Swap words with their 50 closest embedding nearest-neighbors.
    # Embedding: Counter-fitted PARAGRAM-SL999 vectors.
    #
    transformation = WordSwapEmbedding(max_candidates=50)
    #
    # Don't modify the same word twice or the stopwords defined
    # in the TextFooler public implementation.
    #
    # fmt: off
    stopwords = set([
        "a", "about", "above", "across", "after", "afterwards", "again",
        "against", "ain", "all", "almost", "alone", "along", "already", "also",
        "although", "am", "among", "amongst", "an", "and", "another", "any",
        "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "aren",
        "aren't", "around", "as", "at", "back", "been", "before", "beforehand",
        "behind", "being", "below", "beside", "besides", "between", "beyond",
        "both", "but", "by", "can", "cannot", "could", "couldn", "couldn't",
        "d", "didn", "didn't", "doesn", "doesn't", "don", "don't", "down",
        "due", "during", "either", "else", "elsewhere", "empty", "enough",
        "even", "ever", "everyone", "everything", "everywhere", "except",
        "first", "for", "former", "formerly", "from", "hadn", "hadn't", "hasn",
        "hasn't", "haven", "haven't", "he", "hence", "her", "here",
        "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him",
        "himself", "his", "how", "however", "hundred", "i", "if", "in",
        "indeed", "into", "is", "isn", "isn't", "it", "it's", "its", "itself",
        "just", "latter", "latterly", "least", "ll", "may", "me", "meanwhile",
        "mightn", "mightn't", "mine", "more", "moreover", "most", "mostly",
        "must", "mustn", "mustn't", "my", "myself", "namely", "needn",
        "needn't", "neither", "never", "nevertheless", "next", "no", "nobody",
        "none", "noone", "nor", "not", "nothing", "now", "nowhere", "o", "of",
        "off", "on", "once", "one", "only", "onto", "or", "other", "others",
        "otherwise", "our", "ours", "ourselves", "out", "over", "per",
        "please", "s", "same", "shan", "shan't", "she", "she's", "should've",
        "shouldn", "shouldn't", "somehow", "something", "sometime",
        "somewhere", "such", "t", "than", "that", "that'll", "the", "their",
        "theirs", "them", "themselves", "then", "thence", "there",
        "thereafter", "thereby", "therefore", "therein", "thereupon", "these",
        "they", "this", "those", "through", "throughout", "thru", "thus", "to",
        "too", "toward", "towards", "under", "unless", "until", "up", "upon",
        "used", "ve", "was", "wasn", "wasn't", "we", "were", "weren",
        "weren't", "what", "whatever", "when", "whence", "whenever", "where",
        "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever",
        "whether", "which", "while", "whither", "who", "whoever", "whole",
        "whom", "whose", "why", "with", "within", "without", "won", "won't",
        "would", "wouldn", "wouldn't", "y", "yet", "you", "you'd", "you'll",
        "you're", "you've", "your", "yours", "yourself", "yourselves"
    ])
    # fmt: on
    constraints = [
        RepeatModification(),
        StopwordModification(stopwords=stopwords)
    ]
    #
    # During entailment, we should only edit the hypothesis - keep the premise
    # the same.
    #
    input_column_modification = InputColumnModification(
        ["premise", "hypothesis"], {"premise"})
    constraints.append(input_column_modification)
    # Minimum word embedding cosine similarity of 0.5.
    # (The paper claims 0.7, but analysis of the released code and some empirical
    # results show that it's 0.5.)
    #
    constraints.append(WordEmbeddingDistance(min_cos_sim=0.5))
    #
    # Only replace words with the same part of speech (or nouns with verbs)
    #
    constraints.append(PartOfSpeech(allow_verb_noun_swap=True))
    #
    # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7.
    #
    # In the TextFooler code, they forget to divide the angle between the two
    # embeddings by pi. So if the original threshold was that 1 - sim >= 0.7, the
    # new threshold is 1 - (0.3) / pi = 0.90445
    #
    use_constraint = UniversalSentenceEncoder(
        threshold=0.904458599,
        metric="angular",
        compare_with_original=False,
        window_size=15,
        skip_text_shorter_than_window=True,
    )
    constraints.append(use_constraint)
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Greedily swap words with "Word Importance Ranking".
    #
    search_method = GreedyWordSwapWIR()

    return Attack(goal_function, constraints, transformation, search_method)
def TextFoolerJin2019Adjusted(model, SE_thresh=0.98, sentence_encoder='bert'):
    """
        Jin, D., Jin, Z., Zhou, J.T., & Szolovits, P. (2019). 
        
        Is BERT Really Robust? Natural Language Attack on Text Classification and Entailment. 
        
        https://arxiv.org/abs/1907.11932 
       
        Constraints adjusted from paper to align with human evaluation.
    """
    #
    # Swap words with their embedding nearest-neighbors.
    #
    # Embedding: Counter-fitted PARAGRAM-SL999 vectors.
    #
    # 50 nearest-neighbors with a cosine similarity of at least 0.5.
    # (The paper claims 0.7, but analysis of the code and some empirical
    # results show that it's definitely 0.5.)
    #
    transformation = WordSwapEmbedding(max_candidates=50)
    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]
    #
    # Minimum word embedding cosine similarity of 0.9.
    #
    constraints.append(WordEmbeddingDistance(min_cos_sim=0.9))
    #
    # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7.
    #
    if sentence_encoder == 'bert':
        se_constraint = BERT(threshold=SE_thresh,
                             metric='cosine',
                             compare_with_original=False,
                             window_size=15,
                             skip_text_shorter_than_window=False)
    else:
        se_constraint = UniversalSentenceEncoder(
            threshold=SE_thresh,
            metric='cosine',
            compare_with_original=False,
            window_size=15,
            skip_text_shorter_than_window=False)
    constraints.append(se_constraint)
    #
    # Do grammar checking
    #
    constraints.append(LanguageTool(0))

    #
    # Untargeted attack
    #
    goal_function = UntargetedClassification(model)

    #
    # Greedily swap words with "Word Importance Ranking".
    #
    search_method = GreedyWordSwapWIR()

    return Attack(goal_function, constraints, transformation, search_method)
Ejemplo n.º 20
0
    def build(model):
        # "In this paper, we present a simple yet novel technique: BAE (BERT-based
        # Adversarial Examples), which uses a language model (LM) for token
        # replacement to best fit the overall context. We perturb an input sentence
        # by either replacing a token or inserting a new token in the sentence, by
        # means of masking a part of the input and using a LM to fill in the mask."
        #
        # We only consider the top K=50 synonyms from the MLM predictions.
        #
        # [from email correspondance with the author]
        # "When choosing the top-K candidates from the BERT masked LM, we filter out
        # the sub-words and only retain the whole words (by checking if they are
        # present in the GloVE vocabulary)"
        #
        transformation = WordSwapMaskedLM(method="bae", max_candidates=50)
        #
        # Don't modify the same word twice or stopwords.
        #
        constraints = [RepeatModification(), StopwordModification()]

        # For the R operations we add an additional check for
        # grammatical correctness of the generated adversarial example by filtering
        # out predicted tokens that do not form the same part of speech (POS) as the
        # original token t_i in the sentence.
        constraints.append(PartOfSpeech(allow_verb_noun_swap=True))

        # "To ensure semantic similarity on introducing perturbations in the input
        # text, we filter the set of top-K masked tokens (K is a pre-defined
        # constant) predicted by BERT-MLM using a Universal Sentence Encoder (USE)
        # (Cer et al., 2018)-based sentence similarity scorer."
        #
        # "[We] set a threshold of 0.8 for the cosine similarity between USE-based
        # embeddings of the adversarial and input text."
        #
        # [from email correspondence with the author]
        # "For a fair comparison of the benefits of using a BERT-MLM in our paper,
        # we retained the majority of TextFooler's specifications. Thus we:
        # 1. Use the USE for comparison within a window of size 15 around the word
        # being replaced/inserted.
        # 2. Set the similarity score threshold to 0.1 for inputs shorter than the
        # window size (this translates roughly to almost always accepting the new text).
        # 3. Perform the USE similarity thresholding of 0.8 with respect to the text
        # just before the replacement/insertion and not the original text (For
        # example: at the 3rd R/I operation, we compute the USE score on a window
        # of size 15 of the text obtained after the first 2 R/I operations and not
        # the original text).
        # ...
        # To address point (3) from above, compare the USE with the original text
        # at each iteration instead of the current one (While doing this change
        # for the R-operation is trivial, doing it for the I-operation with the
        # window based USE comparison might be more involved)."
        #
        # Finally, since the BAE code is based on the TextFooler code, we need to
        # adjust the threshold to account for the missing / pi in the cosine
        # similarity comparison. So the final threshold is 1 - (1 - 0.8) / pi
        # = 1 - (0.2 / pi) = 0.936338023.
        use_constraint = UniversalSentenceEncoder(
            threshold=0.936338023,
            metric="cosine",
            compare_against_original=True,
            window_size=15,
            skip_text_shorter_than_window=True,
        )
        constraints.append(use_constraint)
        #
        # Goal is untargeted classification.
        #
        goal_function = UntargetedClassification(model)
        #
        # "We estimate the token importance Ii of each token
        # t_i ∈ S = [t1, . . . , tn], by deleting ti from S and computing the
        # decrease in probability of predicting the correct label y, similar
        # to (Jin et al., 2019).
        #
        # • "If there are multiple tokens can cause C to misclassify S when they
        # replace the mask, we choose the token which makes Sadv most similar to
        # the original S based on the USE score."
        # • "If no token causes misclassification, we choose the perturbation that
        # decreases the prediction probability P(C(Sadv)=y) the most."
        #
        search_method = GreedyWordSwapWIR(wir_method="delete")

        return BAEGarg2019(goal_function, constraints, transformation, search_method)
Ejemplo n.º 21
0
def BERTAttackLi2020(model):
    """
        Li, L.., Ma, R., Guo, Q., Xiangyang, X., Xipeng, Q. (2020).
        
        BERT-ATTACK: Adversarial Attack Against BERT Using BERT
        
        https://arxiv.org/abs/2004.09984
        
        This is "attack mode" 1 from the paper, BAE-R, word replacement.
    """
    from textattack.shared.utils import logger

    logger.warn(
        "WARNING: This BERT-Attack implementation is based off of a"
        " preliminary draft of the paper, which lacked source code and"
        " did not include any hyperparameters. Attack reuslts are likely to"
        " change."
    )
    # [from correspondence with the author]
    # Candidate size K is set to 48 for all data-sets.
    transformation = WordSwapMaskedLM(method="bert-attack", max_candidates=48)
    #
    # Don't modify the same word twice or stopwords.
    #
    constraints = [RepeatModification(), StopwordModification()]

    # "We only take ε percent of the most important words since we tend to keep
    # perturbations minimum."
    #
    # [from correspondence with the author]
    # "Word percentage allowed to change is set to 0.4 for most data-sets, this
    # parameter is trivial since most attacks only need a few changes. This
    # epsilon is only used to avoid too much queries on those very hard samples."
    constraints.append(MaxWordsPerturbed(max_percent=0.4))

    # "As used in TextFooler (Jin et al., 2019), we also use Universal Sentence
    # Encoder (Cer et al., 2018) to measure the semantic consistency between the
    # adversarial sample and the original sequence. To balance between semantic
    # preservation and attack success rate, we set up a threshold of semantic
    # similarity score to filter the less similar examples."
    #
    # [from correspondence with author]
    # "Over the full texts, after generating all the adversarial samples, we filter
    # out low USE score samples. Thus the success rate is lower but the USE score
    # can be higher. (actually USE score is not a golden metric, so we simply
    # measure the USE score over the final texts for a comparison with TextFooler).
    # For datasets like IMDB, we set a higher threshold between 0.4-0.7; for
    # datasets like MNLI, we set threshold between 0-0.2."
    #
    # Since the threshold in the real world can't be determined from the training
    # data, the TextAttack implementation uses a fixed threshold - determined to
    # be 0.2 to be most fair.
    use_constraint = UniversalSentenceEncoder(
        threshold=0.2, metric="cosine", compare_with_original=True, window_size=None,
    )
    constraints.append(use_constraint)
    #
    # Goal is untargeted classification.
    #
    goal_function = UntargetedClassification(model)
    #
    # "We first select the words in the sequence which have a high significance
    # influence on the final output logit. Let S = [w0, ··· , wi ··· ] denote
    # the input sentence, and oy(S) denote the logit output by the target model
    # for correct label y, the importance score Iwi is defined as
    # Iwi = oy(S) − oy(S\wi), where S\wi = [w0, ··· , wi−1, [MASK], wi+1, ···]
    # is the sentence after replacing wi with [MASK]. Then we rank all the words
    # according to the ranking score Iwi in descending order to create word list
    # L."
    search_method = GreedyWordSwapWIR(wir_method="unk")

    return Attack(goal_function, constraints, transformation, search_method)