Ejemplo n.º 1
0
def build_baegarg2019(model_wrapper, threshold_cosine=0.936338023, query_budget=None, max_candidates=50):
    """
    Modified from https://github.com/QData/TextAttack/blob/04b7c6f79bdb5301b360555bd5458c15aa2b8695/textattack/attack_recipes/bae_garg_2019.py
    """
    transformation = WordSwapMaskedLM(
        method="bae", max_candidates=max_candidates, min_confidence=0.0
    )
    constraints = [RepeatModification(), StopwordModification()]

    constraints.append(PartOfSpeech(allow_verb_noun_swap=True))

    use_constraint = UniversalSentenceEncoder(
        threshold=threshold_cosine,
        metric="cosine",
        compare_against_original=True,
        window_size=15,
        skip_text_shorter_than_window=True,
    )
    constraints.append(use_constraint)
    goal_function = UntargetedClassification(model_wrapper)
    if query_budget is not None:
        goal_function.query_budget = query_budget
    search_method = GreedyWordSwapWIR(wir_method="delete")

    return Attack(goal_function, constraints, transformation, search_method)
Ejemplo n.º 2
0
def TextFoolerJin2019(model):
    #
    # Swap words with their embedding nearest-neighbors.
    #
    # Embedding: Counter-fitted PARAGRAM-SL999 vectors.
    #
    # 50 nearest-neighbors with a cosine similarity of at least 0.5.
    # (The paper claims 0.7, but analysis of the code and some empirical
    # results show that it's definitely 0.5.)
    #
    transformation = WordSwapEmbedding(max_candidates=50,
                                       textfooler_stopwords=True)
    #
    # Minimum word embedding cosine similarity of 0.5.
    #
    constraints = []
    constraints.append(WordEmbeddingDistance(min_cos_sim=0.5))
    #
    # Only replace words with the same part of speech (or nouns with verbs)
    #
    constraints.append(PartOfSpeech(allow_verb_noun_swap=True))
    #
    # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7.
    #
    # In the TextFooler code, they forget to divide the angle between the two
    # embeddings by pi. So if the original threshold was that 1 - sim >= 0.7, the
    # new threshold is 1 - (0.3) / pi = 0.90445
    #
    use_constraint = UniversalSentenceEncoder(
        threshold=0.904458599,
        metric='angular',
        compare_with_original=False,
        window_size=15,
        skip_text_shorter_than_window=True)
    constraints.append(use_constraint)
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Greedily swap words with "Word Importance Ranking".
    #
    attack = GreedyWordSwapWIR(goal_function,
                               transformation=transformation,
                               constraints=constraints,
                               max_depth=None)

    return attack
def HotFlipEbrahimi2017(model):
    """
        Ebrahimi, J. et al. (2017)
        
        HotFlip: White-Box Adversarial Examples for Text Classification
        
        https://arxiv.org/abs/1712.06751
        
        This is a reproduction of the HotFlip word-level attack (section 5 of the 
        paper).
    """
    #
    # "HotFlip ... uses the gradient with respect to a one-hot input
    # representation to efficiently estimate which individual change has the
    # highest estimated loss."
    transformation = WordSwapGradientBased(model, top_n=1)
    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]
    #
    # 0. "We were able to create only 41 examples (2% of the correctly-
    # classified instances of the SST test set) with one or two flips."
    #
    constraints.append(MaxWordsPerturbed(max_num_words=2))
    #
    # 1. "The cosine similarity between the embedding of words is bigger than a
    #   threshold (0.8)."
    #
    constraints.append(WordEmbeddingDistance(min_cos_sim=0.8))
    #
    # 2. "The two words have the same part-of-speech."
    #
    constraints.append(PartOfSpeech())
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # "HotFlip ... uses a beam search to find a set of manipulations that work
    # well together to confuse a classifier ... The adversary uses a beam size
    # of 10."
    #
    search_method = BeamSearch(beam_width=10)

    return Attack(goal_function, constraints, transformation, search_method)
Ejemplo n.º 4
0
    def build(model_wrapper, mlm=False):
        """Build attack recipe.

        Args:
            model_wrapper (:class:`~textattack.models.wrappers.ModelWrapper`):
                Model wrapper containing both the model and the tokenizer.
            mlm (:obj:`bool`, `optional`, defaults to :obj:`False`):
                If :obj:`True`, load `A2T-MLM` attack. Otherwise, load regular `A2T` attack.

        Returns:
            :class:`~textattack.Attack`: A2T attack.
        """
        constraints = [RepeatModification(), StopwordModification()]
        input_column_modification = InputColumnModification(
            ["premise", "hypothesis"], {"premise"})
        constraints.append(input_column_modification)
        constraints.append(PartOfSpeech(allow_verb_noun_swap=False))
        constraints.append(MaxModificationRate(max_rate=0.1, min_threshold=4))
        sent_encoder = BERT(model_name="stsb-distilbert-base",
                            threshold=0.9,
                            metric="cosine")
        constraints.append(sent_encoder)

        if mlm:
            transformation = transformation = WordSwapMaskedLM(
                method="bae",
                max_candidates=20,
                min_confidence=0.0,
                batch_size=16)
        else:
            transformation = WordSwapEmbedding(max_candidates=20)
            constraints.append(WordEmbeddingDistance(min_cos_sim=0.8))

        #
        # Goal is untargeted classification
        #
        goal_function = UntargetedClassification(model_wrapper,
                                                 model_batch_size=32)
        #
        # Greedily swap words with "Word Importance Ranking".
        #
        search_method = GreedyWordSwapWIR(wir_method="gradient")

        return Attack(goal_function, constraints, transformation,
                      search_method)
Ejemplo n.º 5
0
def HotFlipEbrahimi2017(model):
    #
    # "HotFlip ... uses the gradient with respect to a one-hot input
    # representation to efficiently estimate which individual change has the
    # highest estimated loss."
    transformation = GradientBasedWordSwap(model,
                                           top_n=1,
                                           replace_stopwords=False)
    constraints = []
    #
    # 0. "We were able to create only 41 examples (2% of the correctly-
    # classified instances of the SST test set) with one or two flips."
    #
    constraints.append(WordsPerturbed(max_num_words=2))
    #
    # 1. "The cosine similarity between the embedding of words is bigger than a
    #   threshold (0.8)."
    #
    constraints.append(WordEmbeddingDistance(min_cos_sim=0.8))
    #
    # 2. "The two words have the same part-of-speech."
    #
    constraints.append(PartOfSpeech())
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # "HotFlip ... uses a beam search to find a set of manipulations that work
    # well together to confuse a classifier ... The adversary uses a beam size
    # of 10."
    #
    attack = BeamSearch(goal_function,
                        constraints=constraints,
                        transformation=transformation,
                        beam_width=10)

    return attack
    'thru', 'thus', 'to', 'too', 'toward', 'towards', 'under', 'unless',
    'until', 'up', 'upon', 'used', 've', 'was', 'wasn', "wasn't", 'we', 'were',
    'weren', "weren't", 'what', 'whatever', 'when', 'whence', 'whenever',
    'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
    'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever',
    'whole', 'whom', 'whose', 'why', 'with', 'within', 'without', 'won',
    "won't", 'would', 'wouldn', "wouldn't", 'y', 'yet', 'you', "you'd",
    "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves'
])

# Lax Constraints
MAX_LENGTH = 256
USE_THRESHOLD = 0.9
ALLOW_VERB_NOUN_SWAP = False
TAGGER_TYPE = "flair"

CONSTRAINTS = [
    RepeatModification(),
    StopwordModification(stopwords=STOPWORDS),
    MaxWordIndexModification(max_length=MAX_LENGTH),
    InputColumnModification(["premise", "hypothesis"], {"premise"}),
    UniversalSentenceEncoder(
        threshold=USE_THRESHOLD,
        metric="angular",
        compare_against_original=False,
        window_size=15,
        skip_text_shorter_than_window=True,
    ),
    PartOfSpeech(tagger_type=TAGGER_TYPE,
                 allow_verb_noun_swap=ALLOW_VERB_NOUN_SWAP)
]
Ejemplo n.º 7
0
def TextFoolerJin2019(model):
    """
        Jin, D., Jin, Z., Zhou, J.T., & Szolovits, P. (2019). 
        
        Is BERT Really Robust? Natural Language Attack on Text Classification and Entailment. 
        
        https://arxiv.org/abs/1907.11932 
    """
    #
    # Swap words with their 50 closest embedding nearest-neighbors.
    # Embedding: Counter-fitted PARAGRAM-SL999 vectors.
    #
    transformation = WordSwapEmbedding(max_candidates=50)
    #
    # Don't modify the same word twice or the stopwords defined
    # in the TextFooler public implementation.
    #
    # fmt: off
    stopwords = set([
        "a", "about", "above", "across", "after", "afterwards", "again",
        "against", "ain", "all", "almost", "alone", "along", "already", "also",
        "although", "am", "among", "amongst", "an", "and", "another", "any",
        "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "aren",
        "aren't", "around", "as", "at", "back", "been", "before", "beforehand",
        "behind", "being", "below", "beside", "besides", "between", "beyond",
        "both", "but", "by", "can", "cannot", "could", "couldn", "couldn't",
        "d", "didn", "didn't", "doesn", "doesn't", "don", "don't", "down",
        "due", "during", "either", "else", "elsewhere", "empty", "enough",
        "even", "ever", "everyone", "everything", "everywhere", "except",
        "first", "for", "former", "formerly", "from", "hadn", "hadn't", "hasn",
        "hasn't", "haven", "haven't", "he", "hence", "her", "here",
        "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him",
        "himself", "his", "how", "however", "hundred", "i", "if", "in",
        "indeed", "into", "is", "isn", "isn't", "it", "it's", "its", "itself",
        "just", "latter", "latterly", "least", "ll", "may", "me", "meanwhile",
        "mightn", "mightn't", "mine", "more", "moreover", "most", "mostly",
        "must", "mustn", "mustn't", "my", "myself", "namely", "needn",
        "needn't", "neither", "never", "nevertheless", "next", "no", "nobody",
        "none", "noone", "nor", "not", "nothing", "now", "nowhere", "o", "of",
        "off", "on", "once", "one", "only", "onto", "or", "other", "others",
        "otherwise", "our", "ours", "ourselves", "out", "over", "per",
        "please", "s", "same", "shan", "shan't", "she", "she's", "should've",
        "shouldn", "shouldn't", "somehow", "something", "sometime",
        "somewhere", "such", "t", "than", "that", "that'll", "the", "their",
        "theirs", "them", "themselves", "then", "thence", "there",
        "thereafter", "thereby", "therefore", "therein", "thereupon", "these",
        "they", "this", "those", "through", "throughout", "thru", "thus", "to",
        "too", "toward", "towards", "under", "unless", "until", "up", "upon",
        "used", "ve", "was", "wasn", "wasn't", "we", "were", "weren",
        "weren't", "what", "whatever", "when", "whence", "whenever", "where",
        "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever",
        "whether", "which", "while", "whither", "who", "whoever", "whole",
        "whom", "whose", "why", "with", "within", "without", "won", "won't",
        "would", "wouldn", "wouldn't", "y", "yet", "you", "you'd", "you'll",
        "you're", "you've", "your", "yours", "yourself", "yourselves"
    ])
    # fmt: on
    constraints = [
        RepeatModification(),
        StopwordModification(stopwords=stopwords)
    ]
    #
    # During entailment, we should only edit the hypothesis - keep the premise
    # the same.
    #
    input_column_modification = InputColumnModification(
        ["premise", "hypothesis"], {"premise"})
    constraints.append(input_column_modification)
    # Minimum word embedding cosine similarity of 0.5.
    # (The paper claims 0.7, but analysis of the released code and some empirical
    # results show that it's 0.5.)
    #
    constraints.append(WordEmbeddingDistance(min_cos_sim=0.5))
    #
    # Only replace words with the same part of speech (or nouns with verbs)
    #
    constraints.append(PartOfSpeech(allow_verb_noun_swap=True))
    #
    # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7.
    #
    # In the TextFooler code, they forget to divide the angle between the two
    # embeddings by pi. So if the original threshold was that 1 - sim >= 0.7, the
    # new threshold is 1 - (0.3) / pi = 0.90445
    #
    use_constraint = UniversalSentenceEncoder(
        threshold=0.904458599,
        metric="angular",
        compare_with_original=False,
        window_size=15,
        skip_text_shorter_than_window=True,
    )
    constraints.append(use_constraint)
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Greedily swap words with "Word Importance Ranking".
    #
    search_method = GreedyWordSwapWIR()

    return Attack(goal_function, constraints, transformation, search_method)
Ejemplo n.º 8
0
    def build(model):
        # "In this paper, we present a simple yet novel technique: BAE (BERT-based
        # Adversarial Examples), which uses a language model (LM) for token
        # replacement to best fit the overall context. We perturb an input sentence
        # by either replacing a token or inserting a new token in the sentence, by
        # means of masking a part of the input and using a LM to fill in the mask."
        #
        # We only consider the top K=50 synonyms from the MLM predictions.
        #
        # [from email correspondance with the author]
        # "When choosing the top-K candidates from the BERT masked LM, we filter out
        # the sub-words and only retain the whole words (by checking if they are
        # present in the GloVE vocabulary)"
        #
        transformation = WordSwapMaskedLM(method="bae", max_candidates=50)
        #
        # Don't modify the same word twice or stopwords.
        #
        constraints = [RepeatModification(), StopwordModification()]

        # For the R operations we add an additional check for
        # grammatical correctness of the generated adversarial example by filtering
        # out predicted tokens that do not form the same part of speech (POS) as the
        # original token t_i in the sentence.
        constraints.append(PartOfSpeech(allow_verb_noun_swap=True))

        # "To ensure semantic similarity on introducing perturbations in the input
        # text, we filter the set of top-K masked tokens (K is a pre-defined
        # constant) predicted by BERT-MLM using a Universal Sentence Encoder (USE)
        # (Cer et al., 2018)-based sentence similarity scorer."
        #
        # "[We] set a threshold of 0.8 for the cosine similarity between USE-based
        # embeddings of the adversarial and input text."
        #
        # [from email correspondence with the author]
        # "For a fair comparison of the benefits of using a BERT-MLM in our paper,
        # we retained the majority of TextFooler's specifications. Thus we:
        # 1. Use the USE for comparison within a window of size 15 around the word
        # being replaced/inserted.
        # 2. Set the similarity score threshold to 0.1 for inputs shorter than the
        # window size (this translates roughly to almost always accepting the new text).
        # 3. Perform the USE similarity thresholding of 0.8 with respect to the text
        # just before the replacement/insertion and not the original text (For
        # example: at the 3rd R/I operation, we compute the USE score on a window
        # of size 15 of the text obtained after the first 2 R/I operations and not
        # the original text).
        # ...
        # To address point (3) from above, compare the USE with the original text
        # at each iteration instead of the current one (While doing this change
        # for the R-operation is trivial, doing it for the I-operation with the
        # window based USE comparison might be more involved)."
        #
        # Finally, since the BAE code is based on the TextFooler code, we need to
        # adjust the threshold to account for the missing / pi in the cosine
        # similarity comparison. So the final threshold is 1 - (1 - 0.8) / pi
        # = 1 - (0.2 / pi) = 0.936338023.
        use_constraint = UniversalSentenceEncoder(
            threshold=0.936338023,
            metric="cosine",
            compare_against_original=True,
            window_size=15,
            skip_text_shorter_than_window=True,
        )
        constraints.append(use_constraint)
        #
        # Goal is untargeted classification.
        #
        goal_function = UntargetedClassification(model)
        #
        # "We estimate the token importance Ii of each token
        # t_i ∈ S = [t1, . . . , tn], by deleting ti from S and computing the
        # decrease in probability of predicting the correct label y, similar
        # to (Jin et al., 2019).
        #
        # • "If there are multiple tokens can cause C to misclassify S when they
        # replace the mask, we choose the token which makes Sadv most similar to
        # the original S based on the USE score."
        # • "If no token causes misclassification, we choose the perturbation that
        # decreases the prediction probability P(C(Sadv)=y) the most."
        #
        search_method = GreedyWordSwapWIR(wir_method="delete")

        return BAEGarg2019(goal_function, constraints, transformation, search_method)