Exemple #1
0
    def build(model_wrapper):
        #
        # Section 5: Experiments
        #
        # We base our sets of allowed word substitutions S(x, i) on the
        # substitutions allowed by Alzantot et al. (2018). They demonstrated that
        # their substitutions lead to adversarial examples that are qualitatively
        # similar to the original input and retain the original label, as judged
        # by humans. Alzantot et al. (2018) define the neighbors N(w) of a word w
        # as the n = 8 nearest neighbors of w in a “counter-fitted” word vector
        # space where antonyms are far apart (Mrksiˇ c´ et al., 2016). The
        # neighbors must also lie within some Euclidean distance threshold. They
        # also use a language model constraint to avoid nonsensical perturbations:
        # they allow substituting xi with x˜i ∈ N(xi) if and only if it does not
        # decrease the log-likelihood of the text under a pre-trained language
        # model by more than some threshold.
        #
        # We make three modifications to this approach:
        #
        # First, in Alzantot et al. (2018), the adversary
        # applies substitutions one at a time, and the
        # neighborhoods and language model scores are computed.
        # Equation (4) must be applied before the model
        # can combine information from multiple words, but it can
        # be delayed until after processing each word independently.
        # Note that the model itself classifies using a different
        # set of pre-trained word vectors; the counter-fitted vectors
        # are only used to define the set of allowed substitution words.
        # relative to the current altered version of the input.
        # This results in a hard-to-define attack surface, as
        # changing one word can allow or disallow changes
        # to other words. It also requires recomputing
        # language model scores at each iteration of the genetic
        # attack, which is inefficient. Moreover, the same
        # word can be substituted multiple times, leading
        # to semantic drift. We define allowed substitutions
        # relative to the original sentence x, and disallow
        # repeated substitutions.
        #
        # Second, we use a faster language model that allows us to query
        # longer contexts; Alzantot et al. (2018) use a slower language
        # model and could only query it with short contexts.

        # Finally, we use the language model constraint only
        # at test time; the model is trained against all perturbations in N(w). This encourages the model to be
        # robust to a larger space of perturbations, instead of
        # specializing for the particular choice of language
        # model. See Appendix A.3 for further details. [This is a model-specific
        # adjustment, so does not affect the attack recipe.]
        #
        # Appendix A.3:
        #
        # In Alzantot et al. (2018), the adversary applies replacements one at a
        # time, and the neighborhoods and language model scores are computed
        # relative to the current altered version of the input. This results in a
        # hard-to-define attack surface, as the same word can be replaced many
        # times, leading to semantic drift. We instead pre-compute the allowed
        # substitutions S(x, i) at index i based on the original x. We define
        # S(x, i) as the set of x_i ∈ N(x_i) such that where probabilities are
        # assigned by a pre-trained language model, and the window radius W and
        # threshold δ are hyperparameters. We use W = 6 and δ = 5.
        #
        #
        # Swap words with their embedding nearest-neighbors.
        #
        # Embedding: Counter-fitted Paragram Embeddings.
        #
        # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5"
        #
        transformation = WordSwapEmbedding(max_candidates=8)
        #
        # Don't modify the same word twice or stopwords
        #
        constraints = [RepeatModification(), StopwordModification()]
        #
        # Maximum words perturbed percentage of 20%
        #
        constraints.append(MaxWordsPerturbed(max_percent=0.2))
        #
        # Maximum word embedding euclidean distance of 0.5.
        #
        constraints.append(WordEmbeddingDistance(max_mse_dist=0.5))
        #
        # Language Model
        #
        #
        #
        constraints.append(
            LearningToWriteLanguageModel(window_size=6,
                                         max_log_prob_diff=5.0,
                                         compare_against_original=True))
        # constraints.append(LearningToWriteLanguageModel(window_size=5))
        #
        # Goal is untargeted classification
        #
        goal_function = UntargetedClassification(model_wrapper)
        #
        # Perform word substitution with a genetic algorithm.
        #
        search_method = AlzantotGeneticAlgorithm(pop_size=60,
                                                 max_iters=20,
                                                 post_crossover_check=False)

        return Attack(goal_function, constraints, transformation,
                      search_method)
Exemple #2
0
def TextFoolerJin2019Adjusted(model, SE_thresh=0.98, sentence_encoder='bert'):
    """
        Jin, D., Jin, Z., Zhou, J.T., & Szolovits, P. (2019). 
        
        Is BERT Really Robust? Natural Language Attack on Text Classification and Entailment. 
        
        https://arxiv.org/abs/1907.11932 
       
        Constraints adjusted from paper to align with human evaluation.
    """
    #
    # Swap words with their embedding nearest-neighbors.
    #
    # Embedding: Counter-fitted PARAGRAM-SL999 vectors.
    #
    # 50 nearest-neighbors with a cosine similarity of at least 0.5.
    # (The paper claims 0.7, but analysis of the code and some empirical
    # results show that it's definitely 0.5.)
    #
    transformation = WordSwapEmbedding(max_candidates=50)
    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]
    #
    # Minimum word embedding cosine similarity of 0.9.
    #
    constraints.append(WordEmbeddingDistance(min_cos_sim=0.9))
    #
    # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7.
    #
    if sentence_encoder == 'bert':
        se_constraint = BERT(threshold=SE_thresh,
                             metric='cosine',
                             compare_against_original=False,
                             window_size=15,
                             skip_text_shorter_than_window=False)
    else:
        se_constraint = UniversalSentenceEncoder(
            threshold=SE_thresh,
            metric='cosine',
            compare_against_original=False,
            window_size=15,
            skip_text_shorter_than_window=False)
    constraints.append(se_constraint)
    #
    # Do grammar checking
    #
    constraints.append(LanguageTool(0))

    #
    # Untargeted attack
    #
    goal_function = UntargetedClassification(model)

    #
    # Greedily swap words with "Word Importance Ranking".
    #
    search_method = GreedyWordSwapWIR()

    return Attack(goal_function, constraints, transformation, search_method)
Exemple #3
0
    def build(model):
        #
        # Swap words with their 50 closest embedding nearest-neighbors.
        # Embedding: Counter-fitted PARAGRAM-SL999 vectors.
        #
        transformation = WordSwapEmbedding(max_candidates=50)
        #
        # Don't modify the same word twice or the stopwords defined
        # in the TextFooler public implementation.
        #
        # fmt: off
        stopwords = set([
            "a", "about", "above", "across", "after", "afterwards", "again",
            "against", "ain", "all", "almost", "alone", "along", "already",
            "also", "although", "am", "among", "amongst", "an", "and",
            "another", "any", "anyhow", "anyone", "anything", "anyway",
            "anywhere", "are", "aren", "aren't", "around", "as", "at", "back",
            "been", "before", "beforehand", "behind", "being", "below",
            "beside", "besides", "between", "beyond", "both", "but", "by",
            "can", "cannot", "could", "couldn", "couldn't", "d", "didn",
            "didn't", "doesn", "doesn't", "don", "don't", "down", "due",
            "during", "either", "else", "elsewhere", "empty", "enough", "even",
            "ever", "everyone", "everything", "everywhere", "except", "first",
            "for", "former", "formerly", "from", "hadn", "hadn't", "hasn",
            "hasn't", "haven", "haven't", "he", "hence", "her", "here",
            "hereafter", "hereby", "herein", "hereupon", "hers", "herself",
            "him", "himself", "his", "how", "however", "hundred", "i", "if",
            "in", "indeed", "into", "is", "isn", "isn't", "it", "it's", "its",
            "itself", "just", "latter", "latterly", "least", "ll", "may", "me",
            "meanwhile", "mightn", "mightn't", "mine", "more", "moreover",
            "most", "mostly", "must", "mustn", "mustn't", "my", "myself",
            "namely", "needn", "needn't", "neither", "never", "nevertheless",
            "next", "no", "nobody", "none", "noone", "nor", "not", "nothing",
            "now", "nowhere", "o", "of", "off", "on", "once", "one", "only",
            "onto", "or", "other", "others", "otherwise", "our", "ours",
            "ourselves", "out", "over", "per", "please", "s", "same", "shan",
            "shan't", "she", "she's", "should've", "shouldn", "shouldn't",
            "somehow", "something", "sometime", "somewhere", "such", "t",
            "than", "that", "that'll", "the", "their", "theirs", "them",
            "themselves", "then", "thence", "there", "thereafter", "thereby",
            "therefore", "therein", "thereupon", "these", "they", "this",
            "those", "through", "throughout", "thru", "thus", "to", "too",
            "toward", "towards", "under", "unless", "until", "up", "upon",
            "used", "ve", "was", "wasn", "wasn't", "we", "were", "weren",
            "weren't", "what", "whatever", "when", "whence", "whenever",
            "where", "whereafter", "whereas", "whereby", "wherein",
            "whereupon", "wherever", "whether", "which", "while", "whither",
            "who", "whoever", "whole", "whom", "whose", "why", "with",
            "within", "without", "won", "won't", "would", "wouldn", "wouldn't",
            "y", "yet", "you", "you'd", "you'll", "you're", "you've", "your",
            "yours", "yourself", "yourselves"
        ])
        # fmt: on
        constraints = [
            RepeatModification(),
            StopwordModification(stopwords=stopwords)
        ]
        #
        # During entailment, we should only edit the hypothesis - keep the premise
        # the same.
        #
        input_column_modification = InputColumnModification(
            ["premise", "hypothesis"], {"premise"})
        constraints.append(input_column_modification)
        # Minimum word embedding cosine similarity of 0.5.
        # (The paper claims 0.7, but analysis of the released code and some empirical
        # results show that it's 0.5.)
        #
        constraints.append(WordEmbeddingDistance(min_cos_sim=0.5))
        #
        # Only replace words with the same part of speech (or nouns with verbs)
        #
        constraints.append(PartOfSpeech(allow_verb_noun_swap=True))
        #
        # Universal Sentence Encoder with a minimum angular similarity of ε = 0.5.
        #
        # In the TextFooler code, they forget to divide the angle between the two
        # embeddings by pi. So if the original threshold was that 1 - sim >= 0.5, the
        # new threshold is 1 - (0.5) / pi = 0.840845057
        #
        use_constraint = UniversalSentenceEncoder(
            threshold=0.840845057,
            metric="angular",
            compare_against_original=False,
            window_size=15,
            skip_text_shorter_than_window=True,
        )
        constraints.append(use_constraint)
        #
        # Goal is untargeted classification
        #
        goal_function = UntargetedClassification(model)
        #
        # Greedily swap words with "Word Importance Ranking".
        #
        search_method = GreedyWordSwapWIR(wir_method="delete")

        return Attack(goal_function, constraints, transformation,
                      search_method)