def build(model_wrapper): # # Section 5: Experiments # # We base our sets of allowed word substitutions S(x, i) on the # substitutions allowed by Alzantot et al. (2018). They demonstrated that # their substitutions lead to adversarial examples that are qualitatively # similar to the original input and retain the original label, as judged # by humans. Alzantot et al. (2018) define the neighbors N(w) of a word w # as the n = 8 nearest neighbors of w in a “counter-fitted” word vector # space where antonyms are far apart (Mrksiˇ c´ et al., 2016). The # neighbors must also lie within some Euclidean distance threshold. They # also use a language model constraint to avoid nonsensical perturbations: # they allow substituting xi with x˜i ∈ N(xi) if and only if it does not # decrease the log-likelihood of the text under a pre-trained language # model by more than some threshold. # # We make three modifications to this approach: # # First, in Alzantot et al. (2018), the adversary # applies substitutions one at a time, and the # neighborhoods and language model scores are computed. # Equation (4) must be applied before the model # can combine information from multiple words, but it can # be delayed until after processing each word independently. # Note that the model itself classifies using a different # set of pre-trained word vectors; the counter-fitted vectors # are only used to define the set of allowed substitution words. # relative to the current altered version of the input. # This results in a hard-to-define attack surface, as # changing one word can allow or disallow changes # to other words. It also requires recomputing # language model scores at each iteration of the genetic # attack, which is inefficient. Moreover, the same # word can be substituted multiple times, leading # to semantic drift. We define allowed substitutions # relative to the original sentence x, and disallow # repeated substitutions. # # Second, we use a faster language model that allows us to query # longer contexts; Alzantot et al. (2018) use a slower language # model and could only query it with short contexts. # Finally, we use the language model constraint only # at test time; the model is trained against all perturbations in N(w). This encourages the model to be # robust to a larger space of perturbations, instead of # specializing for the particular choice of language # model. See Appendix A.3 for further details. [This is a model-specific # adjustment, so does not affect the attack recipe.] # # Appendix A.3: # # In Alzantot et al. (2018), the adversary applies replacements one at a # time, and the neighborhoods and language model scores are computed # relative to the current altered version of the input. This results in a # hard-to-define attack surface, as the same word can be replaced many # times, leading to semantic drift. We instead pre-compute the allowed # substitutions S(x, i) at index i based on the original x. We define # S(x, i) as the set of x_i ∈ N(x_i) such that where probabilities are # assigned by a pre-trained language model, and the window radius W and # threshold δ are hyperparameters. We use W = 6 and δ = 5. # # # Swap words with their embedding nearest-neighbors. # # Embedding: Counter-fitted Paragram Embeddings. # # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5" # transformation = WordSwapEmbedding(max_candidates=8) # # Don't modify the same word twice or stopwords # constraints = [RepeatModification(), StopwordModification()] # # Maximum words perturbed percentage of 20% # constraints.append(MaxWordsPerturbed(max_percent=0.2)) # # Maximum word embedding euclidean distance of 0.5. # constraints.append(WordEmbeddingDistance(max_mse_dist=0.5)) # # Language Model # # # constraints.append( LearningToWriteLanguageModel(window_size=6, max_log_prob_diff=5.0, compare_against_original=True)) # constraints.append(LearningToWriteLanguageModel(window_size=5)) # # Goal is untargeted classification # goal_function = UntargetedClassification(model_wrapper) # # Perform word substitution with a genetic algorithm. # search_method = AlzantotGeneticAlgorithm(pop_size=60, max_iters=20, post_crossover_check=False) return Attack(goal_function, constraints, transformation, search_method)
def TextFoolerJin2019Adjusted(model, SE_thresh=0.98, sentence_encoder='bert'): """ Jin, D., Jin, Z., Zhou, J.T., & Szolovits, P. (2019). Is BERT Really Robust? Natural Language Attack on Text Classification and Entailment. https://arxiv.org/abs/1907.11932 Constraints adjusted from paper to align with human evaluation. """ # # Swap words with their embedding nearest-neighbors. # # Embedding: Counter-fitted PARAGRAM-SL999 vectors. # # 50 nearest-neighbors with a cosine similarity of at least 0.5. # (The paper claims 0.7, but analysis of the code and some empirical # results show that it's definitely 0.5.) # transformation = WordSwapEmbedding(max_candidates=50) # # Don't modify the same word twice or stopwords # constraints = [RepeatModification(), StopwordModification()] # # Minimum word embedding cosine similarity of 0.9. # constraints.append(WordEmbeddingDistance(min_cos_sim=0.9)) # # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7. # if sentence_encoder == 'bert': se_constraint = BERT(threshold=SE_thresh, metric='cosine', compare_against_original=False, window_size=15, skip_text_shorter_than_window=False) else: se_constraint = UniversalSentenceEncoder( threshold=SE_thresh, metric='cosine', compare_against_original=False, window_size=15, skip_text_shorter_than_window=False) constraints.append(se_constraint) # # Do grammar checking # constraints.append(LanguageTool(0)) # # Untargeted attack # goal_function = UntargetedClassification(model) # # Greedily swap words with "Word Importance Ranking". # search_method = GreedyWordSwapWIR() return Attack(goal_function, constraints, transformation, search_method)
def build(model): # # Swap words with their 50 closest embedding nearest-neighbors. # Embedding: Counter-fitted PARAGRAM-SL999 vectors. # transformation = WordSwapEmbedding(max_candidates=50) # # Don't modify the same word twice or the stopwords defined # in the TextFooler public implementation. # # fmt: off stopwords = set([ "a", "about", "above", "across", "after", "afterwards", "again", "against", "ain", "all", "almost", "alone", "along", "already", "also", "although", "am", "among", "amongst", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "aren", "aren't", "around", "as", "at", "back", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", "can", "cannot", "could", "couldn", "couldn't", "d", "didn", "didn't", "doesn", "doesn't", "don", "don't", "down", "due", "during", "either", "else", "elsewhere", "empty", "enough", "even", "ever", "everyone", "everything", "everywhere", "except", "first", "for", "former", "formerly", "from", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "i", "if", "in", "indeed", "into", "is", "isn", "isn't", "it", "it's", "its", "itself", "just", "latter", "latterly", "least", "ll", "may", "me", "meanwhile", "mightn", "mightn't", "mine", "more", "moreover", "most", "mostly", "must", "mustn", "mustn't", "my", "myself", "namely", "needn", "needn't", "neither", "never", "nevertheless", "next", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "o", "of", "off", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "per", "please", "s", "same", "shan", "shan't", "she", "she's", "should've", "shouldn", "shouldn't", "somehow", "something", "sometime", "somewhere", "such", "t", "than", "that", "that'll", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "this", "those", "through", "throughout", "thru", "thus", "to", "too", "toward", "towards", "under", "unless", "until", "up", "upon", "used", "ve", "was", "wasn", "wasn't", "we", "were", "weren", "weren't", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "with", "within", "without", "won", "won't", "would", "wouldn", "wouldn't", "y", "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]) # fmt: on constraints = [ RepeatModification(), StopwordModification(stopwords=stopwords) ] # # During entailment, we should only edit the hypothesis - keep the premise # the same. # input_column_modification = InputColumnModification( ["premise", "hypothesis"], {"premise"}) constraints.append(input_column_modification) # Minimum word embedding cosine similarity of 0.5. # (The paper claims 0.7, but analysis of the released code and some empirical # results show that it's 0.5.) # constraints.append(WordEmbeddingDistance(min_cos_sim=0.5)) # # Only replace words with the same part of speech (or nouns with verbs) # constraints.append(PartOfSpeech(allow_verb_noun_swap=True)) # # Universal Sentence Encoder with a minimum angular similarity of ε = 0.5. # # In the TextFooler code, they forget to divide the angle between the two # embeddings by pi. So if the original threshold was that 1 - sim >= 0.5, the # new threshold is 1 - (0.5) / pi = 0.840845057 # use_constraint = UniversalSentenceEncoder( threshold=0.840845057, metric="angular", compare_against_original=False, window_size=15, skip_text_shorter_than_window=True, ) constraints.append(use_constraint) # # Goal is untargeted classification # goal_function = UntargetedClassification(model) # # Greedily swap words with "Word Importance Ranking". # search_method = GreedyWordSwapWIR(wir_method="delete") return Attack(goal_function, constraints, transformation, search_method)