def build(model, max_num_word_swaps=1): # a combination of 4 different character-based transforms # ignore the first and last letter of each word, as in the paper transformation = CompositeTransformation([ WordSwapNeighboringCharacterSwap(random_one=False, skip_first_char=True, skip_last_char=True), WordSwapRandomCharacterDeletion(random_one=False, skip_first_char=True, skip_last_char=True), WordSwapRandomCharacterInsertion(random_one=False, skip_first_char=True, skip_last_char=True), WordSwapQWERTY(random_one=False, skip_first_char=True, skip_last_char=True), ]) # only edit words of length >= 4, edit max_num_word_swaps words. # note that we also are not editing the same word twice, so # max_num_word_swaps is really the max number of character # changes that can be made. The paper looks at 1 and 2 char attacks. constraints = [ MinWordLength(min_length=4), StopwordModification(), MaxWordsPerturbed(max_num_words=max_num_word_swaps), RepeatModification(), ] # untargeted attack goal_function = UntargetedClassification(model) search_method = GreedySearch() return Attack(goal_function, constraints, transformation, search_method)
def build(model, ensemble: bool = False): # [from correspondence with the author] # Candidate size K is set to 48 for all data-sets. transformation = WordSwapMaskedLM(method="bert-attack", max_candidates=48) # # Don't modify the same word twice or stopwords. # constraints = [RepeatModification(), StopwordModification()] # "We only take ε percent of the most important words since we tend to keep # perturbations minimum." # # [from correspondence with the author] # "Word percentage allowed to change is set to 0.4 for most data-sets, this # parameter is trivial since most attacks only need a few changes. This # epsilon is only used to avoid too much queries on those very hard samples." constraints.append(MaxWordsPerturbed(max_percent=0.4)) # "As used in TextFooler (Jin et al., 2019), we also use Universal Sentence # Encoder (Cer et al., 2018) to measure the semantic consistency between the # adversarial sample and the original sequence. To balance between semantic # preservation and attack success rate, we set up a threshold of semantic # similarity score to filter the less similar examples." # # [from correspondence with author] # "Over the full texts, after generating all the adversarial samples, we filter # out low USE score samples. Thus the success rate is lower but the USE score # can be higher. (actually USE score is not a golden metric, so we simply # measure the USE score over the final texts for a comparison with TextFooler). # For datasets like IMDB, we set a higher threshold between 0.4-0.7; for # datasets like MNLI, we set threshold between 0-0.2." # # Since the threshold in the real world can't be determined from the training # data, the TextAttack implementation uses a fixed threshold - determined to # be 0.2 to be most fair. use_constraint = UniversalSentenceEncoder( threshold=0.2, metric="cosine", compare_against_original=True, window_size=None, ) constraints.append(use_constraint) # # Goal is untargeted classification. # goal_function = UntargetedClassification(model) # # "We first select the words in the sequence which have a high significance # influence on the final output logit. Let S = [w0, ··· , wi ··· ] denote # the input sentence, and oy(S) denote the logit output by the target model # for correct label y, the importance score Iwi is defined as # Iwi = oy(S) − oy(S\wi), where S\wi = [w0, ··· , wi−1, [MASK], wi+1, ···] # is the sentence after replacing wi with [MASK]. Then we rank all the words # according to the ranking score Iwi in descending order to create word list # L." search_method = GreedyWordSwapWIR(wir_method="unk", ensemble=ensemble) return Attack(goal_function, constraints, transformation, search_method)
def GeneticAlgorithmAlzantot2018(model): """Alzantot, M., Sharma, Y., Elgohary, A., Ho, B., Srivastava, M.B., & Chang, K. (2018). Generating Natural Language Adversarial Examples. https://arxiv.org/abs/1804.07998 """ # # Swap words with their embedding nearest-neighbors. # # Embedding: Counter-fitted Paragram Embeddings. # # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5" # transformation = WordSwapEmbedding(max_candidates=8) # # Don't modify the same word twice or stopwords # constraints = [RepeatModification(), StopwordModification()] # # During entailment, we should only edit the hypothesis - keep the premise # the same. # input_column_modification = InputColumnModification( ["premise", "hypothesis"], {"premise"}) constraints.append(input_column_modification) # # Maximum words perturbed percentage of 20% # constraints.append(MaxWordsPerturbed(max_percent=0.2)) # # Maximum word embedding euclidean distance of 0.5. # constraints.append( WordEmbeddingDistance(max_mse_dist=0.5, compare_against_original=False)) # # Language Model # constraints.append( Google1BillionWordsLanguageModel(top_n_per_index=4, compare_against_original=False)) # # Goal is untargeted classification # goal_function = UntargetedClassification(model) # # Perform word substitution with a genetic algorithm. # search_method = GeneticAlgorithm(pop_size=60, max_iters=20, post_crossover_check=False) return Attack(goal_function, constraints, transformation, search_method)
def build(model): # # Swap words with their embedding nearest-neighbors. # # Embedding: Counter-fitted Paragram Embeddings. # # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5" # transformation = WordSwapEmbedding(max_candidates=8) # # Don't modify the same word twice or stopwords # constraints = [RepeatModification(), StopwordModification()] # # During entailment, we should only edit the hypothesis - keep the premise # the same. # input_column_modification = InputColumnModification( ["premise", "hypothesis"], {"premise"}) constraints.append(input_column_modification) # # Maximum words perturbed percentage of 20% # constraints.append(MaxWordsPerturbed(max_percent=0.2)) # # Maximum word embedding euclidean distance of 0.5. # constraints.append( WordEmbeddingDistance(max_mse_dist=0.5, compare_against_original=False)) # # Language Model # # constraints.append( # Google1BillionWordsLanguageModel( # top_n_per_index=4, compare_against_original=False # ) # ) # # Goal is untargeted classification # goal_function = UntargetedClassification(model) # # Perform word substitution with a genetic algorithm. # search_method = AlzantotGeneticAlgorithm(pop_size=60, max_iters=20, post_crossover_check=False) return Attack(goal_function, constraints, transformation, search_method)
def Kuleshov2017(model): """ Kuleshov, V. et al. Generating Natural Language Adversarial Examples. https://openreview.net/pdf?id=r1QZ3zbAZ. """ # # "Specifically, in all experiments, we used a target of τ = 0.7, # a neighborhood size of N = 15, and parameters λ_1 = 0.2 and δ = 0.5; we set # the syntactic bound to λ_2 = 2 nats for sentiment analysis" # # Word swap with top-15 counter-fitted embedding neighbors. # transformation = WordSwapEmbedding(max_candidates=15) # # Don't modify the same word twice or stopwords # constraints = [RepeatModification(), StopwordModification()] # # Maximum of 50% of words perturbed (δ in the paper). # constraints.append(MaxWordsPerturbed(max_percent=0.5)) # # Maximum thought vector Euclidean distance of λ_1 = 0.2. (eq. 4) # constraints.append( ThoughtVector(embedding_type='paragramcf', threshold=0.2, metric='max_euclidean')) # # # Maximum language model log-probability difference of λ_2 = 2. (eq. 5) # constraints.append(GPT2(max_log_prob_diff=2.0)) # # Goal is untargeted classification: reduce original probability score # to below τ = 0.7 (Algorithm 1). # goal_function = UntargetedClassification(model, target_max_score=0.7) # # Perform word substitution with a genetic algorithm. # search_method = GreedySearch() return Attack(goal_function, constraints, transformation, search_method)
def Pruthi2019(model, max_num_word_swaps=1): """ An implementation of the attack used in "Combating Adversarial Misspellings with Robust Word Recognition", Pruthi et al., 2019. This attack focuses on a small number of character-level changes that simulate common typos. It combines: - Swapping neighboring characters - Deleting characters - Inserting characters - Swapping characters for adjacent keys on a QWERTY keyboard. https://arxiv.org/abs/1905.11268 :param model: Model to attack. :param max_num_word_swaps: Maximum number of modifications to allow. """ # a combination of 4 different character-based transforms # ignore the first and last letter of each word, as in the paper transformation = CompositeTransformation( [ WordSwapNeighboringCharacterSwap( random_one=False, skip_first_char=True, skip_last_char=True ), WordSwapRandomCharacterDeletion( random_one=False, skip_first_char=True, skip_last_char=True ), WordSwapRandomCharacterInsertion( random_one=False, skip_first_char=True, skip_last_char=True ), WordSwapQWERTY(random_one=False, skip_first_char=True, skip_last_char=True), ] ) # only edit words of length >= 4, edit max_num_word_swaps words. # note that we also are not editing the same word twice, so # max_num_word_swaps is really the max number of character # changes that can be made. The paper looks at 1 and 2 char attacks. constraints = [ MinWordLength(min_length=4), StopwordModification(), MaxWordsPerturbed(max_num_words=max_num_word_swaps), RepeatModification(), ] # untargeted attack goal_function = UntargetedClassification(model) search_method = GreedySearch() return Attack(goal_function, constraints, transformation, search_method)
def HotFlipEbrahimi2017(model): """ Ebrahimi, J. et al. (2017) HotFlip: White-Box Adversarial Examples for Text Classification https://arxiv.org/abs/1712.06751 This is a reproduction of the HotFlip word-level attack (section 5 of the paper). """ # # "HotFlip ... uses the gradient with respect to a one-hot input # representation to efficiently estimate which individual change has the # highest estimated loss." transformation = WordSwapGradientBased(model, top_n=1) # # Don't modify the same word twice or stopwords # constraints = [RepeatModification(), StopwordModification()] # # 0. "We were able to create only 41 examples (2% of the correctly- # classified instances of the SST test set) with one or two flips." # constraints.append(MaxWordsPerturbed(max_num_words=2)) # # 1. "The cosine similarity between the embedding of words is bigger than a # threshold (0.8)." # constraints.append(WordEmbeddingDistance(min_cos_sim=0.8)) # # 2. "The two words have the same part-of-speech." # constraints.append(PartOfSpeech()) # # Goal is untargeted classification # goal_function = UntargetedClassification(model) # # "HotFlip ... uses a beam search to find a set of manipulations that work # well together to confuse a classifier ... The adversary uses a beam size # of 10." # search_method = BeamSearch(beam_width=10) return Attack(goal_function, constraints, transformation, search_method)
def build_attack(model_wrapper, target_class=-1): """ Same as bert-attack except: - it is TargetedClassification instead of Untargeted when target_class != -1 - using "bae" instead of "bert-attack" because of bert-attack's problem for subtokens Modified from https://github.com/QData/TextAttack/blob/36dfce6bdab933bdeed3a2093ae411e93018ebbf/textattack/attack_recipes/bert_attack_li_2020.py """ # transformation = WordSwapMaskedLM(method="bert-attack", max_candidates=48) transformation = WordSwapMaskedLM(method="bae", max_candidates=100) constraints = [RepeatModification(), StopwordModification()] constraints.append(MaxWordsPerturbed(max_percent=0.4)) use_constraint = UniversalSentenceEncoder( threshold=0.2, metric="cosine", compare_against_original=True, window_size=None, ) constraints.append(use_constraint) if target_class == -1: goal_function = UntargetedClassification(model_wrapper) else: # We modify the goal goal_function = TargetedClassification(model_wrapper, target_class=target_class) search_method = GreedyWordSwapWIR(wir_method="unk") return Attack(goal_function, constraints, transformation, search_method) # def build_attack_2(model_wrapper, target_class): # """ # Same as HotFlipEbrahimi2017 attack except: # - it is TargetedClassification instead of Untargeted # """ # transformation = WordSwapGradientBased(model_wrapper, top_n=1) # constraints = [RepeatModification(), StopwordModification()] # constraints.append(MaxWordsPerturbed(max_num_words=2)) # constraints.append(WordEmbeddingDistance(min_cos_sim=0.8)) # constraints.append(PartOfSpeech()) # goal_function = TargetedClassification(model_wrapper) # search_method = BeamSearch(beam_width=10) # return Attack(goal_function, constraints, transformation, search_method)
def IGAWang2019(model): """ Xiaosen Wang, Hao Jin, Kun He (2019). Natural Language Adversarial Attack and Defense in Word Level. http://arxiv.org/abs/1909.06723 """ # # Swap words with their embedding nearest-neighbors. # Embedding: Counter-fitted Paragram Embeddings. # Fix the hyperparameter value to N = Unrestricted (50)." # transformation = WordSwapEmbedding(max_candidates=50) # # Don't modify the stopwords # constraints = [StopwordModification()] # # Maximum words perturbed percentage of 20% # constraints.append(MaxWordsPerturbed(max_percent=0.2)) # # Maximum word embedding euclidean distance δ of 0.5. # constraints.append( WordEmbeddingDistance(max_mse_dist=0.5, compare_against_original=False)) # # Goal is untargeted classification # goal_function = UntargetedClassification(model) # # Perform word substitution with an improved genetic algorithm. # Fix the hyperparameter values to S = 60, M = 20, λ = 5." # search_method = GeneticAlgorithm( pop_size=60, max_iters=20, improved_genetic_algorithm=True, max_replace_times_per_index=5, post_crossover_check=False, ) return Attack(goal_function, constraints, transformation, search_method)
def Alzantot2018(model): """ Alzantot, M., Sharma, Y., Elgohary, A., Ho, B., Srivastava, M.B., & Chang, K. (2018). Generating Natural Language Adversarial Examples. https://arxiv.org/abs/1801.00554 """ # # Swap words with their embedding nearest-neighbors. # # Embedding: Counter-fitted Paragram Embeddings. # # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5" # transformation = WordSwapEmbedding(max_candidates=8) # # Don't modify the same word twice or stopwords # constraints = [RepeatModification(), StopwordModification()] # # Maximum words perturbed percentage of 20% # constraints.append(MaxWordsPerturbed(max_percent=0.2)) # # Maximum word embedding euclidean distance of 0.5. # constraints.append(WordEmbeddingDistance(max_mse_dist=0.5)) # # Language Model # constraints.append(Google1BillionWordsLanguageModel(top_n_per_index=4)) # # Goal is untargeted classification # goal_function = UntargetedClassification(model) # # Perform word substitution with a genetic algorithm. # search_method = GeneticAlgorithm(pop_size=60, max_iters=20) return Attack(goal_function, constraints, transformation, search_method)
def build(model_wrapper): # # Swap words with their embedding nearest-neighbors. # Embedding: Counter-fitted Paragram Embeddings. # Fix the hyperparameter value to N = Unrestricted (50)." # transformation = WordSwapEmbedding(max_candidates=50) # # Don't modify the stopwords # constraints = [StopwordModification()] # # Maximum words perturbed percentage of 20% # constraints.append(MaxWordsPerturbed(max_percent=0.2)) # # Maximum word embedding euclidean distance δ of 0.5. # constraints.append( WordEmbeddingDistance(max_mse_dist=0.5, compare_against_original=False)) # # Goal is untargeted classification # goal_function = UntargetedClassification(model_wrapper) # # Perform word substitution with an improved genetic algorithm. # Fix the hyperparameter values to S = 60, M = 20, λ = 5." # search_method = ImprovedGeneticAlgorithm( pop_size=60, max_iters=20, max_replace_times_per_index=5, post_crossover_check=False, ) return Attack(goal_function, constraints, transformation, search_method)
def build(model): # # Section 5: Experiments # # We base our sets of allowed word substitutions S(x, i) on the # substitutions allowed by Alzantot et al. (2018). They demonstrated that # their substitutions lead to adversarial examples that are qualitatively # similar to the original input and retain the original label, as judged # by humans. Alzantot et al. (2018) define the neighbors N(w) of a word w # as the n = 8 nearest neighbors of w in a “counter-fitted” word vector # space where antonyms are far apart (Mrksiˇ c´ et al., 2016). The # neighbors must also lie within some Euclidean distance threshold. They # also use a language model constraint to avoid nonsensical perturbations: # they allow substituting xi with x˜i ∈ N(xi) if and only if it does not # decrease the log-likelihood of the text under a pre-trained language # model by more than some threshold. # # We make three modifications to this approach: # # First, in Alzantot et al. (2018), the adversary # applies substitutions one at a time, and the # neighborhoods and language model scores are computed. # Equation (4) must be applied before the model # can combine information from multiple words, but it can # be delayed until after processing each word independently. # Note that the model itself classifies using a different # set of pre-trained word vectors; the counter-fitted vectors # are only used to define the set of allowed substitution words. # relative to the current altered version of the input. # This results in a hard-to-define attack surface, as # changing one word can allow or disallow changes # to other words. It also requires recomputing # language model scores at each iteration of the genetic # attack, which is inefficient. Moreover, the same # word can be substituted multiple times, leading # to semantic drift. We define allowed substitutions # relative to the original sentence x, and disallow # repeated substitutions. # # Second, we use a faster language model that allows us to query # longer contexts; Alzantot et al. (2018) use a slower language # model and could only query it with short contexts. # Finally, we use the language model constraint only # at test time; the model is trained against all perturbations in N(w). This encourages the model to be # robust to a larger space of perturbations, instead of # specializing for the particular choice of language # model. See Appendix A.3 for further details. [This is a model-specific # adjustment, so does not affect the attack recipe.] # # Appendix A.3: # # In Alzantot et al. (2018), the adversary applies replacements one at a # time, and the neighborhoods and language model scores are computed # relative to the current altered version of the input. This results in a # hard-to-define attack surface, as the same word can be replaced many # times, leading to semantic drift. We instead pre-compute the allowed # substitutions S(x, i) at index i based on the original x. We define # S(x, i) as the set of x_i ∈ N(x_i) such that where probabilities are # assigned by a pre-trained language model, and the window radius W and # threshold δ are hyperparameters. We use W = 6 and δ = 5. # # # Swap words with their embedding nearest-neighbors. # # Embedding: Counter-fitted Paragram Embeddings. # # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5" # transformation = WordSwapEmbedding(max_candidates=8) # # Don't modify the same word twice or stopwords # constraints = [RepeatModification(), StopwordModification()] # # Maximum words perturbed percentage of 20% # constraints.append(MaxWordsPerturbed(max_percent=0.2)) # # Maximum word embedding euclidean distance of 0.5. # constraints.append(WordEmbeddingDistance(max_mse_dist=0.5)) # # Language Model # # # constraints.append( LearningToWriteLanguageModel(window_size=6, max_log_prob_diff=5.0, compare_against_original=True)) # constraints.append(LearningToWriteLanguageModel(window_size=5)) # # Goal is untargeted classification # goal_function = UntargetedClassification(model) # # Perform word substitution with a genetic algorithm. # search_method = AlzantotGeneticAlgorithm(pop_size=60, max_iters=20, post_crossover_check=False) return Attack(goal_function, constraints, transformation, search_method)
def BERTAttackLi2020(model): """ Li, L.., Ma, R., Guo, Q., Xiangyang, X., Xipeng, Q. (2020). BERT-ATTACK: Adversarial Attack Against BERT Using BERT https://arxiv.org/abs/2004.09984 This is "attack mode" 1 from the paper, BAE-R, word replacement. """ from textattack.shared.utils import logger logger.warn( "WARNING: This BERT-Attack implementation is based off of a" " preliminary draft of the paper, which lacked source code and" " did not include any hyperparameters. Attack reuslts are likely to" " change." ) # [from correspondence with the author] # Candidate size K is set to 48 for all data-sets. transformation = WordSwapMaskedLM(method="bert-attack", max_candidates=48) # # Don't modify the same word twice or stopwords. # constraints = [RepeatModification(), StopwordModification()] # "We only take ε percent of the most important words since we tend to keep # perturbations minimum." # # [from correspondence with the author] # "Word percentage allowed to change is set to 0.4 for most data-sets, this # parameter is trivial since most attacks only need a few changes. This # epsilon is only used to avoid too much queries on those very hard samples." constraints.append(MaxWordsPerturbed(max_percent=0.4)) # "As used in TextFooler (Jin et al., 2019), we also use Universal Sentence # Encoder (Cer et al., 2018) to measure the semantic consistency between the # adversarial sample and the original sequence. To balance between semantic # preservation and attack success rate, we set up a threshold of semantic # similarity score to filter the less similar examples." # # [from correspondence with author] # "Over the full texts, after generating all the adversarial samples, we filter # out low USE score samples. Thus the success rate is lower but the USE score # can be higher. (actually USE score is not a golden metric, so we simply # measure the USE score over the final texts for a comparison with TextFooler). # For datasets like IMDB, we set a higher threshold between 0.4-0.7; for # datasets like MNLI, we set threshold between 0-0.2." # # Since the threshold in the real world can't be determined from the training # data, the TextAttack implementation uses a fixed threshold - determined to # be 0.2 to be most fair. use_constraint = UniversalSentenceEncoder( threshold=0.2, metric="cosine", compare_with_original=True, window_size=None, ) constraints.append(use_constraint) # # Goal is untargeted classification. # goal_function = UntargetedClassification(model) # # "We first select the words in the sequence which have a high significance # influence on the final output logit. Let S = [w0, ··· , wi ··· ] denote # the input sentence, and oy(S) denote the logit output by the target model # for correct label y, the importance score Iwi is defined as # Iwi = oy(S) − oy(S\wi), where S\wi = [w0, ··· , wi−1, [MASK], wi+1, ···] # is the sentence after replacing wi with [MASK]. Then we rank all the words # according to the ranking score Iwi in descending order to create word list # L." search_method = GreedyWordSwapWIR(wir_method="unk") return Attack(goal_function, constraints, transformation, search_method)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--num_examples", default=3000, type=int) #50485 parser.add_argument("--model", default="hfl/chinese-roberta-wwm-ext", type=str) parser.add_argument("--num_labels", default=3, type=int) parser.add_argument("--cuda", default=0, type=int) parser.add_argument("--tokenizer", default="hfl/chinese-roberta-wwm-ext", type=str) parser.add_argument( "--transformation", type=str, required=False, default="word-swap-embedding", help= 'The transformation to apply. Usage: "--transformation {transformation}:{arg_1}={value_1},{arg_3}={value_3}". Choices: ', ) # add_model_args(parser) # add_dataset_args(parser) parser.add_argument( "--constraints", type=str, required=False, nargs="*", default=["repeat", "stopword"], help= 'Constraints to add to the attack. Usage: "--constraints {constraint}:{arg_1}={value_1},{arg_3}={value_3}". Choices: ', ) parser.add_argument( "--log-to-txt", "-l", nargs="?", default=None, const="", type=str, help= "Save attack logs to <install-dir>/outputs/~ by default; Include '/' at the end of argument to save " "output to specified directory in default naming convention; otherwise enter argument to specify " "file name", ) parser.add_argument( "--log-to-csv", nargs="?", default= "/home/guest/r09944010/2020MLSECURITY/final/ml-security-proj/attack/OCNLI/roberta/", const="", type=str, help= "Save attack logs to <install-dir>/outputs/~ by default; Include '/' at the end of argument to save " "output to specified directory in default naming convention; otherwise enter argument to specify " "file name", ) parser.add_argument( "--csv-style", default=None, const="fancy", nargs="?", type=str, help="Use --csv-style plain to remove [[]] around words", ) parser.add_argument("--enable-visdom", action="store_true", help="Enable logging to visdom.") parser.add_argument( "--enable-wandb", action="store_true", help="Enable logging to Weights & Biases.", ) parser.add_argument("--disable-stdout", action="store_true", help="Disable logging to stdout") parser.add_argument( "--interactive", action="store_true", default=False, help="Whether to run attacks interactively.", ) parser.add_argument( "--attack-n", action="store_true", default=False, help= "Whether to run attack until `n` examples have been attacked (not skipped).", ) parser.add_argument( "--parallel", action="store_true", default=False, help="Run attack using multiple GPUs.", ) # goal_function_choices = ", ".join(GOAL_FUNCTION_CLASS_NAMES.keys()) parser.add_argument( "--goal-function", "-g", default="untargeted-classification", # help=f"The goal function to use. choices: {goal_function_choices}", ) def str_to_int(s): return sum((ord(c) for c in s)) parser.add_argument("--random-seed", default=str_to_int("TEXTATTACK"), type=int) parser.add_argument( "--checkpoint-dir", required=False, type=str, default=None, help="The directory to save checkpoint files.", ) parser.add_argument( "--checkpoint-interval", required=False, type=int, help= "If set, checkpoint will be saved after attacking every N examples. If not set, no checkpoints will be saved.", ) parser.add_argument( "--query-budget", "-q", type=int, default=float("inf"), help= "The maximum number of model queries allowed per example attacked.", ) parser.add_argument( "--model-batch-size", type=int, default=26, help="The batch size for making calls to the model.", ) parser.add_argument( "--model-cache-size", type=int, default=2**18, help= "The maximum number of items to keep in the model results cache at once.", ) parser.add_argument( "--constraint-cache-size", type=int, default=2**18, help= "The maximum number of items to keep in the constraints cache at once.", ) attack_group = parser.add_mutually_exclusive_group(required=False) attack_group.add_argument( "--search", "--search-method", "-s", type=str, required=False, default="greedy-word-wir", # help=f"The search method to use. choices: {search_choices}", ) attack_group.add_argument( "--recipe", "--attack-recipe", "-r", type=str, required=False, default=None, # help="full attack recipe (overrides provided goal function, transformation & constraints)", # choices=ATTACK_RECIPE_NAMES.keys(), ) attack_group.add_argument( "--attack-from-file", type=str, required=False, default=None, help= "attack to load from file (overrides provided goal function, transformation & constraints)", ) args = parser.parse_args() # dataset = load_dataset() dataset = load_ocnliDataset(split="dev") dataset = HuggingFaceDataset(dataset) num_remaining_attacks = args.num_examples worklist = deque(range(0, args.num_examples)) worklist_tail = worklist[-1] config = BertConfig.from_pretrained( "hfl/chinese-macbert-base") # "hfl/chinese-macbert-base" config.output_attentions = False config.output_token_type_ids = False # config.max_length = 30 tokenizer = BertTokenizer.from_pretrained("hfl/chinese-macbert-base", config=config) config = AutoConfig.from_pretrained( './models/roberta/chinese-roberta-wwm-ext-OCNLI-2021-01-05-23-46-02-975289', num_labels=3) model = AutoModelForSequenceClassification.from_pretrained( './models/roberta/chinese-roberta-wwm-ext-OCNLI-2021-01-05-23-46-02-975289', config=config, ) model_wrapper = HuggingFaceModelWrapper(model, tokenizer, batch_size=28) # goal function goal_function = UntargetedClassification(model_wrapper) # constraints # stopwords = set( # ["个", "关于", "之上", "across", "之后", "afterwards", "再次", "against", "ain", "全部", "几乎", "单独", "along", "早已", "也", "虽然", "是", "among", "amongst", "一个", "和", "其他", "任何", "anyhow", "任何人", "anything", "anyway", "anywhere", "are", "aren", "没有", "around", "as", "at", "后", "been", "之前", "beforehand", "behind", "being", "below", "beside", "besides", "之間", "beyond", "皆是", "但", "by", "可以", "不可以", "是", "不是", "couldn't", "d", "didn", "didn't", "doesn", "doesn't", "don", "don't", "down", "due", "之間", "either", "之外", "elsewhere", "空", "足夠", "甚至", "ever", "任何人", "everything", "everywhere", "except", "first", "for", "former", "formerly", "from", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "i", "if", "in", "indeed", "into", "is", "isn", "isn't", "it", "it's", "its", "itself", "just", "latter", "latterly", "least", "ll", "may", "me", "meanwhile", "mightn", "mightn't", "mine", "more", "moreover", "most", "mostly", "must", "mustn", "mustn't", "my", "myself", "namely", "needn", "needn't", "neither", "never", "nevertheless", "next", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "o", "of", "off", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "per", "please", "s", "same", "shan", "shan't", "she", "she's", "should've", "shouldn", "shouldn't", "somehow", "something", "sometime", "somewhere", "such", "t", "than", "that", "that'll", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "this", "those", "through", "throughout", "thru", "thus", "to", "too", "toward", "towards", "under", "unless", "until", "up", "upon", "used", "ve", "was", "wasn", "wasn't", "we", "were", "weren", "weren't", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "with", "within", "without", "won", "won't", "would", "wouldn", "wouldn't", "y", "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"] # ) constraints = [RepeatModification(), StopwordModification()] # constraints = [RepeatModification(), StopwordModification(stopwords=stopwords)] input_column_modification = InputColumnModification( ["premise", "hypothesis"], {"premise"}) constraints.append(input_column_modification) constraints.append(MaxWordsPerturbed(max_percent=0.2)) constraints.append( WordEmbeddingDistance(max_mse_dist=0.5, compare_against_original=False)) # constraints.append( # Google1BillionWordsLanguageModel( # top_n_per_index=4, compare_against_original=False # ) # ) # use_constraint = UniversalSentenceEncoder( # threshold=0.840845057, # metric="angular", # compare_against_original=False, # window_size=15, # skip_text_shorter_than_window=True, # ) # constraints.append(use_constraint) transformation = WordSwapEmbedding(max_candidates=8) # transformation = WordDeletion() # search methods # search_method = GreedyWordSwapWIR(wir_method="delete") search_method = AlzantotGeneticAlgorithm(pop_size=60, max_iters=20, post_crossover_check=False) start_time = time.time() textattack.shared.utils.set_seed(args.random_seed) # attack attack = Attack(goal_function, constraints, transformation, search_method) print(attack) attack_log_manager = parse_logger_from_args(args) pbar = tqdm.tqdm(total=num_remaining_attacks, smoothing=0) num_results = 0 num_failures = 0 num_successes = 0 for result in attack.attack_dataset(dataset, indices=worklist): attack_log_manager.log_result(result) if not args.disable_stdout: print("\n") if (not args.attack_n) or (not isinstance( result, textattack.attack_results.SkippedAttackResult)): pbar.update(1) else: # worklist_tail keeps track of highest idx that has been part of worklist # Used to get the next dataset element when attacking with `attack_n` = True. worklist_tail += 1 worklist.append(worklist_tail) num_results += 1 if (type(result) == textattack.attack_results.SuccessfulAttackResult or type(result) == textattack.attack_results.MaximizedAttackResult): num_successes += 1 if type(result) == textattack.attack_results.FailedAttackResult: num_failures += 1 pbar.set_description( "[Succeeded / Failed / Total] {} / {} / {}".format( num_successes, num_failures, num_results)) if (args.checkpoint_interval and len(attack_log_manager.results) % args.checkpoint_interval == 0): new_checkpoint = textattack.shared.Checkpoint( args, attack_log_manager, worklist, worklist_tail) new_checkpoint.save() attack_log_manager.flush() pbar.close() print() # Enable summary stdout if args.disable_stdout: attack_log_manager.enable_stdout() attack_log_manager.log_summary() attack_log_manager.flush() print() # finish_time = time.time() textattack.shared.logger.info(f"Attack time: {time.time()}s") attack_log_manager.results
def build(model, max_perturbed_percent, synonym_boolean): # "In this paper, we present a simple yet novel technique: BAE (BERT-based # Adversarial Examples), which uses a language model (LM) for token # replacement to best fit the overall context. We perturb an input sentence # by either replacing a token or inserting a new token in the sentence, by # means of masking a part of the input and using a LM to fill in the mask." # # We only consider the top K=50 synonyms from the MLM predictions. # # [from email correspondance with the author] # "When choosing the top-K candidates from the BERT masked LM, we filter out # the sub-words and only retain the whole words (by checking if they are # present in the GloVE vocabulary)" # transformation = WordSwapMaskedLM(method="bae", max_candidates=50, min_confidence=0.0) # # Don't modify the same word twice or stopwords. # constraints = [RepeatModification(), StopwordModification()] # For the R operations we add an additional check for # grammatical correctness of the generated adversarial example by filtering # out predicted tokens that do not form the same part of speech (POS) as the # original token t_i in the sentence. constraints.append(PartOfSpeech(allow_verb_noun_swap=True)) # "To ensure semantic similarity on introducing perturbations in the input # text, we filter the set of top-K masked tokens (K is a pre-defined # constant) predicted by BERT-MLM using a Universal Sentence Encoder (USE) # (Cer et al., 2018)-based sentence similarity scorer." # # "[We] set a threshold of 0.8 for the cosine similarity between USE-based # embeddings of the adversarial and input text." # # [from email correspondence with the author] # "For a fair comparison of the benefits of using a BERT-MLM in our paper, # we retained the majority of TextFooler's specifications. Thus we: # 1. Use the USE for comparison within a window of size 15 around the word # being replaced/inserted. # 2. Set the similarity score threshold to 0.1 for inputs shorter than the # window size (this translates roughly to almost always accepting the new text). # 3. Perform the USE similarity thresholding of 0.8 with respect to the text # just before the replacement/insertion and not the original text (For # example: at the 3rd R/I operation, we compute the USE score on a window # of size 15 of the text obtained after the first 2 R/I operations and not # the original text). # ... # To address point (3) from above, compare the USE with the original text # at each iteration instead of the current one (While doing this change # for the R-operation is trivial, doing it for the I-operation with the # window based USE comparison might be more involved)." # # Finally, since the BAE code is based on the TextFooler code, we need to # adjust the threshold to account for the missing / pi in the cosine # similarity comparison. So the final threshold is 1 - (1 - 0.8) / pi # = 1 - (0.2 / pi) = 0.936338023. use_constraint = UniversalSentenceEncoder( threshold=0.936338023, metric="cosine", compare_against_original=True, window_size=15, skip_text_shorter_than_window=True, ) constraints.append(use_constraint) # "We only take ε percent of the most important words since we tend to keep # perturbations minimum." if max_perturbed_percent != 1: constraints.append( MaxWordsPerturbed(max_percent=max_perturbed_percent)) if synonym_boolean: constraints.append(SynonymConstraint(False)) # # Goal is untargeted classification. # goal_function = UntargetedClassification(model) # # "We estimate the token importance Ii of each token # t_i ∈ S = [t1, . . . , tn], by deleting ti from S and computing the # decrease in probability of predicting the correct label y, similar # to (Jin et al., 2019). # # • "If there are multiple tokens can cause C to misclassify S when they # replace the mask, we choose the token which makes Sadv most similar to # the original S based on the USE score." # • "If no token causes misclassification, we choose the perturbation that # decreases the prediction probability P(C(Sadv)=y) the most." # search_method = GreedyWordSwapWIR(wir_method="delete") return BAEGarg2019(goal_function, constraints, transformation, search_method)