Ejemplo n.º 1
0
    def __init__(self, **kwargs):
        from textattack.transformations import WordSwapEmbedding

        transformation = WordSwapEmbedding(max_candidates=50)
        from textattack.constraints.semantics import WordEmbeddingDistance

        constraints = DEFAULT_CONSTRAINTS + [WordEmbeddingDistance(min_cos_sim=0.8)]
        super().__init__(transformation, constraints=constraints, **kwargs)
Ejemplo n.º 2
0
def GeneticAlgorithmAlzantot2018(model):
    """Alzantot, M., Sharma, Y., Elgohary, A., Ho, B., Srivastava, M.B., &
    Chang, K. (2018).

    Generating Natural Language Adversarial Examples.

    https://arxiv.org/abs/1804.07998
    """
    #
    # Swap words with their embedding nearest-neighbors.
    #
    # Embedding: Counter-fitted Paragram Embeddings.
    #
    # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5"
    #
    transformation = WordSwapEmbedding(max_candidates=8)
    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]
    #
    # During entailment, we should only edit the hypothesis - keep the premise
    # the same.
    #
    input_column_modification = InputColumnModification(
        ["premise", "hypothesis"], {"premise"})
    constraints.append(input_column_modification)
    #
    # Maximum words perturbed percentage of 20%
    #
    constraints.append(MaxWordsPerturbed(max_percent=0.2))
    #
    # Maximum word embedding euclidean distance of 0.5.
    #
    constraints.append(
        WordEmbeddingDistance(max_mse_dist=0.5,
                              compare_against_original=False))
    #
    # Language Model
    #
    constraints.append(
        Google1BillionWordsLanguageModel(top_n_per_index=4,
                                         compare_against_original=False))
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Perform word substitution with a genetic algorithm.
    #
    search_method = GeneticAlgorithm(pop_size=60,
                                     max_iters=20,
                                     post_crossover_check=False)

    return Attack(goal_function, constraints, transformation, search_method)
    def build(model):
        #
        # Swap words with their embedding nearest-neighbors.
        #
        # Embedding: Counter-fitted Paragram Embeddings.
        #
        # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5"
        #
        transformation = WordSwapEmbedding(max_candidates=8)
        #
        # Don't modify the same word twice or stopwords
        #
        constraints = [RepeatModification(), StopwordModification()]
        #
        # During entailment, we should only edit the hypothesis - keep the premise
        # the same.
        #
        input_column_modification = InputColumnModification(
            ["premise", "hypothesis"], {"premise"})
        constraints.append(input_column_modification)
        #
        # Maximum words perturbed percentage of 20%
        #
        constraints.append(MaxWordsPerturbed(max_percent=0.2))
        #
        # Maximum word embedding euclidean distance of 0.5.
        #
        constraints.append(
            WordEmbeddingDistance(max_mse_dist=0.5,
                                  compare_against_original=False))
        #
        # Language Model
        #
        # constraints.append(
        #     Google1BillionWordsLanguageModel(
        #         top_n_per_index=4, compare_against_original=False
        #     )
        # )
        #
        # Goal is untargeted classification
        #
        goal_function = UntargetedClassification(model)
        #
        # Perform word substitution with a genetic algorithm.
        #
        search_method = AlzantotGeneticAlgorithm(pop_size=60,
                                                 max_iters=20,
                                                 post_crossover_check=False)

        return Attack(goal_function, constraints, transformation,
                      search_method)
Ejemplo n.º 4
0
def TextFoolerJin2019Adjusted(model, SE_thresh=0.98, sentence_encoder='bert'):
    #
    # Swap words with their embedding nearest-neighbors. 
    #
    # Embedding: Counter-fitted PARAGRAM-SL999 vectors.
    #
    # 50 nearest-neighbors with a cosine similarity of at least 0.5.
    # (The paper claims 0.7, but analysis of the code and some empirical
    # results show that it's definitely 0.5.)
    #
    transformation = WordSwapEmbedding(max_candidates=50, textfooler_stopwords=True)
    #
    # Minimum word embedding cosine similarity of 0.9.
    #
    constraints = []
    constraints.append(
            WordEmbeddingDistance(min_cos_sim=0.9)
    )
    #
    # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7.
    #
    if sentence_encoder == 'bert':
        se_constraint = BERT(threshold=SE_thresh,
            metric='cosine', compare_with_original=False, window_size=15,
            skip_text_shorter_than_window=False)
    else:
        se_constraint = UniversalSentenceEncoder(threshold=SE_thresh,
            metric='cosine', compare_with_original=False, window_size=15,
            skip_text_shorter_than_window=False)
    constraints.append(se_constraint)
    #
    # Do grammar checking
    #
    constraints.append(
            LanguageTool(0)
    )
    
    #
    # Untargeted attack   
    #
    goal_function = UntargetedClassification(model)

    #
    # Greedily swap words with "Word Importance Ranking".
    #
    attack = GreedyWordSwapWIR(goal_function, transformation=transformation,
        constraints=constraints, max_depth=None)
    
    return attack
Ejemplo n.º 5
0
def TextFoolerJin2019(model):
    #
    # Swap words with their embedding nearest-neighbors.
    #
    # Embedding: Counter-fitted PARAGRAM-SL999 vectors.
    #
    # 50 nearest-neighbors with a cosine similarity of at least 0.5.
    # (The paper claims 0.7, but analysis of the code and some empirical
    # results show that it's definitely 0.5.)
    #
    transformation = WordSwapEmbedding(max_candidates=50,
                                       textfooler_stopwords=True)
    #
    # Minimum word embedding cosine similarity of 0.5.
    #
    constraints = []
    constraints.append(WordEmbeddingDistance(min_cos_sim=0.5))
    #
    # Only replace words with the same part of speech (or nouns with verbs)
    #
    constraints.append(PartOfSpeech(allow_verb_noun_swap=True))
    #
    # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7.
    #
    # In the TextFooler code, they forget to divide the angle between the two
    # embeddings by pi. So if the original threshold was that 1 - sim >= 0.7, the
    # new threshold is 1 - (0.3) / pi = 0.90445
    #
    use_constraint = UniversalSentenceEncoder(
        threshold=0.904458599,
        metric='angular',
        compare_with_original=False,
        window_size=15,
        skip_text_shorter_than_window=True)
    constraints.append(use_constraint)
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Greedily swap words with "Word Importance Ranking".
    #
    attack = GreedyWordSwapWIR(goal_function,
                               transformation=transformation,
                               constraints=constraints,
                               max_depth=None)

    return attack
def HotFlipEbrahimi2017(model):
    """
        Ebrahimi, J. et al. (2017)
        
        HotFlip: White-Box Adversarial Examples for Text Classification
        
        https://arxiv.org/abs/1712.06751
        
        This is a reproduction of the HotFlip word-level attack (section 5 of the 
        paper).
    """
    #
    # "HotFlip ... uses the gradient with respect to a one-hot input
    # representation to efficiently estimate which individual change has the
    # highest estimated loss."
    transformation = WordSwapGradientBased(model, top_n=1)
    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]
    #
    # 0. "We were able to create only 41 examples (2% of the correctly-
    # classified instances of the SST test set) with one or two flips."
    #
    constraints.append(MaxWordsPerturbed(max_num_words=2))
    #
    # 1. "The cosine similarity between the embedding of words is bigger than a
    #   threshold (0.8)."
    #
    constraints.append(WordEmbeddingDistance(min_cos_sim=0.8))
    #
    # 2. "The two words have the same part-of-speech."
    #
    constraints.append(PartOfSpeech())
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # "HotFlip ... uses a beam search to find a set of manipulations that work
    # well together to confuse a classifier ... The adversary uses a beam size
    # of 10."
    #
    search_method = BeamSearch(beam_width=10)

    return Attack(goal_function, constraints, transformation, search_method)
Ejemplo n.º 7
0
    def build(model_wrapper, mlm=False):
        """Build attack recipe.

        Args:
            model_wrapper (:class:`~textattack.models.wrappers.ModelWrapper`):
                Model wrapper containing both the model and the tokenizer.
            mlm (:obj:`bool`, `optional`, defaults to :obj:`False`):
                If :obj:`True`, load `A2T-MLM` attack. Otherwise, load regular `A2T` attack.

        Returns:
            :class:`~textattack.Attack`: A2T attack.
        """
        constraints = [RepeatModification(), StopwordModification()]
        input_column_modification = InputColumnModification(
            ["premise", "hypothesis"], {"premise"})
        constraints.append(input_column_modification)
        constraints.append(PartOfSpeech(allow_verb_noun_swap=False))
        constraints.append(MaxModificationRate(max_rate=0.1, min_threshold=4))
        sent_encoder = BERT(model_name="stsb-distilbert-base",
                            threshold=0.9,
                            metric="cosine")
        constraints.append(sent_encoder)

        if mlm:
            transformation = transformation = WordSwapMaskedLM(
                method="bae",
                max_candidates=20,
                min_confidence=0.0,
                batch_size=16)
        else:
            transformation = WordSwapEmbedding(max_candidates=20)
            constraints.append(WordEmbeddingDistance(min_cos_sim=0.8))

        #
        # Goal is untargeted classification
        #
        goal_function = UntargetedClassification(model_wrapper,
                                                 model_batch_size=32)
        #
        # Greedily swap words with "Word Importance Ranking".
        #
        search_method = GreedyWordSwapWIR(wir_method="gradient")

        return Attack(goal_function, constraints, transformation,
                      search_method)
Ejemplo n.º 8
0
def IGAWang2019(model):
    """
        Xiaosen Wang, Hao Jin, Kun He (2019). 
        
        Natural Language Adversarial Attack and Defense in Word Level. 
        
        http://arxiv.org/abs/1909.06723 
    """
    #
    # Swap words with their embedding nearest-neighbors.
    # Embedding: Counter-fitted Paragram Embeddings.
    # Fix the hyperparameter value to N = Unrestricted (50)."
    #
    transformation = WordSwapEmbedding(max_candidates=50)
    #
    # Don't modify the stopwords
    #
    constraints = [StopwordModification()]
    #
    # Maximum words perturbed percentage of 20%
    #
    constraints.append(MaxWordsPerturbed(max_percent=0.2))
    #
    # Maximum word embedding euclidean distance δ of 0.5.
    #
    constraints.append(
        WordEmbeddingDistance(max_mse_dist=0.5,
                              compare_against_original=False))
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Perform word substitution with an improved genetic algorithm.
    # Fix the hyperparameter values to S = 60, M = 20, λ = 5."
    #
    search_method = GeneticAlgorithm(
        pop_size=60,
        max_iters=20,
        improved_genetic_algorithm=True,
        max_replace_times_per_index=5,
        post_crossover_check=False,
    )

    return Attack(goal_function, constraints, transformation, search_method)
Ejemplo n.º 9
0
def Alzantot2018Adjusted(model, SE_thresh=0.98, sentence_encoder='bert'):
    #
    # Swap words with their embedding nearest-neighbors. 
    #
    # Embedding: Counter-fitted PARAGRAM-SL999 vectors.
    #
    # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5"
    #
    transformation = WordSwapEmbedding(max_candidates=50, textfooler_stopwords=True)
    #
    # Minimum word embedding cosine similarity of 0.9.
    #
    constraints = []
    constraints.append(
            WordEmbeddingDistance(min_cos_sim=0.9)
    )
    #
    # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7.
    #
    if sentence_encoder == 'bert':
        se_constraint = BERT(threshold=SE_thresh,
            metric='cosine', compare_with_original=False, window_size=15,
            skip_text_shorter_than_window=False)
    else:
        se_constraint = UniversalSentenceEncoder(threshold=SE_thresh,
            metric='cosine', compare_with_original=False, window_size=15,
            skip_text_shorter_than_window=False)
    constraints.append(se_constraint)
    #
    # Do grammar checking
    #
    constraints.append(
            LanguageTool(0)
    )
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Greedily swap words with "Word Importance Ranking".
    #
    attack = GeneticAlgorithm(goal_function, transformation=transformation, 
        constraints=constraints, pop_size=60, max_iters=20)
    return attack
Ejemplo n.º 10
0
def Alzantot2018(model):
    """
        Alzantot, M., Sharma, Y., Elgohary, A., Ho, B., Srivastava, M.B., & Chang, K. (2018). 
        
        Generating Natural Language Adversarial Examples. 
        
        https://arxiv.org/abs/1801.00554 
    """
    #
    # Swap words with their embedding nearest-neighbors.
    #
    # Embedding: Counter-fitted Paragram Embeddings.
    #
    # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5"
    #
    transformation = WordSwapEmbedding(max_candidates=8)
    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]
    #
    # Maximum words perturbed percentage of 20%
    #
    constraints.append(MaxWordsPerturbed(max_percent=0.2))
    #
    # Maximum word embedding euclidean distance of 0.5.
    #
    constraints.append(WordEmbeddingDistance(max_mse_dist=0.5))
    #
    # Language Model
    #
    constraints.append(Google1BillionWordsLanguageModel(top_n_per_index=4))
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Perform word substitution with a genetic algorithm.
    #
    search_method = GeneticAlgorithm(pop_size=60, max_iters=20)

    return Attack(goal_function, constraints, transformation, search_method)
Ejemplo n.º 11
0
def HotFlipEbrahimi2017(model):
    #
    # "HotFlip ... uses the gradient with respect to a one-hot input
    # representation to efficiently estimate which individual change has the
    # highest estimated loss."
    transformation = GradientBasedWordSwap(model,
                                           top_n=1,
                                           replace_stopwords=False)
    constraints = []
    #
    # 0. "We were able to create only 41 examples (2% of the correctly-
    # classified instances of the SST test set) with one or two flips."
    #
    constraints.append(WordsPerturbed(max_num_words=2))
    #
    # 1. "The cosine similarity between the embedding of words is bigger than a
    #   threshold (0.8)."
    #
    constraints.append(WordEmbeddingDistance(min_cos_sim=0.8))
    #
    # 2. "The two words have the same part-of-speech."
    #
    constraints.append(PartOfSpeech())
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # "HotFlip ... uses a beam search to find a set of manipulations that work
    # well together to confuse a classifier ... The adversary uses a beam size
    # of 10."
    #
    attack = BeamSearch(goal_function,
                        constraints=constraints,
                        transformation=transformation,
                        beam_width=10)

    return attack
Ejemplo n.º 12
0
    def build(model_wrapper):
        #
        # Swap words with their embedding nearest-neighbors.
        # Embedding: Counter-fitted Paragram Embeddings.
        # Fix the hyperparameter value to N = Unrestricted (50)."
        #
        transformation = WordSwapEmbedding(max_candidates=50)
        #
        # Don't modify the stopwords
        #
        constraints = [StopwordModification()]
        #
        # Maximum words perturbed percentage of 20%
        #
        constraints.append(MaxWordsPerturbed(max_percent=0.2))
        #
        # Maximum word embedding euclidean distance δ of 0.5.
        #
        constraints.append(
            WordEmbeddingDistance(max_mse_dist=0.5,
                                  compare_against_original=False))
        #
        # Goal is untargeted classification
        #
        goal_function = UntargetedClassification(model_wrapper)
        #
        # Perform word substitution with an improved genetic algorithm.
        # Fix the hyperparameter values to S = 60, M = 20, λ = 5."
        #
        search_method = ImprovedGeneticAlgorithm(
            pop_size=60,
            max_iters=20,
            max_replace_times_per_index=5,
            post_crossover_check=False,
        )

        return Attack(goal_function, constraints, transformation,
                      search_method)
Ejemplo n.º 13
0
def Alzantot2018(model):
    #
    # Swap words with their embedding nearest-neighbors.
    #
    # Embedding: Counter-fitted Paragram Embeddings.
    #
    # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5"
    #
    transformation = WordSwapEmbedding(max_candidates=8)
    constraints = []
    #
    # Maximum words perturbed percentage of 20%
    #
    constraints.append(WordsPerturbed(max_percent=0.2))
    #
    # Maximum word embedding euclidean distance of 0.5.
    #
    constraints.append(WordEmbeddingDistance(max_mse_dist=0.5))
    #
    # Language Model
    #
    constraints.append(Google1BillionWordsLanguageModel(top_n_per_index=4))
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Perform word substitution with a genetic algorithm.
    #
    attack = GeneticAlgorithm(goal_function,
                              constraints=constraints,
                              transformation=transformation,
                              pop_size=60,
                              max_iters=20)

    return attack
Ejemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--num_examples", default=3000, type=int)  #50485
    parser.add_argument("--model",
                        default="hfl/chinese-roberta-wwm-ext",
                        type=str)
    parser.add_argument("--num_labels", default=3, type=int)
    parser.add_argument("--cuda", default=0, type=int)
    parser.add_argument("--tokenizer",
                        default="hfl/chinese-roberta-wwm-ext",
                        type=str)
    parser.add_argument(
        "--transformation",
        type=str,
        required=False,
        default="word-swap-embedding",
        help=
        'The transformation to apply. Usage: "--transformation {transformation}:{arg_1}={value_1},{arg_3}={value_3}". Choices: ',
    )

    # add_model_args(parser)
    # add_dataset_args(parser)

    parser.add_argument(
        "--constraints",
        type=str,
        required=False,
        nargs="*",
        default=["repeat", "stopword"],
        help=
        'Constraints to add to the attack. Usage: "--constraints {constraint}:{arg_1}={value_1},{arg_3}={value_3}". Choices: ',
    )

    parser.add_argument(
        "--log-to-txt",
        "-l",
        nargs="?",
        default=None,
        const="",
        type=str,
        help=
        "Save attack logs to <install-dir>/outputs/~ by default; Include '/' at the end of argument to save "
        "output to specified directory in default naming convention; otherwise enter argument to specify "
        "file name",
    )

    parser.add_argument(
        "--log-to-csv",
        nargs="?",
        default=
        "/home/guest/r09944010/2020MLSECURITY/final/ml-security-proj/attack/OCNLI/roberta/",
        const="",
        type=str,
        help=
        "Save attack logs to <install-dir>/outputs/~ by default; Include '/' at the end of argument to save "
        "output to specified directory in default naming convention; otherwise enter argument to specify "
        "file name",
    )

    parser.add_argument(
        "--csv-style",
        default=None,
        const="fancy",
        nargs="?",
        type=str,
        help="Use --csv-style plain to remove [[]] around words",
    )

    parser.add_argument("--enable-visdom",
                        action="store_true",
                        help="Enable logging to visdom.")

    parser.add_argument(
        "--enable-wandb",
        action="store_true",
        help="Enable logging to Weights & Biases.",
    )

    parser.add_argument("--disable-stdout",
                        action="store_true",
                        help="Disable logging to stdout")

    parser.add_argument(
        "--interactive",
        action="store_true",
        default=False,
        help="Whether to run attacks interactively.",
    )

    parser.add_argument(
        "--attack-n",
        action="store_true",
        default=False,
        help=
        "Whether to run attack until `n` examples have been attacked (not skipped).",
    )

    parser.add_argument(
        "--parallel",
        action="store_true",
        default=False,
        help="Run attack using multiple GPUs.",
    )

    # goal_function_choices = ", ".join(GOAL_FUNCTION_CLASS_NAMES.keys())
    parser.add_argument(
        "--goal-function",
        "-g",
        default="untargeted-classification",
        # help=f"The goal function to use. choices: {goal_function_choices}",
    )

    def str_to_int(s):
        return sum((ord(c) for c in s))

    parser.add_argument("--random-seed",
                        default=str_to_int("TEXTATTACK"),
                        type=int)

    parser.add_argument(
        "--checkpoint-dir",
        required=False,
        type=str,
        default=None,
        help="The directory to save checkpoint files.",
    )

    parser.add_argument(
        "--checkpoint-interval",
        required=False,
        type=int,
        help=
        "If set, checkpoint will be saved after attacking every N examples. If not set, no checkpoints will be saved.",
    )

    parser.add_argument(
        "--query-budget",
        "-q",
        type=int,
        default=float("inf"),
        help=
        "The maximum number of model queries allowed per example attacked.",
    )
    parser.add_argument(
        "--model-batch-size",
        type=int,
        default=26,
        help="The batch size for making calls to the model.",
    )
    parser.add_argument(
        "--model-cache-size",
        type=int,
        default=2**18,
        help=
        "The maximum number of items to keep in the model results cache at once.",
    )
    parser.add_argument(
        "--constraint-cache-size",
        type=int,
        default=2**18,
        help=
        "The maximum number of items to keep in the constraints cache at once.",
    )

    attack_group = parser.add_mutually_exclusive_group(required=False)
    attack_group.add_argument(
        "--search",
        "--search-method",
        "-s",
        type=str,
        required=False,
        default="greedy-word-wir",
        # help=f"The search method to use. choices: {search_choices}",
    )
    attack_group.add_argument(
        "--recipe",
        "--attack-recipe",
        "-r",
        type=str,
        required=False,
        default=None,
        # help="full attack recipe (overrides provided goal function, transformation & constraints)",
        # choices=ATTACK_RECIPE_NAMES.keys(),
    )
    attack_group.add_argument(
        "--attack-from-file",
        type=str,
        required=False,
        default=None,
        help=
        "attack to load from file (overrides provided goal function, transformation & constraints)",
    )
    args = parser.parse_args()

    # dataset = load_dataset()
    dataset = load_ocnliDataset(split="dev")
    dataset = HuggingFaceDataset(dataset)

    num_remaining_attacks = args.num_examples
    worklist = deque(range(0, args.num_examples))
    worklist_tail = worklist[-1]

    config = BertConfig.from_pretrained(
        "hfl/chinese-macbert-base")  # "hfl/chinese-macbert-base"
    config.output_attentions = False
    config.output_token_type_ids = False
    # config.max_length = 30
    tokenizer = BertTokenizer.from_pretrained("hfl/chinese-macbert-base",
                                              config=config)

    config = AutoConfig.from_pretrained(
        './models/roberta/chinese-roberta-wwm-ext-OCNLI-2021-01-05-23-46-02-975289',
        num_labels=3)
    model = AutoModelForSequenceClassification.from_pretrained(
        './models/roberta/chinese-roberta-wwm-ext-OCNLI-2021-01-05-23-46-02-975289',
        config=config,
    )
    model_wrapper = HuggingFaceModelWrapper(model, tokenizer, batch_size=28)

    # goal function
    goal_function = UntargetedClassification(model_wrapper)
    # constraints
    # stopwords = set(
    #     ["个", "关于", "之上", "across", "之后", "afterwards", "再次", "against", "ain", "全部", "几乎", "单独", "along", "早已", "也", "虽然", "是", "among", "amongst", "一个", "和", "其他", "任何", "anyhow", "任何人", "anything", "anyway", "anywhere", "are", "aren", "没有", "around", "as", "at", "后", "been", "之前", "beforehand", "behind", "being", "below", "beside", "besides", "之間", "beyond", "皆是", "但", "by", "可以", "不可以", "是", "不是", "couldn't", "d", "didn", "didn't", "doesn", "doesn't", "don", "don't", "down", "due", "之間", "either", "之外", "elsewhere", "空", "足夠", "甚至", "ever", "任何人", "everything", "everywhere", "except", "first", "for", "former", "formerly", "from", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "i", "if", "in", "indeed", "into", "is", "isn", "isn't", "it", "it's", "its", "itself", "just", "latter", "latterly", "least", "ll", "may", "me", "meanwhile", "mightn", "mightn't", "mine", "more", "moreover", "most", "mostly", "must", "mustn", "mustn't", "my", "myself", "namely", "needn", "needn't", "neither", "never", "nevertheless", "next", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "o", "of", "off", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "per", "please", "s", "same", "shan", "shan't", "she", "she's", "should've", "shouldn", "shouldn't", "somehow", "something", "sometime", "somewhere", "such", "t", "than", "that", "that'll", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "this", "those", "through", "throughout", "thru", "thus", "to", "too", "toward", "towards", "under", "unless", "until", "up", "upon", "used", "ve", "was", "wasn", "wasn't", "we", "were", "weren", "weren't", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "with", "within", "without", "won", "won't", "would", "wouldn", "wouldn't", "y", "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"]
    # )
    constraints = [RepeatModification(), StopwordModification()]
    # constraints = [RepeatModification(), StopwordModification(stopwords=stopwords)]
    input_column_modification = InputColumnModification(
        ["premise", "hypothesis"], {"premise"})
    constraints.append(input_column_modification)
    constraints.append(MaxWordsPerturbed(max_percent=0.2))
    constraints.append(
        WordEmbeddingDistance(max_mse_dist=0.5,
                              compare_against_original=False))
    # constraints.append(
    #     Google1BillionWordsLanguageModel(
    #         top_n_per_index=4, compare_against_original=False
    #     )
    # )
    # use_constraint = UniversalSentenceEncoder(
    #     threshold=0.840845057,
    #     metric="angular",
    #     compare_against_original=False,
    #     window_size=15,
    #     skip_text_shorter_than_window=True,
    # )
    # constraints.append(use_constraint)
    transformation = WordSwapEmbedding(max_candidates=8)
    # transformation = WordDeletion()
    # search methods
    # search_method = GreedyWordSwapWIR(wir_method="delete")
    search_method = AlzantotGeneticAlgorithm(pop_size=60,
                                             max_iters=20,
                                             post_crossover_check=False)

    start_time = time.time()
    textattack.shared.utils.set_seed(args.random_seed)

    # attack
    attack = Attack(goal_function, constraints, transformation, search_method)
    print(attack)
    attack_log_manager = parse_logger_from_args(args)

    pbar = tqdm.tqdm(total=num_remaining_attacks, smoothing=0)
    num_results = 0
    num_failures = 0
    num_successes = 0

    for result in attack.attack_dataset(dataset, indices=worklist):
        attack_log_manager.log_result(result)
        if not args.disable_stdout:
            print("\n")
        if (not args.attack_n) or (not isinstance(
                result, textattack.attack_results.SkippedAttackResult)):
            pbar.update(1)
        else:
            # worklist_tail keeps track of highest idx that has been part of worklist
            # Used to get the next dataset element when attacking with `attack_n` = True.
            worklist_tail += 1
            worklist.append(worklist_tail)

        num_results += 1

        if (type(result) == textattack.attack_results.SuccessfulAttackResult
                or type(result)
                == textattack.attack_results.MaximizedAttackResult):
            num_successes += 1
        if type(result) == textattack.attack_results.FailedAttackResult:
            num_failures += 1
        pbar.set_description(
            "[Succeeded / Failed / Total] {} / {} / {}".format(
                num_successes, num_failures, num_results))

        if (args.checkpoint_interval
                and len(attack_log_manager.results) % args.checkpoint_interval
                == 0):
            new_checkpoint = textattack.shared.Checkpoint(
                args, attack_log_manager, worklist, worklist_tail)
            new_checkpoint.save()
            attack_log_manager.flush()

    pbar.close()
    print()
    # Enable summary stdout
    if args.disable_stdout:
        attack_log_manager.enable_stdout()
    attack_log_manager.log_summary()
    attack_log_manager.flush()
    print()
    # finish_time = time.time()
    textattack.shared.logger.info(f"Attack time: {time.time()}s")
    attack_log_manager.results
    def build(model):
        #
        # Section 5: Experiments
        #
        # We base our sets of allowed word substitutions S(x, i) on the
        # substitutions allowed by Alzantot et al. (2018). They demonstrated that
        # their substitutions lead to adversarial examples that are qualitatively
        # similar to the original input and retain the original label, as judged
        # by humans. Alzantot et al. (2018) define the neighbors N(w) of a word w
        # as the n = 8 nearest neighbors of w in a “counter-fitted” word vector
        # space where antonyms are far apart (Mrksiˇ c´ et al., 2016). The
        # neighbors must also lie within some Euclidean distance threshold. They
        # also use a language model constraint to avoid nonsensical perturbations:
        # they allow substituting xi with x˜i ∈ N(xi) if and only if it does not
        # decrease the log-likelihood of the text under a pre-trained language
        # model by more than some threshold.
        #
        # We make three modifications to this approach:
        #
        # First, in Alzantot et al. (2018), the adversary
        # applies substitutions one at a time, and the
        # neighborhoods and language model scores are computed.
        # Equation (4) must be applied before the model
        # can combine information from multiple words, but it can
        # be delayed until after processing each word independently.
        # Note that the model itself classifies using a different
        # set of pre-trained word vectors; the counter-fitted vectors
        # are only used to define the set of allowed substitution words.
        # relative to the current altered version of the input.
        # This results in a hard-to-define attack surface, as
        # changing one word can allow or disallow changes
        # to other words. It also requires recomputing
        # language model scores at each iteration of the genetic
        # attack, which is inefficient. Moreover, the same
        # word can be substituted multiple times, leading
        # to semantic drift. We define allowed substitutions
        # relative to the original sentence x, and disallow
        # repeated substitutions.
        #
        # Second, we use a faster language model that allows us to query
        # longer contexts; Alzantot et al. (2018) use a slower language
        # model and could only query it with short contexts.

        # Finally, we use the language model constraint only
        # at test time; the model is trained against all perturbations in N(w). This encourages the model to be
        # robust to a larger space of perturbations, instead of
        # specializing for the particular choice of language
        # model. See Appendix A.3 for further details. [This is a model-specific
        # adjustment, so does not affect the attack recipe.]
        #
        # Appendix A.3:
        #
        # In Alzantot et al. (2018), the adversary applies replacements one at a
        # time, and the neighborhoods and language model scores are computed
        # relative to the current altered version of the input. This results in a
        # hard-to-define attack surface, as the same word can be replaced many
        # times, leading to semantic drift. We instead pre-compute the allowed
        # substitutions S(x, i) at index i based on the original x. We define
        # S(x, i) as the set of x_i ∈ N(x_i) such that where probabilities are
        # assigned by a pre-trained language model, and the window radius W and
        # threshold δ are hyperparameters. We use W = 6 and δ = 5.
        #
        #
        # Swap words with their embedding nearest-neighbors.
        #
        # Embedding: Counter-fitted Paragram Embeddings.
        #
        # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5"
        #
        transformation = WordSwapEmbedding(max_candidates=8)
        #
        # Don't modify the same word twice or stopwords
        #
        constraints = [RepeatModification(), StopwordModification()]
        #
        # Maximum words perturbed percentage of 20%
        #
        constraints.append(MaxWordsPerturbed(max_percent=0.2))
        #
        # Maximum word embedding euclidean distance of 0.5.
        #
        constraints.append(WordEmbeddingDistance(max_mse_dist=0.5))
        #
        # Language Model
        #
        #
        #
        constraints.append(
            LearningToWriteLanguageModel(window_size=6,
                                         max_log_prob_diff=5.0,
                                         compare_against_original=True))
        # constraints.append(LearningToWriteLanguageModel(window_size=5))
        #
        # Goal is untargeted classification
        #
        goal_function = UntargetedClassification(model)
        #
        # Perform word substitution with a genetic algorithm.
        #
        search_method = AlzantotGeneticAlgorithm(pop_size=60,
                                                 max_iters=20,
                                                 post_crossover_check=False)

        return Attack(goal_function, constraints, transformation,
                      search_method)
Ejemplo n.º 16
0
def TextFoolerJin2019(model):
    """
        Jin, D., Jin, Z., Zhou, J.T., & Szolovits, P. (2019). 
        
        Is BERT Really Robust? Natural Language Attack on Text Classification and Entailment. 
        
        https://arxiv.org/abs/1907.11932 
    """
    #
    # Swap words with their 50 closest embedding nearest-neighbors.
    # Embedding: Counter-fitted PARAGRAM-SL999 vectors.
    #
    transformation = WordSwapEmbedding(max_candidates=50)
    #
    # Don't modify the same word twice or the stopwords defined
    # in the TextFooler public implementation.
    #
    # fmt: off
    stopwords = set([
        "a", "about", "above", "across", "after", "afterwards", "again",
        "against", "ain", "all", "almost", "alone", "along", "already", "also",
        "although", "am", "among", "amongst", "an", "and", "another", "any",
        "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "aren",
        "aren't", "around", "as", "at", "back", "been", "before", "beforehand",
        "behind", "being", "below", "beside", "besides", "between", "beyond",
        "both", "but", "by", "can", "cannot", "could", "couldn", "couldn't",
        "d", "didn", "didn't", "doesn", "doesn't", "don", "don't", "down",
        "due", "during", "either", "else", "elsewhere", "empty", "enough",
        "even", "ever", "everyone", "everything", "everywhere", "except",
        "first", "for", "former", "formerly", "from", "hadn", "hadn't", "hasn",
        "hasn't", "haven", "haven't", "he", "hence", "her", "here",
        "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him",
        "himself", "his", "how", "however", "hundred", "i", "if", "in",
        "indeed", "into", "is", "isn", "isn't", "it", "it's", "its", "itself",
        "just", "latter", "latterly", "least", "ll", "may", "me", "meanwhile",
        "mightn", "mightn't", "mine", "more", "moreover", "most", "mostly",
        "must", "mustn", "mustn't", "my", "myself", "namely", "needn",
        "needn't", "neither", "never", "nevertheless", "next", "no", "nobody",
        "none", "noone", "nor", "not", "nothing", "now", "nowhere", "o", "of",
        "off", "on", "once", "one", "only", "onto", "or", "other", "others",
        "otherwise", "our", "ours", "ourselves", "out", "over", "per",
        "please", "s", "same", "shan", "shan't", "she", "she's", "should've",
        "shouldn", "shouldn't", "somehow", "something", "sometime",
        "somewhere", "such", "t", "than", "that", "that'll", "the", "their",
        "theirs", "them", "themselves", "then", "thence", "there",
        "thereafter", "thereby", "therefore", "therein", "thereupon", "these",
        "they", "this", "those", "through", "throughout", "thru", "thus", "to",
        "too", "toward", "towards", "under", "unless", "until", "up", "upon",
        "used", "ve", "was", "wasn", "wasn't", "we", "were", "weren",
        "weren't", "what", "whatever", "when", "whence", "whenever", "where",
        "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever",
        "whether", "which", "while", "whither", "who", "whoever", "whole",
        "whom", "whose", "why", "with", "within", "without", "won", "won't",
        "would", "wouldn", "wouldn't", "y", "yet", "you", "you'd", "you'll",
        "you're", "you've", "your", "yours", "yourself", "yourselves"
    ])
    # fmt: on
    constraints = [
        RepeatModification(),
        StopwordModification(stopwords=stopwords)
    ]
    #
    # During entailment, we should only edit the hypothesis - keep the premise
    # the same.
    #
    input_column_modification = InputColumnModification(
        ["premise", "hypothesis"], {"premise"})
    constraints.append(input_column_modification)
    # Minimum word embedding cosine similarity of 0.5.
    # (The paper claims 0.7, but analysis of the released code and some empirical
    # results show that it's 0.5.)
    #
    constraints.append(WordEmbeddingDistance(min_cos_sim=0.5))
    #
    # Only replace words with the same part of speech (or nouns with verbs)
    #
    constraints.append(PartOfSpeech(allow_verb_noun_swap=True))
    #
    # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7.
    #
    # In the TextFooler code, they forget to divide the angle between the two
    # embeddings by pi. So if the original threshold was that 1 - sim >= 0.7, the
    # new threshold is 1 - (0.3) / pi = 0.90445
    #
    use_constraint = UniversalSentenceEncoder(
        threshold=0.904458599,
        metric="angular",
        compare_with_original=False,
        window_size=15,
        skip_text_shorter_than_window=True,
    )
    constraints.append(use_constraint)
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Greedily swap words with "Word Importance Ranking".
    #
    search_method = GreedyWordSwapWIR()

    return Attack(goal_function, constraints, transformation, search_method)
    "shouldn't", 'somehow', 'something', 'sometime', 'somewhere', 'such', 't',
    'than', 'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves',
    'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein',
    'thereupon', 'these', 'they', 'this', 'those', 'through', 'throughout',
    'thru', 'thus', 'to', 'too', 'toward', 'towards', 'under', 'unless',
    'until', 'up', 'upon', 'used', 've', 'was', 'wasn', "wasn't", 'we', 'were',
    'weren', "weren't", 'what', 'whatever', 'when', 'whence', 'whenever',
    'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
    'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever',
    'whole', 'whom', 'whose', 'why', 'with', 'within', 'without', 'won',
    "won't", 'would', 'wouldn', "wouldn't", 'y', 'yet', 'you', "you'd",
    "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves'
])

# Strict Constraints
MAX_LENGTH = 256
COSINE_SIM = 0.9
BERT_SCORE_SIM = 0.9
ALLOW_VERB_NOUN_SWAP = False
TAGGER_TYPE = "flair"

CONSTRAINTS = [
    RepeatModification(),
    StopwordModification(stopwords=STOPWORDS),
    MaxWordIndexModification(max_length=MAX_LENGTH),
    InputColumnModification(["premise", "hypothesis"], {"premise"}),
    WordEmbeddingDistance(min_cos_sim=COSINE_SIM),
    BERTScore(min_bert_score=BERT_SCORE_SIM),
    PartOfSpeech(tagger_type=TAGGER_TYPE,
                 allow_verb_noun_swap=ALLOW_VERB_NOUN_SWAP)
]
def TextFoolerJin2019Adjusted(model, SE_thresh=0.98, sentence_encoder='bert'):
    """
        Jin, D., Jin, Z., Zhou, J.T., & Szolovits, P. (2019). 
        
        Is BERT Really Robust? Natural Language Attack on Text Classification and Entailment. 
        
        https://arxiv.org/abs/1907.11932 
       
        Constraints adjusted from paper to align with human evaluation.
    """
    #
    # Swap words with their embedding nearest-neighbors.
    #
    # Embedding: Counter-fitted PARAGRAM-SL999 vectors.
    #
    # 50 nearest-neighbors with a cosine similarity of at least 0.5.
    # (The paper claims 0.7, but analysis of the code and some empirical
    # results show that it's definitely 0.5.)
    #
    transformation = WordSwapEmbedding(max_candidates=50)
    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]
    #
    # Minimum word embedding cosine similarity of 0.9.
    #
    constraints.append(WordEmbeddingDistance(min_cos_sim=0.9))
    #
    # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7.
    #
    if sentence_encoder == 'bert':
        se_constraint = BERT(threshold=SE_thresh,
                             metric='cosine',
                             compare_with_original=False,
                             window_size=15,
                             skip_text_shorter_than_window=False)
    else:
        se_constraint = UniversalSentenceEncoder(
            threshold=SE_thresh,
            metric='cosine',
            compare_with_original=False,
            window_size=15,
            skip_text_shorter_than_window=False)
    constraints.append(se_constraint)
    #
    # Do grammar checking
    #
    constraints.append(LanguageTool(0))

    #
    # Untargeted attack
    #
    goal_function = UntargetedClassification(model)

    #
    # Greedily swap words with "Word Importance Ranking".
    #
    search_method = GreedyWordSwapWIR()

    return Attack(goal_function, constraints, transformation, search_method)
Ejemplo n.º 19
0
    "textattack/bert-base-uncased-imdb")
tokenizer = transformers.AutoTokenizer.from_pretrained(
    "textattack/bert-base-uncased-imdb")
model_wrapper = textattack.models.wrappers.HuggingFaceModelWrapper(
    model, tokenizer)

# Construct our four components for `Attack`
from textattack.constraints.pre_transformation import RepeatModification, StopwordModification
from textattack.constraints.semantics import WordEmbeddingDistance

goal_function = textattack.goal_functions.UntargetedClassification(
    model_wrapper)
constraints = [
    RepeatModification(),
    StopwordModification(),
    WordEmbeddingDistance(min_cos_sim=0.9)
]
transformation = textattack.transformations.WordSwapEmbedding(
    max_candidates=50)
search_method = textattack.search_methods.alzantot_genetic_algorithm.AlzantotGeneticAlgorithm(
    pop_size=60,
    max_iters=20,
    temp=0.3,
    give_up_if_no_improvement=False,
    post_crossover_check=True,
    max_crossover_retries=20)

# Construct the actual attack
attack = textattack.Attack(goal_function, constraints, transformation,
                           search_method)
Ejemplo n.º 20
0
def Alzantot2018Adjusted(model, SE_thresh=0.98, sentence_encoder='bert'):
    """
        Alzantot, M., Sharma, Y., Elgohary, A., Ho, B., Srivastava, M.B., & Chang, 
        K. (2018). 
        
        Generating Natural Language Adversarial Examples.
        
        https://arxiv.org/abs/1801.00554 

        Constraints adjusted from paper to align with human evaluation.
    """
    #
    # Swap words with their embedding nearest-neighbors.
    #
    # Embedding: Counter-fitted PARAGRAM-SL999 vectors.
    #
    # "[We] fix the hyperparameter values to S = 60, N = 8, K = 4, and δ = 0.5"
    #
    transformation = WordSwapEmbedding(max_candidates=50)
    #
    # Don't modify the same word twice or stopwords
    #
    constraints = [RepeatModification(), StopwordModification()]
    #
    # Minimum word embedding cosine similarity of 0.9.
    #
    constraints = []
    constraints.append(WordEmbeddingDistance(min_cos_sim=0.9))
    #
    # Universal Sentence Encoder with a minimum angular similarity of ε = 0.7.
    #
    if sentence_encoder == 'bert':
        se_constraint = BERT(threshold=SE_thresh,
                             metric='cosine',
                             compare_against_original=False,
                             window_size=15,
                             skip_text_shorter_than_window=False)
    else:
        se_constraint = UniversalSentenceEncoder(
            threshold=SE_thresh,
            metric='cosine',
            compare_against_original=False,
            window_size=15,
            skip_text_shorter_than_window=False)
    constraints.append(se_constraint)
    #
    # Do grammar checking
    #
    constraints.append(LanguageTool(0))
    #
    # Goal is untargeted classification
    #
    goal_function = UntargetedClassification(model)
    #
    # Perform word substitution with a genetic algorithm.
    #
    search_method = AlzantotGeneticAlgorithm(pop_size=60,
                                             max_iters=20,
                                             post_crossover_check=False)

    return Attack(goal_function, constraints, transformation, search_method)