コード例 #1
0
ファイル: sst.py プロジェクト: nuaaxc/universal-triggers
def main():
    # load the binary SST dataset.
    single_id_indexer = SingleIdTokenIndexer(
        lowercase_tokens=True)  # word tokenizer
    # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences.
    reader = StanfordSentimentTreeBankDatasetReader(
        granularity="2-class",
        token_indexers={"tokens": single_id_indexer},
        use_subtrees=True)
    train_data = reader.read(
        'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt')
    reader = StanfordSentimentTreeBankDatasetReader(
        granularity="2-class", token_indexers={"tokens": single_id_indexer})
    dev_data = reader.read(
        'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt')
    # test_dataset = reader.read('data/sst/test.txt')

    vocab = Vocabulary.from_instances(train_data)

    # Randomly initialize vectors
    if EMBEDDING_TYPE == "None":
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300)
        word_embedding_dim = 300

    # Load word2vec vectors
    elif EMBEDDING_TYPE == "w2v":
        embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
        weight = _read_pretrained_embeddings_file(embedding_path,
                                                  embedding_dim=300,
                                                  vocab=vocab,
                                                  namespace="tokens")
        token_embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=300,
            weight=weight,
            trainable=False)
        word_embedding_dim = 300

    # Initialize model, cuda(), and optimizer
    word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})
    encoder = PytorchSeq2VecWrapper(
        torch.nn.LSTM(word_embedding_dim,
                      hidden_size=512,
                      num_layers=2,
                      batch_first=True))
    model = LstmClassifier(word_embeddings, encoder, vocab)
    model.cuda()

    # where to save the model
    model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th"
    vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab"
    # if the model already exists (its been trained), load the pre-trained weights and vocabulary
    if os.path.isfile(model_path):
        vocab = Vocabulary.from_files(vocab_path)
        model = LstmClassifier(word_embeddings, encoder, vocab)
        with open(model_path, 'rb') as f:
            model.load_state_dict(torch.load(f))
    # otherwise train model from scratch and save its weights
    else:
        iterator = BucketIterator(batch_size=32,
                                  sorting_keys=[("tokens", "num_tokens")])
        iterator.index_with(vocab)
        optimizer = optim.Adam(model.parameters())
        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          iterator=iterator,
                          train_dataset=train_data,
                          validation_dataset=dev_data,
                          num_epochs=5,
                          patience=1,
                          cuda_device=0)
        trainer.train()
        with open(model_path, 'wb') as f:
            torch.save(model.state_dict(), f)
        vocab.save_to_files(vocab_path)
    model.train().cuda()  # rnn cannot do backwards in train mode

    # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings.
    # We use the gradient later in the attack.
    utils.add_hooks(model)
    embedding_weight = utils.get_embedding_weight(
        model)  # also save the word embedding matrix

    # Use batches of size universal_perturb_batch_size for the attacks.
    universal_perturb_batch_size = 128
    iterator = BasicIterator(batch_size=universal_perturb_batch_size)
    iterator.index_with(vocab)

    # Build k-d Tree if you are using gradient + nearest neighbor attack
    # tree = KDTree(embedding_weight.numpy())

    # filter the dataset to only positive or negative examples
    # (the trigger will cause the opposite prediction)
    dataset_label_filter = "0"
    targeted_dev_data = []
    for instance in dev_data:
        if instance['label'].label == dataset_label_filter:
            targeted_dev_data.append(instance)

    # get accuracy before adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None)
    model.train()  # rnn cannot do backwards in train mode

    # initialize triggers which are concatenated to the input
    num_trigger_tokens = 3
    trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens

    # sample batches, update the triggers, and repeat
    for batch in lazy_groups_of(iterator(targeted_dev_data,
                                         num_epochs=5,
                                         shuffle=True),
                                group_size=1):
        # get accuracy with current triggers
        utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
        model.train()  # rnn cannot do backwards in train mode

        # get gradient w.r.t. trigger embeddings for current batch
        averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids)

        # pass the gradients to a particular attack to generate token candidates for each token.
        cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad,
                                                        embedding_weight,
                                                        trigger_token_ids,
                                                        num_candidates=40,
                                                        increase_loss=True)
        # cand_trigger_token_ids = attacks.random_attack(embedding_weight,
        #                                                trigger_token_ids,
        #                                                num_candidates=40)
        # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad,
        #                                                        embedding_weight,
        #                                                        trigger_token_ids,
        #                                                        tree,
        #                                                        100,
        #                                                        num_candidates=40,
        #                                                        increase_loss=True)

        # Tries all of the candidates and returns the trigger sequence with highest loss.
        trigger_token_ids = utils.get_best_candidates(model, batch,
                                                      trigger_token_ids,
                                                      cand_trigger_token_ids)

    # print accuracy after adding triggers
    utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
コード例 #2
0
def main():
    # Load SNLI dataset
    single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True)  # word tokenizer
    tokenizer = WordTokenizer(
        end_tokens=["@@NULL@@"]
    )  # add @@NULL@@ to the end of sentences
    reader = SnliReader(
        token_indexers={"tokens": single_id_indexer}, tokenizer=tokenizer
    )
    dev_dataset = reader.read(
        "https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_dev.jsonl"
    )
    # Load model and vocab
    model = load_archive(
        "https://allennlp.s3-us-west-2.amazonaws.com/models/esim-glove-snli-2019.04.23.tar.gz"
    ).model
    model.eval().cuda()
    vocab = model.vocab

    # add hooks for embeddings so we can compute gradients w.r.t. to the input tokens
    utils.add_hooks(model)
    embedding_weight = utils.get_embedding_weight(
        model
    )  # save the word embedding matrix

    # Batches of examples to construct triggers
    universal_perturb_batch_size = 32
    iterator = BasicIterator(batch_size=universal_perturb_batch_size)
    iterator.index_with(vocab)

    # Subsample the dataset to one class to do a universal attack on that class
    dataset_label_filter = "entailment"  # only entailment examples
    # dataset_label_filter = 'contradiction' # only contradiction examples
    # dataset_label_filter = 'neutral' # only neutral examples
    subset_dev_dataset = []
    for instance in dev_dataset:
        if instance["label"].label == dataset_label_filter:
            subset_dev_dataset.append(instance)
    # the attack is targeted towards a specific class
    # target_label = "0" # flip to entailment
    target_label = "1"  # flip to contradiction
    # target_label = "2" # flip to neutral

    # A k-d tree if you want to do gradient + nearest neighbors
    # tree = KDTree(embedding_weight.numpy())

    # Get original accuracy before adding universal triggers
    utils.get_accuracy(
        model, subset_dev_dataset, vocab, trigger_token_ids=None, snli=True
    )
    model.train()  # rnn cannot do backwards in train mode

    # Initialize triggers
    num_trigger_tokens = 1  # one token prepended
    trigger_token_ids = [vocab.get_token_index("a")] * num_trigger_tokens
    # sample batches, update the triggers, and repeat
    for batch in lazy_groups_of(
        iterator(subset_dev_dataset, num_epochs=10, shuffle=True), group_size=1
    ):
        # get model accuracy with current triggers
        utils.get_accuracy(
            model, subset_dev_dataset, vocab, trigger_token_ids, snli=True
        )
        model.train()  # rnn cannot do backwards in train mode

        # get grad of triggers
        averaged_grad = utils.get_average_grad(
            model, batch, trigger_token_ids, target_label, snli=True
        )

        # find attack candidates using an attack method
        cand_trigger_token_ids = attacks.hotflip_attack(
            averaged_grad, embedding_weight, num_candidates=40
        )
        # cand_trigger_token_ids = attacks.random_attack(embedding_weight,
        #                                                trigger_token_ids,
        #                                                num_candidates=40)
        # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad,
        #                                                        embedding_weight,
        #                                                        trigger_token_ids,
        #                                                        tree,
        #                                                        100,
        #                                                        decrease_prob=True)

        # query the model to get the best candidates
        trigger_token_ids = utils.get_best_candidates(
            model, batch, trigger_token_ids, cand_trigger_token_ids, snli=True
        )
コード例 #3
0
def main():
    # Read the SQuAD validation dataset using a word tokenizer
    single_id = SingleIdTokenIndexer(lowercase_tokens=True)
    reader = SquadReader(token_indexers={'tokens': single_id})
    dev_dataset = reader.read(
        'https://s3-us-west-2.amazonaws.com/allennlp/datasets/squad/squad-dev-v1.1.json'
    )
    # Load the model and its associated vocabulary.
    model = load_archive(
        'https://s3-us-west-2.amazonaws.com/allennlp/models/bidaf-glove-2019.05.09.tar.gz'
    ).model
    vocab = model.vocab
    model.eval().cuda()

    # filter to just certain `wh` questions
    who_questions_dev, what_questions_dev, where_questions_dev, when_questions_dev, what_questions_dev, \
        how_questions_dev, why_questions_dev, which_questions_dev, other_questions_dev = ([] for i in range(9))
    for item in dev_dataset:
        for word in item['question']:
            if word.text.lower() == 'who':
                who_questions_dev.append(item)
                break
            if word.text.lower() == 'what':
                what_questions_dev.append(item)
                break
            if word.text.lower() == 'where':
                where_questions_dev.append(item)
                break
            if word.text.lower() == 'when':
                when_questions_dev.append(item)
                break
            if word.text.lower() == 'how':
                how_questions_dev.append(item)
                break
            if word.text.lower() == 'why':
                why_questions_dev.append(item)
                break
            if word.text.lower() == 'which':
                which_questions_dev.append(item)
                break
            else:
                other_questions_dev.append(item)

    # Use batches to craft the universal perturbations
    universal_perturb_batch_size = 32
    iterator = BasicIterator(batch_size=universal_perturb_batch_size)
    iterator.index_with(vocab)

    # We register a gradient hook on the embeddings.
    utils.add_hooks(model)
    embedding_weight = utils.get_embedding_weight(
        model)  # save the word embedding matrix

    # Initialize the trigger. The first one is an intialization with all "the" tokens.
    # You can customize it. Make sure to set the fixed target answer and the question type.
    # The second is a trigger found after running as reported in our paper.
    trigger_init = "the the the the donald trump the the the the"
    target_answer = "donald trump"
    subsampled_dev_dataset = who_questions_dev  # universal attack on `who` questions
    # trigger_init = "why how ; known because : to kill american people ."
    # target_answer = "to kill american people"
    # subsampled_dev_dataset = why_questions_dev # universal attack on `who` questions

    # tokenizes the trigger, and finds the start/end span
    # make sure the trigger tokens are space separated
    trigger_token_ids = [
        vocab.get_token_index(t) for t in trigger_init.split(' ')
    ]
    span_start = trigger_init.split(' ').index(
        target_answer.split(' ')[0])  # start of target_answer
    span_end = trigger_init.split(' ').index(target_answer.split(' ')[-1])
    # we ignore replacement at the positions of the answer (answer is fixed)
    ignore_indices = [0]*(span_start) + \
        [1]*(span_end - span_start + 1) + [0]*(len(trigger_token_ids) - 1 - span_end)

    # if these parameters are bigger = better result, but slower
    num_candidates = 20
    beam_size = 5
    for _ in range(100):
        # Get targeted accuracy
        squad_utils.get_accuracy_squad(model, subsampled_dev_dataset, vocab,
                                       trigger_token_ids, target_answer,
                                       span_start, span_end)
        model.train()

        # Get the gradient for the appended tokens averaged over the batch.
        averaged_grad = squad_utils.get_average_grad_squad(
            model, vocab, trigger_token_ids, subsampled_dev_dataset,
            span_start, span_end)

        # Use an attack method to get the top candidates
        cand_trigger_token_ids = attacks.hotflip_attack(
            averaged_grad,
            embedding_weight,
            trigger_token_ids,
            num_candidates=num_candidates,
            increase_loss=False)

        # Query the model with the top candidates to find the best tokens.
        trigger_token_ids = squad_utils.get_best_candidates_squad(
            model, trigger_token_ids, cand_trigger_token_ids, vocab,
            subsampled_dev_dataset, beam_size, ignore_indices, span_start,
            span_end)