def main(): # Load SNLI dataset single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer tokenizer = WordTokenizer( end_tokens=["@@NULL@@"] ) # add @@NULL@@ to the end of sentences reader = SnliReader( token_indexers={"tokens": single_id_indexer}, tokenizer=tokenizer ) dev_dataset = reader.read( "https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_dev.jsonl" ) # Load model and vocab model = load_archive( "https://allennlp.s3-us-west-2.amazonaws.com/models/esim-glove-snli-2019.04.23.tar.gz" ).model model.eval().cuda() vocab = model.vocab # add hooks for embeddings so we can compute gradients w.r.t. to the input tokens utils.add_hooks(model) embedding_weight = utils.get_embedding_weight( model ) # save the word embedding matrix # Batches of examples to construct triggers universal_perturb_batch_size = 32 iterator = BasicIterator(batch_size=universal_perturb_batch_size) iterator.index_with(vocab) # Subsample the dataset to one class to do a universal attack on that class dataset_label_filter = "entailment" # only entailment examples # dataset_label_filter = 'contradiction' # only contradiction examples # dataset_label_filter = 'neutral' # only neutral examples subset_dev_dataset = [] for instance in dev_dataset: if instance["label"].label == dataset_label_filter: subset_dev_dataset.append(instance) # the attack is targeted towards a specific class # target_label = "0" # flip to entailment target_label = "1" # flip to contradiction # target_label = "2" # flip to neutral # A k-d tree if you want to do gradient + nearest neighbors # tree = KDTree(embedding_weight.numpy()) # Get original accuracy before adding universal triggers utils.get_accuracy( model, subset_dev_dataset, vocab, trigger_token_ids=None, snli=True ) model.train() # rnn cannot do backwards in train mode # Initialize triggers num_trigger_tokens = 1 # one token prepended trigger_token_ids = [vocab.get_token_index("a")] * num_trigger_tokens # sample batches, update the triggers, and repeat for batch in lazy_groups_of( iterator(subset_dev_dataset, num_epochs=10, shuffle=True), group_size=1 ): # get model accuracy with current triggers utils.get_accuracy( model, subset_dev_dataset, vocab, trigger_token_ids, snli=True ) model.train() # rnn cannot do backwards in train mode # get grad of triggers averaged_grad = utils.get_average_grad( model, batch, trigger_token_ids, target_label, snli=True ) # find attack candidates using an attack method cand_trigger_token_ids = attacks.hotflip_attack( averaged_grad, embedding_weight, num_candidates=40 ) # cand_trigger_token_ids = attacks.random_attack(embedding_weight, # trigger_token_ids, # num_candidates=40) # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad, # embedding_weight, # trigger_token_ids, # tree, # 100, # decrease_prob=True) # query the model to get the best candidates trigger_token_ids = utils.get_best_candidates( model, batch, trigger_token_ids, cand_trigger_token_ids, snli=True )
def main(): # load the binary SST dataset. single_id_indexer = SingleIdTokenIndexer( lowercase_tokens=True) # word tokenizer # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences. reader = StanfordSentimentTreeBankDatasetReader( granularity="2-class", token_indexers={"tokens": single_id_indexer}, use_subtrees=True) train_data = reader.read( 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt') reader = StanfordSentimentTreeBankDatasetReader( granularity="2-class", token_indexers={"tokens": single_id_indexer}) dev_data = reader.read( 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt') # test_dataset = reader.read('data/sst/test.txt') vocab = Vocabulary.from_instances(train_data) # Randomly initialize vectors if EMBEDDING_TYPE == "None": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300) word_embedding_dim = 300 # Load word2vec vectors elif EMBEDDING_TYPE == "w2v": embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip" weight = _read_pretrained_embeddings_file(embedding_path, embedding_dim=300, vocab=vocab, namespace="tokens") token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300, weight=weight, trainable=False) word_embedding_dim = 300 # Initialize model, cuda(), and optimizer word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2VecWrapper( torch.nn.LSTM(word_embedding_dim, hidden_size=512, num_layers=2, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) model.cuda() # where to save the model model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th" vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab" # if the model already exists (its been trained), load the pre-trained weights and vocabulary if os.path.isfile(model_path): vocab = Vocabulary.from_files(vocab_path) model = LstmClassifier(word_embeddings, encoder, vocab) with open(model_path, 'rb') as f: model.load_state_dict(torch.load(f)) # otherwise train model from scratch and save its weights else: iterator = BucketIterator(batch_size=32, sorting_keys=[("tokens", "num_tokens")]) iterator.index_with(vocab) optimizer = optim.Adam(model.parameters()) trainer = Trainer(model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=dev_data, num_epochs=5, patience=1, cuda_device=0) trainer.train() with open(model_path, 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files(vocab_path) model.train().cuda() # rnn cannot do backwards in train mode # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings. # We use the gradient later in the attack. utils.add_hooks(model) embedding_weight = utils.get_embedding_weight( model) # also save the word embedding matrix # Use batches of size universal_perturb_batch_size for the attacks. universal_perturb_batch_size = 128 iterator = BasicIterator(batch_size=universal_perturb_batch_size) iterator.index_with(vocab) # Build k-d Tree if you are using gradient + nearest neighbor attack # tree = KDTree(embedding_weight.numpy()) # filter the dataset to only positive or negative examples # (the trigger will cause the opposite prediction) dataset_label_filter = "0" targeted_dev_data = [] for instance in dev_data: if instance['label'].label == dataset_label_filter: targeted_dev_data.append(instance) # get accuracy before adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None) model.train() # rnn cannot do backwards in train mode # initialize triggers which are concatenated to the input num_trigger_tokens = 3 trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens # sample batches, update the triggers, and repeat for batch in lazy_groups_of(iterator(targeted_dev_data, num_epochs=5, shuffle=True), group_size=1): # get accuracy with current triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids) model.train() # rnn cannot do backwards in train mode # get gradient w.r.t. trigger embeddings for current batch averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids) # pass the gradients to a particular attack to generate token candidates for each token. cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad, embedding_weight, trigger_token_ids, num_candidates=40, increase_loss=True) # cand_trigger_token_ids = attacks.random_attack(embedding_weight, # trigger_token_ids, # num_candidates=40) # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad, # embedding_weight, # trigger_token_ids, # tree, # 100, # num_candidates=40, # increase_loss=True) # Tries all of the candidates and returns the trigger sequence with highest loss. trigger_token_ids = utils.get_best_candidates(model, batch, trigger_token_ids, cand_trigger_token_ids) # print accuracy after adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)