def attack(text_ls, true_label, predictor, model, batch_size=1): # first check the prediction of the original text orig_probs = predictor([text_ls]).squeeze() orig_label = torch.argmax(orig_probs) orig_prob = orig_probs.max() if true_label != orig_label: return '', 0, orig_label, orig_label, 0 else: # print(text_ls) cw = CarliniL2(debug=False, targeted=False) cw.num_classes = 2 num_queries = 1 adv_seq, success = model.attack([text_ls], true_label, cw, batch_size) if adv_seq is not None: text_prime = model.dataset.transform_back_text(adv_seq) print("adv texts:", text_prime) else: print("optimize fail") text_prime = text_ls num_changed = 0 return ' '.join(text_prime), num_changed, orig_label.cpu().item(), \ torch.argmax(predictor([text_prime])).cpu().item(), num_queries
def cw_tree_attack(data_val, tree_data): adv_correct = 0 targeted_success = 0 untargeted_success = 0 orig_correct = 0 tot = 0 orig_append_correct = 0 adv_pickle = [] cw = CarliniL2(debug=args.debugging) embed = torch.load(args.word_vector) device = torch.device("cuda:0" if args.cuda else "cpu") vocab = Vocab(filename=args.dictionary, data=[PAD_WORD, UNK_WORD, EOS_WORD, SOS_WORD]) generator = Generator(args.test_data, vocab=vocab, embed=embed, data_set=data_val) bert_transfered_embedding = torch.load('bert_transfered_embedding.pth') transfer_emb = torch.nn.Embedding( bert_transfered_embedding.size(0), bert_transfered_embedding.size(1)).to(device) # transfer_emb = torch.nn.Embedding.from_pretrained(bert_transfered_embedding).to(device) transfer_emb.weight.data.copy_(bert_transfered_embedding) seqback = WrappedSeqback(embed, device, attack=True, seqback_model=generator.seqback_model, vocab=vocab, transfer_emb=transfer_emb) treelstm = generator.tree_model generator.load_state_dict(torch.load(args.load_ae)) class TreeModel(nn.Module): def __init__(self): super(TreeModel, self).__init__() def forward(self, hidden): self.embedding = seqback(hidden) return model(batch['data'], batch['seq_len'], perturbed=self.embedding)['pred'] def set_temp(self, temp): seqback.temp = temp def get_embedding(self): return self.embedding def get_seqback(self): return seqback tree_model = TreeModel() for batch in get_tree_batch(data_val, tree_data, vocab): input_embedding = model.bert.embeddings.word_embeddings(batch['data']) batch['tree'] = [generator.get_tree(tree) for tree in batch['tree']] seqback.sentences = input_embedding.clone().detach() seqback.batch_trees = batch['tree'] seqback.batch_add_sent = batch['ae_add_sents'] seqback.start = batch['add_start'] seqback.end = batch['add_end'] seqback.adv_sent = [] batch_tree_embedding = [] for bi, append_sent in enumerate(batch['ae_add_sents']): sentences = [ torch.tensor(append_sent, dtype=torch.long, device=device) ] trees = [batch['tree'][bi]] tree_embedding = treelstm(sentences, trees)[0][0].detach() batch_tree_embedding.append(tree_embedding) hidden = torch.cat(batch_tree_embedding, dim=0) cw.batch_info = batch adv_hidden = cw.run(tree_model, hidden, batch['attack_targets'], batch_size=hidden.shape[0], input_token=input_embedding) seqback.adv_sent = [] adv_seq = torch.tensor(batch['data']).to(device) for bi, (add_start, add_end) in enumerate( zip(batch['add_start'], batch['add_end'])): if bi in cw.o_best_sent: ae_words = cw.o_best_sent[bi] bert_tokens = tokenizer.convert_tokens_to_ids(ae_words) adv_seq[bi, add_start:add_end] = torch.LongTensor(bert_tokens) out = model(adv_seq, batch['seq_len'])['pred'] prediction = torch.max(out, 1)[1] orig_correct += batch['orig_correct'].item() orig_append_correct += batch['orig_append_correct'].item() adv_correct += torch.sum((prediction == batch['label']).float()).item() targeted_success += torch.sum( (prediction == batch['attack_targets']).float()).item() untargeted_success += untargeted_success_rate(prediction, batch['label']) tot += len(batch['label']) for i in range(len(batch['label'])): adv_pickle.append({ 'raw_text': transform(adv_seq[i]), 'label': batch['label'][i].item() }) try: logger.info(("orig:", transform(batch['add_sents'][i]))) logger.info(("adv:", cw.o_best_sent[i])) except: continue logger.info(("orig_correct:", orig_correct)) logger.info(("orig_append_correct:", orig_append_correct)) logger.info(("adv_correct:", adv_correct)) logger.info(("targeted successful rate:", targeted_success)) logger.info(("untargetd successful rate:", untargeted_success)) logger.info(("tot:", tot)) joblib.dump(adv_pickle, root_dir + '/adv_text.pkl') logger.info(("orig_correct:", orig_correct / tot)) logger.info(("orig_append_correct:", orig_append_correct / tot)) logger.info(("adv_correct:", adv_correct / tot)) logger.info(("targeted successful rate:", targeted_success / tot)) logger.info(("untargetd successful rate:", untargeted_success / tot))
def cw_word_attack(data_val): adv_correct = 0 targeted_success = 0 untargeted_success = 0 orig_correct = 0 orig_append_correct = 0 tot = 0 adv_pickle = [] cw = CarliniL2(debug=args.debugging) for batch in get_batch(data_val): data = batch['data'] seq_len = batch['seq_len'] label = batch['label'] batch_add_start = batch['add_start'] batch_add_end = batch['add_end'] attack_targets = batch['attack_targets'] add_sents = batch['add_sents'] tot += len(label) input_embedding = model.bert.embeddings.word_embeddings(data) cw_mask = np.zeros(input_embedding.shape).astype(np.float32) for bi, (add_start, add_end) in enumerate(zip(batch_add_start, batch_add_end)): cw_mask[bi, add_start:add_end] = 1 cw_mask = torch.from_numpy(cw_mask).float().to(device) cw.wv = model.bert.embeddings.word_embeddings.weight cw.mask = cw_mask cw.seq = data cw.batch_info = batch cw.seq_len = seq_len adv_data = cw.run(model, input_embedding, attack_targets) adv_seq = torch.tensor(batch['data']).to(device) for bi, (add_start, add_end) in enumerate(zip(batch_add_start, batch_add_end)): if bi in cw.o_best_sent: adv_seq.data[bi, add_start:add_end] = torch.LongTensor( cw.o_best_sent[bi]) out = model(adv_seq, seq_len)['pred'] prediction = torch.max(out, 1)[1] orig_correct += batch['orig_correct'].item() orig_append_correct += batch['orig_append_correct'].item() adv_correct += torch.sum((prediction == label).float()).item() targeted_success += torch.sum( (prediction == attack_targets).float()).item() untargeted_success += untargeted_success_rate(prediction, label) for i in range(len(add_sents)): adv_pickle.append({ 'raw_text': transform(adv_seq[i]), 'label': label[i].item() }) try: logger.info(("orig:", transform(add_sents[i][1:]))) logger.info(("adv:", transform(cw.o_best_sent[i]))) except: continue logger.info(("orig_correct:", orig_correct)) logger.info(("orig_append_correct:", orig_append_correct)) logger.info(("adv_correct:", adv_correct)) logger.info(("targeted successful rate:", targeted_success)) logger.info(("untargetd successful rate:", untargeted_success)) logger.info(("tot:", tot)) joblib.dump(adv_pickle, root_dir + '/adv_text.pkl') logger.info(("orig_correct:", orig_correct / tot)) logger.info(("orig_append_correct:", orig_append_correct / tot)) logger.info(("adv_correct:", adv_correct / tot)) logger.info(("targeted successful rate:", targeted_success / tot)) logger.info(("untargetd successful rate:", untargeted_success / tot))
def cw_tree_attack(data_val): init_attack() cw = CarliniL2() embed = torch.load(args.word_vector) device = torch.device("cuda:0" if args.cuda else "cpu") generator = Generator(args.test_data, vocab=vocab, embed=embed, data_set=data_val) seqback = WrappedSeqback(embed, device, attack=True, seqback_model=generator.seqback_model, vocab=vocab, transfer_emb=model.encoder.bilstm.encoder) treelstm = generator.tree_model generator.load_state_dict(torch.load(args.load_ae)) adv_correct = 0 targeted_success = 0 untargeted_success = 0 orig_append_correct = 0 orig_correct = 0 tot = 0 adv_pickle = [] class TreeModel(nn.Module): def __init__(self): super(TreeModel, self).__init__() def forward(self, hidden): self.embedding = seqback(hidden) return model(self.embedding) def set_temp(self, temp): seqback.temp = temp def get_embedding(self): return self.embedding def get_seqback(self): return seqback tree_model = TreeModel() for batch in get_batch(data_val, has_tree=True): seqback.sentences = batch['data'] seqback.batch_trees = batch['tree'] seqback.batch_masks = batch['mask'] seqback.batch_splitted_sentences = batch['split_text'] seqback.start = batch['add_start'] seqback.end = batch['add_end'] batch_add_start = batch['add_start'] batch_add_end = batch['add_end'] seqback.adv_sent = [] batch_tree_embedding = [] for bi, split_text in enumerate(batch['split_text']): # todo: default use the embedding of front??? batch['split_text'][bi] = [torch.tensor(x, dtype=torch.long, device=device) for x in split_text] sentences = [batch['split_text'][bi][0]] trees = [batch['tree'][bi][0]] masks = [batch['mask'][bi][0]] tree_embedding = treelstm(sentences, trees, masks)[0][0].detach() batch_tree_embedding.append(tree_embedding) hidden = torch.cat(batch_tree_embedding, dim=0) data = batch['data'] model.encoder.raw_inp = batch['data'] model.init_hidden(data.size(1)) model.encoder.bilstm.attack_mode = True input_embedding = model.encoder.bilstm.encoder(data) # np.save('tree_attack/input.npy', input_token.cpu().numpy()) if args.baseline: modifier = torch.randn_like(hidden, device=device) modifier = F.normalize(modifier, p=2, dim=1) * 1e2 adv_hidden = hidden + modifier else: with torch.autograd.detect_anomaly(): adv_hidden = cw.run(tree_model, hidden, batch['attack_targets'], batch_size=hidden.shape[0], input_token=input_embedding) seqback.adv_sent = [] adv_seq = torch.tensor(data).to(device) for bi, (add_start, add_end) in enumerate(zip(batch_add_start, batch_add_end)): if bi in cw.o_best_sent: adv_seq[add_start:add_end, bi] = cw.o_best_sent[bi] for i in range(len(batch['label'])): adv_pickle.append({ 'raw_text': vocab.tensorConvertToLabels(adv_seq[:, i]), 'label': batch['label'][i] }) try: logger.info(("orig:", vocab.convertToLabels(batch['add_words'][i]))) logger.info(("adv:", vocab.tensorConvertToLabels(cw.o_best_sent[i]))) except: continue model.encoder.raw_inp = None model.encoder.bilstm.attack_mode = False output, attention = model(adv_seq) output_flat = output.view(data.size(1), -1) prediction = torch.max(output_flat, 1)[1] orig_correct += batch['orig_correct'].item() orig_append_correct += batch['orig_append_correct'].item() adv_correct += torch.sum((prediction == batch['targets']).float()).item() targeted_success += torch.sum((prediction == batch['attack_targets']).float()).item() untargeted_success += untargeted_success_rate(prediction, batch['label']) tot += len(batch['label']) logger.info(("orig_correct:", orig_correct)) logger.info(("orig_append_correct:", orig_append_correct)) logger.info(("adv_correct:", adv_correct)) logger.info(("targeted successful rate:", targeted_success)) logger.info(("untargetd successful rate:", untargeted_success)) logger.info(("tot:", tot)) joblib.dump(adv_pickle, root_dir + '/adv_text.pkl') logger.info(("orig_correct:", orig_correct / tot)) logger.info(("orig_append_correct:", orig_append_correct / tot)) logger.info(("adv_correct:", adv_correct / tot)) logger.info(("targeted successful rate:", targeted_success / tot)) logger.info(("untargetd successful rate:", untargeted_success / tot))
def cw_seq_attack(data_val): init_attack() cw = CarliniL2() embed = torch.load(args.word_vector) device = torch.device("cuda:0" if args.cuda else "cpu") encoder = EncoderRNN(vocab, embed.size(1), args.hidden_dim, device) decoder = Decoder(embed.size(1), args.hidden_dim, vocab.size(), dropout=0.0) generator = Seq2SeqGenerator(encoder, decoder, embed=embed).to(device) seqback = WrappedSeqDecoder(decoder, vocab) generator.load_state_dict(torch.load(args.load_ae)) adv_correct = 0 targeted_success = 0 untargeted_success = 0 orig_correct = 0 tot = 0 def get_seq_hidden(batch_add_words): # get lstm hidden embedding encoder_output, hidden = encoder(batch_add_words) hidden = torch.stack(hidden).transpose_(0, 2).detach() encoder_output = encoder_output.detach() return encoder_output, hidden class SeqModel(nn.Module): def __init__(self): super(SeqModel, self).__init__() def forward(self, hidden): embedding = seqback(hidden) return model(embedding) seq_model = SeqModel() for batch in get_batch(data_val): batch_add_words = batch['add_words'] sos = torch.tensor([vocab.getIndex(SOS_WORD)], dtype=torch.long) eos = torch.tensor([vocab.getIndex(EOS_WORD)], dtype=torch.long) for i, sentence in enumerate(batch_add_words): sentence = torch.tensor(sentence) sentence = torch.cat((sos, sentence, eos), 0) sentence = sentence.to(device) batch_add_words[i] = sentence from torch.nn.utils.rnn import pad_sequence batch_add_words = pad_sequence(batch_add_words, padding_value=vocab.getIndex(PAD_WORD)) encoder_output, hidden = get_seq_hidden(batch_add_words) seqback.trg = batch_add_words seqback.encoder_output = encoder_output seqback.start = batch['add_start'] seqback.end = batch['add_end'] seqback.sentences = batch['data'] seqback.adv_sent = [] data = batch['data'] model.encoder.raw_inp = batch['data'] model.init_hidden(data.size(1)) model.encoder.bilstm.attack_mode = True if args.baseline: modifier = torch.randn_like(hidden, device=device) modifier = F.normalize(modifier, p=2, dim=3) * 1e2 adv_hidden = hidden + modifier else: adv_hidden = cw.run(seq_model, hidden, batch['attack_targets'], batch_size=hidden.shape[0]) adv_hidden = torch.tensor(adv_hidden).to(device) seqback.adv_sent = [] output, attention = seq_model(adv_hidden) output_flat = output.view(data.size(1), -1) prediction = torch.max(output_flat, 1)[1] orig_correct += batch['orig_correct'].item() adv_correct += torch.sum((prediction == batch['targets']).float()).item() targeted_success += torch.sum((prediction == batch['attack_targets']).float()).item() untargeted_success += untargeted_success_rate(prediction, batch['label']) tot += len(batch['label']) for adv, orig in zip(seqback.adv_sent, batch['add_words']): print("orig:", vocab.tensorConvertToLabels(orig[1:], vocab.getIndex(PAD_WORD))[:-1], file=adv_sent_file) print("adv:", adv[:-1], file=adv_sent_file) print("orig_correct:", orig_correct) print("adv_correct:", adv_correct) print("targeted successful rate:", targeted_success) print("untargetd successful rate:", untargeted_success) print("tot:", tot) print("orig_correct:", orig_correct / tot) print("adv_correct:", adv_correct / tot) print("targeted successful rate:", targeted_success / tot) print("untargetd successful rate:", untargeted_success / tot)
def cw_word_attack(data_val): init_attack() # fname = "/home/wbx/yelp/vectors.kv" fname = "full-vectors.kv" if not os.path.isfile(fname): embed = model.encoder.bilstm.encoder.weight print(len(vocab.idxToLabel), embed.shape[1], file=open(fname, "a")) for k, v in vocab.idxToLabel.items(): vectors = embed[k].cpu().numpy() vector = "" for x in vectors: vector += " " + str(x) print(v, vector[1:], file=open(fname, "a")) device = torch.device("cuda:0" if args.cuda else "cpu") adv_correct = 0 targeted_success = 0 untargeted_success = 0 orig_correct = 0 orig_append_correct = 0 tot = 0 adv_pickle = [] cw = CarliniL2(debug=args.debugging) for batch in get_batch(data_val): data = batch['data'] attack_targets = batch['attack_targets'] batch_add_start = batch['add_start'] batch_add_end = batch['add_end'] text = batch['text'] split_text = batch['split_text'] label = batch['label'] # convert text into embedding and attack in the embedding space model.encoder.raw_inp = data model.init_hidden(data.size(1)) model.encoder.bilstm.attack_mode = True input_embedding = model.encoder.bilstm.encoder(data) cw_mask = np.zeros(input_embedding.shape).astype(np.float32) for bi, (add_start, add_end) in enumerate(zip(batch_add_start, batch_add_end)): cw_mask[add_start:add_end, bi] = 1 cw_mask = torch.from_numpy(cw_mask).float() if args.cuda: cw_mask = cw_mask.cuda() cw.batch_info = batch cw.wv = model.encoder.bilstm.encoder.weight if args.baseline: modifier = torch.randn_like(data, device=device) modifier = F.normalize(modifier, p=2, dim=2) * 10 adv_data = input_embedding + modifier * cw_mask adv_data = adv_data.cpu().detach().numpy() else: cw.mask = cw_mask adv_data = cw.run(model, input_embedding, attack_targets) # adv_hidden = torch.tensor(adv_data).to(device) adv_seq = torch.tensor(data).to(device) for bi, (add_start, add_end) in enumerate(zip(batch_add_start, batch_add_end)): if bi in cw.o_best_sent: adv_seq.data[add_start:add_end, bi] = torch.LongTensor(cw.o_best_sent[bi]) for i in range(len(split_text)): adv_pickle.append({ 'raw_text': vocab.tensorConvertToLabels(adv_seq[:, i]), 'label': label[i] }) try: logger.info(("orig:", vocab.convertToLabels(split_text[i][0]))) logger.info(("adv:", vocab.convertToLabels(cw.o_best_sent[i]))) except: continue model.encoder.raw_inp = None model.encoder.bilstm.attack_mode = False output, attention = model(adv_seq) output_flat = output.view(data.size(1), -1) prediction = torch.max(output_flat, 1)[1] targets = batch['targets'] orig_correct += batch['orig_correct'].item() orig_append_correct += batch['orig_append_correct'].item() adv_correct += torch.sum((prediction == targets).float()).item() targeted_success += torch.sum((prediction == attack_targets).float()).item() untargeted_success += untargeted_success_rate(prediction, label) tot += len(label) logger.info(("orig_correct:", orig_correct)) logger.info(("orig_append_correct:", orig_append_correct)) logger.info(("adv_correct:", adv_correct)) logger.info(("targeted successful rate:", targeted_success)) logger.info(("untargetd successful rate:", untargeted_success)) logger.info(("tot:", tot)) joblib.dump(adv_pickle, root_dir + '/adv_text.pkl') logger.info(("orig_correct:", orig_correct / tot)) logger.info(("adv_correct:", adv_correct / tot)) logger.info(("orig_append_correct:", orig_append_correct / tot)) logger.info(("targeted successful rate:", targeted_success / tot)) logger.info(("untargetd successful rate:", untargeted_success / tot))