def get_word_imp(origSents, orig_label_sents, sent2imp, sent2cluster): #revise name of sent2sent '''Computes word importance scores Args: origSents (list): List of sentences in original text orig_label_sents (list): List of sentences having predicted label same as the original label of the text sent2imp (dict): Dictionary mapping sentences to their importance ranking sent2cluster (dict): Dictionary mapping origSents to the sentence cluster to which it belongs ( We merge sentence in orig_sents to reduce length of origSents, in case total sentences are above 12 ''' ind_count = 0 import_scores = [] nlp = spacy.load('en') for sent in origSents: if not " " in str(sent): text_sent = [str(sent)] else: text_tokens = nlp(sent) text_sent = [str(word) for word in text_tokens] if not sent in orig_label_sents: import_scores.extend([300] * len(text_sent)) else: pos_tags = criteria.get_pos(text_sent) if sent in sent2cluster: sent_imp = sent2imp[sent2cluster[sent]] else: sent_imp = sent2imp[sent] for i1 in range(len(text_sent)): if pos_tags[i1] == 'ADV': import_scores.append(sent_imp + 15) elif pos_tags[i1] == 'VERB': import_scores.append(sent_imp + 15) elif pos_tags[i1] == 'ADJ': import_scores.append(sent_imp) else: import_scores.append(sent_imp + 50) import_scores = np.array(import_scores) return import_scores
def random_attack(text_ls, true_label, predictor, perturb_ratio, stop_words_set, word2idx, idx2word, cos_sim, sim_predictor=None, import_score_threshold=-1., sim_score_threshold=0.5, sim_score_window=15, synonym_num=50, batch_size=32): # first check the prediction of the original text orig_probs = predictor([text_ls]).squeeze() orig_label = torch.argmax(orig_probs) orig_prob = orig_probs.max() if true_label != orig_label: return '', 0, orig_label, orig_label, 0 else: len_text = len(text_ls) if len_text < sim_score_window: sim_score_threshold = 0.1 # shut down the similarity thresholding function half_sim_score_window = (sim_score_window - 1) // 2 num_queries = 1 # get the pos and verb tense info pos_ls = criteria.get_pos(text_ls) # randomly get perturbed words perturb_idxes = random.sample(range(len_text), int(len_text * perturb_ratio)) words_perturb = [(idx, text_ls[idx]) for idx in perturb_idxes] # find synonyms words_perturb_idx = [word2idx[word] for idx, word in words_perturb if word in word2idx] synonym_words, _ = pick_most_similar_words_batch(words_perturb_idx, cos_sim, idx2word, synonym_num, 0.5) synonyms_all = [] for idx, word in words_perturb: if word in word2idx: synonyms = synonym_words.pop(0) if synonyms: synonyms_all.append((idx, synonyms)) # start replacing and attacking text_prime = text_ls[:] text_cache = text_prime[:] num_changed = 0 for idx, synonyms in synonyms_all: new_texts = [text_prime[:idx] + [synonym] + text_prime[min(idx + 1, len_text):] for synonym in synonyms] new_probs = predictor(new_texts, batch_size=batch_size) # compute semantic similarity if idx >= half_sim_score_window and len_text - idx - 1 >= half_sim_score_window: text_range_min = idx - half_sim_score_window text_range_max = idx + half_sim_score_window + 1 elif idx < half_sim_score_window and len_text - idx - 1 >= half_sim_score_window: text_range_min = 0 text_range_max = sim_score_window elif idx >= half_sim_score_window and len_text - idx - 1 < half_sim_score_window: text_range_min = len_text - sim_score_window text_range_max = len_text else: text_range_min = 0 text_range_max = len_text semantic_sims = \ sim_predictor.semantic_sim([' '.join(text_cache[text_range_min:text_range_max])] * len(new_texts), list(map(lambda x: ' '.join(x[text_range_min:text_range_max]), new_texts)))[0] num_queries += len(new_texts) if len(new_probs.shape) < 2: new_probs = new_probs.unsqueeze(0) new_probs_mask = (orig_label != torch.argmax(new_probs, dim=-1)).data.cpu().numpy() # prevent bad synonyms new_probs_mask *= (semantic_sims >= sim_score_threshold) # prevent incompatible pos synonyms_pos_ls = [criteria.get_pos(new_text[max(idx - 4, 0):idx + 5])[min(4, idx)] if len(new_text) > 10 else criteria.get_pos(new_text)[idx] for new_text in new_texts] pos_mask = np.array(criteria.pos_filter(pos_ls[idx], synonyms_pos_ls)) new_probs_mask *= pos_mask if np.sum(new_probs_mask) > 0: text_prime[idx] = synonyms[(new_probs_mask * semantic_sims).argmax()] num_changed += 1 break else: new_label_probs = new_probs[:, orig_label] + torch.from_numpy( (semantic_sims < sim_score_threshold) + (1 - pos_mask).astype(float)).float().cuda() new_label_prob_min, new_label_prob_argmin = torch.min(new_label_probs, dim=-1) if new_label_prob_min < orig_prob: text_prime[idx] = synonyms[new_label_prob_argmin] num_changed += 1 text_cache = text_prime[:] return ' '.join(text_prime), num_changed, orig_label, torch.argmax(predictor([text_prime])), num_queries
def contextual_attack(text_ls, true_label, predictor, maskedLM_predictor , stop_words_set, word2idx, idx2word, cos_sim, sim_predictor=None, import_score_threshold=-1., sim_score_threshold=0.5, sim_score_window=15, synonym_num=50, batch_size=32): # first check the prediction of the original text orig_probs = predictor([text_ls]).squeeze() orig_label = torch.argmax(orig_probs) orig_prob = orig_probs.max() if true_label != orig_label: return '', 0, orig_label, orig_label, 0 else: len_text = len(text_ls) if len_text < sim_score_window: sim_score_threshold = 0.1 # shut down the similarity thresholding function half_sim_score_window = (sim_score_window - 1) // 2 num_queries = 1 # get the pos and verb tense info pos_ls = criteria.get_pos(text_ls) # get importance score leave_1_texts = [text_ls[:ii] + ['<oov>'] + text_ls[min(ii + 1, len_text):] for ii in range(len_text)] leave_1_probs = predictor(leave_1_texts, batch_size=batch_size) num_queries += len(leave_1_texts) leave_1_probs_argmax = torch.argmax(leave_1_probs, dim=-1) import_scores = (orig_prob - leave_1_probs[:, orig_label] + (leave_1_probs_argmax != orig_label).float() * ( leave_1_probs.max(dim=-1)[0] - torch.index_select(orig_probs, 0, leave_1_probs_argmax))).data.cpu().numpy() # get words to perturb ranked by importance scorefor word in words_perturb words_perturb = [] for idx, score in sorted(enumerate(import_scores), key=lambda x: x[1], reverse=True): try: if score > import_score_threshold and text_ls[idx] not in stop_words_set: words_perturb.append((idx, text_ls[idx])) except: print(idx, len(text_ls), import_scores.shape, text_ls, len(leave_1_texts)) #print("Generated words_perturb") # find synonyms new_texts=[] synonyms_all = [] #print(' '.join(text_ls)) for idx, word in words_perturb: synonyms=[] if idx >=127: continue new_texts.append(text_ls[:idx] + ['[MASK]'] + text_ls[min(idx + 1, len_text):]) masked_lm_probs=maskedLM_predictor.text_pred(new_texts, batch_size=batch_size) #masked_lm_probs=masked_lm_probs.cpu().numpy() #print(np.shape(masked_lm_probs)) #exit() values,indices = torch.topk(masked_lm_probs, 25, dim=-1) tokens=maskedLM_predictor.convert_ids_to_tokens(indices.view(-1).cpu().numpy()) tokens=np.reshape(tokens,(1,128,-1)) #print(np.shape(tokens)) #exit() #print(word+" "+str(idx)) #print(' '.join(text_ls)) for i in range(25): word=tokens[0][idx][i] if word in word2idx: synonyms.append(word) #print(tokens[0][idx+1][i]+" ",end="") #print("\n") #for i in range(25): # print(tokens[0][idx][i]+" ",end="") #print("\n") #for i in range(len(indices)): # if indices[i] in idx2word: # synonyms.append( idx2word[indices[i]])# if indices[i] in idx2word) if synonyms: synonyms_all.append((idx, synonyms)) #exit() # words_perturb_idx = [word2idx[word] for idx, word in words_perturb if word in word2idx] # synonym_words, _ = pick_most_similar_words_batch(words_perturb_idx, cos_sim, idx2word, synonym_num, 0.5) # for idx, word in words_perturb: # if word in word2idx: # synonyms = synonym_words.pop(0) # if synonyms: # synonyms_all.append((idx, synonyms)) # start replacing and attacking text_prime = text_ls[:] text_cache = text_prime[:] num_changed = 0 #print("Generated Synonyms") for idx, synonyms in synonyms_all: new_texts = [text_prime[:idx] + [synonym] + text_prime[min(idx + 1, len_text):] for synonym in synonyms] #print(new_texts) if new_texts: new_probs = predictor(new_texts, batch_size=batch_size) else: continue # compute semantic similarity if idx >= half_sim_score_window and len_text - idx - 1 >= half_sim_score_window: text_range_min = idx - half_sim_score_window text_range_max = idx + half_sim_score_window + 1 elif idx < half_sim_score_window and len_text - idx - 1 >= half_sim_score_window: text_range_min = 0 text_range_max = sim_score_window elif idx >= half_sim_score_window and len_text - idx - 1 < half_sim_score_window: text_range_min = len_text - sim_score_window text_range_max = len_text else: text_range_min = 0 text_range_max = len_text semantic_sims = \ sim_predictor.semantic_sim([' '.join(text_cache[text_range_min:text_range_max])] * len(new_texts), list(map(lambda x: ' '.join(x[text_range_min:text_range_max]), new_texts)))[0] num_queries += len(new_texts) if len(new_probs.shape) < 2: new_probs = new_probs.unsqueeze(0) new_probs_mask = (orig_label != torch.argmax(new_probs, dim=-1)).data.cpu().numpy() # prevent bad synonyms new_probs_mask *= (semantic_sims >= sim_score_threshold) # prevent incompatible pos synonyms_pos_ls = [criteria.get_pos(new_text[max(idx - 4, 0):idx + 5])[min(4, idx)] if len(new_text) > 10 else criteria.get_pos(new_text)[idx] for new_text in new_texts] pos_mask = np.array(criteria.pos_filter(pos_ls[idx], synonyms_pos_ls)) new_probs_mask *= pos_mask if np.sum(new_probs_mask) > 0: text_prime[idx] = synonyms[(new_probs_mask * semantic_sims).argmax()] num_changed += 1 break else: new_label_probs = new_probs[:, orig_label] + torch.from_numpy( (semantic_sims < sim_score_threshold) + (1 - pos_mask).astype(float)).float().cuda() new_label_prob_min, new_label_prob_argmin = torch.min(new_label_probs, dim=-1) if new_label_prob_min < orig_prob: text_prime[idx] = synonyms[new_label_prob_argmin] num_changed += 1 text_cache = text_prime[:] return ' '.join(text_prime), num_changed, orig_label, torch.argmax(predictor([text_prime])), num_queries
def attack(cmodel, gcp_nlp_json_link, text_ls, true_label, stop_words_set, word2idx_rev, idx2word_rev, idx2word_vocab, cos_sim, pos_filter, sim_score_threshold=0.5, sim_score_window=15, synonym_num=80, syn_sim=0.65): '''Attack function Implementation of the attack algorithm Args: text_ls (str): Text to be attacked true_label (int): True class of text_ls cmodel (Model obj): Model to be attacked cos_sim (numpy array): numpy array, precomuted cosine similarity square matrix word2idx (dict): Mapping words to indices in the precomuted cosine similarity square matrix idx2word (dict): Mapping indices of precomuted cosine similarity square matrix back to words sim_score_threshold (float): Semantic similarity threshold while selecting or rejecting synonyms,default:0.5 sim_score_window (int): Window size for computing semantic similarity between actual and perturbed text around the perturbed word synonym_num (int): Max number of candidate synonyms to be analysed syn_sim (float): Threshold for cosine similarity between candidate synonyms and original word,defualt:0.75 gcp_nlp_json_link (str) : Google Cloud Platform NLP API JSON key file link ''' if gcp_nlp_json_link: tmodel = model(gcp=True, gcp_nlp_json_link=gcp_nlp_json_link) else: tmodel = model(cmodel) text_temp = text_ls[:] orig_label = tmodel.getPredictions([text_ls])[0] if true_label != orig_label: return '', 0, orig_label, orig_label, 0 else: nlp = spacy.load('en') doc = nlp(str(text_ls)) text_ls = [str(j) for j in doc] len_text = len(text_ls) if len_text < sim_score_window: sim_score_threshold = 0.1 # shut down the similarity thresholding function num_queries = 1 # get the pos info if pos_filter == 'fine': pos_ls1 = nltk.pos_tag(text_ls) pos_ls = [pos_ls1[i][1] for i in range(len(text_ls))] else: pos_ls = criteria.get_pos(text_ls) #sentence segmentation sents_sentiment_dic = defaultdict(list) text_sentences = nlp(text_temp) sents1 = text_sentences.sents sents = [str(sent) for sent in sents1] #print(sents) #look up spacy to get a logical way to get phrases or chunks if len(sents) == 1: sent = sents[0] tokens = nlp(sent) a = len(tokens) // 2 if len(tokens) > 4: sents = [ str(tokens[i:i + 4]) for i in range(0, len(tokens), 4) ] #segregate positive and negative sentence preds = tmodel.getPredictions(list(sents)) num_queries += len(sents) for i in range(len(preds)): sents_sentiment_dic[preds[i]].append(sents[i]) #print(sents_sentiment_dic) orig_sents_ln = len(sents) origSents = sents[:] orig_label_sents = sents_sentiment_dic[orig_label][:] #curtail orig label sentences sent2cluster = {} if len(orig_label_sents) > 12: ln = len(orig_label_sents) mult = int(np.ceil(ln / 12)) new_list = [] for q in range(0, ln, mult): if q + mult < ln: new_sent_list = sents_sentiment_dic[orig_label][q:q + mult] new_sent_str = ' '.join(new_sent_list) new_list.append(new_sent_str) else: new_sent_list = sents_sentiment_dic[orig_label][q:] new_sent_str = ' '.join(new_sent_list) new_list.append(new_sent_str) for snt in new_sent_list: sent2cluster[snt] = new_sent_str sents.remove(snt) sents.append(new_sent_str) sents_sentiment_dic[orig_label] = new_list #Get sentence importance ranking top_sent_imp, word_agg_dic, sent2imp, num_queries = get_sentence_imp_ranking( sents_sentiment_dic, num_queries, orig_label, tmodel) #Get word importance scores import_scores = get_word_imp(origSents, orig_label_sents, sent2imp, sent2cluster) # get words to perturb ranked by importance score for word in words_perturb words_perturb = [] text_prime = text_ls[:] imp_indxs = np.argsort(import_scores).tolist() for idx in imp_indxs: if not text_prime[idx] in stop_words_set: words_perturb.append((idx, text_prime[idx])) # find synonyms words_perturb_idx = [ word2idx_rev[word] for idx, word in words_perturb if word in word2idx_rev ] synonym_words, _ = pick_most_similar_words_batch( words_perturb_idx, cos_sim, idx2word_vocab, synonym_num, syn_sim) synonyms_all = [] for idx, word in words_perturb: if word in word2idx_rev: synonyms = synonym_words.pop(0) if synonyms: synonyms_all.append((idx, synonyms)) # start replacing and attacking num_changed = 0 idx_flag = 0 backtrack_dic = {} flg = 0 misclassified = False visited = {} #map the words to indices in text_prime word_idx_dic = {} for idx in range(len(text_prime)): word = text_prime[idx] if word in word_idx_dic: word_idx_dic[word].append(idx) else: word_idx_dic[word] = [idx] visited[word] = False origtext_prime = text_prime.copy() len_text = len(text_prime) for idx, synonyms in synonyms_all: orig_pos = criteria.get_pos(text_prime)[idx] if len(origtext_prime[idx]) <= 1 or visited[ origtext_prime[idx]] == True: continue if misclassified: #backtrack to check for unnecessary perturbations for (wrd, index) in backtrack_dic: txt_temp = text_prime[:] txt_temp[index] = backtrack_dic[(wrd, index)] txt_temp = vowel_correction(txt_temp, index) pred = tmodel.getPredictions([' '.join(txt_temp)])[0] num_queries += 1 if pred != orig_label: text_prime = txt_temp[:] num_changed -= 1 break if num_queries >= 5000: break text_range_min, text_range_max = get_semantic_sim_window( idx, len_text, sim_score_window) # Step#1: Find all aggregates(with orig_label) to which the target wrd belongs target_word = text_prime[idx] visited[target_word] = True agg_list = [] if target_word in word_agg_dic: agg_list = list(set(word_agg_dic[target_word])) word_agg_dic[target_word] = [] orig_sentiment_sent = [] for sent1 in sents_sentiment_dic[orig_label]: if target_word in sent1 and not sent1 in agg_list: orig_sentiment_sent.append(sent1) #Check if any synonym is able make the entire review/text misclassify if pos_filter == 'fine': new_pos = np.array([ nltk.pos_tag(text_prime[:idx] + [syn] + text_prime[idx + 1:])[idx][1] for syn in synonyms ]) else: new_pos = np.array([ criteria.get_pos(text_prime[:idx] + [syn] + text_prime[idx + 1:])[idx] for syn in synonyms ]) pos_mask = (new_pos == (pos_ls[idx])).astype(int) rev_with_syns1 = [ text_prime[:idx] + [syn] + text_prime[idx + 1:] for syn in synonyms ] sem_sims1 = np.array([ semantic_sim( [' '.join(rev_with_syn[text_range_min:text_range_max])], [' '.join(text_prime[text_range_min:text_range_max])]) for rev_with_syn in rev_with_syns1 ]) sem_sim_mask = (sem_sims1 >= sim_score_threshold).astype(int) #apply pos and semantic similarity masks to synonyms synonyms_masked = [ synonyms[i] for i in range(len(synonyms)) if pos_mask[i] == 1 and sem_sim_mask[i] == 1 ] rev_with_syns = [ text_prime[:idx] + [syn] + text_prime[idx + 1:] for syn in synonyms_masked ] sem_sims = np.array([ semantic_sim( [' '.join(rev_with_syn[text_range_min:text_range_max])], [' '.join(text_prime[text_range_min:text_range_max])]) for rev_with_syn in rev_with_syns ]) #sort synonyms as per semantic similarity scores sort_order = dict(zip(synonyms_masked, sem_sims)) synonyms_sorted = sorted(synonyms_masked, key=sort_order.get) rev_str = ' '.join(text_prime) vowels = {'a', 'e', 'i', 'o', 'u'} revs_with_synonyms1 = [ re.sub(r'\b{}\s+{}\b'.format('a', target_word), 'an ' + syn, rev_str) if syn[0] in vowels else re.sub( r'\b{}\s+{}\b'.format('an', target_word), 'a ' + syn, rev_str) for syn in synonyms_sorted ] revs_with_synonyms = [ re.sub(r'\b{}\b'.format(target_word), synonyms_sorted[i], revs_with_synonyms1[i]) for i in range(len(synonyms_sorted)) ] changed = False for i in range(len(revs_with_synonyms)): num_queries += 1 pred = tmodel.getPredictions([revs_with_synonyms[i]])[0] if pred != orig_label: changed = True sel_sym = synonyms_sorted[i] print(sel_sym) misclassified = True break #Check if any synonym is able make the any sentence, which originally had the same label as the orig_label of the review, #to misclassify if not changed and len(orig_sentiment_sent) > 0: #print("len sents: ",len(orig_sentiment_sent)) sents_with_syns1 = [[ re.sub(r'\b{}\s+{}\b'.format('a', target_word), 'an ' + syn, sent) if syn[0] in vowels else re.sub( r'\b{}\s+{}\b'.format('an', target_word), 'a ' + syn, sent) for sent in orig_sentiment_sent ] for syn in synonyms_sorted] sents_with_syns = [[ re.sub(r'\b{}\b'.format(target_word), synonyms_sorted[i], sent) for sent in sents_with_syns1[i] ] for i in range(len(synonyms_sorted))] for i in range(len(sents_with_syns)): num_queries += len(sents_with_syns[i]) if tmodel.getPredictions( sents_with_syns[i]).count(orig_label) < len( sents_with_syns[i]): changed = True sel_sym = synonyms_sorted[i] break if not changed and len(agg_list) > 0: #Check if any synonym is able make any aggregate, which originally had the same label as the orig_label of the review, #to misclassify aggs_with_syns1 = [[ re.sub(r'\b{}\s+{}\b'.format('a', target_word), 'an ' + syn, agg) if syn[0] in vowels else re.sub( r'\b{}\s+{}\b'.format('an', target_word), 'a ' + syn, agg) for agg in agg_list ] for syn in synonyms_sorted] aggs_with_syns = [[ re.sub(r'\b{}\b'.format(target_word), synonyms_sorted[i], agg) for agg in aggs_with_syns1[i] ] for i in range(len(synonyms_sorted))] for i in range(len(synonyms_sorted)): num_queries += len(agg_list) if tmodel.getPredictions( aggs_with_syns[i]).count(orig_label) < len( aggs_with_syns[i]): changed = True sel_sym = synonyms_sorted[i] break if changed: for indx in word_idx_dic[str(target_word)]: #print("changed") text_prime[indx] = sel_sym text_prime = vowel_correction(text_prime[:], indx) backtrack_dic[(sel_sym, indx)] = target_word num_changed += 1 #print(num_changed) text_prime = ' '.join(text_prime) probs = tmodel.getPredictions([text_prime]) return text_prime, num_changed, orig_label, probs[0], num_queries
def attack( text_ls, true_label, predictor, stop_words_set, word2idx, idx2word, cos_sim, sim_predictor=None, import_score_threshold=-1.0, sim_score_threshold=0.5, sim_score_window=15, synonym_num=50, batch_size=32, ): # first check the prediction of the original text orig_probs = predictor([text_ls]).squeeze() orig_label = torch.argmax(orig_probs) # orig_label = ( # torch.tensor( # list(map(lambda x: 1.0 if x[0] > 0.5 else 0.0, orig_probs)), # dtype=torch.long, # ) # .cuda() # .unsqueeze(-1) # ) # orig_label = torch.tensor( # 1 if orig_probs.data >= 0.5 else 0, dtype=torch.long # ).cuda() orig_prob = orig_probs.max() if true_label != orig_label: return "", 0, orig_label, orig_label, 0 else: len_text = len(text_ls) if len_text < sim_score_window: sim_score_threshold = 0.1 # shut down the similarity thresholding function half_sim_score_window = (sim_score_window - 1) // 2 num_queries = 1 # get the pos and verb tense info pos_ls = criteria.get_pos(text_ls) # get importance score leave_1_texts = [ text_ls[:ii] + ["<oov>"] + text_ls[min(ii + 1, len_text):] for ii in range(len_text) ] leave_1_probs = predictor(leave_1_texts, batch_size=batch_size) num_queries += len(leave_1_texts) leave_1_probs_argmax = torch.argmax(leave_1_probs, dim=-1) # leave_1_probs_argmax = torch.tensor( # 1 if leave_1_probs.data >= 0.5 else 0, dtype=torch.long # ).cuda() # leave_1_probs_argmax = ( # torch.tensor( # list(map(lambda x: 1 if x[0] > 0.5 else 0, leave_1_probs)), # dtype=torch.long, # ) # .cuda() # .unsqueeze(-1) # ) import_scores = ( (orig_prob - leave_1_probs[:, orig_label] + (leave_1_probs_argmax != orig_label).float() * (leave_1_probs.max(dim=-1)[0] - torch.index_select( orig_probs, 0, leave_1_probs_argmax))).data.cpu().numpy()) # get words to perturb ranked by importance scorefor word in words_perturb words_perturb = [] for idx, score in sorted(enumerate(import_scores), key=lambda x: x[1], reverse=True): try: if (score > import_score_threshold and text_ls[idx] not in stop_words_set): words_perturb.append((idx, text_ls[idx])) except: print(idx, len(text_ls), import_scores.shape, text_ls, len(leave_1_texts)) # find synonyms words_perturb_idx = [ word2idx[word] for idx, word in words_perturb if word in word2idx ] synonym_words, _ = pick_most_similar_words_batch( words_perturb_idx, cos_sim, idx2word, synonym_num, 0.5) synonyms_all = [] for idx, word in words_perturb: if word in word2idx: synonyms = synonym_words.pop(0) if synonyms: synonyms_all.append((idx, synonyms)) # start replacing and attacking text_prime = text_ls[:] text_cache = text_prime[:] num_changed = 0 for idx, synonyms in synonyms_all: new_texts = [ text_prime[:idx] + [synonym] + text_prime[min(idx + 1, len_text):] for synonym in synonyms ] new_probs = predictor(new_texts, batch_size=batch_size) # compute semantic similarity if (idx >= half_sim_score_window and len_text - idx - 1 >= half_sim_score_window): text_range_min = idx - half_sim_score_window text_range_max = idx + half_sim_score_window + 1 elif (idx < half_sim_score_window and len_text - idx - 1 >= half_sim_score_window): text_range_min = 0 text_range_max = sim_score_window elif (idx >= half_sim_score_window and len_text - idx - 1 < half_sim_score_window): text_range_min = len_text - sim_score_window text_range_max = len_text else: text_range_min = 0 text_range_max = len_text semantic_sims = sim_predictor.semantic_sim( [" ".join(text_cache[text_range_min:text_range_max])] * len(new_texts), list( map(lambda x: " ".join(x[text_range_min:text_range_max]), new_texts)), )[0] num_queries += len(new_texts) if len(new_probs.shape) < 2: new_probs = new_probs.unsqueeze(0) new_probs_mask = ((orig_label != torch.argmax( new_probs, dim=-1)).data.cpu().numpy()) # prevent bad synonyms new_probs_mask *= semantic_sims >= sim_score_threshold # prevent incompatible pos synonyms_pos_ls = [ criteria.get_pos(new_text[max(idx - 4, 0):idx + 5])[min(4, idx)] if len(new_text) > 10 else criteria.get_pos(new_text)[idx] for new_text in new_texts ] pos_mask = np.array( criteria.pos_filter(pos_ls[idx], synonyms_pos_ls)) new_probs_mask *= pos_mask if np.sum(new_probs_mask) > 0: text_prime[idx] = synonyms[(new_probs_mask * semantic_sims).argmax()] num_changed += 1 break else: new_label_probs = (new_probs[:, orig_label] + torch.from_numpy( (semantic_sims < sim_score_threshold) + (1 - pos_mask).astype(float)).float().cuda()) new_label_prob_min, new_label_prob_argmin = torch.min( new_label_probs, dim=-1) if new_label_prob_min < orig_prob: text_prime[idx] = synonyms[new_label_prob_argmin] num_changed += 1 text_cache = text_prime[:] return ( " ".join(text_prime), num_changed, orig_label, torch.argmax(predictor([text_prime])), num_queries, )
def attack(fuzz_val, top_k_words, qrs, wts, sample_index, text_ls, true_label, predictor, stop_words_set, word2idx, idx2word, cos_sim, word_embedding, sim_predictor=None, import_score_threshold=-1., sim_score_threshold=0.5, sim_score_window=15, synonym_num=50, batch_size=32): rows = [] nlp = spacy.load('en_core_web_sm') masked_lang_model = BertForMaskedLM.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') changed_with = [] doc = nlp(' '.join(text_ls)) text = [] for sent in doc.sents: for token in sent: text.append(token.text) tok_text = [] for item in text: ap = item.find("'") if ap >= 0: tok_text.append(item[0:ap]) tok_text.append("'") tok_text.append(item[ap + 1:len(item)]) else: tok_text.append(item) text = [] for item in tok_text: if len(item) > 0: text.append(item) text_ls = text[:] # first check the prediction of the original text orig_probs = predictor([text_ls]).squeeze() orig_label = torch.argmax(orig_probs) orig_prob = orig_probs.max() if true_label != orig_label: return '', 0, orig_label, orig_label, 0, [], [] else: len_text = len(text_ls) if len_text < sim_score_window: sim_score_threshold = 0.1 # shut down the similarity thresholding function half_sim_score_window = (sim_score_window - 1) // 2 num_queries = 1 # get the pos and verb tense info pos_ls = criteria.get_pos(text_ls) # get importance score leave_1_texts = [ text_ls[:ii] + ['<oov>'] + text_ls[min(ii + 1, len_text):] for ii in range(len_text) ] leave_1_probs = predictor(leave_1_texts, batch_size=batch_size) num_queries += len(leave_1_texts) leave_1_probs_argmax = torch.argmax(leave_1_probs, dim=-1) import_scores = ( orig_prob - leave_1_probs[:, orig_label] + (leave_1_probs_argmax != orig_label).float() * (leave_1_probs.max(dim=-1)[0] - torch.index_select( orig_probs, 0, leave_1_probs_argmax))).data.cpu().numpy() # get words to perturb ranked by importance score for word in words_perturb words_perturb = [] for idx, score in sorted(enumerate(import_scores), key=lambda x: x[1], reverse=True): try: if score > import_score_threshold and text_ls[ idx] not in stop_words_set and len(text_ls[idx]) > 2: words_perturb.append((idx, score)) except: print(idx, len(text_ls), import_scores.shape, text_ls, len(leave_1_texts)) #return '', 0, orig_label, orig_label, 0, [], words_perturb # find synonyms words_perturb_idx = [ word2idx[word] for idx, word in words_perturb if word in word2idx ] #synonym_words, synonym_values, synonyms_dict = pick_most_similar_words_batch(words_perturb_idx, cos_sim, idx2word, synonym_num, -1.0) # start replacing and attacking text_prime = text_ls[:] sims = [] text_cache = text_prime[:] num_changed = 0 for idx, score in words_perturb: #print(text_ls[idx]) text_range_min, text_range_max = calc_window(idx, 3, 10, len_text) sliced_text = text_prime[text_range_min:text_range_max] #print(sliced_text) new_index = idx - text_range_min #print(sliced_text[new_index]) masked_idx = new_index tokens, words, position = gen.convert_sentence_to_token( ' '.join(sliced_text), 1000, tokenizer) assert len(words) == len(position) len_tokens = len(tokens) mask_position = position[masked_idx] if isinstance(mask_position, list): feature = gen.convert_whole_word_to_feature( tokens, mask_position, 1000, tokenizer) else: feature = gen.convert_token_to_feature(tokens, mask_position, 1000, tokenizer) tokens_tensor = torch.tensor([feature.input_ids]) token_type_ids = torch.tensor([feature.input_type_ids]) attention_mask = torch.tensor([feature.input_mask]) tokens_tensor = tokens_tensor.to('cuda') token_type_ids = token_type_ids.to('cuda') attention_mask = attention_mask.to('cuda') #new_probs = predictor(new_texts, batch_size=batch_size) masked_lang_model.to('cuda') masked_lang_model.eval() ps = PorterStemmer() with torch.no_grad(): prediction_scores = masked_lang_model(tokens_tensor, token_type_ids, attention_mask) if isinstance(mask_position, list): predicted_top = prediction_scores[0, mask_position[0]].topk(50) else: predicted_top = prediction_scores[0, mask_position].topk(50) pre_tokens = tokenizer.convert_ids_to_tokens( predicted_top[1].cpu().numpy()) synonyms_initial = gen.substitution_generation( words[masked_idx], pre_tokens, predicted_top[0].cpu().numpy(), ps, 50) new_texts = [] avg = [] synonyms = [] assert words[masked_idx] == text_ls[idx] #print(synonyms) for candidate_word in synonyms_initial: if candidate_word in word_embedding and words[ masked_idx] in word_embedding: candidate_similarity = calc_similarity( word_embedding[words[masked_idx]], word_embedding[candidate_word]) avg.append(candidate_similarity) #print(words[masked_idx], candidate_similarity, candidate_word) if candidate_similarity >= 0.2: new_texts.append(text_prime[:idx] + [candidate_word] + text_prime[min(idx + 1, len_text):]) synonyms.append(candidate_word) else: new_texts.append(text_prime[:idx] + [candidate_word] + text_prime[min(idx + 1, len_text):]) synonyms.append(candidate_word) #print(len(new_texts)) if len(new_texts) == 0: continue text_range_min, text_range_max = calc_window( idx, half_sim_score_window, sim_score_window, len_text) semantic_sims = \ sim_predictor.semantic_sim([' '.join(text_cache[text_range_min:text_range_max])] * len(new_texts), list(map(lambda x: ' '.join(x[text_range_min:text_range_max]), new_texts)))[0] sims.append(np.sum(semantic_sims) / len(semantic_sims)) new_probs_mask = np.ones( len(new_texts) ) #(orig_label != torch.argmax(new_probs, dim=-1)).data.cpu().numpy() # prevent bad synonyms new_probs_mask *= (semantic_sims >= sim_score_threshold) # prevent incompatible pos synonyms_pos_ls = [ criteria.get_pos(new_text[max(idx - 4, 0):idx + 5])[min(4, idx)] if len(new_text) > 10 else criteria.get_pos(new_text)[idx] for new_text in new_texts ] pos_mask = np.array( criteria.pos_filter(pos_ls[idx], synonyms_pos_ls)) new_probs_mask *= pos_mask new_vals = semantic_sims * new_probs_mask index = [] mini = 2 for i in range(len(new_vals)): if new_vals[i] > 0: index.append((new_vals[i], i)) if len(index) == 0: continue new_texts1 = [new_texts[ind] for val, ind in index] #print(len(new_texts1)) num_queries += len(new_texts1) if num_queries > qrs: return '', 0, orig_label, orig_label, 0, [], [] new_probs = predictor(new_texts1, batch_size=batch_size) if len(new_probs.shape) < 2: new_probs = new_probs.unsqueeze(0) pr = (orig_label != torch.argmax(new_probs, dim=-1)).data.cpu().numpy() if np.sum(pr) > 0: text_prime[idx] = synonyms[index[pr.argmax( )][1]] #synonyms[(new_probs_mask * semantic_sims).argmax()] num_changed += 1 break else: new_label_probs = new_probs[:, orig_label] new_label_prob_min, new_label_prob_argmin = torch.min( new_label_probs, dim=-1) if new_label_prob_min < orig_prob: text_prime[idx] = synonyms[index[new_label_prob_argmin][1]] num_changed += 1 text_cache = text_prime[:] if fuzz.token_set_ratio(' '.join(text_ls), ' '.join(text_cache)) < fuzz_val: return ' '.join( text_prime), num_changed, orig_label, torch.argmax( predictor([text_prime ])), num_queries, words_perturb, sims return ' '.join(text_prime), num_changed, orig_label, torch.argmax( predictor([text_prime])), num_queries, words_perturb, sims
def text_fooler(text_ls, true_label, model, stop_words_set, word2idx, idx2word, cos_sim, sim_predictor=None, import_score_threshold=-1., sim_score_threshold=0.7, sim_score_window=15, synonym_num=50, batch_size=32): adversaries = [] # first check the prediction of the original text# ref_ans, stud_ans = text_ls stud_ans = list_to_string(stud_ans).split(" ") orig_logits = predict(model, ref_ans, stud_ans, true_label) orig_probs = F.softmax(orig_logits, dim=0) orig_label = torch.argmax(orig_probs).item() orig_prob = orig_probs.max().item() if true_label != orig_label: return '', 0, orig_label, orig_label, 0 else: len_text = len(stud_ans) if len_text < sim_score_window: sim_score_threshold = 0.1 # shut down the similarity thresholding function half_sim_score_window = (sim_score_window - 1) // 2 num_queries = 1 # get the pos and verb tense info pos_ls = criteria.get_pos(stud_ans) # get importance score leave_1_texts = [ stud_ans[:ii] + ['[UNK]'] + stud_ans[min(ii + 1, len_text):] for ii in range(len_text) ] leave_1_probs = [] num_queries += len(leave_1_texts) for new_ans in leave_1_texts: new_logits = predict(model, ref_ans, new_ans, true_label) new_probs = F.softmax(new_logits, dim=0) leave_1_probs.append(new_probs) leave_1_probs = torch.stack(leave_1_probs) leave_1_probs_argmax = torch.argmax(leave_1_probs, dim=-1) import_scores = ( orig_prob - leave_1_probs[:, orig_label] + (leave_1_probs_argmax != orig_label).float() * (leave_1_probs.max(dim=-1)[0] - torch.index_select( orig_probs, 0, leave_1_probs_argmax))).data.cpu().numpy() # get words to perturb ranked by importance score for word in words_perturb words_perturb = [] for idx, score in sorted(enumerate(import_scores), key=lambda x: x[1], reverse=True): try: if score > import_score_threshold and stud_ans[ idx] not in stop_words_set: words_perturb.append((idx, stud_ans[idx])) except: print(idx, len(stud_ans), import_scores.shape, stud_ans, len(leave_1_texts)) # find synonyms words_perturb_idx = [ word2idx[word] for idx, word in words_perturb if word in word2idx ] synonym_words, _ = pick_most_similar_words_batch( words_perturb_idx, cos_sim, idx2word, synonym_num, 0.5) synonyms_all = [] for idx, word in words_perturb: if word in word2idx: synonyms = synonym_words.pop(0) if synonyms: synonyms_all.append((idx, synonyms)) # start replacing and attacking text_prime = stud_ans[:] text_cache = text_prime[:] num_changed = 0 for idx, synonyms in synonyms_all: new_texts = [ text_prime[:idx] + [synonym] + text_prime[min(idx + 1, len_text):] for synonym in synonyms ] new_probs = [] new_labels = [] for syn_text in new_texts: syn_logits = predict(model, ref_ans, syn_text, true_label) new_probs.append(F.softmax(syn_logits, dim=0)) new_probs = torch.stack(new_probs) # compute semantic similarity if idx >= half_sim_score_window and len_text - idx - 1 >= half_sim_score_window: text_range_min = idx - half_sim_score_window text_range_max = idx + half_sim_score_window + 1 elif idx < half_sim_score_window <= len_text - idx - 1: text_range_min = 0 text_range_max = sim_score_window elif idx >= half_sim_score_window > len_text - idx - 1: text_range_min = len_text - sim_score_window text_range_max = len_text else: text_range_min = 0 text_range_max = len_text semantic_sims = \ sim_predictor.semantic_sim([' '.join(text_cache[text_range_min:text_range_max])] * len(new_texts), list(map(lambda x: ' '.join(x[text_range_min:text_range_max]), new_texts)))[ 0] num_queries += len(new_texts) if len(new_probs.shape) < 2: new_probs = new_probs.unsqueeze(0) new_probs_mask = (2 == torch.argmax(new_probs, dim=-1)).data.cpu().numpy() # prevent bad synonyms new_probs_mask *= (semantic_sims >= sim_score_threshold) # prevent incompatible pos (maybe not) synonyms_pos_ls = [ criteria.get_pos(new_text[max(idx - 4, 0):idx + 5])[min(4, idx)] if len(new_text) > 10 else criteria.get_pos(new_text)[idx] for new_text in new_texts ] pos_mask = np.array( criteria.pos_filter(pos_ls[idx], synonyms_pos_ls)) # Uncomment to inverse mask and only allow candidates where POS is not the same # pos_mask = np.invert(pos_mask) new_probs_mask *= pos_mask if np.sum(new_probs_mask) > 0: text_prime[idx] = synonyms[(new_probs_mask * semantic_sims).argmax()] num_changed += 1 adversaries.append(tuple(text_prime)) break """ else: new_label_probs = new_probs[:, orig_label] + torch.from_numpy( (semantic_sims < sim_score_threshold) + (1 - pos_mask).astype(float)).float() new_label_prob_min, new_label_prob_argmin = torch.min(new_label_probs, dim=-1) if new_label_prob_min < orig_prob: text_prime[idx] = synonyms[new_label_prob_argmin] num_changed += 1 text_cache = text_prime[:] adversaries.append(text_cache) #new_labels.append() """ # Combine adversaries with new labels result = set(i for i in adversaries if i[0] != stud_ans[:]) return num_changed, num_queries, result