def get_fool_sentence_pwws(self, sentence: str, label: int, index: int): start = time.perf_counter() maxlen = config_data[config_dataset].padding_maxlen vector = str2seq(sentence, self.vocab, self.tokenizer, maxlen).to(config_device) label = torch.tensor(label).to(config_device) flag = -1 predict = self.net.predict_class(vector)[0] end = -1 if predict == label: sentence, adv_y, sub_rate, NE_rate, change_tuple_list = adversarial_paraphrase( sentence, vector, label, self.tokenizer, self.vocab, self.net, self.verbose, self.sub_rate_limit, ) if adv_y != label: flag = 1 else: flag = 0 self.__write_log(index, flag, sub_rate, NE_rate, change_tuple_list) end = time.perf_counter() - start return sentence, flag, end
def get_fool_sentence_textfool(self, sentence: str, label: int, index: int): start = time.perf_counter() maxlen = config_data[config_dataset].padding_maxlen flag = -1 end = -1 vector = str2seq(sentence, self.vocab, self.tokenizer, maxlen).to(config_device) predict = self.net.predict_class(vector)[0] if predict == label: sentence, adv_y, sub_rate, NE_rate, change_tuple_list = textfool_perturb_text( sentence, self.net, self.vocab, self.tokenizer, maxlen, label, self.use_typos, verbose=self.verbose, sub_rate_limit=self.sub_rate_limit, ) if adv_y != label: flag = 1 else: flag = 0 self.__write_log(index, flag, sub_rate, NE_rate, change_tuple_list) end = time.perf_counter() - start return sentence, flag, end
def halt_condition_fn(perturbed_text): ''' Halt if model output is changed. ''' maxlen = config_data[config_dataset].padding_maxlen perturbed_vector = str2seq(perturbed_text, vocab, tokenizer, maxlen).to(config_device) predict = net.predict_class(perturbed_vector)[0] return predict != true_y
def random_attack( sentence, y_true, net, vocab, tokenizer, maxlen, verbose=False, sub_rate_limit=None): doc = nlp(sentence) if sub_rate_limit: sub_rate_limit = int(sub_rate_limit * len(doc)) else: sub_rate_limit = len(doc) def halt_conditionh_func(perturbed_text): perturbed_vector = str2seq(perturbed_text, vocab, tokenizer, maxlen).to(config_device) predict = net.predict_class(perturbed_vector)[0] return predict != y_true candidates_list = [] for idx, token in enumerate(doc): if idx >= maxlen: break candidates = _generate_synonym_candidates(token=token, token_position=idx, rank_fn=None) if len(candidates) > 0: candidates_list.append((idx, candidates)) upper = min(len(candidates_list), sub_rate_limit) lower = upper // 3 sub_num = get_random(lower, upper) sub_pos = random.sample(candidates_list, sub_num) change_tuple_list = [] accepted_candidates = [] for token_pos, candidates in sub_pos: substitution = random.sample(candidates, 1)[0] accepted_candidates.append(substitution) change_tuple_list.append((token_pos, substitution.original_token, substitution.candidate_word, None, 'tagNONE')) perturbed_text = ' '.join( _compile_perturbed_tokens(doc, accepted_candidates)) if halt_conditionh_func(perturbed_text): break if verbose: print(f'origin token pos {token_pos}, origin token {substitution.original_token}, candidate token {substitution.candidate_word}') perturbed_text = ' '.join( _compile_perturbed_tokens(doc, accepted_candidates)) sub_rate = len(change_tuple_list) / len(doc) ne_rate = 0.0 adv_vec = str2seq(perturbed_text, vocab, tokenizer, maxlen).to(config_device) adv_y = net.predict_class(adv_vec)[0] return perturbed_text, adv_y, sub_rate, ne_rate, change_tuple_list
def heuristic_fn(text, candidate): ''' Return the difference between the classification probability of the original word and the candidate substitute synonym, which is defined in Eq.(4) and Eq.(5). ''' doc = nlp(text) maxlen = config_data[config_dataset].padding_maxlen perturbed_tokens = _compile_perturbed_tokens(doc, [candidate]) perturbed_doc = ' '.join(perturbed_tokens) perturbed_vector = str2seq(perturbed_doc, vocab, tokenizer, maxlen).to(config_device) adv_y = net.predict_prob(perturbed_vector, true_y)[0] ori_y = net.predict_prob(origin_vector, true_y)[0] return ori_y - adv_y
def adversarial_paraphrase(input_text, origin_vector, true_y, tokenizer:Tokenizer, vocab:'Vocab', net:nn.Module, verbose=False, sub_rate_limit=None): ''' Compute a perturbation, greedily choosing the synonym if it causes the most significant change in the classification probability after replacement :return perturbed_text : generated adversarial examples :return perturbed_y: predicted class of perturbed_text :return sub_rate: word replacement rate showed in Table 3 :return change_tuple_list: list of substitute words ''' def halt_condition_fn(perturbed_text): ''' Halt if model output is changed. ''' maxlen = config_data[config_dataset].padding_maxlen perturbed_vector = str2seq(perturbed_text, vocab, tokenizer, maxlen).to(config_device) predict = net.predict_class(perturbed_vector)[0] return predict != true_y def heuristic_fn(text, candidate): ''' Return the difference between the classification probability of the original word and the candidate substitute synonym, which is defined in Eq.(4) and Eq.(5). ''' doc = nlp(text) maxlen = config_data[config_dataset].padding_maxlen perturbed_tokens = _compile_perturbed_tokens(doc, [candidate]) perturbed_doc = ' '.join(perturbed_tokens) perturbed_vector = str2seq(perturbed_doc, vocab, tokenizer, maxlen).to(config_device) adv_y = net.predict_prob(perturbed_vector, true_y)[0] ori_y = net.predict_prob(origin_vector, true_y)[0] return ori_y - adv_y doc = nlp(input_text) # PWWS position_word_list, word_saliency_list = evaluate_word_saliency(doc, origin_vector, true_y, net) perturbed_text, sub_rate, NE_rate, change_tuple_list = PWWS(doc, true_y, word_saliency_list=word_saliency_list, heuristic_fn=heuristic_fn, halt_condition_fn=halt_condition_fn, verbose=verbose, sub_rate_limit=sub_rate_limit) # print("perturbed_text after perturb_text:", perturbed_text) maxlen = config_data[config_dataset].padding_maxlen perturbed_vector = str2seq(perturbed_text, vocab, tokenizer, maxlen).to(config_device) perturbed_y = net.predict_class(perturbed_vector)[0] if verbose: origin_prob = net.predict_prob(origin_vector, true_y)[0] perturbed_prob = net.predict_prob(perturbed_vector, true_y)[0] raw_score = origin_prob - perturbed_prob print('Prob before: ', origin_prob, '. Prob after: ', perturbed_prob, '. Prob shift: ', raw_score) return perturbed_text, perturbed_y, sub_rate, NE_rate, change_tuple_list
def halt_conditionh_func(perturbed_text): perturbed_vector = str2seq(perturbed_text, vocab, tokenizer, maxlen).to(config_device) predict = net.predict_class(perturbed_vector)[0] return predict != true_y
def textfool_perturb_text( sentence, net, vocab, tokenizer, maxlen, true_y, use_typos=False, rank_fn=None, heuristic_fn=None, verbose=False, sub_rate_limit=None): ''' Perturb the text by replacing some words with their WordNet synonyms, sorting by GloVe similarity between the synonym and the original context window, and optional heuristic. :param doc: Document to perturb. :type doc: spacy.tokens.doc.Doc :param rank_fn: See `_generate_synonym_candidates``. :param heuristic_fn: Ranks the best synonyms using the heuristic. If the value of the heuristic is negative, the candidate substitution is rejected. :param halt_condition_fn: Returns true when the perturbation is satisfactory enough. :param verbose: ATTENTION: This function is originally from https://github.com/bogdan-kulynych/textfool ''' def halt_conditionh_func(perturbed_text): perturbed_vector = str2seq(perturbed_text, vocab, tokenizer, maxlen).to(config_device) predict = net.predict_class(perturbed_vector)[0] return predict != true_y doc = nlp(sentence) heuristic_fn = heuristic_fn or (lambda _, candidate: candidate.similarity_rank) candidates = textfool_generate_synonym_candidates(doc, rank_fn=rank_fn) if use_typos: candidates.extend(_generate_typo_candidates(doc)) if sub_rate_limit: sub_rate_limit = int(sub_rate_limit * len(doc)) else: sub_rate_limit = len(doc) perturbed_positions = set() accepted_candidates = [] perturbed_text = doc.text if verbose: print('Got {} candidates'.format(len(candidates))) sorted_candidates = zip( map(partial(heuristic_fn, perturbed_text), candidates), candidates) sorted_candidates = list(sorted(sorted_candidates, key=lambda t: t[0])) change_tuple_list = [] while len(sorted_candidates) > 0 and len(change_tuple_list) < sub_rate_limit: score, candidate = sorted_candidates.pop() if score < 0: continue if candidate.token_position >= maxlen: break if candidate.token_position not in perturbed_positions: perturbed_positions.add(candidate.token_position) accepted_candidates.append(candidate) if verbose: print('Candidate:', candidate) print('Candidate score:', heuristic_fn(perturbed_text, candidate)) print('Candidate accepted.') perturbed_text = ' '.join(_compile_perturbed_tokens(doc, accepted_candidates)) change_tuple_list.append((candidate.token_position, candidate.original_token, candidate.candidate_word, score, 'TAGNONE')) if halt_conditionh_func(perturbed_text): break if len(sorted_candidates) > 0: _, candidates = zip(*sorted_candidates) sorted_candidates = zip( map(partial(heuristic_fn, perturbed_text), candidates), candidates) sorted_candidates = list(sorted(sorted_candidates, key=lambda t: t[0])) sub_rate = len(change_tuple_list) / len(doc) ne_rate = 0.0 adv_vec = str2seq(perturbed_text, vocab, tokenizer, maxlen).to(config_device) adv_y = net.predict_class(adv_vec)[0] return perturbed_text, adv_y, sub_rate, ne_rate, change_tuple_list