def get_score_of_word(sentence_words, index, batch_size=256): """ Calculates the score (gradient) of given word for all given sentences :param sentence_words: Sentences to test :type sentence_words: list of lists of words :param index: Index of the word to score :type index: int :param batch_size: Batch size. The larger the batch, the faster the calculation but also the higher the RAM consumption :type batch_size: int :returns: list containing scores for every senetnce """ length = 0 for sentence in sentence_words: length = max(length, len(sentence)) result = [] # Build batches number_batches = (len(sentence_words) // batch_size) + 1 for current_batch in range(number_batches): print('Calculating score: {0:3d} of {1:3d}'.format( current_batch + 1, number_batches), end='\r') sentence_w2v = list() for i in range(batch_size * current_batch, batch_size * (current_batch + 1)): if i < len(sentence_words): sentence_w2v.append( encode_sentence.encode_word2vec(' '.join( sentence_words[i]), min_length=length, cache_model=True)) else: break if len(sentence_w2v) == 0: break gradient_input = numpy.asarray(sentence_w2v) sentence_gradient = gradient.get_gradient(network, gradient_input, session) for output_gradient in sentence_gradient: result.append(numpy.amax(output_gradient, axis=1)[0]) print() return result
input_tries = input_tries + 1 input_index = random.randint(0, len(dataset_x_train)) # Test if we have already tried this input if os.path.exists('./adversarial_example/{}-{}.json'.format( DATASET.value, input_index)) or os.path.exists( './adversarial_example_unsuccessful/{}-{}.json'.format( DATASET.value, input_index)) or os.path.exists( './adversarial_example_input_unusable/{}-{}.json'. format(DATASET.value, input_index)): print('(duplicate input)') continue input_target_w2v = numpy.asarray([ encode_sentence.encode_word2vec(dataset_x_train[input_index], cache_model=True) ]) output_class_target_w2v = target_w2v.predict(input_target_w2v) output_class_target_w2v = numpy.argmax(output_class_target_w2v) output_class_target_w2v_retrained = target_w2v_retrained.predict( input_target_w2v) output_class_target_w2v_retrained = numpy.argmax( output_class_target_w2v_retrained) output_class_target_w2v_second_half = target_w2v_second_half.predict( input_target_w2v) output_class_target_w2v_second_half = numpy.argmax( output_class_target_w2v_second_half) output_class_target_w2v_alternative = target_w2v_alternative.predict(
def attack_w2v(network: keras.models.Model, input_data: str, session: tensorflow.InteractiveSession, data_x: list, data_y: list, attack_key=None, use_keywords=True, typo_chance=1.0) -> str: """ Runs an adversarial attack against a network which uses the Google word2vec encoding. The attack is modelled after the attack described by Suranjana Samanta and Sameep Mehta in "Towards Crafting Text Adversarial Samples". https://arxiv.org/abs/1707.02812 :raises: NoAdversarialExampleFound :param network: Network to attack :type network: keras.model.Model :param input_data: Input sentence :type input_data: str :param session: Session to run the network under :type session: tensorflow.InteractiveSession :param data_x: Dataset in normal string form :type data_x: [str, str, ...] :param data_y: Dataset classes as normal integers :type data_y: [int, int, ...] :param attack_key: Key to store keyword list for this game. None for no caching. :type attack_key: str :param use_keywords: If set to True, 'Genre specific keywords' will be used (see Samanta and Mehta) :type use_keywords: bool :param typo_chance: Chance that typos are included into the search :type typo_chance: float :returns: String containing the resulting attack, int number of changes, array containing changes """ input_sentence = input_data if not os.path.exists('./datasets/nltk/'): os.makedirs('./datasets/nltk/') keyword_list = _create_keyword_list(data_x, data_y, attack_key) number_changes = 0 changes_list = [] nltk.data.path = ['./datasets/nltk/'] global _ATTACKS_WORDNET_CHECKED global _ATTACKS_AVERAGED_PERCEPTRON_TAGGER_CHECKED if not _ATTACKS_WORDNET_CHECKED: nltk.download('wordnet', download_dir='./datasets/nltk/') _ATTACKS_WORDNET_CHECKED = True if not _ATTACKS_AVERAGED_PERCEPTRON_TAGGER_CHECKED: nltk.download('averaged_perceptron_tagger', download_dir='./datasets/nltk/') _ATTACKS_AVERAGED_PERCEPTRON_TAGGER_CHECKED = True w2v_input = numpy.asarray( [encode_sentence.encode_word2vec(input_sentence, cache_model=True)]) # Calculate origin class and current class origin_class = int(numpy.argmax(network.predict(w2v_input))) current_class = origin_class # Select target class for the keyword list target_class = random.randint(0, numpy.max(data_y)) while target_class == origin_class: target_class = random.randint(0, numpy.max(data_y)) # Calculate priorities based on gradient input_gradient = gradient.get_gradient(network, w2v_input, session) word_score = numpy.amax(input_gradient, axis=2)[0] word_priority_list = list() for i in range(input_gradient.shape[1]): maximum_value = numpy.argmax(word_score) word_priority_list.append(maximum_value) word_score[maximum_value] = -99999 print(input_sentence) # IMPORTANT: These two have to be kept in sync! # This is to reduce miss-tags later after modifying input_words = input_sentence.split(" ") for i in range(len(input_words)): if input_words[i] == '': # Avoid empty words for tagging input_words[i] = ' ' input_words_tagged = nltk.pos_tag(input_words) for i in range(len(input_words)): if input_words[i] == ' ': # Reconstruct original sentence input_words[i] = '' input_words_tagged[i] = ('', input_words_tagged[i][1]) # Helper function for getting scores def get_score_of_word(sentence_words, index, batch_size=256): """ Calculates the score (gradient) of given word for all given sentences :param sentence_words: Sentences to test :type sentence_words: list of lists of words :param index: Index of the word to score :type index: int :param batch_size: Batch size. The larger the batch, the faster the calculation but also the higher the RAM consumption :type batch_size: int :returns: list containing scores for every senetnce """ length = 0 for sentence in sentence_words: length = max(length, len(sentence)) result = [] # Build batches number_batches = (len(sentence_words) // batch_size) + 1 for current_batch in range(number_batches): print('Calculating score: {0:3d} of {1:3d}'.format( current_batch + 1, number_batches), end='\r') sentence_w2v = list() for i in range(batch_size * current_batch, batch_size * (current_batch + 1)): if i < len(sentence_words): sentence_w2v.append( encode_sentence.encode_word2vec(' '.join( sentence_words[i]), min_length=length, cache_model=True)) else: break if len(sentence_w2v) == 0: break gradient_input = numpy.asarray(sentence_w2v) sentence_gradient = gradient.get_gradient(network, gradient_input, session) for output_gradient in sentence_gradient: result.append(numpy.amax(output_gradient, axis=1)[0]) print() return result # Basic sanity tests for i in range(len(word_priority_list)): assert word_priority_list[i] < len(input_words) while origin_class == current_class: # Always change most important word per round print('Remaining words to test: {}'.format(len(word_priority_list))) if len(word_priority_list ) == 0: # We have tried changing all words - aborting raise NoAdversarialExampleFound() current_word = word_priority_list.pop(0) current_word_tagged = input_words_tagged[current_word] if 'RB' in current_word_tagged[ 1] and current_word != 0: # The word is an adverb - workaround to not delete the first word as it might lead to incorrect sentences number_changes += 1 changes_list.append( ['deletion', int(current_word), input_words[current_word]]) input_words.pop(current_word) input_words_tagged.pop(current_word) for i in range( len(word_priority_list) ): # Update position of all words after the deleted one if word_priority_list[i] > current_word: word_priority_list[i] = word_priority_list[i] - 1 else: insert = False best_word = input_words[current_word] best_prediction = -100.0 # Insertion if 'JJ' in current_word_tagged[1]: insert_candidate_set = set() for keyword in keyword_list[target_class]: if 'RB' in keyword[1]: insert_candidate_set.add(keyword[0]) # Find best word insert_candidate_set = list(insert_candidate_set) test_data = [] for test_word in insert_candidate_set: test_input_words = copy.deepcopy(input_words) for single_word in reversed( test_word.split('_') ): # Multiple words are connected by '_', e.g. 'a_lot' test_input_words.insert(current_word, single_word) test_data.append(test_input_words) # Don't do anything if no candidate exists if len(test_data) != 0: test_score = get_score_of_word(test_data, current_word) for i in range(len(test_data)): if test_score[i] > best_prediction: best_prediction = test_score[i] best_word = insert_candidate_set[i] insert = True # Replacement candidate_set = set() # Keywords for candidates if use_keywords: for keyword in keyword_list[ target_class]: # Doens't work well at least for TREC - produces unrecognisable questions if keyword[1] == current_word_tagged[1]: candidate_set.add(keyword[0]) # Synonyms for candidates if 'NN' in current_word_tagged[1]: # Noun for synonym_class in nltk.corpus.wordnet.synsets( input_words[current_word], pos=nltk.corpus.wordnet.NOUN): for synonym in synonym_class.lemma_names(): candidate_set.add(str(synonym)) if 'VB' in current_word_tagged[1]: # Verb for synonym_class in nltk.corpus.wordnet.synsets( input_words[current_word], pos=nltk.corpus.wordnet.VERB): for synonym in synonym_class.lemma_names(): candidate_set.add(str(synonym)) if 'JJ' in current_word_tagged[1]: # Adjective for synonym_class in nltk.corpus.wordnet.synsets( input_words[current_word], pos=nltk.corpus.wordnet.ADJ): for synonym in synonym_class.lemma_names(): candidate_set.add(str(synonym)) if 'RB' in current_word_tagged[1]: # Adverb for synonym_class in nltk.corpus.wordnet.synsets( input_words[current_word], pos=nltk.corpus.wordnet.ADV): for synonym in synonym_class.lemma_names(): candidate_set.add(str(synonym)) # Typos for candidates if random.random() < typo_chance: for typo in typos.get_typos(input_words[current_word]): candidate_set.add(typo) # Find best word test_data = [] candidate_set = list(candidate_set) for test_word in candidate_set: test_input_words = copy.deepcopy(input_words) test_input_words.pop( current_word ) # Remove word temporarily - the new one will be inserted here for single_word in reversed( test_word.split('_') ): # Multiple words are connected by '_', e.g. 'a_lot' test_input_words.insert(current_word, single_word) test_data.append(test_input_words) # Don't do anything if no candidate exists if len(test_data) != 0: test_score = get_score_of_word(test_data, current_word) for i in range(len(test_data)): if test_score[i] > best_prediction: best_prediction = test_score[i] best_word = candidate_set[i] insert = False if insert: number_changes += 1 changes_list.append( ['insertion', int(current_word), best_word]) for single_word in reversed( best_word.split('_') ): # Multiple words are connected by '_', e.g. 'a_lot' input_words.insert(current_word, single_word) input_words_tagged.insert(current_word, (single_word, 'RB')) number_words = 1 + best_word.count('_') for i in range( len(word_priority_list) ): # Update position of all words after the inserted one if word_priority_list[i] > current_word: word_priority_list[ i] = word_priority_list[i] + number_words else: if best_word != input_words[current_word]: number_changes += 1 changes_list.append([ 'modification', int(current_word), input_words[current_word], best_word ]) input_words.pop( current_word ) # Remove word temporarily - the new one will be inserted here old_pos_class = input_words_tagged.pop(current_word)[1] for single_word in reversed( best_word.split('_') ): # Multiple words are connected by '_', e.g. 'a_lot' input_words.insert(current_word, single_word) input_words_tagged.insert(current_word, (single_word, old_pos_class)) additional_inserted = best_word.count('_') for i in range(len(word_priority_list) ): # Update index if more than one word if word_priority_list[i] > current_word: word_priority_list[ i] = word_priority_list[i] + additional_inserted # Basic sanity tests assert len(input_words) == len(input_words_tagged) for i in range(len(input_words)): assert input_words[i] == input_words_tagged[i][0] for i in range(len(word_priority_list)): assert word_priority_list[i] < len(input_words) # Test modified input input_sentence = ' '.join(input_words) print(input_sentence) w2v_input = numpy.asarray([ encode_sentence.encode_word2vec(input_sentence, cache_model=True) ]) current_class = int(numpy.argmax(network.predict(w2v_input))) # Sanity check assert number_changes == len(changes_list) return input_sentence, number_changes, changes_list
# Get all adversarial example data print() print('--- Test input data ---') print() print('trec') trec_used_adversarials = list() trec_used_adversarials_original_class = list() trec_used_adversarials_id = list() for i in range(len(trec_input)): print('{0:5d} of {1:5d}'.format(i + 1, len(trec_input)), end='\r') trec_predicted = trec_network.predict( numpy.asarray( [encode_sentence.encode_word2vec(trec_input[i], cache_model=True)])) if trec_target_class[i] == int(numpy.argmax(trec_predicted)): trec_used_adversarials.append(trec_adversarial_input[i]) trec_used_adversarials_original_class.append(trec_target_class[i]) trec_used_adversarials_id.append(trec_id[i]) else: trec_results['wrong_classification'].append(trec_id[i]) print() print('ag') ag_used_adversarials = list() ag_used_adversarials_original_class = list() ag_used_adversarials_id = list() for i in range(len(ag_input)): print('{0:5d} of {1:5d}'.format(i + 1, len(ag_input)), end='\r')
amazonmovie_w2v_retrained = trained_networks.get_network(dataset.DatasetType.AMAZONMOVIE, dataset.Encoding.WORD2VEC, every_xth_trainings_data=2, skip_trainings_data=0, cache_prefix='retrained-') amazonmovie_w2v_second_half = trained_networks.get_network(dataset.DatasetType.AMAZONMOVIE, dataset.Encoding.WORD2VEC, every_xth_trainings_data=2, skip_trainings_data=1) amazonmovie_w2v_alternative = trained_networks.get_network(dataset.DatasetType.AMAZONMOVIE, dataset.Encoding.WORD2VEC, kernel_variation=[3, 3, 5, 5], every_xth_trainings_data=2, skip_trainings_data=0) amazonmovie_character = trained_networks.get_network(dataset.DatasetType.AMAZONMOVIE, dataset.Encoding.CHARACTER, every_xth_trainings_data=2, skip_trainings_data=0) number_calculations = 0 else: number_calculations += 1 input_filename = './adversarial_example/' + filename with open(input_filename, 'r') as file: file_content = json.load(file) gradient_file = dict() gradient_file['input_sentence'] = file_content['input_sentence'] gradient_file['dataset'] = file_content['dataset'] input_sentence_w2v = numpy.asarray([encode_sentence.encode_word2vec(file_content['input_sentence'], cache_model=True)]) input_sentence_character = numpy.asarray([encode_sentence.encode_character(file_content['input_sentence'])]) if file_content['dataset'] == dataset.DatasetType.TREC.value: gradient_file['target_network'] = gradient.get_gradient(trec_w2v, input_sentence_w2v, session).tolist()[0] gradient_file['w2v_retrained'] = gradient.get_gradient(trec_w2v_retrained, input_sentence_w2v, session).tolist()[0] gradient_file['w2v_second_half'] = gradient.get_gradient(trec_w2v_second_half, input_sentence_w2v, session).tolist()[0] gradient_file['w2v_alternative_first_half'] = gradient.get_gradient(trec_w2v_alternative, input_sentence_w2v, session).tolist()[0] gradient_file['character'] = gradient.get_gradient(trec_character, input_sentence_character, session).tolist()[0] elif file_content['dataset'] == dataset.DatasetType.AG.value: gradient_file['target_network'] = gradient.get_gradient(ag_w2v, input_sentence_w2v, session).tolist()[0] gradient_file['w2v_retrained'] = gradient.get_gradient(ag_w2v_retrained, input_sentence_w2v, session).tolist()[0] gradient_file['w2v_second_half'] = gradient.get_gradient(ag_w2v_second_half, input_sentence_w2v, session).tolist()[0] gradient_file['w2v_alternative_first_half'] = gradient.get_gradient(ag_w2v_alternative, input_sentence_w2v, session).tolist()[0] gradient_file['character'] = gradient.get_gradient(ag_character, input_sentence_character, session).tolist()[0] elif file_content['dataset'] == dataset.DatasetType.AMAZONMOVIE.value: gradient_file['target_network'] = gradient.get_gradient(amazonmovie_w2v, input_sentence_w2v, session).tolist()[0]
print() # Get all adversarial example data print() print('--- Test input data ---') print() print('trec') trec_used_adversarials = list() trec_used_adversarials_original_class = list() trec_used_adversarials_id = list() for i in range(len(trec_input)): print('{0:5d} of {1:5d}'.format(i+1, len(trec_input)), end='\r') if trec_target_class[i] == int(numpy.argmax(trec_w2v_retrained.predict(numpy.asarray([encode_sentence.encode_word2vec(trec_input[i], cache_model=True)])))) and trec_target_class[i] == int(numpy.argmax(trec_w2v_alternative.predict(numpy.asarray([encode_sentence.encode_word2vec(trec_input[i], cache_model=True)])))) and trec_target_class[i] == int(numpy.argmax(trec_w2v_second_half.predict(numpy.asarray([encode_sentence.encode_word2vec(trec_input[i], cache_model=True)])))) and trec_target_class[i] == int(numpy.argmax(trec_character.predict(numpy.asarray([encode_sentence.encode_character(trec_input[i])])))): trec_used_adversarials.append(trec_adversarial_input[i]) trec_used_adversarials_original_class.append(trec_target_class[i]) trec_used_adversarials_id.append(trec_id[i]) else: results['trec_wrong_classification'].append(trec_id[i]) print() print('ag') ag_used_adversarials = list() ag_used_adversarials_original_class = list() ag_used_adversarials_id = list() for i in range(len(ag_input)): print('{0:5d} of {1:5d}'.format(i+1, len(ag_input)), end='\r') if ag_target_class[i] == int(numpy.argmax(ag_w2v_retrained.predict(numpy.asarray([encode_sentence.encode_word2vec(ag_input[i], cache_model=True)])))) and ag_target_class[i] == int(numpy.argmax(ag_w2v_alternative.predict(numpy.asarray([encode_sentence.encode_word2vec(ag_input[i], cache_model=True)])))) and ag_target_class[i] == int(numpy.argmax(ag_w2v_second_half.predict(numpy.asarray([encode_sentence.encode_word2vec(ag_input[i], cache_model=True)])))) and ag_target_class[i] == int(numpy.argmax(ag_character.predict(numpy.asarray([encode_sentence.encode_character(ag_input[i])])))):