Example #1
0
    def get_score_of_word(sentence_words, index, batch_size=256):
        """
        Calculates the score (gradient) of given word for all given sentences

        :param sentence_words: Sentences to test
        :type sentence_words: list of lists of words
        :param index: Index of the word to score
        :type index: int
        :param batch_size: Batch size. The larger the batch, the faster the calculation but also the higher the RAM consumption
        :type batch_size: int
        :returns: list containing scores for every senetnce
        """
        length = 0
        for sentence in sentence_words:
            length = max(length, len(sentence))
        result = []

        # Build batches
        number_batches = (len(sentence_words) // batch_size) + 1
        for current_batch in range(number_batches):
            print('Calculating score: {0:3d} of {1:3d}'.format(
                current_batch + 1, number_batches),
                  end='\r')
            sentence_w2v = list()
            for i in range(batch_size * current_batch,
                           batch_size * (current_batch + 1)):
                if i < len(sentence_words):
                    sentence_w2v.append(
                        encode_sentence.encode_word2vec(' '.join(
                            sentence_words[i]),
                                                        min_length=length,
                                                        cache_model=True))
                else:
                    break
            if len(sentence_w2v) == 0:
                break
            gradient_input = numpy.asarray(sentence_w2v)
            sentence_gradient = gradient.get_gradient(network, gradient_input,
                                                      session)
            for output_gradient in sentence_gradient:
                result.append(numpy.amax(output_gradient, axis=1)[0])
        print()
        return result
Example #2
0
        input_tries = input_tries + 1

        input_index = random.randint(0, len(dataset_x_train))

        # Test if we have already tried this input
        if os.path.exists('./adversarial_example/{}-{}.json'.format(
                DATASET.value, input_index)) or os.path.exists(
                    './adversarial_example_unsuccessful/{}-{}.json'.format(
                        DATASET.value, input_index)) or os.path.exists(
                            './adversarial_example_input_unusable/{}-{}.json'.
                            format(DATASET.value, input_index)):
            print('(duplicate input)')
            continue

        input_target_w2v = numpy.asarray([
            encode_sentence.encode_word2vec(dataset_x_train[input_index],
                                            cache_model=True)
        ])
        output_class_target_w2v = target_w2v.predict(input_target_w2v)
        output_class_target_w2v = numpy.argmax(output_class_target_w2v)

        output_class_target_w2v_retrained = target_w2v_retrained.predict(
            input_target_w2v)
        output_class_target_w2v_retrained = numpy.argmax(
            output_class_target_w2v_retrained)

        output_class_target_w2v_second_half = target_w2v_second_half.predict(
            input_target_w2v)
        output_class_target_w2v_second_half = numpy.argmax(
            output_class_target_w2v_second_half)

        output_class_target_w2v_alternative = target_w2v_alternative.predict(
Example #3
0
def attack_w2v(network: keras.models.Model,
               input_data: str,
               session: tensorflow.InteractiveSession,
               data_x: list,
               data_y: list,
               attack_key=None,
               use_keywords=True,
               typo_chance=1.0) -> str:
    """
    Runs an adversarial attack against a network which uses the Google word2vec encoding.

    The attack is modelled after the attack described by Suranjana Samanta and Sameep Mehta in "Towards Crafting Text Adversarial Samples".
    https://arxiv.org/abs/1707.02812

    :raises: NoAdversarialExampleFound

    :param network: Network to attack
    :type network: keras.model.Model
    :param input_data: Input sentence
    :type input_data: str
    :param session: Session to run the network under
    :type session: tensorflow.InteractiveSession
    :param data_x: Dataset in normal string form
    :type data_x: [str, str, ...]
    :param data_y: Dataset classes as normal integers
    :type data_y: [int, int, ...]
    :param attack_key: Key to store keyword list for this game. None for no caching.
    :type attack_key: str
    :param use_keywords: If set to True, 'Genre specific keywords' will be used (see Samanta and Mehta)
    :type use_keywords: bool
    :param typo_chance: Chance that typos are included into the search
    :type typo_chance: float
    :returns: String containing the resulting attack, int number of changes, array containing changes
    """
    input_sentence = input_data

    if not os.path.exists('./datasets/nltk/'):
        os.makedirs('./datasets/nltk/')

    keyword_list = _create_keyword_list(data_x, data_y, attack_key)

    number_changes = 0
    changes_list = []

    nltk.data.path = ['./datasets/nltk/']
    global _ATTACKS_WORDNET_CHECKED
    global _ATTACKS_AVERAGED_PERCEPTRON_TAGGER_CHECKED
    if not _ATTACKS_WORDNET_CHECKED:
        nltk.download('wordnet', download_dir='./datasets/nltk/')
        _ATTACKS_WORDNET_CHECKED = True
    if not _ATTACKS_AVERAGED_PERCEPTRON_TAGGER_CHECKED:
        nltk.download('averaged_perceptron_tagger',
                      download_dir='./datasets/nltk/')
        _ATTACKS_AVERAGED_PERCEPTRON_TAGGER_CHECKED = True

    w2v_input = numpy.asarray(
        [encode_sentence.encode_word2vec(input_sentence, cache_model=True)])

    # Calculate origin class and current class
    origin_class = int(numpy.argmax(network.predict(w2v_input)))
    current_class = origin_class

    # Select target class for the keyword list
    target_class = random.randint(0, numpy.max(data_y))
    while target_class == origin_class:
        target_class = random.randint(0, numpy.max(data_y))

    # Calculate priorities based on gradient
    input_gradient = gradient.get_gradient(network, w2v_input, session)

    word_score = numpy.amax(input_gradient, axis=2)[0]

    word_priority_list = list()
    for i in range(input_gradient.shape[1]):
        maximum_value = numpy.argmax(word_score)
        word_priority_list.append(maximum_value)
        word_score[maximum_value] = -99999

    print(input_sentence)

    # IMPORTANT: These two have to be kept in sync!
    # This is to reduce miss-tags later after modifying
    input_words = input_sentence.split(" ")
    for i in range(len(input_words)):
        if input_words[i] == '':  # Avoid empty words for tagging
            input_words[i] = ' '
    input_words_tagged = nltk.pos_tag(input_words)
    for i in range(len(input_words)):
        if input_words[i] == ' ':  # Reconstruct original sentence
            input_words[i] = ''
            input_words_tagged[i] = ('', input_words_tagged[i][1])

    # Helper function for getting scores
    def get_score_of_word(sentence_words, index, batch_size=256):
        """
        Calculates the score (gradient) of given word for all given sentences

        :param sentence_words: Sentences to test
        :type sentence_words: list of lists of words
        :param index: Index of the word to score
        :type index: int
        :param batch_size: Batch size. The larger the batch, the faster the calculation but also the higher the RAM consumption
        :type batch_size: int
        :returns: list containing scores for every senetnce
        """
        length = 0
        for sentence in sentence_words:
            length = max(length, len(sentence))
        result = []

        # Build batches
        number_batches = (len(sentence_words) // batch_size) + 1
        for current_batch in range(number_batches):
            print('Calculating score: {0:3d} of {1:3d}'.format(
                current_batch + 1, number_batches),
                  end='\r')
            sentence_w2v = list()
            for i in range(batch_size * current_batch,
                           batch_size * (current_batch + 1)):
                if i < len(sentence_words):
                    sentence_w2v.append(
                        encode_sentence.encode_word2vec(' '.join(
                            sentence_words[i]),
                                                        min_length=length,
                                                        cache_model=True))
                else:
                    break
            if len(sentence_w2v) == 0:
                break
            gradient_input = numpy.asarray(sentence_w2v)
            sentence_gradient = gradient.get_gradient(network, gradient_input,
                                                      session)
            for output_gradient in sentence_gradient:
                result.append(numpy.amax(output_gradient, axis=1)[0])
        print()
        return result

    # Basic sanity tests
    for i in range(len(word_priority_list)):
        assert word_priority_list[i] < len(input_words)

    while origin_class == current_class:  # Always change most important word per round
        print('Remaining words to test: {}'.format(len(word_priority_list)))
        if len(word_priority_list
               ) == 0:  # We have tried changing all words - aborting
            raise NoAdversarialExampleFound()

        current_word = word_priority_list.pop(0)
        current_word_tagged = input_words_tagged[current_word]

        if 'RB' in current_word_tagged[
                1] and current_word != 0:  # The word is an adverb - workaround to not delete the first word as it might lead to incorrect sentences
            number_changes += 1
            changes_list.append(
                ['deletion',
                 int(current_word), input_words[current_word]])
            input_words.pop(current_word)
            input_words_tagged.pop(current_word)
            for i in range(
                    len(word_priority_list)
            ):  # Update position of all words after the deleted one
                if word_priority_list[i] > current_word:
                    word_priority_list[i] = word_priority_list[i] - 1
        else:
            insert = False
            best_word = input_words[current_word]
            best_prediction = -100.0

            # Insertion
            if 'JJ' in current_word_tagged[1]:
                insert_candidate_set = set()
                for keyword in keyword_list[target_class]:
                    if 'RB' in keyword[1]:
                        insert_candidate_set.add(keyword[0])

                # Find best word
                insert_candidate_set = list(insert_candidate_set)
                test_data = []
                for test_word in insert_candidate_set:
                    test_input_words = copy.deepcopy(input_words)
                    for single_word in reversed(
                            test_word.split('_')
                    ):  # Multiple words are connected by '_', e.g. 'a_lot'
                        test_input_words.insert(current_word, single_word)
                    test_data.append(test_input_words)

                # Don't do anything if no candidate exists
                if len(test_data) != 0:
                    test_score = get_score_of_word(test_data, current_word)
                    for i in range(len(test_data)):
                        if test_score[i] > best_prediction:
                            best_prediction = test_score[i]
                            best_word = insert_candidate_set[i]
                            insert = True

            # Replacement
            candidate_set = set()
            # Keywords for candidates
            if use_keywords:
                for keyword in keyword_list[
                        target_class]:  # Doens't work well at least for TREC - produces unrecognisable questions
                    if keyword[1] == current_word_tagged[1]:
                        candidate_set.add(keyword[0])

            # Synonyms for candidates
            if 'NN' in current_word_tagged[1]:  # Noun
                for synonym_class in nltk.corpus.wordnet.synsets(
                        input_words[current_word],
                        pos=nltk.corpus.wordnet.NOUN):
                    for synonym in synonym_class.lemma_names():
                        candidate_set.add(str(synonym))

            if 'VB' in current_word_tagged[1]:  # Verb
                for synonym_class in nltk.corpus.wordnet.synsets(
                        input_words[current_word],
                        pos=nltk.corpus.wordnet.VERB):
                    for synonym in synonym_class.lemma_names():
                        candidate_set.add(str(synonym))

            if 'JJ' in current_word_tagged[1]:  # Adjective
                for synonym_class in nltk.corpus.wordnet.synsets(
                        input_words[current_word],
                        pos=nltk.corpus.wordnet.ADJ):
                    for synonym in synonym_class.lemma_names():
                        candidate_set.add(str(synonym))

            if 'RB' in current_word_tagged[1]:  # Adverb
                for synonym_class in nltk.corpus.wordnet.synsets(
                        input_words[current_word],
                        pos=nltk.corpus.wordnet.ADV):
                    for synonym in synonym_class.lemma_names():
                        candidate_set.add(str(synonym))

            # Typos for candidates
            if random.random() < typo_chance:
                for typo in typos.get_typos(input_words[current_word]):
                    candidate_set.add(typo)

            # Find best word
            test_data = []
            candidate_set = list(candidate_set)
            for test_word in candidate_set:
                test_input_words = copy.deepcopy(input_words)
                test_input_words.pop(
                    current_word
                )  # Remove word temporarily - the new one will be inserted here
                for single_word in reversed(
                        test_word.split('_')
                ):  # Multiple words are connected by '_', e.g. 'a_lot'
                    test_input_words.insert(current_word, single_word)
                test_data.append(test_input_words)

            # Don't do anything if no candidate exists
            if len(test_data) != 0:
                test_score = get_score_of_word(test_data, current_word)
                for i in range(len(test_data)):
                    if test_score[i] > best_prediction:
                        best_prediction = test_score[i]
                        best_word = candidate_set[i]
                        insert = False

            if insert:
                number_changes += 1
                changes_list.append(
                    ['insertion', int(current_word), best_word])
                for single_word in reversed(
                        best_word.split('_')
                ):  # Multiple words are connected by '_', e.g. 'a_lot'
                    input_words.insert(current_word, single_word)
                    input_words_tagged.insert(current_word,
                                              (single_word, 'RB'))
                number_words = 1 + best_word.count('_')
                for i in range(
                        len(word_priority_list)
                ):  # Update position of all words after the inserted one
                    if word_priority_list[i] > current_word:
                        word_priority_list[
                            i] = word_priority_list[i] + number_words
            else:
                if best_word != input_words[current_word]:
                    number_changes += 1
                    changes_list.append([
                        'modification',
                        int(current_word), input_words[current_word], best_word
                    ])
                input_words.pop(
                    current_word
                )  # Remove word temporarily - the new one will be inserted here
                old_pos_class = input_words_tagged.pop(current_word)[1]
                for single_word in reversed(
                        best_word.split('_')
                ):  # Multiple words are connected by '_', e.g. 'a_lot'
                    input_words.insert(current_word, single_word)
                    input_words_tagged.insert(current_word,
                                              (single_word, old_pos_class))
                additional_inserted = best_word.count('_')
                for i in range(len(word_priority_list)
                               ):  # Update index if more than one word
                    if word_priority_list[i] > current_word:
                        word_priority_list[
                            i] = word_priority_list[i] + additional_inserted

        # Basic sanity tests
        assert len(input_words) == len(input_words_tagged)
        for i in range(len(input_words)):
            assert input_words[i] == input_words_tagged[i][0]
        for i in range(len(word_priority_list)):
            assert word_priority_list[i] < len(input_words)

        # Test modified input
        input_sentence = ' '.join(input_words)
        print(input_sentence)
        w2v_input = numpy.asarray([
            encode_sentence.encode_word2vec(input_sentence, cache_model=True)
        ])
        current_class = int(numpy.argmax(network.predict(w2v_input)))

    # Sanity check
    assert number_changes == len(changes_list)
    return input_sentence, number_changes, changes_list
Example #4
0
# Get all adversarial example data
print()
print('--- Test input data ---')
print()

print('trec')
trec_used_adversarials = list()
trec_used_adversarials_original_class = list()
trec_used_adversarials_id = list()

for i in range(len(trec_input)):
    print('{0:5d} of {1:5d}'.format(i + 1, len(trec_input)), end='\r')
    trec_predicted = trec_network.predict(
        numpy.asarray(
            [encode_sentence.encode_word2vec(trec_input[i],
                                             cache_model=True)]))
    if trec_target_class[i] == int(numpy.argmax(trec_predicted)):
        trec_used_adversarials.append(trec_adversarial_input[i])
        trec_used_adversarials_original_class.append(trec_target_class[i])
        trec_used_adversarials_id.append(trec_id[i])
    else:
        trec_results['wrong_classification'].append(trec_id[i])
print()

print('ag')
ag_used_adversarials = list()
ag_used_adversarials_original_class = list()
ag_used_adversarials_id = list()

for i in range(len(ag_input)):
    print('{0:5d} of {1:5d}'.format(i + 1, len(ag_input)), end='\r')
Example #5
0
        amazonmovie_w2v_retrained = trained_networks.get_network(dataset.DatasetType.AMAZONMOVIE, dataset.Encoding.WORD2VEC, every_xth_trainings_data=2, skip_trainings_data=0, cache_prefix='retrained-')
        amazonmovie_w2v_second_half = trained_networks.get_network(dataset.DatasetType.AMAZONMOVIE, dataset.Encoding.WORD2VEC, every_xth_trainings_data=2, skip_trainings_data=1)
        amazonmovie_w2v_alternative = trained_networks.get_network(dataset.DatasetType.AMAZONMOVIE, dataset.Encoding.WORD2VEC, kernel_variation=[3, 3, 5, 5], every_xth_trainings_data=2, skip_trainings_data=0)
        amazonmovie_character = trained_networks.get_network(dataset.DatasetType.AMAZONMOVIE, dataset.Encoding.CHARACTER, every_xth_trainings_data=2, skip_trainings_data=0)
        number_calculations = 0
    else:
        number_calculations += 1

    input_filename = './adversarial_example/' + filename

    with open(input_filename, 'r') as file:
        file_content = json.load(file)
    gradient_file = dict()
    gradient_file['input_sentence'] = file_content['input_sentence']
    gradient_file['dataset'] = file_content['dataset']
    input_sentence_w2v = numpy.asarray([encode_sentence.encode_word2vec(file_content['input_sentence'], cache_model=True)])
    input_sentence_character = numpy.asarray([encode_sentence.encode_character(file_content['input_sentence'])])
    if file_content['dataset'] == dataset.DatasetType.TREC.value:
        gradient_file['target_network'] = gradient.get_gradient(trec_w2v, input_sentence_w2v, session).tolist()[0]
        gradient_file['w2v_retrained'] = gradient.get_gradient(trec_w2v_retrained, input_sentence_w2v, session).tolist()[0]
        gradient_file['w2v_second_half'] = gradient.get_gradient(trec_w2v_second_half, input_sentence_w2v, session).tolist()[0]
        gradient_file['w2v_alternative_first_half'] = gradient.get_gradient(trec_w2v_alternative, input_sentence_w2v, session).tolist()[0]
        gradient_file['character'] = gradient.get_gradient(trec_character, input_sentence_character, session).tolist()[0]
    elif file_content['dataset'] == dataset.DatasetType.AG.value:
        gradient_file['target_network'] = gradient.get_gradient(ag_w2v, input_sentence_w2v, session).tolist()[0]
        gradient_file['w2v_retrained'] = gradient.get_gradient(ag_w2v_retrained, input_sentence_w2v, session).tolist()[0]
        gradient_file['w2v_second_half'] = gradient.get_gradient(ag_w2v_second_half, input_sentence_w2v, session).tolist()[0]
        gradient_file['w2v_alternative_first_half'] = gradient.get_gradient(ag_w2v_alternative, input_sentence_w2v, session).tolist()[0]
        gradient_file['character'] = gradient.get_gradient(ag_character, input_sentence_character, session).tolist()[0]
    elif file_content['dataset'] == dataset.DatasetType.AMAZONMOVIE.value:
        gradient_file['target_network'] = gradient.get_gradient(amazonmovie_w2v, input_sentence_w2v, session).tolist()[0]
    print()


    # Get all adversarial example data
    print()
    print('--- Test input data ---')
    print()

    print('trec')
    trec_used_adversarials = list()
    trec_used_adversarials_original_class = list()
    trec_used_adversarials_id = list()

    for i in range(len(trec_input)):
        print('{0:5d} of {1:5d}'.format(i+1, len(trec_input)), end='\r')
        if trec_target_class[i] == int(numpy.argmax(trec_w2v_retrained.predict(numpy.asarray([encode_sentence.encode_word2vec(trec_input[i], cache_model=True)])))) and trec_target_class[i] == int(numpy.argmax(trec_w2v_alternative.predict(numpy.asarray([encode_sentence.encode_word2vec(trec_input[i], cache_model=True)])))) and trec_target_class[i] == int(numpy.argmax(trec_w2v_second_half.predict(numpy.asarray([encode_sentence.encode_word2vec(trec_input[i], cache_model=True)])))) and trec_target_class[i] == int(numpy.argmax(trec_character.predict(numpy.asarray([encode_sentence.encode_character(trec_input[i])])))):
            trec_used_adversarials.append(trec_adversarial_input[i])
            trec_used_adversarials_original_class.append(trec_target_class[i])
            trec_used_adversarials_id.append(trec_id[i])
        else:
            results['trec_wrong_classification'].append(trec_id[i])
    print()

    print('ag')
    ag_used_adversarials = list()
    ag_used_adversarials_original_class = list()
    ag_used_adversarials_id = list()

    for i in range(len(ag_input)):
        print('{0:5d} of {1:5d}'.format(i+1, len(ag_input)), end='\r')
        if ag_target_class[i] == int(numpy.argmax(ag_w2v_retrained.predict(numpy.asarray([encode_sentence.encode_word2vec(ag_input[i], cache_model=True)])))) and ag_target_class[i] == int(numpy.argmax(ag_w2v_alternative.predict(numpy.asarray([encode_sentence.encode_word2vec(ag_input[i], cache_model=True)])))) and ag_target_class[i] == int(numpy.argmax(ag_w2v_second_half.predict(numpy.asarray([encode_sentence.encode_word2vec(ag_input[i], cache_model=True)])))) and ag_target_class[i] == int(numpy.argmax(ag_character.predict(numpy.asarray([encode_sentence.encode_character(ag_input[i])])))):