Esempio n. 1
0
 def test_get_gradient(self):
     x, y = sympy.symbols('x y')
     expr = 100 * (y - x**2)**2 + (1 - x)**2
     # whose gradient is (-400xy+400x^3+2x-2, 200(y-x^2))
     position1 = np.array([0., 0.])
     # the value should be (-2, 0)
     self.assertTrue(
         np.allclose(np.array([-2, 0]), get_gradient(expr, position1)))
     position2 = np.array([1., 1.])
     # the value should be (0, 0)
     self.assertTrue(
         np.allclose(np.array([0., 0.]), get_gradient(expr, position2)))
Esempio n. 2
0
def derivate_in_search(expression, direction, position):
    """
	handle the derivate operation in the line search
	para::expression: the expression of a function
	para::direction: the direction of the line search, array like object(python list, numpy 1d array, etc)
	para::position: array like object(python list, numpy 1d array, etc)
	return::derivate: the numerical value 
	background:
		we hope the find the minimum of function
		phi(alpha) = f(x_k + alpha * direction_k)
		which derivate is 
		phi'(alpha) = gradient(f)(x_k) * direction_k 
	"""
    expr_gradient = get_gradient(expression, position)
    derivate = np.dot(expr_gradient, direction)
    return derivate
Esempio n. 3
0
def pix_arr_from_file(filename):
	'''
		Given a filename for an image, creates a list of Pixel objects (including 
		their color in the Laplacian gradient of the image)
	'''
	img_grad = gradient.get_gradient(filename)
	img = Image.open(filename)
	
	pix_arr = []
	for row in range(img.size[1]):
		new_row = []
		for col in range(img.size[0]):
			new_row.append(Pixel(img.getpixel((col, row)), img_grad[row][col]))
		pix_arr.append(new_row)

	img.close()

	return pix_arr
Esempio n. 4
0
    def get_score_of_word(sentence_words, index, batch_size=256):
        """
        Calculates the score (gradient) of given word for all given sentences

        :param sentence_words: Sentences to test
        :type sentence_words: list of lists of words
        :param index: Index of the word to score
        :type index: int
        :param batch_size: Batch size. The larger the batch, the faster the calculation but also the higher the RAM consumption
        :type batch_size: int
        :returns: list containing scores for every senetnce
        """
        length = 0
        for sentence in sentence_words:
            length = max(length, len(sentence))
        result = []

        # Build batches
        number_batches = (len(sentence_words) // batch_size) + 1
        for current_batch in range(number_batches):
            print('Calculating score: {0:3d} of {1:3d}'.format(
                current_batch + 1, number_batches),
                  end='\r')
            sentence_w2v = list()
            for i in range(batch_size * current_batch,
                           batch_size * (current_batch + 1)):
                if i < len(sentence_words):
                    sentence_w2v.append(
                        encode_sentence.encode_word2vec(' '.join(
                            sentence_words[i]),
                                                        min_length=length,
                                                        cache_model=True))
                else:
                    break
            if len(sentence_w2v) == 0:
                break
            gradient_input = numpy.asarray(sentence_w2v)
            sentence_gradient = gradient.get_gradient(network, gradient_input,
                                                      session)
            for output_gradient in sentence_gradient:
                result.append(numpy.amax(output_gradient, axis=1)[0])
        print()
        return result
Esempio n. 5
0
def attack_w2v(network: keras.models.Model,
               input_data: str,
               session: tensorflow.InteractiveSession,
               data_x: list,
               data_y: list,
               attack_key=None,
               use_keywords=True,
               typo_chance=1.0) -> str:
    """
    Runs an adversarial attack against a network which uses the Google word2vec encoding.

    The attack is modelled after the attack described by Suranjana Samanta and Sameep Mehta in "Towards Crafting Text Adversarial Samples".
    https://arxiv.org/abs/1707.02812

    :raises: NoAdversarialExampleFound

    :param network: Network to attack
    :type network: keras.model.Model
    :param input_data: Input sentence
    :type input_data: str
    :param session: Session to run the network under
    :type session: tensorflow.InteractiveSession
    :param data_x: Dataset in normal string form
    :type data_x: [str, str, ...]
    :param data_y: Dataset classes as normal integers
    :type data_y: [int, int, ...]
    :param attack_key: Key to store keyword list for this game. None for no caching.
    :type attack_key: str
    :param use_keywords: If set to True, 'Genre specific keywords' will be used (see Samanta and Mehta)
    :type use_keywords: bool
    :param typo_chance: Chance that typos are included into the search
    :type typo_chance: float
    :returns: String containing the resulting attack, int number of changes, array containing changes
    """
    input_sentence = input_data

    if not os.path.exists('./datasets/nltk/'):
        os.makedirs('./datasets/nltk/')

    keyword_list = _create_keyword_list(data_x, data_y, attack_key)

    number_changes = 0
    changes_list = []

    nltk.data.path = ['./datasets/nltk/']
    global _ATTACKS_WORDNET_CHECKED
    global _ATTACKS_AVERAGED_PERCEPTRON_TAGGER_CHECKED
    if not _ATTACKS_WORDNET_CHECKED:
        nltk.download('wordnet', download_dir='./datasets/nltk/')
        _ATTACKS_WORDNET_CHECKED = True
    if not _ATTACKS_AVERAGED_PERCEPTRON_TAGGER_CHECKED:
        nltk.download('averaged_perceptron_tagger',
                      download_dir='./datasets/nltk/')
        _ATTACKS_AVERAGED_PERCEPTRON_TAGGER_CHECKED = True

    w2v_input = numpy.asarray(
        [encode_sentence.encode_word2vec(input_sentence, cache_model=True)])

    # Calculate origin class and current class
    origin_class = int(numpy.argmax(network.predict(w2v_input)))
    current_class = origin_class

    # Select target class for the keyword list
    target_class = random.randint(0, numpy.max(data_y))
    while target_class == origin_class:
        target_class = random.randint(0, numpy.max(data_y))

    # Calculate priorities based on gradient
    input_gradient = gradient.get_gradient(network, w2v_input, session)

    word_score = numpy.amax(input_gradient, axis=2)[0]

    word_priority_list = list()
    for i in range(input_gradient.shape[1]):
        maximum_value = numpy.argmax(word_score)
        word_priority_list.append(maximum_value)
        word_score[maximum_value] = -99999

    print(input_sentence)

    # IMPORTANT: These two have to be kept in sync!
    # This is to reduce miss-tags later after modifying
    input_words = input_sentence.split(" ")
    for i in range(len(input_words)):
        if input_words[i] == '':  # Avoid empty words for tagging
            input_words[i] = ' '
    input_words_tagged = nltk.pos_tag(input_words)
    for i in range(len(input_words)):
        if input_words[i] == ' ':  # Reconstruct original sentence
            input_words[i] = ''
            input_words_tagged[i] = ('', input_words_tagged[i][1])

    # Helper function for getting scores
    def get_score_of_word(sentence_words, index, batch_size=256):
        """
        Calculates the score (gradient) of given word for all given sentences

        :param sentence_words: Sentences to test
        :type sentence_words: list of lists of words
        :param index: Index of the word to score
        :type index: int
        :param batch_size: Batch size. The larger the batch, the faster the calculation but also the higher the RAM consumption
        :type batch_size: int
        :returns: list containing scores for every senetnce
        """
        length = 0
        for sentence in sentence_words:
            length = max(length, len(sentence))
        result = []

        # Build batches
        number_batches = (len(sentence_words) // batch_size) + 1
        for current_batch in range(number_batches):
            print('Calculating score: {0:3d} of {1:3d}'.format(
                current_batch + 1, number_batches),
                  end='\r')
            sentence_w2v = list()
            for i in range(batch_size * current_batch,
                           batch_size * (current_batch + 1)):
                if i < len(sentence_words):
                    sentence_w2v.append(
                        encode_sentence.encode_word2vec(' '.join(
                            sentence_words[i]),
                                                        min_length=length,
                                                        cache_model=True))
                else:
                    break
            if len(sentence_w2v) == 0:
                break
            gradient_input = numpy.asarray(sentence_w2v)
            sentence_gradient = gradient.get_gradient(network, gradient_input,
                                                      session)
            for output_gradient in sentence_gradient:
                result.append(numpy.amax(output_gradient, axis=1)[0])
        print()
        return result

    # Basic sanity tests
    for i in range(len(word_priority_list)):
        assert word_priority_list[i] < len(input_words)

    while origin_class == current_class:  # Always change most important word per round
        print('Remaining words to test: {}'.format(len(word_priority_list)))
        if len(word_priority_list
               ) == 0:  # We have tried changing all words - aborting
            raise NoAdversarialExampleFound()

        current_word = word_priority_list.pop(0)
        current_word_tagged = input_words_tagged[current_word]

        if 'RB' in current_word_tagged[
                1] and current_word != 0:  # The word is an adverb - workaround to not delete the first word as it might lead to incorrect sentences
            number_changes += 1
            changes_list.append(
                ['deletion',
                 int(current_word), input_words[current_word]])
            input_words.pop(current_word)
            input_words_tagged.pop(current_word)
            for i in range(
                    len(word_priority_list)
            ):  # Update position of all words after the deleted one
                if word_priority_list[i] > current_word:
                    word_priority_list[i] = word_priority_list[i] - 1
        else:
            insert = False
            best_word = input_words[current_word]
            best_prediction = -100.0

            # Insertion
            if 'JJ' in current_word_tagged[1]:
                insert_candidate_set = set()
                for keyword in keyword_list[target_class]:
                    if 'RB' in keyword[1]:
                        insert_candidate_set.add(keyword[0])

                # Find best word
                insert_candidate_set = list(insert_candidate_set)
                test_data = []
                for test_word in insert_candidate_set:
                    test_input_words = copy.deepcopy(input_words)
                    for single_word in reversed(
                            test_word.split('_')
                    ):  # Multiple words are connected by '_', e.g. 'a_lot'
                        test_input_words.insert(current_word, single_word)
                    test_data.append(test_input_words)

                # Don't do anything if no candidate exists
                if len(test_data) != 0:
                    test_score = get_score_of_word(test_data, current_word)
                    for i in range(len(test_data)):
                        if test_score[i] > best_prediction:
                            best_prediction = test_score[i]
                            best_word = insert_candidate_set[i]
                            insert = True

            # Replacement
            candidate_set = set()
            # Keywords for candidates
            if use_keywords:
                for keyword in keyword_list[
                        target_class]:  # Doens't work well at least for TREC - produces unrecognisable questions
                    if keyword[1] == current_word_tagged[1]:
                        candidate_set.add(keyword[0])

            # Synonyms for candidates
            if 'NN' in current_word_tagged[1]:  # Noun
                for synonym_class in nltk.corpus.wordnet.synsets(
                        input_words[current_word],
                        pos=nltk.corpus.wordnet.NOUN):
                    for synonym in synonym_class.lemma_names():
                        candidate_set.add(str(synonym))

            if 'VB' in current_word_tagged[1]:  # Verb
                for synonym_class in nltk.corpus.wordnet.synsets(
                        input_words[current_word],
                        pos=nltk.corpus.wordnet.VERB):
                    for synonym in synonym_class.lemma_names():
                        candidate_set.add(str(synonym))

            if 'JJ' in current_word_tagged[1]:  # Adjective
                for synonym_class in nltk.corpus.wordnet.synsets(
                        input_words[current_word],
                        pos=nltk.corpus.wordnet.ADJ):
                    for synonym in synonym_class.lemma_names():
                        candidate_set.add(str(synonym))

            if 'RB' in current_word_tagged[1]:  # Adverb
                for synonym_class in nltk.corpus.wordnet.synsets(
                        input_words[current_word],
                        pos=nltk.corpus.wordnet.ADV):
                    for synonym in synonym_class.lemma_names():
                        candidate_set.add(str(synonym))

            # Typos for candidates
            if random.random() < typo_chance:
                for typo in typos.get_typos(input_words[current_word]):
                    candidate_set.add(typo)

            # Find best word
            test_data = []
            candidate_set = list(candidate_set)
            for test_word in candidate_set:
                test_input_words = copy.deepcopy(input_words)
                test_input_words.pop(
                    current_word
                )  # Remove word temporarily - the new one will be inserted here
                for single_word in reversed(
                        test_word.split('_')
                ):  # Multiple words are connected by '_', e.g. 'a_lot'
                    test_input_words.insert(current_word, single_word)
                test_data.append(test_input_words)

            # Don't do anything if no candidate exists
            if len(test_data) != 0:
                test_score = get_score_of_word(test_data, current_word)
                for i in range(len(test_data)):
                    if test_score[i] > best_prediction:
                        best_prediction = test_score[i]
                        best_word = candidate_set[i]
                        insert = False

            if insert:
                number_changes += 1
                changes_list.append(
                    ['insertion', int(current_word), best_word])
                for single_word in reversed(
                        best_word.split('_')
                ):  # Multiple words are connected by '_', e.g. 'a_lot'
                    input_words.insert(current_word, single_word)
                    input_words_tagged.insert(current_word,
                                              (single_word, 'RB'))
                number_words = 1 + best_word.count('_')
                for i in range(
                        len(word_priority_list)
                ):  # Update position of all words after the inserted one
                    if word_priority_list[i] > current_word:
                        word_priority_list[
                            i] = word_priority_list[i] + number_words
            else:
                if best_word != input_words[current_word]:
                    number_changes += 1
                    changes_list.append([
                        'modification',
                        int(current_word), input_words[current_word], best_word
                    ])
                input_words.pop(
                    current_word
                )  # Remove word temporarily - the new one will be inserted here
                old_pos_class = input_words_tagged.pop(current_word)[1]
                for single_word in reversed(
                        best_word.split('_')
                ):  # Multiple words are connected by '_', e.g. 'a_lot'
                    input_words.insert(current_word, single_word)
                    input_words_tagged.insert(current_word,
                                              (single_word, old_pos_class))
                additional_inserted = best_word.count('_')
                for i in range(len(word_priority_list)
                               ):  # Update index if more than one word
                    if word_priority_list[i] > current_word:
                        word_priority_list[
                            i] = word_priority_list[i] + additional_inserted

        # Basic sanity tests
        assert len(input_words) == len(input_words_tagged)
        for i in range(len(input_words)):
            assert input_words[i] == input_words_tagged[i][0]
        for i in range(len(word_priority_list)):
            assert word_priority_list[i] < len(input_words)

        # Test modified input
        input_sentence = ' '.join(input_words)
        print(input_sentence)
        w2v_input = numpy.asarray([
            encode_sentence.encode_word2vec(input_sentence, cache_model=True)
        ])
        current_class = int(numpy.argmax(network.predict(w2v_input)))

    # Sanity check
    assert number_changes == len(changes_list)
    return input_sentence, number_changes, changes_list
Esempio n. 6
0
def view_gradient_descent(expression,
                          position,
                          epsilon,
                          alpha_bar=3,
                          rho=0.4,
                          sigma=0.7):
    """
	carry out the gradient descent method to find the minimal point of the given function
	para::expression: the expression of the function
	para::position: the initial search point
	para::epsilon: the termination error
	para::alpha_bar: the initial length of the line search
	para::rho: parameter in line search, in (0, 1/2)
	para::sigma: parameter in line search, in (rho, 1)
	..note: to use this method, you'd better construct the expression in this way:
		claim all the variables
		write the expression

		for example:
		>>> import sympy 
		>>> import numpy as np
		>>> x, y, z = sympy.symbols('x y z')
		>>> expr = sympy.cos(x) + sympy.exp(y*z)
	"""
    # convert the function for show
    show_func = sympy.lambdify(expression.free_symbols, expression, 'numpy')
    # the new points in ech iteration
    show_points = [position.tolist()]
    iter_time = 0
    while True:
        expr_gradient = get_gradient(expression, position)
        if npnorm(expr_gradient) < epsilon:
            break
        direction = -expr_gradient
        # get the step length by line search
        alpha = line_search(expression, position, direction, alpha_bar, rho,
                            sigma)
        position = position + alpha * direction
        show_points.append(position.tolist())
        iter_time += 1
    print 'iterations: ', iter_time
    print 'point: ', show_points[-1]
    # draw the contour and line segments between the iterate points
    fig = plt.figure()
    ccx = np.linspace(-50, 50, 1000)
    ccy = np.linspace(-50, 50, 1000)
    X, Y = np.meshgrid(ccx, ccy)
    Z = show_func(X, Y)
    plt.contour(X, Y, Z, colors='black')
    """
	for k in range(len(show_points) -1):
		plt.plot((show_points[k], show_points[k+1]),
			color='brown', marker='o')
	"""
    # draw the lines
    segs = [[k, k + 1] for k in range(len(show_points) - 1)]
    lines = [[tuple(show_points[j]) for j in i] for i in segs]
    lc = matplotlib.collections.LineCollection(lines)
    ax = fig.add_subplot(111)
    ax.add_collection(lc)
    plt.xlim([-50, 50])
    plt.ylim([-50, 50])
    plt.show()
Esempio n. 7
0
        amazonmovie_character = trained_networks.get_network(dataset.DatasetType.AMAZONMOVIE, dataset.Encoding.CHARACTER, every_xth_trainings_data=2, skip_trainings_data=0)
        number_calculations = 0
    else:
        number_calculations += 1

    input_filename = './adversarial_example/' + filename

    with open(input_filename, 'r') as file:
        file_content = json.load(file)
    gradient_file = dict()
    gradient_file['input_sentence'] = file_content['input_sentence']
    gradient_file['dataset'] = file_content['dataset']
    input_sentence_w2v = numpy.asarray([encode_sentence.encode_word2vec(file_content['input_sentence'], cache_model=True)])
    input_sentence_character = numpy.asarray([encode_sentence.encode_character(file_content['input_sentence'])])
    if file_content['dataset'] == dataset.DatasetType.TREC.value:
        gradient_file['target_network'] = gradient.get_gradient(trec_w2v, input_sentence_w2v, session).tolist()[0]
        gradient_file['w2v_retrained'] = gradient.get_gradient(trec_w2v_retrained, input_sentence_w2v, session).tolist()[0]
        gradient_file['w2v_second_half'] = gradient.get_gradient(trec_w2v_second_half, input_sentence_w2v, session).tolist()[0]
        gradient_file['w2v_alternative_first_half'] = gradient.get_gradient(trec_w2v_alternative, input_sentence_w2v, session).tolist()[0]
        gradient_file['character'] = gradient.get_gradient(trec_character, input_sentence_character, session).tolist()[0]
    elif file_content['dataset'] == dataset.DatasetType.AG.value:
        gradient_file['target_network'] = gradient.get_gradient(ag_w2v, input_sentence_w2v, session).tolist()[0]
        gradient_file['w2v_retrained'] = gradient.get_gradient(ag_w2v_retrained, input_sentence_w2v, session).tolist()[0]
        gradient_file['w2v_second_half'] = gradient.get_gradient(ag_w2v_second_half, input_sentence_w2v, session).tolist()[0]
        gradient_file['w2v_alternative_first_half'] = gradient.get_gradient(ag_w2v_alternative, input_sentence_w2v, session).tolist()[0]
        gradient_file['character'] = gradient.get_gradient(ag_character, input_sentence_character, session).tolist()[0]
    elif file_content['dataset'] == dataset.DatasetType.AMAZONMOVIE.value:
        gradient_file['target_network'] = gradient.get_gradient(amazonmovie_w2v, input_sentence_w2v, session).tolist()[0]
        gradient_file['w2v_retrained'] = gradient.get_gradient(amazonmovie_w2v_retrained, input_sentence_w2v, session).tolist()[0]
        gradient_file['w2v_second_half'] = gradient.get_gradient(amazonmovie_w2v_second_half, input_sentence_w2v, session).tolist()[0]
        gradient_file['w2v_alternative_first_half'] = gradient.get_gradient(amazonmovie_w2v_alternative, input_sentence_w2v, session).tolist()[0]