def test_multi_step_gradient(self): input_size = 5 hidden_size = 6 n = Gru(input_size, hidden_size) xs = [frand(size=input_size) for _ in range(10)] h0 = frand(hidden_size) intermediate_results = {} hs = n.forward_prop(xs, h0, intermediate_results) n.back_prop([derr(h) for h in hs], intermediate_results) for index in scalar_indices(n): array_name = gru_array_names(n)[index[0]] delta = 1e-4 slightly_less = clone(n) gru_arrays(slightly_less)[index[0]][index[1]] -= delta slightly_less_hs = slightly_less.forward_prop(xs, h0, {}) err_slightly_less = sum([err(h) for h in slightly_less_hs]) slightly_more = clone(n) gru_arrays(slightly_more)[index[0]][index[1]] += delta slightly_more_hs = slightly_more.forward_prop(xs, h0, {}) err_slightly_more = sum([err(h) for h in slightly_more_hs]) expected_grad = (err_slightly_more - err_slightly_less) / (2 * delta) numerical_grad = gru_results_arrays(intermediate_results)[index[0]][index[1]] self.assertTrue(abs(expected_grad - numerical_grad) < 0.01, "{}: {} not within threshold of {}".format(array_name, numerical_grad, expected_grad))
def test_multi_step_gradient(self): input_size = 5 hidden_size = 6 n = Gru(input_size, hidden_size) xs = [frand(size=input_size) for _ in range(10)] h0 = frand(hidden_size) intermediate_results = {} hs = n.forward_prop(xs, h0, intermediate_results) n.back_prop([derr(h) for h in hs], intermediate_results) for index in scalar_indices(n): array_name = gru_array_names(n)[index[0]] delta = 1e-4 slightly_less = clone(n) gru_arrays(slightly_less)[index[0]][index[1]] -= delta slightly_less_hs = slightly_less.forward_prop(xs, h0, {}) err_slightly_less = sum([err(h) for h in slightly_less_hs]) slightly_more = clone(n) gru_arrays(slightly_more)[index[0]][index[1]] += delta slightly_more_hs = slightly_more.forward_prop(xs, h0, {}) err_slightly_more = sum([err(h) for h in slightly_more_hs]) expected_grad = (err_slightly_more - err_slightly_less) / (2 * delta) numerical_grad = gru_results_arrays(intermediate_results)[ index[0]][index[1]] self.assertTrue( abs(expected_grad - numerical_grad) < 0.01, "{}: {} not within threshold of {}".format( array_name, numerical_grad, expected_grad))
def clone(gru: Gru): gru_clone = Gru(0, 0) gru_clone.w_rx = np.copy(gru.w_rx) gru_clone.w_rh = np.copy(gru.w_rh) gru_clone.b_r = np.copy(gru.b_r) gru_clone.w_zx = np.copy(gru.w_zx) gru_clone.w_zh = np.copy(gru.w_zh) gru_clone.b_z = np.copy(gru.b_z) gru_clone.w_hx = np.copy(gru.w_hx) gru_clone.w_hh = np.copy(gru.w_hh) gru_clone.b_h = np.copy(gru.b_h) return gru_clone
def test_learn_word_vectors_from_char_vector_sequence(self): text = "please learn how to infer word vectors from sequences of character vectors" index_to_word = list(set(text.split())) index_to_char = list(set(text)) word_to_index = { word: index for index, word in enumerate(index_to_word) } char_to_index = { word: index for index, word in enumerate(index_to_char) } def to_char_vector_sequence(word): sequence = [] for char in word: vector = np.ones(len(char_to_index)) * -1 vector[char_to_index[char]] = 1 sequence.append(vector) sequence.append(np.zeros(len(char_to_index))) return sequence def to_word_vector(word): vector = np.ones(len(word_to_index)) * -1 vector[word_to_index[word]] = 1 return vector training_data = [(to_char_vector_sequence(word), to_word_vector(word)) for word in text.split()] n = Gru(len(index_to_char), len(index_to_word)) for i in range(1000): for char_vectors, word_vector in training_data: intermediate_results = {} hs = n.forward_prop(char_vectors, np.zeros(len(index_to_word)), intermediate_results) dhs = [ np.zeros(shape=word_vector.shape) for _ in range(len(hs)) ] dhs[-1] = ce_err_prime(hs[-1], word_vector) n.back_prop(dhs, intermediate_results) n.train(0.1, intermediate_results) if i % 200 == 0: total_err = 0 for char_vectors, word_vector in training_data: hs = n.forward_prop(char_vectors, np.zeros(len(index_to_word)), {}) total_err += mathutils.mse(hs[-1], word_vector) print(total_err / len(training_data)) result = n.forward_prop(to_char_vector_sequence("infer"), np.zeros(len(index_to_word)), {})[-1] self.assertEquals("infer", index_to_word[np.argmax(result)])
def test_single_step_gradient(self): input_size = 5 hidden_size = 6 n = Gru(input_size, hidden_size) xs = [frand(size=input_size)] h0 = frand(hidden_size) intermediate_results = {} hs = n.forward_prop(xs, h0, intermediate_results) dh0 = n.back_prop([derr(hs[-1])], intermediate_results) delta = 1e-4 for index in scalar_indices(n): array_name = gru_array_names(n)[index[0]] slightly_less = clone(n) gru_arrays(slightly_less)[index[0]][index[1]] -= delta err_slightly_less = err(slightly_less.forward_prop(xs, h0, {})[-1]) slightly_more = clone(n) gru_arrays(slightly_more)[index[0]][index[1]] += delta err_slightly_more = err(slightly_more.forward_prop(xs, h0, {})[-1]) expected_grad = (err_slightly_more - err_slightly_less) / (2 * delta) numerical_grad = gru_results_arrays(intermediate_results)[index[0]][index[1]] self.assertTrue(abs(expected_grad - numerical_grad) < 0.01, "{}: {} not within threshold of {}".format(array_name, numerical_grad, expected_grad)) for index in np.ndindex(h0.shape): slightly_less_h0 = np.copy(h0) slightly_less_h0[index] -= delta err_slightly_less_h0 = err(n.forward_prop(xs, slightly_less_h0, {})[-1]) slightly_more_h0 = np.copy(h0) slightly_more_h0[index] += delta err_slightly_more_h0 = err(n.forward_prop(xs, slightly_more_h0, {})[-1]) expected_grad = (err_slightly_more_h0 - err_slightly_less_h0) / (2 * delta) numerical_grad = dh0[index] self.assertTrue(abs(expected_grad - numerical_grad) < 0.01, "h0: {} not within threshold of {}".format(numerical_grad, expected_grad))
def test_learn_word_vectors_from_char_vector_sequence(self): text = "please learn how to infer word vectors from sequences of character vectors" index_to_word = list(set(text.split())) index_to_char = list(set(text)) word_to_index = {word: index for index, word in enumerate(index_to_word)} char_to_index = {word: index for index, word in enumerate(index_to_char)} def to_char_vector_sequence(word): sequence = [] for char in word: vector = np.ones(len(char_to_index)) * -1 vector[char_to_index[char]] = 1 sequence.append(vector) sequence.append(np.zeros(len(char_to_index))) return sequence def to_word_vector(word): vector = np.ones(len(word_to_index)) * -1 vector[word_to_index[word]] = 1 return vector training_data = [(to_char_vector_sequence(word), to_word_vector(word)) for word in text.split()] n = Gru(len(index_to_char), len(index_to_word)) for i in range(1000): for char_vectors, word_vector in training_data: intermediate_results = {} hs = n.forward_prop(char_vectors, np.zeros(len(index_to_word)), intermediate_results) dhs = [np.zeros(shape=word_vector.shape) for _ in range(len(hs))] dhs[-1] = ce_err_prime(hs[-1], word_vector) n.back_prop(dhs, intermediate_results) n.train(0.1, intermediate_results) if i % 200 == 0: total_err = 0 for char_vectors, word_vector in training_data: hs = n.forward_prop(char_vectors, np.zeros(len(index_to_word)), {}) total_err += mathutils.mse(hs[-1], word_vector) print(total_err/len(training_data)) result = n.forward_prop(to_char_vector_sequence("infer"), np.zeros(len(index_to_word)), {})[-1] self.assertEquals("infer", index_to_word[np.argmax(result)])
def test_single_step_gradient(self): input_size = 5 hidden_size = 6 n = Gru(input_size, hidden_size) xs = [frand(size=input_size)] h0 = frand(hidden_size) intermediate_results = {} hs = n.forward_prop(xs, h0, intermediate_results) dh0 = n.back_prop([derr(hs[-1])], intermediate_results) delta = 1e-4 for index in scalar_indices(n): array_name = gru_array_names(n)[index[0]] slightly_less = clone(n) gru_arrays(slightly_less)[index[0]][index[1]] -= delta err_slightly_less = err(slightly_less.forward_prop(xs, h0, {})[-1]) slightly_more = clone(n) gru_arrays(slightly_more)[index[0]][index[1]] += delta err_slightly_more = err(slightly_more.forward_prop(xs, h0, {})[-1]) expected_grad = (err_slightly_more - err_slightly_less) / (2 * delta) numerical_grad = gru_results_arrays(intermediate_results)[ index[0]][index[1]] self.assertTrue( abs(expected_grad - numerical_grad) < 0.01, "{}: {} not within threshold of {}".format( array_name, numerical_grad, expected_grad)) for index in np.ndindex(h0.shape): slightly_less_h0 = np.copy(h0) slightly_less_h0[index] -= delta err_slightly_less_h0 = err( n.forward_prop(xs, slightly_less_h0, {})[-1]) slightly_more_h0 = np.copy(h0) slightly_more_h0[index] += delta err_slightly_more_h0 = err( n.forward_prop(xs, slightly_more_h0, {})[-1]) expected_grad = (err_slightly_more_h0 - err_slightly_less_h0) / (2 * delta) numerical_grad = dh0[index] self.assertTrue( abs(expected_grad - numerical_grad) < 0.01, "h0: {} not within threshold of {}".format( numerical_grad, expected_grad))
def testTranslateWordsIntoInitialisms(self): text = "Born in Vienna into one of Europe's richest families, he inherited a large fortune " \ "from his father in 1913. He gave some considerable sums to poor artists. In a period " \ "of severe personal depression after the first World War, he then gave away his entire " \ "fortune to his brothers and sisters. Three of his brothers committed suicide, with " \ "Wittgenstein contemplating it too. He left academia several times—serving as an " \ "officer on the front line during World War I, where he was decorated a number of times " \ "for his courage; teaching in schools in remote Austrian villages where he encountered " \ "controversy for hitting children when they made mistakes in mathematics; and working " \ "as a hospital porter during World War II in London where he told patients not to take " \ "the drugs they were prescribed while largely managing to keep secret the fact that he " \ "was one of the world's most famous philosophers." index_to_word = sorted(list(set(text.split(sep=" ")))) word_to_index = {word: i for i, word in enumerate(index_to_word)} index_to_char = sorted( list(set([word[0].upper() for word in index_to_word]))) char_to_index = {char: i for i, char in enumerate(index_to_char)} def vector_from_word(word): index = word_to_index[word] vec = np.zeros(len(index_to_word)) vec[index] = 1 return vec def word_from_vector(vector): index = vector.argmax() if vector[index] < 0.3: return "?" else: return index_to_word[index] def vector_from_char(char): vec = np.zeros(len(index_to_char)) upper = char.upper() if upper in char_to_index: index = char_to_index[upper] vec[index] = 1 return vec def char_from_vector(vector): index = vector.argmax() if vector[index] < -0.3: return " " elif vector[index] < 0.3: return "?" else: return index_to_char[index] max_seq_size = 5 training_set = [] for _ in range(500): seq_size = random.randint(1, max_seq_size) word_indices = [ random.randrange(0, len(index_to_word)) for _ in range(seq_size) ] words = [index_to_word[index] for index in word_indices] initials = [word[0].upper() for word in words] training_set.append( ([vector_from_word(word) for word in words], [vector_from_char(char) for char in initials])) encoder_hidden_state_size = 50 encoder = Gru(len(index_to_word), encoder_hidden_state_size) decoder = Gru( len(index_to_char) + encoder_hidden_state_size, len(index_to_char)) encoder_h0 = np.random.uniform(-0.2, 0.2, encoder_hidden_state_size) decoder_h0 = np.random.uniform(-0.2, 0.2, len(index_to_char)) end_of_sequence = np.ones(len(index_to_char)) * -1 for epoch in range(10000): debug = epoch % 5000 == 0 for word_vectors, char_vectors in random.sample(training_set, 30): encoder_results = {} encoded_state = encoder.forward_prop(word_vectors, encoder_h0, encoder_results)[-1] decoder_results = {} def decoder_input_generator(max): yield np.concatenate( [encoded_state, np.zeros(len(index_to_char))]) prev_h = decoder_results["hs"][-1] resulting_char = char_from_vector(prev_h) i = 0 while resulting_char is not " " and i <= max: yield np.concatenate( [encoded_state, vector_from_char(resulting_char)]) i += 1 hs = decoder.forward_prop( decoder_input_generator(len(char_vectors)), decoder_h0, decoder_results) if len(hs) <= len(char_vectors): targets = char_vectors[:len(hs)] else: targets = char_vectors + [ end_of_sequence for _ in range(len(hs) - len(char_vectors)) ] decoder_errors = [h - target for h, target in zip(hs, targets)] decoder.back_prop(decoder_errors, decoder_results) decoder.train(0.1, decoder_results) encoded_state_error = decoder_results["dx"][:len(encoded_state )] encoder_errors = ( [np.zeros(encoder_hidden_state_size)] * (len(word_vectors) - 1)) + [encoded_state_error] encoder.back_prop(encoder_errors, encoder_results) encoder.train(0.1, encoder_results) if debug: print((" ".join([ word_from_vector(word_vector) for word_vector in word_vectors ]))) print(("".join([char_from_vector(h) for h in hs]))) print((sum( [np.sum(np.square(err)) for err in decoder_errors]))) encoder.save("encoder") decoder.save("decoder") debug = False
def testTranslateWordsIntoInitialisms(self): text = "Born in Vienna into one of Europe's richest families, he inherited a large fortune " \ "from his father in 1913. He gave some considerable sums to poor artists. In a period " \ "of severe personal depression after the first World War, he then gave away his entire " \ "fortune to his brothers and sisters. Three of his brothers committed suicide, with " \ "Wittgenstein contemplating it too. He left academia several times—serving as an " \ "officer on the front line during World War I, where he was decorated a number of times " \ "for his courage; teaching in schools in remote Austrian villages where he encountered " \ "controversy for hitting children when they made mistakes in mathematics; and working " \ "as a hospital porter during World War II in London where he told patients not to take " \ "the drugs they were prescribed while largely managing to keep secret the fact that he " \ "was one of the world's most famous philosophers." index_to_word = sorted(list(set(text.split(sep=" ")))) word_to_index = {word:i for i, word in enumerate(index_to_word)} index_to_char = sorted(list(set([word[0].upper() for word in index_to_word]))) char_to_index = {char:i for i, char in enumerate(index_to_char)} def vector_from_word(word): index = word_to_index[word] vec = np.zeros(len(index_to_word)) vec[index] = 1 return vec def word_from_vector(vector): index = vector.argmax() if vector[index] < 0.3: return "?" else: return index_to_word[index] def vector_from_char(char): vec = np.zeros(len(index_to_char)) upper = char.upper() if upper in char_to_index: index = char_to_index[upper] vec[index] = 1 return vec def char_from_vector(vector): index = vector.argmax() if vector[index] < -0.3: return " " elif vector[index] < 0.3: return "?" else: return index_to_char[index] max_seq_size = 5 training_set = [] for _ in range(500): seq_size = random.randint(1, max_seq_size) word_indices = [random.randrange(0, len(index_to_word)) for _ in range(seq_size)] words = [index_to_word[index] for index in word_indices] initials = [word[0].upper() for word in words] training_set.append(([vector_from_word(word) for word in words], [vector_from_char(char) for char in initials])) encoder_hidden_state_size = 50 encoder = Gru(len(index_to_word), encoder_hidden_state_size) decoder = Gru(len(index_to_char) + encoder_hidden_state_size, len(index_to_char)) encoder_h0 = np.random.uniform(-0.2, 0.2, encoder_hidden_state_size) decoder_h0 = np.random.uniform(-0.2, 0.2, len(index_to_char)) end_of_sequence = np.ones(len(index_to_char)) * -1 for epoch in range(10000): debug = epoch % 5000 == 0 for word_vectors, char_vectors in random.sample(training_set, 30): encoder_results = {} encoded_state = encoder.forward_prop(word_vectors, encoder_h0, encoder_results)[-1] decoder_results = {} def decoder_input_generator(max): yield np.concatenate([encoded_state, np.zeros(len(index_to_char))]) prev_h = decoder_results["hs"][-1] resulting_char = char_from_vector(prev_h) i = 0 while resulting_char is not " " and i <= max: yield np.concatenate([encoded_state, vector_from_char(resulting_char)]) i += 1 hs = decoder.forward_prop(decoder_input_generator(len(char_vectors)), decoder_h0, decoder_results) if len(hs) <= len(char_vectors): targets = char_vectors[:len(hs)] else: targets = char_vectors + [end_of_sequence for _ in range(len(hs) - len(char_vectors))] decoder_errors = [h - target for h, target in zip(hs, targets)] decoder.back_prop(decoder_errors, decoder_results) decoder.train(0.1, decoder_results) encoded_state_error = decoder_results["dx"][:len(encoded_state)] encoder_errors = ([np.zeros(encoder_hidden_state_size)] * (len(word_vectors) - 1)) + [encoded_state_error] encoder.back_prop(encoder_errors, encoder_results) encoder.train(0.1, encoder_results) if debug: print(" ".join([word_from_vector(word_vector) for word_vector in word_vectors])) print("".join([char_from_vector(h) for h in hs])) print(sum([np.sum(np.square(err)) for err in decoder_errors])) encoder.save("encoder") decoder.save("decoder") debug = False