def test_grad_bs(self): n = FeedForwardNetwork([4, 7, 2, 3]) x0 = np.random.uniform(size=4).astype(TYPE) intermediate_results = {} y = n.forward_prop(x0, intermediate_results) t = np.zeros(3).astype(TYPE) dy = mathutils.mean_squared_error_prime(y, t) n.back_prop(dy, intermediate_results) dbs = intermediate_results["dbs"] delta = 1e-4 exp_dbs = [] for i in range(len(n.bs)): b = n.bs[i] exp_db = np.zeros(b.shape) for index in np.ndindex(b.shape): n1 = clone(n) n2 = clone(n) n1.bs[i][index] -= delta n2.bs[i][index] += delta exp_grad = (err(n2.forward_prop(x0, {})) - err(n1.forward_prop(x0, {}))) / (2 * delta) exp_db[index] = exp_grad exp_dbs.append(exp_db) for dw, exp_db in zip(dbs, exp_dbs): npt.assert_array_almost_equal(dw, exp_db, decimal=3)
def test_grad_ws(self): n = FeedForwardNetwork([5, 4, 3, 2]) x0 = np.random.uniform(size=5).astype(TYPE) intermediate_results = {} y = n.forward_prop(x0, intermediate_results) t = np.zeros(2).astype(TYPE) dy = mathutils.mean_squared_error_prime(y, t) n.back_prop(dy, intermediate_results) dws = intermediate_results["dws"] delta = 1e-4 exp_dws = [] for i in range(len(n.ws)): w = n.ws[i] exp_dw = np.zeros(w.shape) for index in np.ndindex(w.shape): n1 = clone(n) n2 = clone(n) n1.ws[i][index] -= delta n2.ws[i][index] += delta exp_grad = (err(n2.forward_prop(x0, {})) - err(n1.forward_prop(x0, {}))) / (2 * delta) exp_dw[index] = exp_grad exp_dws.append(exp_dw) for dw, exp_dw in zip(dws, exp_dws): npt.assert_array_almost_equal(dw, exp_dw, decimal=3)
def test_grad_x(self): n = FeedForwardNetwork([3, 4, 4, 2]) x0 = np.random.uniform(size=3).astype(TYPE) intermediate_results = {} y = n.forward_prop(x0, intermediate_results) t = np.zeros(2).astype(TYPE) dy = mathutils.mean_squared_error_prime(y, t) dx = n.back_prop(dy, intermediate_results) delta = 1e-4 exp_dx = np.zeros(x0.shape) for index in np.ndindex(x0.shape): x0_a = np.copy(x0) x0_b = np.copy(x0) x0_a[index] -= delta x0_b[index] += delta exp_grad = (err(n.forward_prop(x0_b, {})) - err(n.forward_prop(x0_a, {}))) / (2 * delta) exp_dx[index] = exp_grad npt.assert_array_almost_equal(dx, exp_dx, decimal=3)
def test_learn_word_vectors_from_char_vector_sequence(self): text = "please learn how to infer word vectors from sequences of character vectors" index_to_word = list(set(text.split())) index_to_char = list(set(text)) word_to_index = {word: index for index, word in enumerate(index_to_word)} char_to_index = {word: index for index, word in enumerate(index_to_char)} def to_char_vector_sequence(word): sequence = [] for char in word: vector = np.ones(len(char_to_index)) * -1 vector[char_to_index[char]] = 1 sequence.append(vector) sequence.append(np.zeros(len(char_to_index))) return np.asarray(sequence) def to_word_vector(word): vector = np.ones(len(word_to_index)) * -1 vector[word_to_index[word]] = 1 return vector training_data = [(to_char_vector_sequence(word), to_word_vector(word)) for word in text.split()] # hidden_size = 100 hidden_size = len(index_to_word) lstm = NoOutputLstm(len(index_to_char), hidden_size) ffn = FeedForwardNetwork([hidden_size, 50, 20, len(index_to_word)]) h0 = np.random.uniform(-1, 1, size=hidden_size) learning_rate = 0.5 for i in range(1000): for char_vectors, word_vector in training_data: hs, f_gs, i_gs, cs, lstm_output = lstm.forward_prop(char_vectors, h0) res = {} y = ffn.forward_prop(lstm_output, res) # dy = mathutils.mean_squared_error_prime(y, word_vector) dy = mathutils.mean_squared_error_prime(lstm_output, word_vector) dx = ffn.dx(lstm_output, dy, res) ffn.train(learning_rate, lstm_output, dy, res) # dw_xf_g, dw_hf_g, db_f_g, dw_xi_g, dw_hi_g, db_i_g, dw_xc, dw_hc, db_c = lstm.back_prop(char_vectors, hs, f_gs, i_gs, cs, dx) dw_xf_g, dw_hf_g, db_f_g, dw_xi_g, dw_hi_g, db_i_g, dw_xc, dw_hc, db_c = lstm.back_prop(char_vectors, hs, f_gs, i_gs, cs, dy) lstm.w_xf_g -= dw_xf_g * learning_rate lstm.w_hf_g -= dw_hf_g * learning_rate lstm.b_f_g -= db_f_g * learning_rate lstm.w_xi_g -= dw_xi_g * learning_rate lstm.w_hi_g -= dw_hi_g * learning_rate lstm.b_i_g -= db_i_g * learning_rate lstm.w_xc -= dw_xc * learning_rate lstm.w_hc -= dw_hc * learning_rate lstm.b_c -= db_c * learning_rate if i % 200 == 0: total_err = 0 for char_vectors, word_vector in training_data: h = lstm.activate(char_vectors, h0) output_vector = ffn.forward_prop(h[-1], {}) total_err += mathutils.mean_squared_error(output_vector, word_vector) print(total_err/len(training_data)) lstm_out = lstm.activate(to_char_vector_sequence("infer"), h0) result = ffn.forward_prop(lstm_out, {}) self.assertEquals("infer", index_to_word[np.argmax(result)])
def test_learn_word_vectors_from_char_vector_sequence(self): text = "please learn how to infer word vectors from sequences of character vectors" index_to_word = list(set(text.split())) index_to_char = list(set(text)) word_to_index = { word: index for index, word in enumerate(index_to_word) } char_to_index = { word: index for index, word in enumerate(index_to_char) } def to_char_vector_sequence(word): sequence = [] for char in word: vector = np.ones(len(char_to_index)) * -1 vector[char_to_index[char]] = 1 sequence.append(vector) sequence.append(np.zeros(len(char_to_index))) return np.asarray(sequence) def to_word_vector(word): vector = np.ones(len(word_to_index)) * -1 vector[word_to_index[word]] = 1 return vector training_data = [(to_char_vector_sequence(word), to_word_vector(word)) for word in text.split()] # hidden_size = 100 hidden_size = len(index_to_word) lstm = NoOutputLstm(len(index_to_char), hidden_size) ffn = FeedForwardNetwork([hidden_size, 50, 20, len(index_to_word)]) h0 = np.random.uniform(-1, 1, size=hidden_size) learning_rate = 0.5 for i in range(1000): for char_vectors, word_vector in training_data: hs, f_gs, i_gs, cs, lstm_output = lstm.forward_prop( char_vectors, h0) res = {} y = ffn.forward_prop(lstm_output, res) # dy = mathutils.mean_squared_error_prime(y, word_vector) dy = mathutils.mean_squared_error_prime( lstm_output, word_vector) dx = ffn.dx(lstm_output, dy, res) ffn.train(learning_rate, lstm_output, dy, res) # dw_xf_g, dw_hf_g, db_f_g, dw_xi_g, dw_hi_g, db_i_g, dw_xc, dw_hc, db_c = lstm.back_prop(char_vectors, hs, f_gs, i_gs, cs, dx) dw_xf_g, dw_hf_g, db_f_g, dw_xi_g, dw_hi_g, db_i_g, dw_xc, dw_hc, db_c = lstm.back_prop( char_vectors, hs, f_gs, i_gs, cs, dy) lstm.w_xf_g -= dw_xf_g * learning_rate lstm.w_hf_g -= dw_hf_g * learning_rate lstm.b_f_g -= db_f_g * learning_rate lstm.w_xi_g -= dw_xi_g * learning_rate lstm.w_hi_g -= dw_hi_g * learning_rate lstm.b_i_g -= db_i_g * learning_rate lstm.w_xc -= dw_xc * learning_rate lstm.w_hc -= dw_hc * learning_rate lstm.b_c -= db_c * learning_rate if i % 200 == 0: total_err = 0 for char_vectors, word_vector in training_data: h = lstm.activate(char_vectors, h0) output_vector = ffn.forward_prop(h[-1], {}) total_err += mathutils.mean_squared_error( output_vector, word_vector) print(total_err / len(training_data)) lstm_out = lstm.activate(to_char_vector_sequence("infer"), h0) result = ffn.forward_prop(lstm_out, {}) self.assertEquals("infer", index_to_word[np.argmax(result)])