def test_tanh_rnn(): # random state so script is deterministic random_state = np.random.RandomState(1999) # home of the computational graph graph = OrderedDict() # number of hidden features n_hid = 10 # number of output_features = input_features n_out = X.shape[-1] # input (where first dimension is time) datasets_list = [X, X_mask, y, y_mask] names_list = ["X", "X_mask", "y", "y_mask"] test_values_list = [X, X_mask, y, y_mask] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph, list_of_test_values=test_values_list) # Setup weights l1 = linear_layer([X_sym], graph, 'l1_proj', n_hid, random_state) h = tanh_recurrent_layer([l1], X_mask_sym, n_hid, graph, 'l1_rec', random_state) # linear output activation y_hat = linear_layer([h], graph, 'l2_proj', n_out, random_state) # error between output and target cost = squared_error(y_hat, y_sym) cost = masked_cost(cost, y_mask_sym).mean() # Parameters of the model params, grads = get_params_and_grads(graph, cost) # Use stochastic gradient descent to optimize opt = sgd(params) learning_rate = 0.001 updates = opt.updates(params, grads, learning_rate) fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates, mode="FAST_COMPILE") cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(X.shape[1]) valid_indices = np.arange(X.shape[1]) early_stopping_trainer(fit_function, cost_function, checkpoint_dict, [X, y], minibatch_size, train_indices, valid_indices, fit_function_output_names=["cost"], cost_function_output_name="valid_cost", n_epochs=1)
def test_conditional_gru_recurrent(): random_state = np.random.RandomState(1999) graph = OrderedDict() n_hid = 5 n_out = n_chars # input (where first dimension is time) datasets_list = [X_mb, X_mask, y_mb, y_mask] names_list = ["X", "X_mask", "y", "y_mask"] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph) h = gru_recurrent_layer([X_sym], X_mask_sym, n_hid, graph, 'l1_end', random_state) shifted_y_sym = shift_layer([y_sym], graph, 'shift') h_dec, context = conditional_gru_recurrent_layer([y_sym], [h], y_mask_sym, n_hid, graph, 'l2_dec', random_state) # linear output activation y_hat = softmax_layer([h_dec, context, shifted_y_sym], graph, 'l2_proj', n_out, random_state=random_state) # error between output and target cost = categorical_crossentropy(y_hat, y_sym) cost = masked_cost(cost, y_mask_sym).mean() # Parameters of the model """ params, grads = get_params_and_grads(graph, cost) # Use stochastic gradient descent to optimize opt = sgd(params) learning_rate = 0.00000 updates = opt.updates(params, grads, learning_rate) fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates, mode="FAST_COMPILE") """ cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(len(X)) valid_indices = np.arange(len(X)) early_stopping_trainer(cost_function, cost_function, train_indices, valid_indices, checkpoint_dict, [X, y], minibatch_size, list_of_minibatch_functions=[text_minibatch_func], list_of_train_output_names=["cost"], valid_output_name="valid_cost", n_epochs=1)
def test_conditional_gru_recurrent(): random_state = np.random.RandomState(1999) graph = OrderedDict() n_hid = 5 n_out = n_chars # input (where first dimension is time) datasets_list = [X_mb, X_mask, y_mb, y_mask] names_list = ["X", "X_mask", "y", "y_mask"] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph) h = gru_recurrent_layer([X_sym], X_mask_sym, n_hid, graph, 'l1_end', random_state) shifted_y_sym = shift_layer([y_sym], graph, 'shift') h_dec, context = conditional_gru_recurrent_layer([y_sym], [h], y_mask_sym, n_hid, graph, 'l2_dec', random_state) # linear output activation y_hat = softmax_layer([h_dec, context, shifted_y_sym], graph, 'l2_proj', n_out, random_state) # error between output and target cost = categorical_crossentropy(y_hat, y_sym) cost = masked_cost(cost, y_mask_sym).mean() # Parameters of the model """ params, grads = get_params_and_grads(graph, cost) # Use stochastic gradient descent to optimize opt = sgd(params) learning_rate = 0.00000 updates = opt.updates(params, grads, learning_rate) fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates, mode="FAST_COMPILE") """ cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(len(X)) valid_indices = np.arange(len(X)) early_stopping_trainer(cost_function, cost_function, checkpoint_dict, [X, y], minibatch_size, train_indices, valid_indices, list_of_minibatch_functions=[text_minibatch_func], fit_function_output_names=["cost"], cost_function_output_name="valid_cost", n_epochs=1)
def test_tanh_rnn(): # random state so script is deterministic random_state = np.random.RandomState(1999) # home of the computational graph graph = OrderedDict() # number of hidden features n_hid = 10 # number of output_features = input_features n_out = X.shape[-1] # input (where first dimension is time) datasets_list = [X, X_mask, y, y_mask] names_list = ["X", "X_mask", "y", "y_mask"] test_values_list = [X, X_mask, y, y_mask] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph, list_of_test_values=test_values_list) # Setup weights l1 = linear_layer([X_sym], graph, 'l1_proj', proj_dim=n_hid, random_state=random_state) h = tanh_recurrent_layer([l1], X_mask_sym, n_hid, graph, 'l1_rec', random_state) # linear output activation y_hat = linear_layer([h], graph, 'l2_proj', proj_dim=n_out, random_state=random_state) # error between output and target cost = squared_error(y_hat, y_sym) cost = masked_cost(cost, y_mask_sym).mean() # Parameters of the model params, grads = get_params_and_grads(graph, cost) # Use stochastic gradient descent to optimize learning_rate = 0.001 opt = sgd(params, learning_rate) updates = opt.updates(params, grads) fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates, mode="FAST_COMPILE") cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(X.shape[1]) valid_indices = np.arange(X.shape[1]) early_stopping_trainer(fit_function, cost_function, train_indices, valid_indices, checkpoint_dict, [X, y], minibatch_size, list_of_train_output_names=["cost"], valid_output_name="valid_cost", n_epochs=1)
def test_rnn_correlated_mixture_density(): # graph holds information necessary to build layers from parents random_state = np.random.RandomState(1999) graph = OrderedDict() minibatch_size = 5 X_seq = np.array([bernoulli_X for i in range(minibatch_size)]) y_seq = np.array([bernoulli_y for i in range(minibatch_size)]) X_mb, X_mb_mask = make_masked_minibatch(X_seq, slice(0, minibatch_size)) y_mb, y_mb_mask = make_masked_minibatch(y_seq, slice(0, minibatch_size)) datasets_list = [X_mb, X_mb_mask, y_mb, y_mb_mask] names_list = ["X", "X_mask", "y", "y_mask"] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph) n_hid = 5 train_indices = np.arange(len(X_seq)) valid_indices = np.arange(len(X_seq)) l1 = tanh_layer([X_sym], graph, 'l1', proj_dim=n_hid, random_state=random_state) h = gru_recurrent_layer([l1], X_mask_sym, n_hid, graph, 'l1_rec', random_state=random_state) rval = bernoulli_and_correlated_log_gaussian_mixture_layer( [h], graph, 'hw', proj_dim=2, n_components=3, random_state=random_state) binary, coeffs, mus, log_sigmas, corr = rval cost = bernoulli_and_correlated_log_gaussian_mixture_cost( binary, coeffs, mus, log_sigmas, corr, y_sym) cost = masked_cost(cost, y_mask_sym).mean() cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = create_checkpoint_dict(locals()) epoch_results = fixed_n_epochs_trainer( cost_function, cost_function, train_indices, valid_indices, checkpoint_dict, [X_seq, y_seq], minibatch_size, list_of_minibatch_functions=[make_masked_minibatch, make_masked_minibatch], list_of_train_output_names=["train_cost"], valid_output_name="valid_cost", n_epochs=1)
def test_masked_cost(): graph = OrderedDict() X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph) cost = gaussian_error(.5 * X_sym, .5 * X_sym, X_sym) masked = masked_cost(X_sym, y_sym) theano.function([X_sym, y_sym], [cost, masked], mode="FAST_COMPILE")
datasets_list = [X_mb, X_mb_mask, y_mb, y_mb_mask] names_list = ["X", "X_mask", "y", "y_mask"] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph, list_of_test_values=datasets_list) n_hid = 256 n_out = 8 h = location_attention_tanh_recurrent_layer( [X_sym], [y_sym], X_mask_sym, y_mask_sym, n_hid, graph, 'l1_att_rec', random_state=random_state) X_hat = sigmoid_layer([h], graph, 'output', proj_dim=n_out, random_state=random_state) cost = binary_crossentropy(X_hat, X_sym).mean() cost = masked_cost(cost, X_mask_sym).mean() params, grads = get_params_and_grads(graph, cost) opt = adadelta(params) updates = opt.updates(params, grads) fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates) valid_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost]) checkpoint_dict = {} checkpoint_dict["fit_function"] = fit_function checkpoint_dict["valid_function"] = valid_function TL = TrainingLoop(fit_function, valid_function, train_itr, valid_itr, checkpoint_dict=checkpoint_dict, list_of_train_output_names=["train_cost"], valid_output_name="valid_cost", n_epochs=500,
def test_masked_cost(): cost = gaussian_error(.5 * X_sym, .5 * X_sym, X_sym) masked = masked_cost(X_sym, y_sym) theano.function([X_sym, y_sym], [cost, masked], mode="FAST_COMPILE")
graph = OrderedDict() X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph) l1 = relu_layer([X_sym], graph, 'l1', proj_dim=n_hid, random_state=random_state) h = lstm_recurrent_layer([l1], X_mask_sym, rnn_dim, graph, 'l1_rec', random_state=random_state) l2 = relu_layer([h], graph, 'l2', proj_dim=n_hid, random_state=random_state) rval = bernoulli_and_correlated_log_gaussian_mixture_layer( [l2], graph, 'hw', proj_dim=2, n_components=20, random_state=random_state) binary, coeffs, mus, sigmas, corr = rval cost = bernoulli_and_correlated_log_gaussian_mixture_cost( binary, coeffs, mus, sigmas, corr, y_sym) cost = masked_cost(cost, y_mask_sym).sum(axis=0).mean() params, grads = get_params_and_grads(graph, cost) learning_rate = 0.0003 opt = adam(params, learning_rate) clipped_grads = gradient_clipping(grads) updates = opt.updates(params, clipped_grads) fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates) cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost]) predict_function = theano.function([X_sym, X_mask_sym], [binary, coeffs, mus, sigmas, corr]) checkpoint_dict = create_checkpoint_dict(locals())
graph = OrderedDict() X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph) l1 = relu_layer([X_sym], graph, 'l1', proj_dim=n_hid, random_state=random_state) h = lstm_recurrent_layer([l1], X_mask_sym, rnn_dim, graph, 'l1_rec', random_state=random_state) l2 = relu_layer([h], graph, 'l2', proj_dim=n_hid, random_state=random_state) rval = bernoulli_and_correlated_log_gaussian_mixture_layer( [l2], graph, 'hw', proj_dim=2, n_components=20, random_state=random_state) binary, coeffs, mus, sigmas, corr = rval cost = bernoulli_and_correlated_log_gaussian_mixture_cost( binary, coeffs, mus, sigmas, corr, y_sym) cost = masked_cost(cost, y_mask_sym).sum(axis=0).mean() params, grads = get_params_and_grads(graph, cost) learning_rate = 0.0003 opt = adam(params, learning_rate) clipped_grads = gradient_clipping(grads) updates = opt.updates(params, clipped_grads) fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates) cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost]) predict_function = theano.function([X_sym, X_mask_sym], [binary, coeffs, mus, sigmas, corr]) valid_itr = list_iterator([X, y], minibatch_size, axis=1, make_mask=True, start_index=train_end)