def test_feedforward_theano_mix(): minibatch_size = 100 random_state = np.random.RandomState(1999) graph = OrderedDict() X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph) l1_o = linear_layer([X_sym], graph, 'l1', proj_dim=20, random_state=random_state) l1_o = .999 * l1_o y_pred = softmax_layer([l1_o], graph, 'pred', n_classes, random_state=random_state) cost = categorical_crossentropy(y_pred, y_sym).mean() params, grads = get_params_and_grads(graph, cost) learning_rate = 0.001 opt = sgd(params) updates = opt.updates(params, grads, learning_rate) fit_function = theano.function([X_sym, y_sym], [cost], updates=updates, mode="FAST_COMPILE") cost_function = theano.function([X_sym, y_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(len(X)) valid_indices = np.arange(len(X)) early_stopping_trainer(fit_function, cost_function, checkpoint_dict, [X, y], minibatch_size, train_indices, valid_indices, fit_function_output_names=["cost"], cost_function_output_name="valid_cost", n_epochs=1)
def test_conditional_gru_recurrent(): random_state = np.random.RandomState(1999) graph = OrderedDict() n_hid = 5 n_out = n_chars # input (where first dimension is time) datasets_list = [X_mb, X_mask, y_mb, y_mask] names_list = ["X", "X_mask", "y", "y_mask"] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph) h = gru_recurrent_layer([X_sym], X_mask_sym, n_hid, graph, 'l1_end', random_state) shifted_y_sym = shift_layer([y_sym], graph, 'shift') h_dec, context = conditional_gru_recurrent_layer([y_sym], [h], y_mask_sym, n_hid, graph, 'l2_dec', random_state) # linear output activation y_hat = softmax_layer([h_dec, context, shifted_y_sym], graph, 'l2_proj', n_out, random_state=random_state) # error between output and target cost = categorical_crossentropy(y_hat, y_sym) cost = masked_cost(cost, y_mask_sym).mean() # Parameters of the model """ params, grads = get_params_and_grads(graph, cost) # Use stochastic gradient descent to optimize opt = sgd(params) learning_rate = 0.00000 updates = opt.updates(params, grads, learning_rate) fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates, mode="FAST_COMPILE") """ cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(len(X)) valid_indices = np.arange(len(X)) early_stopping_trainer(cost_function, cost_function, train_indices, valid_indices, checkpoint_dict, [X, y], minibatch_size, list_of_minibatch_functions=[text_minibatch_func], list_of_train_output_names=["cost"], valid_output_name="valid_cost", n_epochs=1)
def test_tanh_rnn(): # random state so script is deterministic random_state = np.random.RandomState(1999) # home of the computational graph graph = OrderedDict() # number of hidden features n_hid = 10 # number of output_features = input_features n_out = X.shape[-1] # input (where first dimension is time) datasets_list = [X, X_mask, y, y_mask] names_list = ["X", "X_mask", "y", "y_mask"] test_values_list = [X, X_mask, y, y_mask] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph, list_of_test_values=test_values_list) # Setup weights l1 = linear_layer([X_sym], graph, 'l1_proj', n_hid, random_state) h = tanh_recurrent_layer([l1], X_mask_sym, n_hid, graph, 'l1_rec', random_state) # linear output activation y_hat = linear_layer([h], graph, 'l2_proj', n_out, random_state) # error between output and target cost = squared_error(y_hat, y_sym) cost = masked_cost(cost, y_mask_sym).mean() # Parameters of the model params, grads = get_params_and_grads(graph, cost) # Use stochastic gradient descent to optimize opt = sgd(params) learning_rate = 0.001 updates = opt.updates(params, grads, learning_rate) fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates, mode="FAST_COMPILE") cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(X.shape[1]) valid_indices = np.arange(X.shape[1]) early_stopping_trainer(fit_function, cost_function, checkpoint_dict, [X, y], minibatch_size, train_indices, valid_indices, fit_function_output_names=["cost"], cost_function_output_name="valid_cost", n_epochs=1)
def test_conditional_gru_recurrent(): random_state = np.random.RandomState(1999) graph = OrderedDict() n_hid = 5 n_out = n_chars # input (where first dimension is time) datasets_list = [X_mb, X_mask, y_mb, y_mask] names_list = ["X", "X_mask", "y", "y_mask"] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph) h = gru_recurrent_layer([X_sym], X_mask_sym, n_hid, graph, 'l1_end', random_state) shifted_y_sym = shift_layer([y_sym], graph, 'shift') h_dec, context = conditional_gru_recurrent_layer([y_sym], [h], y_mask_sym, n_hid, graph, 'l2_dec', random_state) # linear output activation y_hat = softmax_layer([h_dec, context, shifted_y_sym], graph, 'l2_proj', n_out, random_state) # error between output and target cost = categorical_crossentropy(y_hat, y_sym) cost = masked_cost(cost, y_mask_sym).mean() # Parameters of the model """ params, grads = get_params_and_grads(graph, cost) # Use stochastic gradient descent to optimize opt = sgd(params) learning_rate = 0.00000 updates = opt.updates(params, grads, learning_rate) fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates, mode="FAST_COMPILE") """ cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(len(X)) valid_indices = np.arange(len(X)) early_stopping_trainer(cost_function, cost_function, checkpoint_dict, [X, y], minibatch_size, train_indices, valid_indices, list_of_minibatch_functions=[text_minibatch_func], fit_function_output_names=["cost"], cost_function_output_name="valid_cost", n_epochs=1)
def test_tanh_rnn(): # random state so script is deterministic random_state = np.random.RandomState(1999) # home of the computational graph graph = OrderedDict() # number of hidden features n_hid = 10 # number of output_features = input_features n_out = X.shape[-1] # input (where first dimension is time) datasets_list = [X, X_mask, y, y_mask] names_list = ["X", "X_mask", "y", "y_mask"] test_values_list = [X, X_mask, y, y_mask] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph, list_of_test_values=test_values_list) # Setup weights l1 = linear_layer([X_sym], graph, 'l1_proj', proj_dim=n_hid, random_state=random_state) h = tanh_recurrent_layer([l1], X_mask_sym, n_hid, graph, 'l1_rec', random_state) # linear output activation y_hat = linear_layer([h], graph, 'l2_proj', proj_dim=n_out, random_state=random_state) # error between output and target cost = squared_error(y_hat, y_sym) cost = masked_cost(cost, y_mask_sym).mean() # Parameters of the model params, grads = get_params_and_grads(graph, cost) # Use stochastic gradient descent to optimize learning_rate = 0.001 opt = sgd(params, learning_rate) updates = opt.updates(params, grads) fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates, mode="FAST_COMPILE") cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(X.shape[1]) valid_indices = np.arange(X.shape[1]) early_stopping_trainer(fit_function, cost_function, train_indices, valid_indices, checkpoint_dict, [X, y], minibatch_size, list_of_train_output_names=["cost"], valid_output_name="valid_cost", n_epochs=1)
def test_vae(): minibatch_size = 10 random_state = np.random.RandomState(1999) graph = OrderedDict() X_sym = add_datasets_to_graph([X], ["X"], graph) l1_enc = softplus_layer([X_sym], graph, 'l1_enc', proj_dim=100, random_state=random_state) mu = linear_layer([l1_enc], graph, 'mu', proj_dim=50, random_state=random_state) log_sigma = linear_layer([l1_enc], graph, 'log_sigma', proj_dim=50, random_state=random_state) samp = gaussian_log_sample_layer([mu], [log_sigma], graph, 'gaussian_log_sample', random_state=random_state) l1_dec = softplus_layer([samp], graph, 'l1_dec', proj_dim=100, random_state=random_state) out = sigmoid_layer([l1_dec], graph, 'out', proj_dim=X.shape[1], random_state=random_state) kl = gaussian_log_kl([mu], [log_sigma], graph, 'gaussian_kl').mean() cost = binary_crossentropy(out, X_sym).mean() + kl params, grads = get_params_and_grads(graph, cost) learning_rate = 0.00000 opt = sgd(params, learning_rate) updates = opt.updates(params, grads) fit_function = theano.function([X_sym], [cost], updates=updates, mode="FAST_COMPILE") cost_function = theano.function([X_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(len(X)) valid_indices = np.arange(len(X)) early_stopping_trainer(fit_function, cost_function, train_indices, valid_indices, checkpoint_dict, [X], minibatch_size, list_of_train_output_names=["cost"], valid_output_name="valid_cost", n_epochs=1)
fit_function = theano.function([X_sym, y_sym], [cost], updates=updates) cost_function = theano.function([X_sym, y_sym], [cost]) predict_function = theano.function([X_sym], [y_pred]) checkpoint_dict = {} checkpoint_dict["fit_function"] = fit_function checkpoint_dict["cost_function"] = cost_function checkpoint_dict["predict_function"] = predict_function previous_results = None def error(X_mb, y_mb): y_pred = predict_function(X_mb)[0] return 1 - np.mean((np.argmax(y_pred, axis=1).ravel()) == (np.argmax(y_mb, axis=1).ravel())) epoch_results = early_stopping_trainer( fit_function, error, train_indices, valid_indices, checkpoint_dict, [X, y], minibatch_size, list_of_train_output_names=["train_cost"], valid_output_name="valid_error", n_epochs=1000, optimizer_object=opt, previous_results=previous_results)
updates = opt.updates(params, grads, learning_rate) # Checkpointing try: checkpoint_dict = load_last_checkpoint() fit_function = checkpoint_dict["fit_function"] cost_function = checkpoint_dict["cost_function"] encode_function = checkpoint_dict["encode_function"] decode_function = checkpoint_dict["decode_function"] previous_epoch_results = checkpoint_dict["previous_epoch_results"] except KeyError: fit_function = theano.function([X_sym], [nll, kl, nll + kl], updates=updates) cost_function = theano.function([X_sym], [nll + kl]) encode_function = theano.function([X_sym], [code_mu, code_log_sigma]) decode_function = theano.function([samp], [out]) checkpoint_dict = {} checkpoint_dict["fit_function"] = fit_function checkpoint_dict["cost_function"] = cost_function checkpoint_dict["encode_function"] = encode_function checkpoint_dict["decode_function"] = decode_function previous_epoch_results = None epoch_results = early_stopping_trainer( fit_function, cost_function, checkpoint_dict, [X], minibatch_size, train_indices, valid_indices, fit_function_output_names=["nll", "kl", "lower_bound"], cost_function_output_name="valid_lower_bound", n_epochs=500, previous_epoch_results=previous_epoch_results, shuffle=True, random_state=random_state)
learning_rate = 1E-4 momentum = 0.95 opt = rmsprop(params, learning_rate, momentum) updates = opt.updates(params, grads) fit_function = theano.function([X_sym, y_sym], [cost], updates=updates) cost_function = theano.function([X_sym, y_sym], [cost]) predict_function = theano.function([X_sym], [y_pred]) checkpoint_dict = create_checkpoint_dict(locals()) def error(*args): xargs = args[:-1] y = args[-1] final_args = xargs y_pred = predict_function(*final_args)[0] return 1 - np.mean((np.argmax( y_pred, axis=1).ravel()) == (np.argmax(y, axis=1).ravel())) epoch_results = early_stopping_trainer( fit_function, error, train_indices, valid_indices, checkpoint_dict, [X, y], minibatch_size, list_of_train_output_names=["train_cost"], valid_output_name="valid_error", n_epochs=1000, optimizer_object=opt)
# Checkpointing try: checkpoint_dict = load_last_checkpoint() fit_function = checkpoint_dict["fit_function"] cost_function = checkpoint_dict["cost_function"] predict_function = checkpoint_dict["predict_function"] previous_epoch_results = checkpoint_dict["previous_epoch_results"] except KeyError: fit_function = theano.function([X_sym, y_sym], [cost], updates=updates) cost_function = theano.function([X_sym, y_sym], [cost]) predict_function = theano.function([X_sym], [y_pred]) checkpoint_dict = {} checkpoint_dict["fit_function"] = fit_function checkpoint_dict["cost_function"] = cost_function checkpoint_dict["predict_function"] = predict_function previous_epoch_results = None epoch_results = early_stopping_trainer( fit_function, cost_function, checkpoint_dict, [X, y], minibatch_size, train_indices, valid_indices, fit_function_output_names=["cost"], cost_function_output_name="valid_cost", n_epochs=100, previous_epoch_results=previous_epoch_results, shuffle=True, random_state=random_state)
updates = opt.updates(params, grads, learning_rate) print("Compiling fit...") fit_function = theano.function(X_story_syms + [X_story_mask_sym] + X_query_syms + [X_query_mask_sym, y_sym], [cost], updates=updates) print("Compiling cost...") cost_function = theano.function(X_story_syms + [X_story_mask_sym] + X_query_syms + [X_query_mask_sym, y_sym], [cost]) print("Compiling predict...") predict_function = theano.function(X_story_syms + [X_story_mask_sym] + X_query_syms + [X_query_mask_sym], [y_pred]) def accuracy(*args): xargs = args[:-1] y = args[-1] y_pred = predict_function(*xargs)[0] return np.mean((np.argmax( y_pred, axis=1).ravel()) == (np.argmax(y, axis=1).ravel())) checkpoint_dict = {} epoch_results = early_stopping_trainer( fit_function, accuracy, checkpoint_dict, [X_story, X_query, y_answer], minibatch_size, train_indices, valid_indices, list_of_minibatch_functions=[make_embedding_minibatch, make_embedding_minibatch, make_minibatch], fit_function_output_names=["cost"], cost_function_output_name="valid_cost", n_epochs=20)
learning_rate = 0.0002 opt = adam(params) updates = opt.updates(params, grads, learning_rate) # Checkpointing try: checkpoint_dict = load_last_checkpoint() fit_function = checkpoint_dict["fit_function"] cost_function = checkpoint_dict["cost_function"] predict_function = checkpoint_dict["predict_function"] previous_epoch_results = checkpoint_dict["previous_epoch_results"] except KeyError: fit_function = theano.function([X_sym, y_sym], [cost], updates=updates) cost_function = theano.function([X_sym, y_sym], [cost]) predict_function = theano.function([X_sym], [y_pred]) checkpoint_dict = {} checkpoint_dict["fit_function"] = fit_function checkpoint_dict["cost_function"] = cost_function checkpoint_dict["predict_function"] = predict_function previous_epoch_results = None epoch_results = early_stopping_trainer( fit_function, cost_function, checkpoint_dict, [X, y], minibatch_size, train_indices, valid_indices, fit_function_output_names=["cost"], cost_function_output_name="valid_cost", n_epochs=100, previous_epoch_results=previous_epoch_results, shuffle=True, random_state=random_state)
def test_vae(): minibatch_size = 10 random_state = np.random.RandomState(1999) graph = OrderedDict() X_sym = add_datasets_to_graph([X], ["X"], graph) l1_enc = softplus_layer([X_sym], graph, 'l1_enc', proj_dim=100, random_state=random_state) mu = linear_layer([l1_enc], graph, 'mu', proj_dim=50, random_state=random_state) log_sigma = linear_layer([l1_enc], graph, 'log_sigma', proj_dim=50, random_state=random_state) samp = gaussian_log_sample_layer([mu], [log_sigma], graph, 'gaussian_log_sample', random_state=random_state) l1_dec = softplus_layer([samp], graph, 'l1_dec', proj_dim=100, random_state=random_state) out = sigmoid_layer([l1_dec], graph, 'out', proj_dim=X.shape[1], random_state=random_state) kl = gaussian_log_kl([mu], [log_sigma], graph, 'gaussian_kl').mean() cost = binary_crossentropy(out, X_sym).mean() + kl params, grads = get_params_and_grads(graph, cost) learning_rate = 0.00000 opt = sgd(params) updates = opt.updates(params, grads, learning_rate) fit_function = theano.function([X_sym], [cost], updates=updates, mode="FAST_COMPILE") cost_function = theano.function([X_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(len(X)) valid_indices = np.arange(len(X)) early_stopping_trainer(fit_function, cost_function, checkpoint_dict, [X], minibatch_size, train_indices, valid_indices, fit_function_output_names=["cost"], cost_function_output_name="valid_cost", n_epochs=1)