def test_vae(): minibatch_size = 100 random_state = np.random.RandomState(1999) graph = OrderedDict() X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph) l1_enc = relu_layer([X_sym, y_sym], graph, 'l1_enc', proj_dim=20, random_state=random_state) mu = linear_layer([l1_enc], graph, 'mu', proj_dim=10, random_state=random_state) log_sigma = linear_layer([l1_enc], graph, 'log_sigma', proj_dim=10, random_state=random_state) samp = gaussian_log_sample_layer([mu], [log_sigma], graph, 'gaussian_log_sample', random_state=random_state) l1_dec = relu_layer([samp], graph, 'l1_dec', proj_dim=20, random_state=random_state) out = sigmoid_layer([l1_dec], graph, 'out', proj_dim=X.shape[1], random_state=random_state) kl = gaussian_log_kl([mu], [log_sigma], graph, 'gaussian_kl').mean() cost = binary_crossentropy(out, X_sym).mean() + kl params, grads = get_params_and_grads(graph, cost) learning_rate = 0.001 opt = sgd(params) updates = opt.updates(params, grads, learning_rate) train_function = theano.function([X_sym, y_sym], [cost], updates=updates, mode="FAST_COMPILE") iterate_function(train_function, [X, y], minibatch_size, list_of_output_names=["cost"], n_epochs=1)
def test_tanh_rnn(): # random state so script is deterministic random_state = np.random.RandomState(1999) # home of the computational graph graph = OrderedDict() # number of hidden features n_hid = 10 # number of output_features = input_features n_out = X.shape[-1] # input (where first dimension is time) datasets_list = [X, X_mask, y, y_mask] names_list = ["X", "X_mask", "y", "y_mask"] test_values_list = [X, X_mask, y, y_mask] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph, list_of_test_values=test_values_list) # Setup weights l1 = linear_layer([X_sym], graph, 'l1_proj', n_hid, random_state) h = tanh_recurrent_layer([l1], X_mask_sym, n_hid, graph, 'l1_rec', random_state) # linear output activation y_hat = linear_layer([h], graph, 'l2_proj', n_out, random_state) # error between output and target cost = squared_error(y_hat, y_sym) cost = masked_cost(cost, y_mask_sym).mean() # Parameters of the model params, grads = get_params_and_grads(graph, cost) # Use stochastic gradient descent to optimize opt = sgd(params) learning_rate = 0.001 updates = opt.updates(params, grads, learning_rate) fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates, mode="FAST_COMPILE") cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(X.shape[1]) valid_indices = np.arange(X.shape[1]) early_stopping_trainer(fit_function, cost_function, checkpoint_dict, [X, y], minibatch_size, train_indices, valid_indices, fit_function_output_names=["cost"], cost_function_output_name="valid_cost", n_epochs=1)
def test_tanh_rnn(): # random state so script is deterministic random_state = np.random.RandomState(1999) # home of the computational graph graph = OrderedDict() # number of hidden features n_hid = 10 # number of output_features = input_features n_out = X.shape[-1] # input (where first dimension is time) datasets_list = [X, X_mask, y, y_mask] names_list = ["X", "X_mask", "y", "y_mask"] test_values_list = [X, X_mask, y, y_mask] X_sym, X_mask_sym, y_sym, y_mask_sym = add_datasets_to_graph( datasets_list, names_list, graph, list_of_test_values=test_values_list) # Setup weights l1 = linear_layer([X_sym], graph, 'l1_proj', proj_dim=n_hid, random_state=random_state) h = tanh_recurrent_layer([l1], X_mask_sym, n_hid, graph, 'l1_rec', random_state) # linear output activation y_hat = linear_layer([h], graph, 'l2_proj', proj_dim=n_out, random_state=random_state) # error between output and target cost = squared_error(y_hat, y_sym) cost = masked_cost(cost, y_mask_sym).mean() # Parameters of the model params, grads = get_params_and_grads(graph, cost) # Use stochastic gradient descent to optimize learning_rate = 0.001 opt = sgd(params, learning_rate) updates = opt.updates(params, grads) fit_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], updates=updates, mode="FAST_COMPILE") cost_function = theano.function([X_sym, X_mask_sym, y_sym, y_mask_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(X.shape[1]) valid_indices = np.arange(X.shape[1]) early_stopping_trainer(fit_function, cost_function, train_indices, valid_indices, checkpoint_dict, [X, y], minibatch_size, list_of_train_output_names=["cost"], valid_output_name="valid_cost", n_epochs=1)
def test_gaussian_sample_layer(): random_state = np.random.RandomState(42) graph = OrderedDict() X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph) mu = linear_layer([X_sym], graph, 'mu', proj_dim=20, random_state=random_state) sigma = softplus_layer([X_sym], graph, 'sigma', proj_dim=20, random_state=random_state) samp = gaussian_sample_layer([mu], [sigma], graph, 'gaussian_sample', random_state=random_state) out = linear_layer([samp], graph, 'out', proj_dim=10, random_state=random_state) f = theano.function([X_sym], [out], mode="FAST_COMPILE")
def test_feedforward_theano_mix(): minibatch_size = 100 random_state = np.random.RandomState(1999) graph = OrderedDict() X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph) l1_o = linear_layer([X_sym], graph, 'l1', proj_dim=20, random_state=random_state) l1_o = .999 * l1_o y_pred = softmax_layer([l1_o], graph, 'pred', n_classes, random_state=random_state) cost = categorical_crossentropy(y_pred, y_sym).mean() params, grads = get_params_and_grads(graph, cost) learning_rate = 0.001 opt = sgd(params) updates = opt.updates(params, grads, learning_rate) fit_function = theano.function([X_sym, y_sym], [cost], updates=updates, mode="FAST_COMPILE") cost_function = theano.function([X_sym, y_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(len(X)) valid_indices = np.arange(len(X)) early_stopping_trainer(fit_function, cost_function, checkpoint_dict, [X, y], minibatch_size, train_indices, valid_indices, fit_function_output_names=["cost"], cost_function_output_name="valid_cost", n_epochs=1)
def test_vae(): minibatch_size = 10 random_state = np.random.RandomState(1999) graph = OrderedDict() X_sym = add_datasets_to_graph([X], ["X"], graph) l1_enc = softplus_layer([X_sym], graph, 'l1_enc', proj_dim=100, random_state=random_state) mu = linear_layer([l1_enc], graph, 'mu', proj_dim=50, random_state=random_state) log_sigma = linear_layer([l1_enc], graph, 'log_sigma', proj_dim=50, random_state=random_state) samp = gaussian_log_sample_layer([mu], [log_sigma], graph, 'gaussian_log_sample', random_state=random_state) l1_dec = softplus_layer([samp], graph, 'l1_dec', proj_dim=100, random_state=random_state) out = sigmoid_layer([l1_dec], graph, 'out', proj_dim=X.shape[1], random_state=random_state) kl = gaussian_log_kl([mu], [log_sigma], graph, 'gaussian_kl').mean() cost = binary_crossentropy(out, X_sym).mean() + kl params, grads = get_params_and_grads(graph, cost) learning_rate = 0.00000 opt = sgd(params, learning_rate) updates = opt.updates(params, grads) fit_function = theano.function([X_sym], [cost], updates=updates, mode="FAST_COMPILE") cost_function = theano.function([X_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(len(X)) valid_indices = np.arange(len(X)) early_stopping_trainer(fit_function, cost_function, train_indices, valid_indices, checkpoint_dict, [X], minibatch_size, list_of_train_output_names=["cost"], valid_output_name="valid_cost", n_epochs=1)
def test_softmax_sample_layer(): random_state = np.random.RandomState(42) graph = OrderedDict() X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph) softmax = softmax_layer([X_sym], graph, 'softmax', proj_dim=20, random_state=random_state) samp = softmax_sample_layer([softmax], graph, 'softmax_sample', random_state=random_state) out = linear_layer([samp], graph, 'out', proj_dim=10, random_state=random_state) f = theano.function([X_sym], [out], mode="FAST_COMPILE")
def test_gaussian_log_sample_layer(): random_state = np.random.RandomState(42) graph = OrderedDict() X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph) mu = linear_layer([X_sym], graph, 'mu', proj_dim=20, random_state=random_state) log_sigma = linear_layer([X_sym], graph, 'log_sigma', proj_dim=20, random_state=random_state) samp = gaussian_log_sample_layer([mu], [log_sigma], graph, 'gaussian_sample', random_state=random_state) out = linear_layer([samp], graph, 'out', proj_dim=10, random_state=random_state) f = theano.function([X_sym], [out], mode="FAST_COMPILE")
def test_feedforward_classifier(): minibatch_size = 100 random_state = np.random.RandomState(1999) graph = OrderedDict() X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph) l1_o = linear_layer([X_sym], graph, "l1", proj_dim=20, random_state=random_state) y_pred = softmax_layer([l1_o], graph, "pred", n_classes, random_state=random_state) cost = categorical_crossentropy(y_pred, y_sym).mean() params, grads = get_params_and_grads(graph, cost) learning_rate = 0.001 opt = sgd(params) updates = opt.updates(params, grads, learning_rate) train_function = theano.function([X_sym, y_sym], [cost], updates=updates, mode="FAST_COMPILE") iterate_function(train_function, [X, y], minibatch_size, list_of_output_names=["cost"], n_epochs=1)
def test_embedding_layer(): random_state = np.random.RandomState(1999) graph = OrderedDict() max_index = 100 proj_dim = 12 fake_str_int = [[1, 5, 7, 1, 6, 2], [2, 3, 6, 2], [3, 3, 3, 3, 3, 3, 3]] minibatch, mask = make_embedding_minibatch(fake_str_int, slice(0, 3)) (emb_slices, ), (emb_mask, ) = add_embedding_datasets_to_graph([minibatch], [mask], "emb", graph) emb = embedding_layer(emb_slices, max_index, proj_dim, graph, 'emb', random_state) followup_dim = 17 proj = linear_layer([emb], graph, 'proj', followup_dim, random_state) f = theano.function(emb_slices, [proj], mode="FAST_COMPILE") out, = f(*minibatch) assert (out.shape[-1] == 17) assert (out.shape[-2] == len(fake_str_int))
def test_embedding_layer(): random_state = np.random.RandomState(1999) graph = OrderedDict() max_index = 100 proj_dim = 12 fake_str_int = [[1, 5, 7, 1, 6, 2], [2, 3, 6, 2], [3, 3, 3, 3, 3, 3, 3]] minibatch, mask = make_embedding_minibatch( fake_str_int, slice(0, 3)) (emb_slices,), (emb_mask,) = add_embedding_datasets_to_graph( [minibatch], [mask], "emb", graph) emb = embedding_layer(emb_slices, max_index, proj_dim, graph, 'emb', random_state) followup_dim = 17 proj = linear_layer([emb], graph, 'proj', followup_dim, random_state=random_state) f = theano.function(emb_slices, [proj], mode="FAST_COMPILE") out, = f(*minibatch) assert(out.shape[-1] == 17) assert(out.shape[-2] == len(fake_str_int))
def test_fixed_projection_layer(): random_state = np.random.RandomState(1999) rand_projection = random_state.randn(64, 12) graph = OrderedDict() X_sym = add_datasets_to_graph([X], ["X"], graph) out = fixed_projection_layer([X_sym], rand_projection, graph, 'proj') out2 = fixed_projection_layer([X_sym], rand_projection, graph, 'proj', pre=rand_projection[:, 0]) out3 = fixed_projection_layer([X_sym], rand_projection, graph, 'proj', post=rand_projection[0]) final = linear_layer([out2], graph, 'linear', 17, random_state=random_state) # Test that it compiles with and without bias f = theano.function([X_sym], [out, out2, out3, final], mode="FAST_COMPILE") # Test updates params, grads = get_params_and_grads(graph, final.mean()) opt = sgd(params) updates = opt.updates(params, grads, .1) f2 = theano.function([X_sym], [out2, final], updates=updates) ret = f(np.ones_like(X))[0] assert ret.shape[1] != X.shape[1] ret2 = f(np.ones_like(X))[1] assert ret.shape[1] != X.shape[1] out1, final1 = f2(X) out2, final2 = f2(X) # Make sure fixed basis is unchanged assert_almost_equal(out1, out2) # Make sure linear layer is updated assert_raises(AssertionError, assert_almost_equal, final1, final2)
def test_fixed_projection_layer(): random_state = np.random.RandomState(1999) rand_projection = random_state.randn(64, 12) graph = OrderedDict() X_sym = add_datasets_to_graph([X], ["X"], graph) out = fixed_projection_layer([X_sym], rand_projection, graph, 'proj') out2 = fixed_projection_layer([X_sym], rand_projection, graph, 'proj', pre=rand_projection[:, 0]) out3 = fixed_projection_layer([X_sym], rand_projection, graph, 'proj', post=rand_projection[0]) final = linear_layer([out2], graph, 'linear', 17, random_state=random_state) # Test that it compiles with and without bias f = theano.function([X_sym], [out, out2, out3, final], mode="FAST_COMPILE") # Test updates params, grads = get_params_and_grads( graph, final.mean()) opt = sgd(params, .1) updates = opt.updates(params, grads) f2 = theano.function([X_sym], [out2, final], updates=updates) ret = f(np.ones_like(X))[0] assert ret.shape[1] != X.shape[1] ret2 = f(np.ones_like(X))[1] assert ret.shape[1] != X.shape[1] out1, final1 = f2(X) out2, final2 = f2(X) # Make sure fixed basis is unchanged assert_almost_equal(out1, out2) # Make sure linear layer is updated assert_raises(AssertionError, assert_almost_equal, final1, final2)
def test_vae(): minibatch_size = 10 random_state = np.random.RandomState(1999) graph = OrderedDict() X_sym = add_datasets_to_graph([X], ["X"], graph) l1_enc = softplus_layer([X_sym], graph, 'l1_enc', proj_dim=100, random_state=random_state) mu = linear_layer([l1_enc], graph, 'mu', proj_dim=50, random_state=random_state) log_sigma = linear_layer([l1_enc], graph, 'log_sigma', proj_dim=50, random_state=random_state) samp = gaussian_log_sample_layer([mu], [log_sigma], graph, 'gaussian_log_sample', random_state=random_state) l1_dec = softplus_layer([samp], graph, 'l1_dec', proj_dim=100, random_state=random_state) out = sigmoid_layer([l1_dec], graph, 'out', proj_dim=X.shape[1], random_state=random_state) kl = gaussian_log_kl([mu], [log_sigma], graph, 'gaussian_kl').mean() cost = binary_crossentropy(out, X_sym).mean() + kl params, grads = get_params_and_grads(graph, cost) learning_rate = 0.00000 opt = sgd(params) updates = opt.updates(params, grads, learning_rate) fit_function = theano.function([X_sym], [cost], updates=updates, mode="FAST_COMPILE") cost_function = theano.function([X_sym], [cost], mode="FAST_COMPILE") checkpoint_dict = {} train_indices = np.arange(len(X)) valid_indices = np.arange(len(X)) early_stopping_trainer(fit_function, cost_function, checkpoint_dict, [X], minibatch_size, train_indices, valid_indices, fit_function_output_names=["cost"], cost_function_output_name="valid_cost", n_epochs=1)
valid_indices = np.arange(len(sine_y)) X = sine_x y = sine_y # graph holds information necessary to build layers from parents graph = OrderedDict() X_sym, y_sym = add_datasets_to_graph([X, y], ["X", "y"], graph) # random state so script is deterministic random_state = np.random.RandomState(1999) minibatch_size = len(sine_y) n_hid = 20 n_out = 1 l1 = tanh_layer([X_sym], graph, 'l1', proj_dim=n_hid, random_state=random_state) y_pred = linear_layer([l1], graph, 'y_pred', proj_dim=n_out, random_state=random_state) cost = ((y_pred - y_sym) ** 2).mean() # Can also define cost this way using dagbldr # cost = squared_error(y_pred, y_sym).mean() params, grads = get_params_and_grads(graph, cost) learning_rate = 1E-3 momentum = 0.8 opt = rmsprop(params, learning_rate, momentum) updates = opt.updates(params, grads) fit_function = theano.function([X_sym, y_sym], [cost], updates=updates) cost_function = theano.function([X_sym, y_sym], [cost]) predict_function = theano.function([X_sym], [y_pred]) checkpoint_dict = create_checkpoint_dict(locals())
# random state so script is deterministic random_state = np.random.RandomState(1999) minibatch_size = 100 n_code = 400 n_enc_layer = [600, 600] n_dec_layer = [600, 600] width = 48 height = 48 n_input = width * height # encode path aka q l1_enc = softplus_layer([X_sym], graph, 'l1_enc', n_enc_layer[0], random_state) l2_enc = softplus_layer([l1_enc], graph, 'l2_enc', n_enc_layer[1], random_state) code_mu = linear_layer([l2_enc], graph, 'code_mu', n_code, random_state) code_log_sigma = linear_layer([l2_enc], graph, 'code_log_sigma', n_code, random_state) kl = gaussian_log_kl([code_mu], [code_log_sigma], graph, 'kl').mean() samp = gaussian_log_sample_layer([code_mu], [code_log_sigma], graph, 'samp', random_state) # decode path aka p l1_dec = softplus_layer([samp], graph, 'l1_dec', n_dec_layer[0], random_state) l2_dec = softplus_layer([l1_dec], graph, 'l2_dec', n_dec_layer[1], random_state) out = linear_layer([l2_dec], graph, 'out', n_input, random_state) nll = squared_error(out, X_sym).mean() # log p(x) = -nll so swap sign # want to minimize cost in optimization so multiply by -1 cost = -1 * (-nll - kl)
# random state so script is deterministic random_state = np.random.RandomState(1999) minibatch_size = 100 n_code = 100 n_enc_layer = [200, 200] n_dec_layer = [200, 200] width = 28 height = 28 n_input = width * height # encode path aka q l1_enc = softplus_layer([X_sym], graph, 'l1_enc', n_enc_layer[0], random_state) l2_enc = softplus_layer([l1_enc], graph, 'l2_enc', n_enc_layer[1], random_state) code_mu = linear_layer([l2_enc], graph, 'code_mu', n_code, random_state) code_log_sigma = linear_layer([l2_enc], graph, 'code_log_sigma', n_code, random_state) kl = gaussian_log_kl([code_mu], [code_log_sigma], graph, 'kl').mean() samp = gaussian_log_sample_layer([code_mu], [code_log_sigma], graph, 'samp', random_state) # decode path aka p l1_dec = softplus_layer([samp], graph, 'l1_dec', n_dec_layer[0], random_state) l2_dec = softplus_layer([l1_dec], graph, 'l2_dec', n_dec_layer[1], random_state) out = sigmoid_layer([l2_dec], graph, 'out', n_input, random_state) nll = binary_crossentropy(out, X_sym).mean() # log p(x) = -nll so swap sign # want to minimize cost in optimization so multiply by -1 cost = -1 * (-nll - kl)
# combined q(y_pred | x) and partial q(z | x) for q(z | x, y_pred) l3_enc = softplus_layer([X_l2_enc, y_pred], graph, 'l3_enc', n_enc_layer, random_state=random_state) l4_enc = softplus_layer([l3_enc], graph, 'l4_enc', n_enc_layer, random_state=random_state) # code layer code_mu = linear_layer([l4_enc], graph, 'code_mu', n_code, random_state=random_state) code_log_sigma = linear_layer([l4_enc], graph, 'code_log_sigma', n_code, random_state=random_state) kl = gaussian_log_kl([code_mu], [code_log_sigma], graph, 'kl').mean() samp = gaussian_log_sample_layer([code_mu], [code_log_sigma], graph, 'samp', random_state) # decode path aka p(x | z, y) for labeled data l1_dec = softplus_layer([samp, y_sym], graph, 'l1_dec',