def nnet_dropout(X, Y): """Neural net with dropout.""" reg = 0.001 # Weight prior noise = .5 # Likelihood st. dev. net = ( ab.InputLayer(name="X", n_samples=n_samples) >> ab.DenseMAP(output_dim=30, l2_reg=reg, l1_reg=0.) >> ab.Activation(tf.tanh) >> ab.DropOut(keep_prob=0.95) >> ab.DenseMAP(output_dim=20, l2_reg=reg, l1_reg=0.) >> ab.Activation(tf.tanh) >> ab.DropOut(keep_prob=0.95) >> ab.DenseMAP(output_dim=10, l2_reg=reg, l1_reg=0.) >> ab.Activation(tf.tanh) >> ab.DropOut(keep_prob=0.95) >> ab.DenseMAP(output_dim=5, l2_reg=reg, l1_reg=0.) >> ab.Activation(tf.tanh) >> ab.DenseMAP(output_dim=1, l2_reg=reg, l1_reg=0.) ) phi, reg = net(X=X) lkhood = tf.distributions.Normal(loc=phi, scale=noise) loss = ab.max_posterior(lkhood, Y, reg) return phi, loss
def test_categorical_likelihood(make_data): """Test aboleth with a tf.distributions.Categorical likelihood. Since it is a bit of an odd half-multivariate case. """ x, y, _, = make_data N, K = x.shape # Make two classes (K = 2) Y = np.zeros(len(y), dtype=np.int32) Y[y[:, 0] > 0] = 1 layers = ab.stack( ab.InputLayer(name='X', n_samples=10), lambda X: (X, 0.0) # Mock a sampling layer, with 2-class output ) nn, reg = layers(X=x.astype(np.float32)) like = tf.distributions.Categorical(logits=nn) ELBO = ab.elbo(like, Y, N, reg) MAP = ab.max_posterior(like, Y, reg) tc = tf.test.TestCase() with tc.test_session(): tf.global_variables_initializer().run() assert like.probs.eval().shape == (10, N, K) assert like.prob(Y).eval().shape == (10, N) L = ELBO.eval() assert np.isscalar(L) L = MAP.eval() assert np.isscalar(L)
def test_concat(make_data): """Test concatenation layer.""" x, _, X = make_data # This replicates the input layer behaviour f = ab.InputLayer('X', n_samples=3) g = ab.InputLayer('Y', n_samples=3) catlayer = ab.Concat(f, g) F, KL = catlayer(X=x, Y=x) tc = tf.test.TestCase() with tc.test_session(): forked = F.eval() orig = X.eval() assert forked.shape == orig.shape[0:2] + (2 * orig.shape[2], ) assert np.all(forked == np.dstack((orig, orig))) assert KL.eval() == 0.0
def test_input(make_data): """Test the input layer.""" x, _, X = make_data s = ab.InputLayer(name='myname') F, KL = s(myname=x) tc = tf.test.TestCase() with tc.test_session(): f = F.eval() assert KL == 0.0 assert np.array_equal(f, x[np.newaxis, ...])
def test_ncp_output(make_data): """Test we are making the ncp extra samples correctly, and KL is OK.""" x, _, X = make_data x = x.astype(np.float32) net_ncp = (ab.InputLayer(name='X', n_samples=1) >> ab.NCPContinuousPerturb( input_noise=1.) >> ab.DenseNCP(output_dim=1)) net = (ab.InputLayer(name='X', n_samples=1) >> ab.DenseVariational(output_dim=1)) F, KL = net_ncp(X=x) F_var, KL_var = net(X=x) tc = tf.test.TestCase() with tc.test_session(): tf.global_variables_initializer().run() f, f_var = F.eval(), F_var.eval() assert f.shape[0] == 1 assert f.shape == f_var.shape assert KL.eval() >= KL_var.eval()
def my_model(features, labels, mode, params): N = params["N"] n_samples = NSAMPLES if mode == tf.estimator.ModeKeys.TRAIN \ else NPREDICTSAMPLES X = tf.feature_column.input_layer(features, params['feature_columns']) kernel = ab.RBF(LENSCALE, learn_lenscale=True) net = ( ab.InputLayer(name="X", n_samples=n_samples) >> ab.RandomFourier(n_features=NFEATURES, kernel=kernel) >> ab.Dense(output_dim=64, init_fn="autonorm") >> ab.Activation(tf.nn.selu) >> ab.DenseVariational(output_dim=1, full=False, prior_std=1.0, learn_prior=True) ) phi, kl = net(X=X) std = ab.pos_variable(NOISE, name="noise") ll_f = tf.distributions.Normal(loc=phi, scale=std) predict_mean = ab.sample_mean(phi) # Compute predictions. if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'predictions': predict_mean, 'samples': phi } return tf.estimator.EstimatorSpec(mode, predictions=predictions) ll = ll_f.log_prob(labels) loss = ab.elbo(ll, kl, N) tf.summary.scalar('loss', loss) # Compute evaluation metrics. mse = tf.metrics.mean_squared_error(labels=labels, predictions=predict_mean, name='mse_op') r2 = r2_metric(labels, predict_mean) metrics = {'mse': mse, 'r2': r2} if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec( mode, loss=loss, eval_metric_ops=metrics) # Create training op. assert mode == tf.estimator.ModeKeys.TRAIN optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
def test_input_sample(make_data): """Test the input and tiling layer.""" x, _, X = make_data s = ab.InputLayer(name='myname', n_samples=3) F, KL = s(myname=x) tc = tf.test.TestCase() with tc.test_session(): f = F.eval() X_array = X.eval() assert KL == 0.0 assert np.array_equal(f, X_array) for i in range(3): assert np.array_equal(f[i], x)
def linear(X, Y): """Linear regression with l2 regularization.""" lambda_ = 1e-4 # Weight regularizer noise = 1. # Likelihood st. dev. net = (ab.InputLayer(name="X") >> ab.DenseMAP( output_dim=1, l2_reg=lambda_, l1_reg=0.)) Xw, reg = net(X=X) lkhood = tf.distributions.Normal(loc=Xw, scale=noise) loss = ab.max_posterior(lkhood, Y, reg) # loss = 0.5 * tf.reduce_mean((Y - Xw)**2) + reg return Xw, loss
def bayesian_linear(X, Y): """Bayesian Linear Regression.""" noise = ab.pos_variable(1.0) net = ( ab.InputLayer(name="X", n_samples=n_samples_) >> ab.DenseVariational(output_dim=1, full=True) ) f, kl = net(X=X) lkhood = tf.distributions.Normal(loc=f, scale=noise).log_prob(Y) loss = ab.elbo(lkhood, kl, N) return f, loss
def bayesian_linear(X, Y): """Bayesian Linear Regression.""" lambda_ = 100. std = (1 / lambda_)**.5 # Weight st. dev. prior noise = tf.Variable(1.) # Likelihood st. dev. initialisation, and learning net = (ab.InputLayer(name="X", n_samples=n_samples_) >> ab.DenseVariational(output_dim=1, std=std, full=True)) f, kl = net(X=X) lkhood = tf.distributions.Normal(loc=f, scale=ab.pos(noise)) loss = ab.elbo(lkhood, Y, N, kl) return f, loss
def linear(X, Y): """Linear regression with l2 regularization.""" reg = .01 # Weight prior noise = .5 # Likelihood st. dev. net = ( ab.InputLayer(name="X", n_samples=1) >> ab.DenseMAP(output_dim=1, l2_reg=reg, l1_reg=0.) ) phi, reg = net(X=X) lkhood = tf.distributions.Normal(loc=phi, scale=noise) loss = ab.max_posterior(lkhood, Y, reg) return phi, loss
def bayesian_linear(X, Y): """Bayesian Linear Regression.""" reg = .01 # Initial weight prior std. dev, this is optimised later noise = tf.Variable(.5) # Likelihood st. dev. initialisation, and learning net = ( ab.InputLayer(name="X", n_samples=n_samples) >> ab.DenseVariational(output_dim=1, std=reg, full=True) ) phi, kl = net(X=X) lkhood = tf.distributions.Normal(loc=phi, scale=ab.pos(noise)) loss = ab.elbo(lkhood, Y, N, kl) return phi, loss
def make_graph(): """Make the requirements for making a simple tf graph.""" x, Y, X = data() layers = ab.stack( ab.InputLayer(name='X', n_samples=10), lambda X: (X[:, :, 0:1], 0.0) # Mock a sampling layer ) N = len(x) X_ = tf.placeholder(tf.float32, x.shape) Y_ = tf.placeholder(tf.float32, Y.shape) N_ = tf.placeholder(tf.float32) return x, Y, N, X_, Y_, N_, layers
def test_ncp_con_input_samples(make_data): """Test we are making the ncp extra samples correctly.""" x, _, X = make_data net = (ab.InputLayer(name='X', n_samples=1) >> ab.NCPContinuousPerturb(input_noise=1.)) F, KL = net(X=x) tc = tf.test.TestCase() with tc.test_session(): f = F.eval() assert KL.eval() == 0.0 assert f.shape[0] == 2 assert np.all(f[0] == x) assert np.all(f[1] != x)
def svr(X, Y): """Support vector regressor.""" reg = 0.1 eps = 0.01 lenscale = 1. kern = ab.RBF(lenscale=lenscale) # keep the length scale positive net = ( ab.InputLayer(name="X", n_samples=1) >> ab.RandomFourier(n_features=50, kernel=kern) >> ab.DenseMAP(output_dim=1, l2_reg=reg, l1_reg=0.) ) phi, reg = net(X=X) loss = tf.reduce_mean(tf.maximum(tf.abs(Y - phi - eps), 0.)) + reg return phi, loss
def nnet_bayesian(X, Y): """Bayesian neural net.""" lambda_ = 1e-1 # Weight prior noise = tf.Variable(0.01) # Likelihood st. dev. initialisation net = (ab.InputLayer(name="X", n_samples=n_samples_) >> ab.DenseVariational(output_dim=20, std=lambda_) >> ab.Activation( tf.nn.relu) >> ab.DenseVariational(output_dim=7, std=lambda_) >> ab.Activation(tf.nn.relu) >> ab.DenseVariational( output_dim=5, std=lambda_) >> ab.Activation( tf.tanh) >> ab.DenseVariational(output_dim=1, std=lambda_)) f, kl = net(X=X) lkhood = tf.distributions.Normal(loc=f, scale=ab.pos(noise)) loss = ab.elbo(lkhood, Y, N, kl) return f, loss
def gaussian_process(X, Y): """Gaussian Process Regression.""" noise = ab.pos_variable(.5) kern = ab.RBF(learn_lenscale=False) # learn lengthscale net = ( ab.InputLayer(name="X", n_samples=n_samples_) >> ab.RandomFourier(n_features=50, kernel=kern) >> ab.DenseVariational(output_dim=1, full=True, learn_prior=True) ) f, kl = net(X=X) lkhood = tf.distributions.Normal(loc=f, scale=noise).log_prob(Y) loss = ab.elbo(lkhood, kl, N) return f, loss
def deep_gaussian_process(X, Y): """Deep Gaussian Process Regression.""" noise = ab.pos_variable(.1) net = ( ab.InputLayer(name="X", n_samples=n_samples_) >> ab.RandomFourier(n_features=20, kernel=ab.RBF(learn_lenscale=True)) >> ab.DenseVariational(output_dim=5, full=False) >> ab.RandomFourier(n_features=10, kernel=ab.RBF(1., seed=1)) >> ab.DenseVariational(output_dim=1, full=False, learn_prior=True) ) f, kl = net(X=X) lkhood = tf.distributions.Normal(loc=f, scale=noise).log_prob(Y) loss = ab.elbo(lkhood, kl, N) return f, loss
def nnet(X, Y): """Neural net with regularization.""" lambda_ = 1e-4 # Weight regularizer noise = .5 # Likelihood st. dev. net = ( ab.InputLayer(name="X", n_samples=1) >> ab.DenseMAP( output_dim=40, l2_reg=lambda_, l1_reg=0.) >> ab.Activation(tf.tanh) >> ab.DenseMAP(output_dim=20, l2_reg=lambda_, l1_reg=0.) >> ab.Activation(tf.tanh) >> ab.DenseMAP(output_dim=10, l2_reg=lambda_, l1_reg=0.) >> ab.Activation( tf.tanh) >> ab.DenseMAP(output_dim=1, l2_reg=lambda_, l1_reg=0.)) f, reg = net(X=X) lkhood = tf.distributions.Normal(loc=f, scale=noise) loss = ab.max_posterior(lkhood, Y, reg) return f, loss
def deep_gaussian_process(X, Y): """Deep Gaussian Process Regression.""" lambda_ = 0.1 # Initial weight prior std. dev, this is optimised later noise = tf.Variable(.01) # Likelihood st. dev. initialisation lenscale = tf.Variable(1.) # learn the length scale net = (ab.InputLayer(name="X", n_samples=n_samples_) >> ab.RandomFourier( n_features=20, kernel=ab.RBF(ab.pos(lenscale))) >> ab.DenseVariational( output_dim=5, std=lambda_, full=False) >> ab.RandomFourier( n_features=10, kernel=ab.RBF(1.)) >> ab.DenseVariational( output_dim=1, std=lambda_, full=False)) f, kl = net(X=X) lkhood = tf.distributions.Normal(loc=f, scale=ab.pos(noise)) loss = ab.elbo(lkhood, Y, N, kl) return f, loss
def test_categorical_likelihood(make_data, likelihood): """Test aboleth with discrete likelihoods. Since these are kind of corner cases... """ x, y, _, = make_data like, K = likelihood N, _ = x.shape # Make two classes (K = 2) Y = np.zeros(len(y), dtype=np.int32) Y[y[:, 0] > 0] = 1 if K == 1: Y = Y[:, np.newaxis] X_ = tf.placeholder(tf.float32, x.shape) Y_ = tf.placeholder(tf.int32, Y.shape) n_samples_ = tf.placeholder(tf.int32) layers = ab.stack( ab.InputLayer(name='X', n_samples=n_samples_), ab.Dense(output_dim=K) ) nn, reg = layers(X=X_) like = like(logits=nn) log_like = like.log_prob(Y_) prob = like.prob(Y_) ELBO = ab.elbo(log_like, reg, N) MAP = ab.max_posterior(log_like, reg) fd = {X_: x, Y_: Y, n_samples_: 10} tc = tf.test.TestCase() with tc.test_session(): tf.global_variables_initializer().run() assert like.probs.eval(feed_dict=fd).shape == (10, N, K) assert prob.eval(feed_dict=fd).shape == (10,) + Y.shape L = ELBO.eval(feed_dict=fd) L = MAP.eval(feed_dict=fd) assert np.isscalar(L)
def gaussian_process(X, Y): """Gaussian Process Regression.""" lambda_ = 0.1 # Initial weight prior std. dev, this is optimised later noise = tf.Variable(.5) # Likelihood st. dev. initialisation, and learning lenscale = tf.Variable(1.) # learn the length scale kern = ab.RBF(lenscale=ab.pos(lenscale)) # keep the length scale positive # kern = ab.RBFVariational(lenscale=ab.pos(lenscale)) net = (ab.InputLayer(name="X", n_samples=n_samples_) >> ab.RandomFourier( n_features=50, kernel=kern) >> ab.DenseVariational( output_dim=1, std=lambda_, full=True)) f, kl = net(X=X) lkhood = tf.distributions.Normal(loc=f, scale=ab.pos(noise)) # lkhood = tf.distributions.StudentT(df=1., loc=f, scale=ab.pos(noise)) loss = ab.elbo(lkhood, Y, N, kl) return f, loss
def svr(X, Y): """Support vector regressor, kind of...""" lambda_ = 1e-4 eps = 0.01 lenscale = 1. # Specify which kernel to approximate with the random Fourier features kern = ab.RBF(lenscale=lenscale) net = ( # ab.InputLayer(name="X", n_samples=n_samples_) >> ab.InputLayer(name="X", n_samples=1) >> ab.RandomFourier( n_features=50, kernel=kern) >> # ab.DropOut(keep_prob=0.9) >> ab.DenseMAP(output_dim=1, l2_reg=lambda_, l1_reg=0.)) f, reg = net(X=X) loss = tf.reduce_mean(tf.nn.relu(tf.abs(Y - f) - eps)) + reg return f, loss
def nnet_bayesian(X, Y): """Bayesian neural net.""" noise = 0.01 net = ( ab.InputLayer(name="X", n_samples=n_samples_) >> ab.DenseVariational(output_dim=5) >> ab.Activation(tf.nn.selu) >> ab.DenseVariational(output_dim=4) >> ab.Activation(tf.nn.selu) >> ab.DenseVariational(output_dim=3) >> ab.Activation(tf.nn.selu) >> ab.DenseVariational(output_dim=1) ) f, kl = net(X=X) lkhood = tf.distributions.Normal(loc=f, scale=noise).log_prob(Y) loss = ab.elbo(lkhood, kl, N) return f, loss
def test_ncp_cat_input_samples(make_categories): """Test we are making the ncp extra samples correctly.""" x, K = make_categories pflip = 0.5 pcats = 1. / K pdiff = pflip * (1 - pcats) net = (ab.InputLayer(name='X', n_samples=1) >> ab.NCPCategoricalPerturb( flip_prob=pflip, n_categories=K)) F, KL = net(X=x) tc = tf.test.TestCase() with tc.test_session(): f = F.eval() assert KL.eval() == 0.0 assert f.shape[0] == 2 assert np.all(f[0] == x) prob = np.mean(f[1] != x) assert np.allclose(prob, pdiff, atol=0.1)
def nnet_dropout(X, Y): """Neural net with dropout.""" lambda_ = 1e-3 # Weight prior noise = .5 # Likelihood st. dev. net = ( ab.InputLayer(name="X", n_samples=n_samples_) >> ab.Dense(output_dim=32, l2_reg=lambda_) >> ab.Activation(tf.nn.selu) >> ab.DropOut(keep_prob=0.9, independent=True) >> ab.Dense(output_dim=16, l2_reg=lambda_) >> ab.Activation(tf.nn.selu) >> ab.DropOut(keep_prob=0.95, independent=True) >> ab.Dense(output_dim=8, l2_reg=lambda_) >> ab.Activation(tf.nn.selu) >> ab.Dense(output_dim=1, l2_reg=lambda_) ) f, reg = net(X=X) lkhood = tf.distributions.Normal(loc=f, scale=noise).log_prob(Y) loss = ab.max_posterior(lkhood, reg) return f, loss
def nnet_ncp(X, Y): """Noise contrastive prior network.""" noise = ab.pos_variable(.5) lstd = 1. perturb_noise = 10. net = ( ab.InputLayer(name="X", n_samples=n_samples_) >> ab.NCPContinuousPerturb(input_noise=perturb_noise) >> ab.Dense(output_dim=32) >> ab.Activation(tf.nn.selu) >> ab.Dense(output_dim=16) >> ab.Activation(tf.nn.selu) >> ab.Dense(output_dim=8) >> ab.Activation(tf.nn.selu) >> ab.DenseNCP(output_dim=1, prior_std=.1, latent_std=lstd) ) f, kl = net(X=X) lkhood = tf.distributions.Normal(loc=f, scale=noise).log_prob(Y) loss = ab.elbo(lkhood, kl, N) return f, loss
def main(): """Run the demo.""" # Get Continuous and categorical data df_train, df_test = fetch_data() df = pd.concat((df_train, df_test)) X_con, X_cat, n_cats, Y = input_fn(df) n_samples_ = tf.placeholder_with_default(T_SAMPLES, []) # Define the continuous layers con_layer = ( ab.InputLayer(name='con', n_samples=n_samples_) >> ab.RandomFourier(100, kernel=ab.RBF(learn_lenscale=True)) >> ab.Dense(output_dim=16, init_fn="autonorm") ) # Now define the cateogrical layers, which we embed # Note every Embed call can be different, this is just "lazy" cat_layer_list = [ab.Embed(EMBED_DIMS, i, init_fn="autonorm") for i in n_cats] cat_layer = ( ab.InputLayer(name='cat', n_samples=n_samples_) >> ab.PerFeature(*cat_layer_list) >> # Assign columns to embedding layers ab.Activation(tf.nn.selu) >> ab.Dense(16, init_fn="autonorm") ) # Now we can feed the initial continuous and cateogrical layers to further # "joint" layers after we concatenate them net = ( ab.Concat(con_layer, cat_layer) >> ab.Activation(tf.nn.selu) >> ab.DenseVariational(output_dim=1) ) # Split data into training and testing Xt_con, Xs_con = np.split(X_con, [len(df_train)], axis=0) Xt_cat, Xs_cat = np.split(X_cat, [len(df_train)], axis=0) Yt, Ys = np.split(Y, [len(df_train)], axis=0) # Graph place holders X_con_ = tf.placeholder(tf.float32, [None, Xt_con.shape[1]]) X_cat_ = tf.placeholder(tf.int32, [None, Xt_cat.shape[1]]) Y_ = tf.placeholder(tf.float32, [None, 1]) # Feed dicts train_dict = {X_con_: Xt_con, X_cat_: Xt_cat, Y_: Yt} test_dict = {X_con_: Xs_con, X_cat_: Xs_cat, n_samples_: P_SAMPLES} # Make model N = len(Xt_con) nn, kl = net(con=X_con_, cat=X_cat_) likelihood = tf.distributions.Bernoulli(logits=nn) prob = ab.sample_mean(likelihood.probs) loss = ab.elbo(likelihood.log_prob(Y_), kl, N) optimizer = tf.train.AdamOptimizer() train = optimizer.minimize(loss) init = tf.global_variables_initializer() with tf.Session(config=CONFIG): init.run() # We're going to just use a feed_dict to feed in batches, which we # generate here batches = ab.batch( train_dict, batch_size=BSIZE, n_iter=NITER) for i, data in enumerate(batches): train.run(feed_dict=data) if i % 1000 == 0: loss_val = loss.eval(feed_dict=data) print("Iteration {}, loss = {}".format(i, loss_val)) # Predict Ep = prob.eval(feed_dict=test_dict) Ey = Ep > 0.5 # Max probability assignment acc = accuracy_score(Ys.flatten(), Ey.flatten()) logloss = log_loss(Ys.flatten(), np.hstack((1 - Ep, Ep))) print("Accuracy = {}, log loss = {}".format(acc, logloss))
rseed = 100 ab.set_hyperseed(rseed) # Optimization n_epochs = 50 batch_size = 100 config = tf.ConfigProto(device_count={'GPU': 0}) # Use GPU ? reg = 0.1 l_samples = 5 p_samples = 5 # Network architecture net = ab.stack( ab.InputLayer(name='X', n_samples=l_samples), # LSAMPLES,BATCH_SIZE,28*28 ab.Conv2D(filters=32, kernel_size=(5, 5), l2_reg=reg), # LSAMPLES, BATCH_SIZE, 28, 28, 32 ab.Activation(h=tf.nn.relu), ab.MaxPool2D(pool_size=(2, 2), strides=(2, 2)), # LSAMPLES, BATCH_SIZE, 14, 14, 32 ab.Conv2D(filters=64, kernel_size=(5, 5), l2_reg=reg), # LSAMPLES, BATCH_SIZE, 14, 14, 64 ab.Activation(h=tf.nn.relu), ab.MaxPool2D(pool_size=(2, 2), strides=(2, 2)), # LSAMPLES, BATCH_SIZE, 7, 7, 64 ab.Flatten(), # LSAMPLES, BATCH_SIZE, 7*7*64 ab.Dense(output_dim=1024, l2_reg=reg), # LSAMPLES, BATCH_SIZE, 1024 ab.Activation(h=tf.nn.relu), ab.DropOut(0.5), ab.Dense(output_dim=10, l2_reg=reg), # LSAMPLES, BATCH_SIZE, 10
def __init__(self, layer, *args, **kwargs): self.layer = layer(*args, **kwargs) def _build(self, X): """Build the graph of this layer.""" Net = self.layer(X) # aggregate layer regularization terms KL = tf.reduce_sum(self.layer.losses) return Net, KL n_samples_ = tf.placeholder(tf.int32) l1_l2_reg = tf.keras.regularizers.l1_l2(l1=0., l2=0.) net = (ab.InputLayer(name="X", n_samples=n_samples_) >> WrapperLayer( tf.keras.layers.Dense, units=64, activation='tanh', kernel_regularizer=l1_l2_reg, bias_regularizer=l1_l2_reg) >> ab.DropOut(keep_prob=.9) >> WrapperLayer( tf.keras.layers.Dense, units=32, activation='tanh', kernel_regularizer=l1_l2_reg, bias_regularizer=l1_l2_reg) >> ab.DropOut(keep_prob=.9) >> WrapperLayer(tf.keras.layers.Dense, units=1, kernel_regularizer=l1_l2_reg, bias_regularizer=l1_l2_reg))